ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: c0a61b78d170bfec84aa9052ddf4e2010ba79afb
Parent: 8e1243fc6309f0793b5c8d814ba79f6dffda8386
Author: Randy Palamar
Date:   Tue, 10 Jun 2025 10:54:28 -0600

core/lib: rework library syncing and locking

this removes the trivial ways of deadlocking the library while
also greatly improving throughput. for a particular dataset on my
computer the beamformer would previously achieve ~1.3GB/s now it
is consistently ~2.1GB/s

this also transitions w32 to using w32 semaphores so that
timeouts work on w32

Diffstat:
Mbeamformer.c | 140++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mbeamformer.h | 5+++--
Mbeamformer_parameters.h | 2--
Mbeamformer_work_queue.c | 16----------------
Mbeamformer_work_queue.h | 51+++++++++++++++++++++++++++++++++++----------------
Mbuild.c | 1-
Mhelpers/ogl_beamformer_lib.c | 173++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mhelpers/ogl_beamformer_lib_base.h | 4+++-
Mos_linux.c | 72++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mos_win32.c | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mstatic.c | 35++++++++++++++---------------------
Mui.c | 32++++++++++++++++----------------
Mutil.c | 9+++++++++
Mutil.h | 30+++++++++++++++++++++---------
14 files changed, 391 insertions(+), 258 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,6 +1,5 @@ /* See LICENSE for license details. */ /* TODO(rnp): - * [ ]: refactor: BeamformGPUComputeContext * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit * [ ]: reinvestigate ring buffer raw_data_ssbo * - to minimize latency the main thread should manage the subbuffer upload so that the @@ -10,6 +9,14 @@ * can overwrite one while the other is in use. * - make use of glFenceSync to guard buffer uploads * [ ]: BeamformWorkQueue -> BeamformerWorkQueue + * [ ]: bug: re-beamform on shader reload + * [ ]: need to keep track of gpu memory in some way + * - want to be able to store more than 16 2D frames but limit 3D frames + * - maybe keep track of how much gpu memory is committed for beamformed images + * and use that to determine when to loop back over existing textures + * - to do this maybe use a circular linked list instead of a flat array + * - then have a way of querying how many frames are available for a specific point count + * [ ]: bug: reinit cuda on hot-reload */ #include "beamformer.h" @@ -111,7 +118,7 @@ function void alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) { ComputeShaderCtx *cs = &ctx->csctx; - BeamformerParameters *bp = &ctx->shared_memory->parameters; + BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters; cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim); cs->rf_raw_size = rf_raw_size; @@ -161,6 +168,7 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1); u32 frame_index = frame_id % countof(ctx->beamform_frames); work->type = BW_COMPUTE; + work->lock = BeamformerSharedMemoryLockKind_DispatchCompute; work->frame = ctx->beamform_frames + frame_index; work->frame->ready_to_present = 0; work->frame->frame.id = frame_id; @@ -270,7 +278,8 @@ compute_cursor_finished(struct compute_cursor *cursor) function void do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader) { - ComputeShaderCtx *csctx = &ctx->csctx; + ComputeShaderCtx *csctx = &ctx->csctx; + BeamformerSharedMemory *sm = ctx->shared_memory.region; glUseProgram(csctx->programs[shader]); @@ -364,10 +373,10 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, aframe->frame.id = ctx->averaged_frame_index; /* TODO(rnp): hack we need a better way of specifying which frames to sum; * this is fine for rolling averaging but what if we want to do something else */ - ASSERT(frame >= ctx->beamform_frames); - ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames)); + assert(frame >= ctx->beamform_frames); + assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames)); u32 base_index = (u32)(frame - ctx->beamform_frames); - u32 to_average = ctx->shared_memory->parameters.output_points[3]; + u32 to_average = sm->parameters.output_points[3]; u32 frame_count = 0; u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, @@ -482,11 +491,11 @@ reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extr } function void -complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset) +complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context) { ComputeShaderCtx *cs = &ctx->csctx; - BeamformerParameters *bp = &ctx->shared_memory->parameters; - BeamformerSharedMemory *sm = ctx->shared_memory; + BeamformerSharedMemory *sm = ctx->shared_memory.region; + BeamformerParameters *bp = &sm->parameters; BeamformWork *work = beamform_work_queue_pop(q); while (work) { @@ -517,7 +526,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co } } break; case BW_UPLOAD_BUFFER: { - assert(!atomic_load_u32((i32 *)(barrier_offset + work->completion_barrier))); + ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1); BeamformerUploadContext *uc = &work->upload_context; u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; switch (uc->kind) { @@ -541,7 +550,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co tex_element_count = ARRAY_COUNT(sm->sparse_elements); } break; case BU_KIND_PARAMETERS: { - ctx->ui_read_params = barrier_offset != 0; + ctx->ui_read_params = ctx->beamform_work_queue != q; buffer = cs->shared_ubo; } break; case BU_KIND_RF_DATA: { @@ -564,8 +573,26 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co glNamedBufferSubData(buffer, 0, uc->size, (u8 *)sm + uc->shared_memory_offset); } + ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); } break; - case BW_COMPUTE: { + case BW_COMPUTE_INDIRECT:{ + fill_frame_compute_work(ctx, work, work->compute_indirect_plane); + DEBUG_DECL(work->type = BW_COMPUTE_INDIRECT;) + } /* FALLTHROUGH */ + case BW_COMPUTE:{ + /* NOTE(rnp): debug: here it is not a bug to release the lock if it + * isn't held but elswhere it is */ + DEBUG_DECL(if (sm->locks[work->lock])) { + ctx->os.shared_memory_region_unlock(&ctx->shared_memory, + sm->locks, work->lock); + } + atomic_store_u32(&ctx->starting_compute, 0); + + if (cs->shared_ubo_dirty) { + glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters); + cs->shared_ubo_dirty = 0; + } + atomic_store_u32(&cs->processing_compute, 1); start_renderdoc_capture(gl_context); @@ -640,14 +667,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co INVALID_CODE_PATH; } } break; - default: INVALID_CODE_PATH; break; + InvalidDefaultCase; } if (can_commit) { - if (work->completion_barrier) { - i32 *value = (i32 *)(barrier_offset + work->completion_barrier); - ctx->os.wake_waiters(value); - } beamform_work_queue_pop_commit(q); work = beamform_work_queue_pop(q); } @@ -657,7 +680,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) { BeamformerCtx *ctx = (BeamformerCtx *)user_context; - BeamformerSharedMemory *sm = ctx->shared_memory; + BeamformerSharedMemory *sm = ctx->shared_memory.region; ComputeShaderCtx *cs = &ctx->csctx; glCreateBuffers(1, &cs->shared_ubo); @@ -678,9 +701,10 @@ DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) { - BeamformerCtx *ctx = (BeamformerCtx *)user_context; - complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory); - complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0); + BeamformerCtx *ctx = (BeamformerCtx *)user_context; + BeamformerSharedMemory *sm = ctx->shared_memory.region; + complete_queue(ctx, &sm->external_work_queue, arena, gl_context); + complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context); } #include "ui.c" @@ -700,49 +724,40 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); } - BeamformerParameters *bp = &ctx->shared_memory->parameters; - if (ctx->shared_memory->dispatch_compute_sync) { - ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane; - atomic_store_u32(&ctx->shared_memory->dispatch_compute_sync, 0); - BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); - if (work) { - if (fill_frame_compute_work(ctx, work, current_plane)) + BeamformerSharedMemory *sm = ctx->shared_memory.region; + BeamformerParameters *bp = &sm->parameters; + if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && !ctx->starting_compute) { + if (sm->start_compute_from_main) { + BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); + ImagePlaneTag tag = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; + if (fill_frame_compute_work(ctx, work, tag)) { beamform_work_queue_push_commit(ctx->beamform_work_queue); - - if (ctx->shared_memory->export_next_frame) { - BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); - if (export) { - /* TODO: we don't really want the beamformer opening/closing files */ - iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name); - export->type = BW_SAVE_FRAME; - export->output_frame_ctx.file_handle = f; - if (bp->output_points[3] > 1) { - u32 a_index = !(ctx->averaged_frame_index % - ARRAY_COUNT(ctx->averaged_frames)); - BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; - export->output_frame_ctx.frame = aframe; - } else { - export->output_frame_ctx.frame = work->frame; + if (sm->export_next_frame) { + BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); + if (export) { + /* TODO: we don't really want the beamformer opening/closing files */ + iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name); + export->type = BW_SAVE_FRAME; + export->output_frame_ctx.file_handle = f; + if (bp->output_points[3] > 1) { + static_assert(countof(ctx->averaged_frames) == 2, + "fix this, we assume average frame ping pong buffer"); + u32 a_index = !(ctx->averaged_frame_index % + countof(ctx->averaged_frames)); + BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; + export->output_frame_ctx.frame = aframe; + } else { + export->output_frame_ctx.frame = work->frame; + } + beamform_work_queue_push_commit(ctx->beamform_work_queue); } - beamform_work_queue_push_commit(ctx->beamform_work_queue); + sm->export_next_frame = 0; } - ctx->shared_memory->export_next_frame = 0; - } - - ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); - } - } - - if (ctx->start_compute) { - if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) { - BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); - ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; - if (fill_frame_compute_work(ctx, work, plane)) { - beamform_work_queue_push_commit(ctx->beamform_work_queue); - ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); - ctx->start_compute = 0; } + atomic_store_u32(&sm->start_compute_from_main, 0); } + atomic_store_u32(&ctx->starting_compute, 1); + ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); } ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index, @@ -754,14 +769,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) } } - if (ctx->start_compute) { - ctx->start_compute = 0; - ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); - } - BeamformComputeFrame *frame_to_draw; if (bp->output_points[3] > 1) { - u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames)); + u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames)); frame_to_draw = ctx->averaged_frames + a_index; } else { frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; diff --git a/beamformer.h b/beamformer.h @@ -87,6 +87,7 @@ typedef struct { u32 raw_data_ssbo; u32 shared_ubo; + b32 shared_ubo_dirty; u32 channel_mapping_texture; u32 sparse_elements_texture; @@ -171,7 +172,7 @@ typedef struct { GLParams gl; uv2 window_size; - b32 start_compute; + b32 starting_compute; b32 should_exit; Arena ui_backing_store; @@ -201,7 +202,7 @@ typedef struct { BeamformWorkQueue *beamform_work_queue; - BeamformerSharedMemory *shared_memory; + SharedMemoryRegion shared_memory; } BeamformerCtx; struct ShaderReloadContext { diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -7,8 +7,6 @@ * programatically would be nice. */ -#define BEAMFORMER_PARAMETERS_VERSION (2UL) - /* X(enumarant, number, shader file name, needs header, pretty name) */ #define COMPUTE_SHADERS \ X(CudaDecode, 0, "", 0, "CUDA Decode") \ diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c @@ -50,19 +50,3 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit) { atomic_add_u64(&q->queue, 1); } - -function b32 -try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value) -{ - b32 result = 0; - for (;;) { - i32 current = atomic_load_u32(sync); - if (current && atomic_cas_u32(sync, &current, 0)) { - result = 1; - break; - } - if (!timeout_ms || !os_wait_on_value(sync, 0, timeout_ms)) - break; - } - return result; -} diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h @@ -1,12 +1,20 @@ /* See LICENSE for license details. */ +/* TODO(rnp): + * [ ]: coalesce uploads if they are overwriting exist data + * - use a flag field and only submit a new work if the corresponding flag is clear + */ + #ifndef _BEAMFORMER_WORK_QUEUE_H_ #define _BEAMFORMER_WORK_QUEUE_H_ +#define BEAMFORMER_SHARED_MEMORY_VERSION (3UL) + typedef struct BeamformComputeFrame BeamformComputeFrame; typedef struct ShaderReloadContext ShaderReloadContext; typedef enum { BW_COMPUTE, + BW_COMPUTE_INDIRECT, BW_RELOAD_SHADER, BW_SAVE_FRAME, BW_SEND_FRAME, @@ -33,6 +41,21 @@ typedef struct { iptr file_handle; } BeamformOutputFrameContext; +#define BEAMFORMER_SHARED_MEMORY_LOCKS \ + X(None) \ + X(Parameters) \ + X(ParametersHead) \ + X(ParametersUI) \ + X(FocalVectors) \ + X(ChannelMapping) \ + X(SparseElements) \ + X(RawData) \ + X(DispatchCompute) + +#define X(name) BeamformerSharedMemoryLockKind_##name, +typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Count} BeamformerSharedMemoryLockKind; +#undef X + /* NOTE: discriminated union based on type */ typedef struct { union { @@ -40,11 +63,10 @@ typedef struct { BeamformerUploadContext upload_context; BeamformOutputFrameContext output_frame_ctx; ShaderReloadContext *shader_reload_context; + ImagePlaneTag compute_indirect_plane; void *generic; }; - /* NOTE(rnp): mostly for __external__ processes to sync on. when passed from external - * process this should be an offset from base of shared_memory */ - iptr completion_barrier; + BeamformerSharedMemoryLockKind lock; BeamformWorkType type; } BeamformWork; @@ -68,9 +90,15 @@ typedef BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit_fn); - (uintptr_t)(sizeof(BeamformerSharedMemory) & 4095ULL)) #define BEAMFORMER_MAX_RF_DATA_SIZE (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_RF_DATA_OFF) -typedef struct { +typedef align_as(64) struct { + u32 version; + + /* NOTE(rnp): not used for locking on w32 but we can use these to peek at the status of + * the lock without leaving userspace. also this struct needs a bunch of padding */ + i32 locks[BeamformerSharedMemoryLockKind_Count]; + /* NOTE(rnp): interleaved transmit angle, focal depth pairs */ - _Alignas(64) v2 focal_vectors[256]; + align_as(64) v2 focal_vectors[256]; i16 channel_mapping[256]; i16 sparse_elements[256]; @@ -87,21 +115,12 @@ typedef struct { ComputeShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES]; u32 compute_stages_count; - i32 parameters_sync; - i32 parameters_head_sync; - i32 parameters_ui_sync; - i32 focal_vectors_sync; - i32 channel_mapping_sync; - i32 sparse_elements_sync; - i32 raw_data_sync; - - i32 dispatch_compute_sync; - ImagePlaneTag current_image_plane; + /* TODO(rnp): hack: we need a different way of dispatching work for export */ + b32 start_compute_from_main; /* TODO(rnp): this shouldn't be needed */ b32 export_next_frame; - u32 version; BeamformWorkQueue external_work_queue; } BeamformerSharedMemory; diff --git a/build.c b/build.c @@ -272,7 +272,6 @@ W32(b32) CreateProcessA(u8 *, u8 *, iptr, iptr, b32, u32, iptr, u8 *, iptr, iptr W32(b32) GetExitCodeProcess(iptr handle, u32 *); W32(b32) GetFileTime(iptr, iptr, iptr, iptr); W32(b32) MoveFileExA(c8 *, c8 *, u32); -W32(u32) WaitForSingleObject(iptr, u32); function void os_make_directory(char *name) diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c @@ -8,6 +8,7 @@ #define PIPE_RETRY_PERIOD_MS (100ULL) +global SharedMemoryRegion g_shared_memory; global BeamformerSharedMemory *g_bp; global BeamformerLibErrorKind g_lib_last_error; @@ -72,15 +73,14 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms) return total_read == read_size; } -static BeamformerSharedMemory * +function SharedMemoryRegion os_open_shared_memory_area(char *name) { - BeamformerSharedMemory *result = 0; + SharedMemoryRegion result = {0}; i32 fd = shm_open(name, O_RDWR, S_IRUSR|S_IWUSR); if (fd > 0) { void *new = mmap(0, BEAMFORMER_SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - if (new != MAP_FAILED) - result = new; + if (new != MAP_FAILED) result.region = new; close(fd); } return result; @@ -88,15 +88,6 @@ os_open_shared_memory_area(char *name) #elif OS_WINDOWS -/* TODO(rnp): temporary workaround */ -function OS_WAIT_ON_VALUE_FN(os_wait_on_value_stub) -{ - /* TODO(rnp): this doesn't work across processes on win32 (return 1 to cause a spin wait) */ - return 1; - return WaitOnAddress(value, &current, sizeof(*value), timeout_ms); -} -#define os_wait_on_value os_wait_on_value_stub - static Pipe os_open_read_pipe(char *name) { @@ -145,16 +136,35 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms) return total_read == read_size; } -function BeamformerSharedMemory * +function SharedMemoryRegion os_open_shared_memory_area(char *name) { - BeamformerSharedMemory *result = 0; + SharedMemoryRegion result = {0}; iptr h = OpenFileMappingA(FILE_MAP_ALL_ACCESS, 0, name); if (h != INVALID_FILE) { - result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, BEAMFORMER_SHARED_MEMORY_SIZE); + void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, + os_round_up_to_page_size(BEAMFORMER_SHARED_MEMORY_SIZE)); + if (new) { + u8 buffer[1024]; + Stream sb = {.data = buffer, .cap = 1024}; + stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_")); + local_persist iptr semaphores[BeamformerSharedMemoryLockKind_Count]; + local_persist w32_shared_memory_context ctx = {.semaphores = semaphores}; + b32 all_semaphores = 1; + for (i32 i = 0; i < countof(semaphores); i++) { + Stream lb = sb; + stream_append_i64(&lb, i); + stream_append_byte(&lb, 0); + semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data); + all_semaphores &= semaphores[i] != INVALID_FILE; + } + if (all_semaphores) { + result.region = new; + result.os_context = (iptr)&ctx; + } + } CloseHandle(h); } - return result; } @@ -164,16 +174,19 @@ function b32 check_shared_memory(void) { b32 result = 1; - if (!g_bp) { - g_bp = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME); - if (!g_bp) { + if (!g_shared_memory.region) { + g_shared_memory = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME); + if (!g_shared_memory.region) { result = 0; g_lib_last_error = BF_LIB_ERR_KIND_SHARED_MEMORY; } - } else if (g_bp->version != BEAMFORMER_PARAMETERS_VERSION) { + } else if (((BeamformerSharedMemory *)g_shared_memory.region)->version + != BEAMFORMER_SHARED_MEMORY_VERSION) + { g_lib_last_error = BF_LIB_ERR_KIND_VERSION_MISMATCH; result = 0; } + if (result) g_bp = g_shared_memory.region; return result; } @@ -186,17 +199,23 @@ try_push_work_queue(void) } function b32 -lib_try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value) +lib_try_lock(BeamformerSharedMemoryLockKind lock, i32 timeout_ms) { - b32 result = try_wait_sync(sync, timeout_ms, os_wait_on_value); + b32 result = os_shared_memory_region_lock(&g_shared_memory, g_bp->locks, (i32)lock, timeout_ms); if (!result) g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE; return result; } +function void +lib_release_lock(BeamformerSharedMemoryLockKind lock) +{ + os_shared_memory_region_unlock(&g_shared_memory, g_bp->locks, (i32)lock); +} + u32 beamformer_get_api_version(void) { - return BEAMFORMER_PARAMETERS_VERSION; + return BEAMFORMER_SHARED_MEMORY_VERSION; } const char * @@ -245,64 +264,60 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count) } b32 -beamformer_start_compute(u32 image_plane_tag) +beamformer_start_compute(i32 timeout_ms) { b32 result = 0; - if (image_plane_tag < IPT_LAST) { - if (check_shared_memory()) { - if (atomic_load_u32(&g_bp->dispatch_compute_sync) == 0) { - g_bp->current_image_plane = image_plane_tag; - atomic_store_u32(&g_bp->dispatch_compute_sync, 1); + if (check_shared_memory()) { + if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) { + if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms)) { + lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute); result = 1; - } else { - g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE; } } - } else { - g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE; } return result; } function b32 -beamformer_upload_buffer(void *data, u32 size, i32 store_offset, i32 sync_offset, +beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerSharedMemoryLockKind lock, BeamformerUploadKind kind, i32 timeout_ms) { b32 result = 0; if (check_shared_memory()) { BeamformWork *work = try_push_work_queue(); - result = work && lib_try_wait_sync((i32 *)((u8 *)g_bp + sync_offset), timeout_ms, os_wait_on_value); + result = work && lib_try_lock(lock, timeout_ms); if (result) { BeamformerUploadContext *uc = &work->upload_context; uc->shared_memory_offset = store_offset; uc->size = size; uc->kind = kind; work->type = BW_UPLOAD_BUFFER; - work->completion_barrier = sync_offset; + work->lock = lock; mem_copy((u8 *)g_bp + store_offset, data, size); beamform_work_queue_push_commit(&g_bp->external_work_queue); + lib_release_lock(lock); } } return result; } #define BEAMFORMER_UPLOAD_FNS \ - X(channel_mapping, i16, 1, CHANNEL_MAPPING) \ - X(sparse_elements, i16, 1, SPARSE_ELEMENTS) \ - X(focal_vectors, f32, 2, FOCAL_VECTORS) + X(channel_mapping, i16, 1, ChannelMapping, CHANNEL_MAPPING) \ + X(sparse_elements, i16, 1, SparseElements, SPARSE_ELEMENTS) \ + X(focal_vectors, f32, 2, FocalVectors, FOCAL_VECTORS) -#define X(name, dtype, elements, command) \ +#define X(name, dtype, elements, lock_name, command) \ b32 beamformer_push_##name (dtype *data, u32 count, i32 timeout_ms) { \ - b32 result = 0; \ - if (count <= countof(g_bp->name)) { \ - result = beamformer_upload_buffer(data, count * elements * sizeof(dtype), \ - offsetof(BeamformerSharedMemory, name), \ - offsetof(BeamformerSharedMemory, name##_sync), \ - BU_KIND_##command, timeout_ms); \ - } else { \ - g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; \ - } \ - return result; \ + b32 result = 0; \ + if (count <= countof(g_bp->name)) { \ + result = beamformer_upload_buffer(data, count * elements * sizeof(dtype), \ + offsetof(BeamformerSharedMemory, name), \ + BeamformerSharedMemoryLockKind_##lock_name, \ + BU_KIND_##command, timeout_ms); \ + } else { \ + g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; \ + } \ + return result; \ } BEAMFORMER_UPLOAD_FNS #undef X @@ -312,19 +327,20 @@ beamformer_push_parameters(BeamformerParameters *bp, i32 timeout_ms) { b32 result = beamformer_upload_buffer(bp, sizeof(*bp), offsetof(BeamformerSharedMemory, parameters), - offsetof(BeamformerSharedMemory, parameters_sync), + BeamformerSharedMemoryLockKind_Parameters, BU_KIND_PARAMETERS, timeout_ms); return result; } -b32 -beamformer_push_data(void *data, u32 data_size, i32 timeout_ms) +function b32 +beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_from_main) { b32 result = 0; if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) { result = beamformer_upload_buffer(data, data_size, BEAMFORMER_RF_DATA_OFF, - offsetof(BeamformerSharedMemory, raw_data_sync), + BeamformerSharedMemoryLockKind_RawData, BU_KIND_RF_DATA, timeout_ms); + if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1); } else { g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; } @@ -332,21 +348,49 @@ beamformer_push_data(void *data, u32 data_size, i32 timeout_ms) } b32 +beamformer_push_data(void *data, u32 data_size, i32 timeout_ms) +{ + return beamformer_push_data_base(data, data_size, timeout_ms, 1); +} + +b32 +beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, i32 timeout_ms) +{ + b32 result = beamformer_push_data_base(data, data_size, timeout_ms, 0); + if (result) { + result = image_plane_tag < IPT_LAST; + if (result) { + BeamformWork *work = try_push_work_queue(); + result = work != 0; + if (result) { + work->type = BW_COMPUTE_INDIRECT; + work->compute_indirect_plane = image_plane_tag; + beamform_work_queue_push_commit(&g_bp->external_work_queue); + } + } else { + g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE; + } + } + return result; +} + +b32 beamformer_push_parameters_ui(BeamformerUIParameters *bp, i32 timeout_ms) { b32 result = 0; if (check_shared_memory()) { BeamformWork *work = try_push_work_queue(); - result = work && lib_try_wait_sync(&g_bp->parameters_ui_sync, timeout_ms, os_wait_on_value); + result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersUI, timeout_ms); if (result) { BeamformerUploadContext *uc = &work->upload_context; uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters); uc->size = sizeof(g_bp->parameters); uc->kind = BU_KIND_PARAMETERS; work->type = BW_UPLOAD_BUFFER; - work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_ui_sync); + work->lock = BeamformerSharedMemoryLockKind_ParametersUI; mem_copy(&g_bp->parameters_ui, bp, sizeof(*bp)); beamform_work_queue_push_commit(&g_bp->external_work_queue); + lib_release_lock(BeamformerSharedMemoryLockKind_ParametersUI); } } return result; @@ -358,16 +402,17 @@ beamformer_push_parameters_head(BeamformerParametersHead *bp, i32 timeout_ms) b32 result = 0; if (check_shared_memory()) { BeamformWork *work = try_push_work_queue(); - result = work && lib_try_wait_sync(&g_bp->parameters_head_sync, timeout_ms, os_wait_on_value); + result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersHead, timeout_ms); if (result) { BeamformerUploadContext *uc = &work->upload_context; uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters); uc->size = sizeof(g_bp->parameters); uc->kind = BU_KIND_PARAMETERS; work->type = BW_UPLOAD_BUFFER; - work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_head_sync); + work->lock = BeamformerSharedMemoryLockKind_ParametersHead; mem_copy(&g_bp->parameters_head, bp, sizeof(*bp)); beamform_work_queue_push_commit(&g_bp->external_work_queue); + lib_release_lock(BeamformerSharedMemoryLockKind_ParametersHead); } } return result; @@ -393,14 +438,8 @@ b32 send_data(void *data, u32 data_size) { b32 result = 0; - if (beamformer_push_data(data, data_size, 0)) { - result = beamformer_start_compute(0); - if (result) { - /* TODO(rnp): should we just set timeout on acquiring the lock instead of this? */ - try_wait_sync(&g_bp->raw_data_sync, -1, os_wait_on_value); - atomic_store_u32(&g_bp->raw_data_sync, 1); - } - } + if (beamformer_push_data(data, data_size, 0)) + result = beamformer_start_compute(-1); return result; } diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h @@ -38,10 +38,12 @@ LIB_FN uint32_t send_data(void *data, uint32_t data_size); LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, uint32_t output_points[3], float *out_data, int32_t timeout_ms); -LIB_FN uint32_t beamformer_start_compute(uint32_t image_plane_tag); +/* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */ +LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms); /* NOTE: these functions only queue an upload; you must flush (old data functions or start_compute) */ LIB_FN uint32_t beamformer_push_data(void *data, uint32_t size, int32_t timeout_ms); +LIB_FN uint32_t beamformer_push_data_with_compute(void *data, uint32_t size, uint32_t image_plane_tag, int32_t timeout_ms); LIB_FN uint32_t beamformer_push_channel_mapping(int16_t *mapping, uint32_t count, int32_t timeout_ms); LIB_FN uint32_t beamformer_push_sparse_elements(int16_t *elements, uint32_t count, int32_t timeout_ms); LIB_FN uint32_t beamformer_push_focal_vectors(float *vectors, uint32_t count, int32_t timeout_ms); diff --git a/os_linux.c b/os_linux.c @@ -83,24 +83,25 @@ os_get_timer_counter(void) return result; } +function iz +os_round_up_to_page_size(iz value) +{ + iz result = round_up_to(value, sysconf(_SC_PAGESIZE)); + return result; +} + function OS_ALLOC_ARENA_FN(os_alloc_arena) { - Arena result; - iz pagesize = sysconf(_SC_PAGESIZE); - if (capacity % pagesize != 0) - capacity += pagesize - capacity % pagesize; - - iz oldsize = old.end - old.beg; - if (oldsize > capacity) - return old; - - if (old.beg) - munmap(old.beg, oldsize); - - result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (result.beg == MAP_FAILED) - os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); - result.end = result.beg + capacity; + Arena result = old; + capacity = os_round_up_to_page_size(capacity); + iz old_size = old.end - old.beg; + if (old_size < capacity) { + if (old.beg) munmap(old.beg, old_size); + result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (result.beg == MAP_FAILED) + os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); + result.end = result.beg + capacity; + } return result; } @@ -163,15 +164,15 @@ function OS_READ_FILE_FN(os_read_file) return total_read; } -function void * -os_create_shared_memory_area(char *name, iz cap) +function SharedMemoryRegion +os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity) { - void *result = 0; + iz capacity = os_round_up_to_page_size(requested_capacity); + SharedMemoryRegion result = {0}; i32 fd = shm_open(name, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR); - if (fd > 0 && ftruncate(fd, cap) != -1) { - void *new = mmap(NULL, cap, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - if (new != MAP_FAILED) - result = new; + if (fd > 0 && ftruncate(fd, capacity) != -1) { + void *new = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (new != MAP_FAILED) result.region = new; } if (fd > 0) close(fd); return result; @@ -294,7 +295,30 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value) function OS_WAKE_WAITERS_FN(os_wake_waiters) { if (sync) { - atomic_store_u32(sync, 1); + atomic_store_u32(sync, 0); syscall(SYS_futex, sync, FUTEX_WAKE, I32_MAX, 0, 0, 0); } } + +function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock) +{ + b32 result = 0; + for (;;) { + i32 current = atomic_load_u32(locks + lock_index); + if (current == 0 && atomic_cas_u32(locks + lock_index, &current, 1)) { + result = 1; + break; + } + if (!timeout_ms || !os_wait_on_value(locks + lock_index, current, timeout_ms)) + break; + } + return result; +} + +function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock) +{ + i32 *lock = locks + lock_index; + assert(atomic_load_u32(lock)); + atomic_store_u32(lock, 0); + os_wake_waiters(lock); +} diff --git a/os_win32.c b/os_win32.c @@ -83,12 +83,17 @@ typedef struct { iptr context; } w32_io_completion_event; +typedef struct { + iptr *semaphores; +} w32_shared_memory_context; + #define W32(r) __declspec(dllimport) r __stdcall W32(b32) CloseHandle(iptr); W32(b32) CopyFileA(c8 *, c8 *, b32); W32(iptr) CreateFileA(c8 *, u32, u32, void *, u32, u32, void *); W32(iptr) CreateFileMappingA(iptr, void *, u32, u32, u32, c8 *); W32(iptr) CreateIoCompletionPort(iptr, iptr, uptr, u32); +W32(iptr) CreateSemaphoreA(iptr, i32, i32, c8 *); W32(iptr) CreateThread(iptr, uz, iptr, iptr, u32, u32 *); W32(b32) DeleteFileA(c8 *); W32(void) ExitProcess(i32); @@ -107,8 +112,9 @@ W32(b32) QueryPerformanceCounter(u64 *); W32(b32) QueryPerformanceFrequency(u64 *); W32(b32) ReadDirectoryChangesW(iptr, u8 *, u32, b32, u32, u32 *, void *, void *); W32(b32) ReadFile(iptr, u8 *, i32, i32 *, void *); -W32(b32) ReleaseSemaphore(iptr, i64, i64 *); +W32(b32) ReleaseSemaphore(iptr, i32, i32 *); W32(i32) SetThreadDescription(iptr, u16 *); +W32(u32) WaitForSingleObject(iptr, u32); W32(b32) WaitOnAddress(void *, void *, uz, u32); W32(i32) WakeByAddressAll(void *); W32(iptr) wglGetProcAddress(c8 *); @@ -168,10 +174,9 @@ os_get_timer_counter(void) return result; } -function OS_ALLOC_ARENA_FN(os_alloc_arena) +function iz +os_round_up_to_page_size(iz value) { - Arena result = old; - struct { u16 architecture; u16 _pad1; @@ -185,17 +190,18 @@ function OS_ALLOC_ARENA_FN(os_alloc_arena) u16 processor_level; u16 processor_revision; } info; - GetSystemInfo(&info); + iz result = round_up_to(value, info.page_size); + return result; +} - if (capacity % info.page_size != 0) - capacity += (info.page_size - capacity % info.page_size); - +function OS_ALLOC_ARENA_FN(os_alloc_arena) +{ + Arena result = old; + capacity = os_round_up_to_page_size(capacity); iz old_size = old.end - old.beg; if (old_size < capacity) { - if (old.beg) - VirtualFree(old.beg, old_size, MEM_RELEASE); - + if (old.beg) VirtualFree(old.beg, old_size, MEM_RELEASE); result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); if (!result.beg) os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); @@ -268,13 +274,34 @@ os_file_exists(char *path) return result; } -function void * -os_create_shared_memory_area(char *name, iz cap) +function SharedMemoryRegion +os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity) { - void *result = 0; - iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, cap, name); - if (h != INVALID_FILE) - result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, cap); + iz capacity = os_round_up_to_page_size(requested_capacity); + SharedMemoryRegion result = {0}; + iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, capacity, name); + if (h != INVALID_FILE) { + void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, capacity); + if (new) { + w32_shared_memory_context *ctx = push_struct(arena, typeof(*ctx)); + ctx->semaphores = push_array(arena, typeof(*ctx->semaphores), lock_count); + result.os_context = (iptr)ctx; + result.region = new; + + Stream sb = arena_stream(*arena); + stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_")); + for (i32 i = 0; i < lock_count; i++) { + Stream lb = sb; + stream_append_i64(&lb, i); + stream_append_byte(&lb, 0); + ctx->semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data); + if (ctx->semaphores[i] == INVALID_FILE) { + os_fatal(s8("os_create_shared_memory_area: " + "failed to create semaphore\n")); + } + } + } + } return result; } @@ -380,7 +407,23 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value) function OS_WAKE_WAITERS_FN(os_wake_waiters) { if (sync) { - atomic_add_u32(sync, 1); + atomic_store_u32(sync, 0); WakeByAddressAll(sync); } } + +function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock) +{ + w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context; + b32 result = !WaitForSingleObject(ctx->semaphores[lock_index], timeout_ms); + if (result) atomic_store_u32(locks + lock_index, 1); + return result; +} + +function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock) +{ + w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context; + assert(atomic_load_u32(locks + lock_index)); + os_wake_waiters(locks + lock_index); + ReleaseSemaphore(ctx->semaphores[lock_index], 1, 0); +} diff --git a/static.c b/static.c @@ -212,7 +212,6 @@ function FILE_WATCH_CALLBACK_FN(load_cuda_lib) return result; } - #define GLFW_VISIBLE 0x00020004 void glfwWindowHint(i32, i32); iptr glfwCreateWindow(i32, i32, char *, iptr, iptr); @@ -229,12 +228,12 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point) for (;;) { for (;;) { - i32 expected = 1; - if (atomic_cas_u32(&ctx->sync_variable, &expected, 0)) + i32 expected = 0; + if (atomic_cas_u32(&ctx->sync_variable, &expected, 1)) break; ctx->asleep = 1; - os_wait_on_value(&ctx->sync_variable, 0, -1); + os_wait_on_value(&ctx->sync_variable, 1, -1); ctx->asleep = 0; } beamformer_complete_compute(ctx->user_context, ctx->arena, ctx->gl_context); @@ -280,25 +279,19 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory) ctx->beamform_work_queue = push_struct(memory, BeamformWorkQueue); - ctx->shared_memory = os_create_shared_memory_area(OS_SHARED_MEMORY_NAME, BEAMFORMER_SHARED_MEMORY_SIZE); - if (!ctx->shared_memory) - os_fatal(s8("Get more ram lol\n")); - mem_clear(ctx->shared_memory, 0, sizeof(*ctx->shared_memory)); - - ctx->shared_memory->version = BEAMFORMER_PARAMETERS_VERSION; - /* TODO(rnp): refactor - this is annoying */ - ctx->shared_memory->parameters_sync = 1; - ctx->shared_memory->parameters_head_sync = 1; - ctx->shared_memory->parameters_ui_sync = 1; - ctx->shared_memory->raw_data_sync = 1; - ctx->shared_memory->channel_mapping_sync = 1; - ctx->shared_memory->sparse_elements_sync = 1; - ctx->shared_memory->focal_vectors_sync = 1; + ctx->shared_memory = os_create_shared_memory_area(memory, OS_SHARED_MEMORY_NAME, + BeamformerSharedMemoryLockKind_Count, + BEAMFORMER_SHARED_MEMORY_SIZE); + BeamformerSharedMemory *sm = ctx->shared_memory.region; + if (!sm) os_fatal(s8("Get more ram lol\n")); + mem_clear(sm, 0, sizeof(*sm)); + + sm->version = BEAMFORMER_SHARED_MEMORY_VERSION; /* NOTE: default compute shader pipeline */ - ctx->shared_memory->compute_stages[0] = ComputeShaderKind_Decode; - ctx->shared_memory->compute_stages[1] = ComputeShaderKind_DASCompute; - ctx->shared_memory->compute_stages_count = 2; + sm->compute_stages[0] = ComputeShaderKind_Decode; + sm->compute_stages[1] = ComputeShaderKind_DASCompute; + sm->compute_stages_count = 2; if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA && load_cuda_lib(&ctx->os, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory)) diff --git a/ui.c b/ui.c @@ -1965,18 +1965,18 @@ draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, ComputeShaderStats *sta read_only local_persist s8 labels[ComputeShaderKind_Count] = {COMPUTE_SHADERS}; #undef X + BeamformerSharedMemory *sm = ctx->shared_memory.region; BeamformerUI *ui = ctx->ui; f32 compute_time_sum = 0; - u32 stages = ctx->shared_memory->compute_stages_count; + u32 stages = sm->compute_stages_count; TextSpec text_spec = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED}; Table *table = table_new(&arena, stages + 1, 3, (TextAlignment []){TA_LEFT, TA_LEFT, TA_LEFT}); for (u32 i = 0; i < stages; i++) { TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data; - Stream sb = arena_stream(arena); - u32 index = ctx->shared_memory->compute_stages[i]; + ShaderKind index = (ShaderKind)sm->compute_stages[i]; compute_time_sum += stats->times[index]; stream_append_f64_e(&sb, stats->times[index]); @@ -2824,6 +2824,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw ComputeShaderStats *latest_compute_stats) { BeamformerUI *ui = ctx->ui; + BeamformerSharedMemory *sm = ctx->shared_memory.region; ui->latest_plane[IPT_LAST] = frame_to_draw; ui->latest_plane[frame_plane] = frame_to_draw; @@ -2831,7 +2832,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw /* TODO(rnp): there should be a better way of detecting this */ if (ctx->ui_read_params) { - mem_copy(&ui->params, &ctx->shared_memory->parameters.output_min_coordinate, sizeof(ui->params)); + mem_copy(&ui->params, &sm->parameters.output_min_coordinate, sizeof(ui->params)); ui->flush_params = 0; ctx->ui_read_params = 0; } @@ -2841,19 +2842,18 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw ui_interact(ui, input, ctx->window_size); if (ui->flush_params) { + i32 lock = BeamformerSharedMemoryLockKind_Parameters; validate_ui_parameters(&ui->params); - BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); - if (work && try_wait_sync(&ctx->shared_memory->parameters_sync, 0, ctx->os.wait_on_value)) { - BeamformerUploadContext *uc = &work->upload_context; - uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters); - uc->size = sizeof(ctx->shared_memory->parameters); - uc->kind = BU_KIND_PARAMETERS; - work->type = BW_UPLOAD_BUFFER; - work->completion_barrier = (iptr)&ctx->shared_memory->parameters_sync; - mem_copy(&ctx->shared_memory->parameters_ui, &ui->params, sizeof(ui->params)); - beamform_work_queue_push_commit(ctx->beamform_work_queue); - ui->flush_params = 0; - ctx->start_compute = 1; + if (ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, lock, 0)) { + mem_copy(&sm->parameters_ui, &ui->params, sizeof(ui->params)); + ui->flush_params = 0; + ctx->csctx.shared_ubo_dirty = 1; + b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, + BeamformerSharedMemoryLockKind_DispatchCompute, + 0); + sm->start_compute_from_main |= dispatch & + ctx->beamform_frames[ctx->display_frame_index].ready_to_present; + ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock); } } diff --git a/util.c b/util.c @@ -508,6 +508,15 @@ round_down_power_of_2(u32 a) return result; } +function iz +round_up_to(iz value, iz multiple) +{ + iz result = value; + if (value % multiple != 0) + result += multiple - value % multiple; + return result; +} + function b32 uv2_equal(uv2 a, uv2 b) { diff --git a/util.h b/util.h @@ -254,6 +254,11 @@ typedef struct { iptr handle; } FileWatchContext; +typedef struct { + void *region; + iptr os_context; +} SharedMemoryRegion; + #define OS_ALLOC_ARENA_FN(name) Arena name(Arena old, iz capacity) typedef OS_ALLOC_ARENA_FN(os_alloc_arena_fn); @@ -291,16 +296,23 @@ typedef OS_WRITE_FILE_FN(os_write_file_fn); #define OS_THREAD_ENTRY_POINT_FN(name) iptr name(iptr _ctx) typedef OS_THREAD_ENTRY_POINT_FN(os_thread_entry_point_fn); +#define OS_SHARED_MEMORY_LOCK_REGION_FN(name) b32 name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index, i32 timeout_ms) +typedef OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock_fn); + +#define OS_SHARED_MEMORY_UNLOCK_REGION_FN(name) void name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index) +typedef OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock_fn); + #define OS_FNS \ - X(add_file_watch) \ - X(alloc_arena) \ - X(close) \ - X(open_for_write) \ - X(read_file) \ - X(read_whole_file) \ - X(wait_on_value) \ - X(wake_waiters) \ - X(write_new_file) \ + X(add_file_watch) \ + X(alloc_arena) \ + X(close) \ + X(open_for_write) \ + X(read_file) \ + X(read_whole_file) \ + X(shared_memory_region_lock) \ + X(shared_memory_region_unlock) \ + X(wake_waiters) \ + X(write_new_file) \ X(write_file) #define RENDERDOC_GET_API_FN(name) b32 name(u32 version, void **out_api)