Commit: c0a61b78d170bfec84aa9052ddf4e2010ba79afb
Parent: 8e1243fc6309f0793b5c8d814ba79f6dffda8386
Author: Randy Palamar
Date: Tue, 10 Jun 2025 10:54:28 -0600
core/lib: rework library syncing and locking
this removes the trivial ways of deadlocking the library while
also greatly improving throughput. for a particular dataset on my
computer the beamformer would previously achieve ~1.3GB/s now it
is consistently ~2.1GB/s
this also transitions w32 to using w32 semaphores so that
timeouts work on w32
Diffstat:
14 files changed, 391 insertions(+), 258 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,6 +1,5 @@
/* See LICENSE for license details. */
/* TODO(rnp):
- * [ ]: refactor: BeamformGPUComputeContext
* [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
* [ ]: reinvestigate ring buffer raw_data_ssbo
* - to minimize latency the main thread should manage the subbuffer upload so that the
@@ -10,6 +9,14 @@
* can overwrite one while the other is in use.
* - make use of glFenceSync to guard buffer uploads
* [ ]: BeamformWorkQueue -> BeamformerWorkQueue
+ * [ ]: bug: re-beamform on shader reload
+ * [ ]: need to keep track of gpu memory in some way
+ * - want to be able to store more than 16 2D frames but limit 3D frames
+ * - maybe keep track of how much gpu memory is committed for beamformed images
+ * and use that to determine when to loop back over existing textures
+ * - to do this maybe use a circular linked list instead of a flat array
+ * - then have a way of querying how many frames are available for a specific point count
+ * [ ]: bug: reinit cuda on hot-reload
*/
#include "beamformer.h"
@@ -111,7 +118,7 @@ function void
alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
{
ComputeShaderCtx *cs = &ctx->csctx;
- BeamformerParameters *bp = &ctx->shared_memory->parameters;
+ BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters;
cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
cs->rf_raw_size = rf_raw_size;
@@ -161,6 +168,7 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl
u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1);
u32 frame_index = frame_id % countof(ctx->beamform_frames);
work->type = BW_COMPUTE;
+ work->lock = BeamformerSharedMemoryLockKind_DispatchCompute;
work->frame = ctx->beamform_frames + frame_index;
work->frame->ready_to_present = 0;
work->frame->frame.id = frame_id;
@@ -270,7 +278,8 @@ compute_cursor_finished(struct compute_cursor *cursor)
function void
do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader)
{
- ComputeShaderCtx *csctx = &ctx->csctx;
+ ComputeShaderCtx *csctx = &ctx->csctx;
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
glUseProgram(csctx->programs[shader]);
@@ -364,10 +373,10 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame,
aframe->frame.id = ctx->averaged_frame_index;
/* TODO(rnp): hack we need a better way of specifying which frames to sum;
* this is fine for rolling averaging but what if we want to do something else */
- ASSERT(frame >= ctx->beamform_frames);
- ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
+ assert(frame >= ctx->beamform_frames);
+ assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
u32 base_index = (u32)(frame - ctx->beamform_frames);
- u32 to_average = ctx->shared_memory->parameters.output_points[3];
+ u32 to_average = sm->parameters.output_points[3];
u32 frame_count = 0;
u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
@@ -482,11 +491,11 @@ reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extr
}
function void
-complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
+complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context)
{
ComputeShaderCtx *cs = &ctx->csctx;
- BeamformerParameters *bp = &ctx->shared_memory->parameters;
- BeamformerSharedMemory *sm = ctx->shared_memory;
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
+ BeamformerParameters *bp = &sm->parameters;
BeamformWork *work = beamform_work_queue_pop(q);
while (work) {
@@ -517,7 +526,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
}
} break;
case BW_UPLOAD_BUFFER: {
- assert(!atomic_load_u32((i32 *)(barrier_offset + work->completion_barrier)));
+ ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
BeamformerUploadContext *uc = &work->upload_context;
u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
switch (uc->kind) {
@@ -541,7 +550,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
tex_element_count = ARRAY_COUNT(sm->sparse_elements);
} break;
case BU_KIND_PARAMETERS: {
- ctx->ui_read_params = barrier_offset != 0;
+ ctx->ui_read_params = ctx->beamform_work_queue != q;
buffer = cs->shared_ubo;
} break;
case BU_KIND_RF_DATA: {
@@ -564,8 +573,26 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
glNamedBufferSubData(buffer, 0, uc->size,
(u8 *)sm + uc->shared_memory_offset);
}
+ ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
} break;
- case BW_COMPUTE: {
+ case BW_COMPUTE_INDIRECT:{
+ fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
+ DEBUG_DECL(work->type = BW_COMPUTE_INDIRECT;)
+ } /* FALLTHROUGH */
+ case BW_COMPUTE:{
+ /* NOTE(rnp): debug: here it is not a bug to release the lock if it
+ * isn't held but elswhere it is */
+ DEBUG_DECL(if (sm->locks[work->lock])) {
+ ctx->os.shared_memory_region_unlock(&ctx->shared_memory,
+ sm->locks, work->lock);
+ }
+ atomic_store_u32(&ctx->starting_compute, 0);
+
+ if (cs->shared_ubo_dirty) {
+ glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters);
+ cs->shared_ubo_dirty = 0;
+ }
+
atomic_store_u32(&cs->processing_compute, 1);
start_renderdoc_capture(gl_context);
@@ -640,14 +667,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
INVALID_CODE_PATH;
}
} break;
- default: INVALID_CODE_PATH; break;
+ InvalidDefaultCase;
}
if (can_commit) {
- if (work->completion_barrier) {
- i32 *value = (i32 *)(barrier_offset + work->completion_barrier);
- ctx->os.wake_waiters(value);
- }
beamform_work_queue_pop_commit(q);
work = beamform_work_queue_pop(q);
}
@@ -657,7 +680,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
{
BeamformerCtx *ctx = (BeamformerCtx *)user_context;
- BeamformerSharedMemory *sm = ctx->shared_memory;
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
ComputeShaderCtx *cs = &ctx->csctx;
glCreateBuffers(1, &cs->shared_ubo);
@@ -678,9 +701,10 @@ DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
{
- BeamformerCtx *ctx = (BeamformerCtx *)user_context;
- complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory);
- complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0);
+ BeamformerCtx *ctx = (BeamformerCtx *)user_context;
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
+ complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
+ complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
}
#include "ui.c"
@@ -700,49 +724,40 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture);
}
- BeamformerParameters *bp = &ctx->shared_memory->parameters;
- if (ctx->shared_memory->dispatch_compute_sync) {
- ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane;
- atomic_store_u32(&ctx->shared_memory->dispatch_compute_sync, 0);
- BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
- if (work) {
- if (fill_frame_compute_work(ctx, work, current_plane))
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
+ BeamformerParameters *bp = &sm->parameters;
+ if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && !ctx->starting_compute) {
+ if (sm->start_compute_from_main) {
+ BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
+ ImagePlaneTag tag = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
+ if (fill_frame_compute_work(ctx, work, tag)) {
beamform_work_queue_push_commit(ctx->beamform_work_queue);
-
- if (ctx->shared_memory->export_next_frame) {
- BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
- if (export) {
- /* TODO: we don't really want the beamformer opening/closing files */
- iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
- export->type = BW_SAVE_FRAME;
- export->output_frame_ctx.file_handle = f;
- if (bp->output_points[3] > 1) {
- u32 a_index = !(ctx->averaged_frame_index %
- ARRAY_COUNT(ctx->averaged_frames));
- BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
- export->output_frame_ctx.frame = aframe;
- } else {
- export->output_frame_ctx.frame = work->frame;
+ if (sm->export_next_frame) {
+ BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
+ if (export) {
+ /* TODO: we don't really want the beamformer opening/closing files */
+ iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
+ export->type = BW_SAVE_FRAME;
+ export->output_frame_ctx.file_handle = f;
+ if (bp->output_points[3] > 1) {
+ static_assert(countof(ctx->averaged_frames) == 2,
+ "fix this, we assume average frame ping pong buffer");
+ u32 a_index = !(ctx->averaged_frame_index %
+ countof(ctx->averaged_frames));
+ BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
+ export->output_frame_ctx.frame = aframe;
+ } else {
+ export->output_frame_ctx.frame = work->frame;
+ }
+ beamform_work_queue_push_commit(ctx->beamform_work_queue);
}
- beamform_work_queue_push_commit(ctx->beamform_work_queue);
+ sm->export_next_frame = 0;
}
- ctx->shared_memory->export_next_frame = 0;
- }
-
- ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
- }
- }
-
- if (ctx->start_compute) {
- if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
- BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
- ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
- if (fill_frame_compute_work(ctx, work, plane)) {
- beamform_work_queue_push_commit(ctx->beamform_work_queue);
- ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
- ctx->start_compute = 0;
}
+ atomic_store_u32(&sm->start_compute_from_main, 0);
}
+ atomic_store_u32(&ctx->starting_compute, 1);
+ ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
}
ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
@@ -754,14 +769,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
}
}
- if (ctx->start_compute) {
- ctx->start_compute = 0;
- ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
- }
-
BeamformComputeFrame *frame_to_draw;
if (bp->output_points[3] > 1) {
- u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
+ u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames));
frame_to_draw = ctx->averaged_frames + a_index;
} else {
frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
diff --git a/beamformer.h b/beamformer.h
@@ -87,6 +87,7 @@ typedef struct {
u32 raw_data_ssbo;
u32 shared_ubo;
+ b32 shared_ubo_dirty;
u32 channel_mapping_texture;
u32 sparse_elements_texture;
@@ -171,7 +172,7 @@ typedef struct {
GLParams gl;
uv2 window_size;
- b32 start_compute;
+ b32 starting_compute;
b32 should_exit;
Arena ui_backing_store;
@@ -201,7 +202,7 @@ typedef struct {
BeamformWorkQueue *beamform_work_queue;
- BeamformerSharedMemory *shared_memory;
+ SharedMemoryRegion shared_memory;
} BeamformerCtx;
struct ShaderReloadContext {
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -7,8 +7,6 @@
* programatically would be nice.
*/
-#define BEAMFORMER_PARAMETERS_VERSION (2UL)
-
/* X(enumarant, number, shader file name, needs header, pretty name) */
#define COMPUTE_SHADERS \
X(CudaDecode, 0, "", 0, "CUDA Decode") \
diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c
@@ -50,19 +50,3 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit)
{
atomic_add_u64(&q->queue, 1);
}
-
-function b32
-try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value)
-{
- b32 result = 0;
- for (;;) {
- i32 current = atomic_load_u32(sync);
- if (current && atomic_cas_u32(sync, ¤t, 0)) {
- result = 1;
- break;
- }
- if (!timeout_ms || !os_wait_on_value(sync, 0, timeout_ms))
- break;
- }
- return result;
-}
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -1,12 +1,20 @@
/* See LICENSE for license details. */
+/* TODO(rnp):
+ * [ ]: coalesce uploads if they are overwriting exist data
+ * - use a flag field and only submit a new work if the corresponding flag is clear
+ */
+
#ifndef _BEAMFORMER_WORK_QUEUE_H_
#define _BEAMFORMER_WORK_QUEUE_H_
+#define BEAMFORMER_SHARED_MEMORY_VERSION (3UL)
+
typedef struct BeamformComputeFrame BeamformComputeFrame;
typedef struct ShaderReloadContext ShaderReloadContext;
typedef enum {
BW_COMPUTE,
+ BW_COMPUTE_INDIRECT,
BW_RELOAD_SHADER,
BW_SAVE_FRAME,
BW_SEND_FRAME,
@@ -33,6 +41,21 @@ typedef struct {
iptr file_handle;
} BeamformOutputFrameContext;
+#define BEAMFORMER_SHARED_MEMORY_LOCKS \
+ X(None) \
+ X(Parameters) \
+ X(ParametersHead) \
+ X(ParametersUI) \
+ X(FocalVectors) \
+ X(ChannelMapping) \
+ X(SparseElements) \
+ X(RawData) \
+ X(DispatchCompute)
+
+#define X(name) BeamformerSharedMemoryLockKind_##name,
+typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Count} BeamformerSharedMemoryLockKind;
+#undef X
+
/* NOTE: discriminated union based on type */
typedef struct {
union {
@@ -40,11 +63,10 @@ typedef struct {
BeamformerUploadContext upload_context;
BeamformOutputFrameContext output_frame_ctx;
ShaderReloadContext *shader_reload_context;
+ ImagePlaneTag compute_indirect_plane;
void *generic;
};
- /* NOTE(rnp): mostly for __external__ processes to sync on. when passed from external
- * process this should be an offset from base of shared_memory */
- iptr completion_barrier;
+ BeamformerSharedMemoryLockKind lock;
BeamformWorkType type;
} BeamformWork;
@@ -68,9 +90,15 @@ typedef BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit_fn);
- (uintptr_t)(sizeof(BeamformerSharedMemory) & 4095ULL))
#define BEAMFORMER_MAX_RF_DATA_SIZE (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_RF_DATA_OFF)
-typedef struct {
+typedef align_as(64) struct {
+ u32 version;
+
+ /* NOTE(rnp): not used for locking on w32 but we can use these to peek at the status of
+ * the lock without leaving userspace. also this struct needs a bunch of padding */
+ i32 locks[BeamformerSharedMemoryLockKind_Count];
+
/* NOTE(rnp): interleaved transmit angle, focal depth pairs */
- _Alignas(64) v2 focal_vectors[256];
+ align_as(64) v2 focal_vectors[256];
i16 channel_mapping[256];
i16 sparse_elements[256];
@@ -87,21 +115,12 @@ typedef struct {
ComputeShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
u32 compute_stages_count;
- i32 parameters_sync;
- i32 parameters_head_sync;
- i32 parameters_ui_sync;
- i32 focal_vectors_sync;
- i32 channel_mapping_sync;
- i32 sparse_elements_sync;
- i32 raw_data_sync;
-
- i32 dispatch_compute_sync;
- ImagePlaneTag current_image_plane;
+ /* TODO(rnp): hack: we need a different way of dispatching work for export */
+ b32 start_compute_from_main;
/* TODO(rnp): this shouldn't be needed */
b32 export_next_frame;
- u32 version;
BeamformWorkQueue external_work_queue;
} BeamformerSharedMemory;
diff --git a/build.c b/build.c
@@ -272,7 +272,6 @@ W32(b32) CreateProcessA(u8 *, u8 *, iptr, iptr, b32, u32, iptr, u8 *, iptr, iptr
W32(b32) GetExitCodeProcess(iptr handle, u32 *);
W32(b32) GetFileTime(iptr, iptr, iptr, iptr);
W32(b32) MoveFileExA(c8 *, c8 *, u32);
-W32(u32) WaitForSingleObject(iptr, u32);
function void
os_make_directory(char *name)
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -8,6 +8,7 @@
#define PIPE_RETRY_PERIOD_MS (100ULL)
+global SharedMemoryRegion g_shared_memory;
global BeamformerSharedMemory *g_bp;
global BeamformerLibErrorKind g_lib_last_error;
@@ -72,15 +73,14 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
return total_read == read_size;
}
-static BeamformerSharedMemory *
+function SharedMemoryRegion
os_open_shared_memory_area(char *name)
{
- BeamformerSharedMemory *result = 0;
+ SharedMemoryRegion result = {0};
i32 fd = shm_open(name, O_RDWR, S_IRUSR|S_IWUSR);
if (fd > 0) {
void *new = mmap(0, BEAMFORMER_SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
- if (new != MAP_FAILED)
- result = new;
+ if (new != MAP_FAILED) result.region = new;
close(fd);
}
return result;
@@ -88,15 +88,6 @@ os_open_shared_memory_area(char *name)
#elif OS_WINDOWS
-/* TODO(rnp): temporary workaround */
-function OS_WAIT_ON_VALUE_FN(os_wait_on_value_stub)
-{
- /* TODO(rnp): this doesn't work across processes on win32 (return 1 to cause a spin wait) */
- return 1;
- return WaitOnAddress(value, ¤t, sizeof(*value), timeout_ms);
-}
-#define os_wait_on_value os_wait_on_value_stub
-
static Pipe
os_open_read_pipe(char *name)
{
@@ -145,16 +136,35 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
return total_read == read_size;
}
-function BeamformerSharedMemory *
+function SharedMemoryRegion
os_open_shared_memory_area(char *name)
{
- BeamformerSharedMemory *result = 0;
+ SharedMemoryRegion result = {0};
iptr h = OpenFileMappingA(FILE_MAP_ALL_ACCESS, 0, name);
if (h != INVALID_FILE) {
- result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, BEAMFORMER_SHARED_MEMORY_SIZE);
+ void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0,
+ os_round_up_to_page_size(BEAMFORMER_SHARED_MEMORY_SIZE));
+ if (new) {
+ u8 buffer[1024];
+ Stream sb = {.data = buffer, .cap = 1024};
+ stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_"));
+ local_persist iptr semaphores[BeamformerSharedMemoryLockKind_Count];
+ local_persist w32_shared_memory_context ctx = {.semaphores = semaphores};
+ b32 all_semaphores = 1;
+ for (i32 i = 0; i < countof(semaphores); i++) {
+ Stream lb = sb;
+ stream_append_i64(&lb, i);
+ stream_append_byte(&lb, 0);
+ semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data);
+ all_semaphores &= semaphores[i] != INVALID_FILE;
+ }
+ if (all_semaphores) {
+ result.region = new;
+ result.os_context = (iptr)&ctx;
+ }
+ }
CloseHandle(h);
}
-
return result;
}
@@ -164,16 +174,19 @@ function b32
check_shared_memory(void)
{
b32 result = 1;
- if (!g_bp) {
- g_bp = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
- if (!g_bp) {
+ if (!g_shared_memory.region) {
+ g_shared_memory = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
+ if (!g_shared_memory.region) {
result = 0;
g_lib_last_error = BF_LIB_ERR_KIND_SHARED_MEMORY;
}
- } else if (g_bp->version != BEAMFORMER_PARAMETERS_VERSION) {
+ } else if (((BeamformerSharedMemory *)g_shared_memory.region)->version
+ != BEAMFORMER_SHARED_MEMORY_VERSION)
+ {
g_lib_last_error = BF_LIB_ERR_KIND_VERSION_MISMATCH;
result = 0;
}
+ if (result) g_bp = g_shared_memory.region;
return result;
}
@@ -186,17 +199,23 @@ try_push_work_queue(void)
}
function b32
-lib_try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value)
+lib_try_lock(BeamformerSharedMemoryLockKind lock, i32 timeout_ms)
{
- b32 result = try_wait_sync(sync, timeout_ms, os_wait_on_value);
+ b32 result = os_shared_memory_region_lock(&g_shared_memory, g_bp->locks, (i32)lock, timeout_ms);
if (!result) g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE;
return result;
}
+function void
+lib_release_lock(BeamformerSharedMemoryLockKind lock)
+{
+ os_shared_memory_region_unlock(&g_shared_memory, g_bp->locks, (i32)lock);
+}
+
u32
beamformer_get_api_version(void)
{
- return BEAMFORMER_PARAMETERS_VERSION;
+ return BEAMFORMER_SHARED_MEMORY_VERSION;
}
const char *
@@ -245,64 +264,60 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count)
}
b32
-beamformer_start_compute(u32 image_plane_tag)
+beamformer_start_compute(i32 timeout_ms)
{
b32 result = 0;
- if (image_plane_tag < IPT_LAST) {
- if (check_shared_memory()) {
- if (atomic_load_u32(&g_bp->dispatch_compute_sync) == 0) {
- g_bp->current_image_plane = image_plane_tag;
- atomic_store_u32(&g_bp->dispatch_compute_sync, 1);
+ if (check_shared_memory()) {
+ if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) {
+ if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms)) {
+ lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute);
result = 1;
- } else {
- g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE;
}
}
- } else {
- g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
}
return result;
}
function b32
-beamformer_upload_buffer(void *data, u32 size, i32 store_offset, i32 sync_offset,
+beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerSharedMemoryLockKind lock,
BeamformerUploadKind kind, i32 timeout_ms)
{
b32 result = 0;
if (check_shared_memory()) {
BeamformWork *work = try_push_work_queue();
- result = work && lib_try_wait_sync((i32 *)((u8 *)g_bp + sync_offset), timeout_ms, os_wait_on_value);
+ result = work && lib_try_lock(lock, timeout_ms);
if (result) {
BeamformerUploadContext *uc = &work->upload_context;
uc->shared_memory_offset = store_offset;
uc->size = size;
uc->kind = kind;
work->type = BW_UPLOAD_BUFFER;
- work->completion_barrier = sync_offset;
+ work->lock = lock;
mem_copy((u8 *)g_bp + store_offset, data, size);
beamform_work_queue_push_commit(&g_bp->external_work_queue);
+ lib_release_lock(lock);
}
}
return result;
}
#define BEAMFORMER_UPLOAD_FNS \
- X(channel_mapping, i16, 1, CHANNEL_MAPPING) \
- X(sparse_elements, i16, 1, SPARSE_ELEMENTS) \
- X(focal_vectors, f32, 2, FOCAL_VECTORS)
+ X(channel_mapping, i16, 1, ChannelMapping, CHANNEL_MAPPING) \
+ X(sparse_elements, i16, 1, SparseElements, SPARSE_ELEMENTS) \
+ X(focal_vectors, f32, 2, FocalVectors, FOCAL_VECTORS)
-#define X(name, dtype, elements, command) \
+#define X(name, dtype, elements, lock_name, command) \
b32 beamformer_push_##name (dtype *data, u32 count, i32 timeout_ms) { \
- b32 result = 0; \
- if (count <= countof(g_bp->name)) { \
- result = beamformer_upload_buffer(data, count * elements * sizeof(dtype), \
- offsetof(BeamformerSharedMemory, name), \
- offsetof(BeamformerSharedMemory, name##_sync), \
- BU_KIND_##command, timeout_ms); \
- } else { \
- g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; \
- } \
- return result; \
+ b32 result = 0; \
+ if (count <= countof(g_bp->name)) { \
+ result = beamformer_upload_buffer(data, count * elements * sizeof(dtype), \
+ offsetof(BeamformerSharedMemory, name), \
+ BeamformerSharedMemoryLockKind_##lock_name, \
+ BU_KIND_##command, timeout_ms); \
+ } else { \
+ g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; \
+ } \
+ return result; \
}
BEAMFORMER_UPLOAD_FNS
#undef X
@@ -312,19 +327,20 @@ beamformer_push_parameters(BeamformerParameters *bp, i32 timeout_ms)
{
b32 result = beamformer_upload_buffer(bp, sizeof(*bp),
offsetof(BeamformerSharedMemory, parameters),
- offsetof(BeamformerSharedMemory, parameters_sync),
+ BeamformerSharedMemoryLockKind_Parameters,
BU_KIND_PARAMETERS, timeout_ms);
return result;
}
-b32
-beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
+function b32
+beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_from_main)
{
b32 result = 0;
if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) {
result = beamformer_upload_buffer(data, data_size, BEAMFORMER_RF_DATA_OFF,
- offsetof(BeamformerSharedMemory, raw_data_sync),
+ BeamformerSharedMemoryLockKind_RawData,
BU_KIND_RF_DATA, timeout_ms);
+ if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1);
} else {
g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;
}
@@ -332,21 +348,49 @@ beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
}
b32
+beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
+{
+ return beamformer_push_data_base(data, data_size, timeout_ms, 1);
+}
+
+b32
+beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, i32 timeout_ms)
+{
+ b32 result = beamformer_push_data_base(data, data_size, timeout_ms, 0);
+ if (result) {
+ result = image_plane_tag < IPT_LAST;
+ if (result) {
+ BeamformWork *work = try_push_work_queue();
+ result = work != 0;
+ if (result) {
+ work->type = BW_COMPUTE_INDIRECT;
+ work->compute_indirect_plane = image_plane_tag;
+ beamform_work_queue_push_commit(&g_bp->external_work_queue);
+ }
+ } else {
+ g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
+ }
+ }
+ return result;
+}
+
+b32
beamformer_push_parameters_ui(BeamformerUIParameters *bp, i32 timeout_ms)
{
b32 result = 0;
if (check_shared_memory()) {
BeamformWork *work = try_push_work_queue();
- result = work && lib_try_wait_sync(&g_bp->parameters_ui_sync, timeout_ms, os_wait_on_value);
+ result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersUI, timeout_ms);
if (result) {
BeamformerUploadContext *uc = &work->upload_context;
uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
uc->size = sizeof(g_bp->parameters);
uc->kind = BU_KIND_PARAMETERS;
work->type = BW_UPLOAD_BUFFER;
- work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_ui_sync);
+ work->lock = BeamformerSharedMemoryLockKind_ParametersUI;
mem_copy(&g_bp->parameters_ui, bp, sizeof(*bp));
beamform_work_queue_push_commit(&g_bp->external_work_queue);
+ lib_release_lock(BeamformerSharedMemoryLockKind_ParametersUI);
}
}
return result;
@@ -358,16 +402,17 @@ beamformer_push_parameters_head(BeamformerParametersHead *bp, i32 timeout_ms)
b32 result = 0;
if (check_shared_memory()) {
BeamformWork *work = try_push_work_queue();
- result = work && lib_try_wait_sync(&g_bp->parameters_head_sync, timeout_ms, os_wait_on_value);
+ result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersHead, timeout_ms);
if (result) {
BeamformerUploadContext *uc = &work->upload_context;
uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
uc->size = sizeof(g_bp->parameters);
uc->kind = BU_KIND_PARAMETERS;
work->type = BW_UPLOAD_BUFFER;
- work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_head_sync);
+ work->lock = BeamformerSharedMemoryLockKind_ParametersHead;
mem_copy(&g_bp->parameters_head, bp, sizeof(*bp));
beamform_work_queue_push_commit(&g_bp->external_work_queue);
+ lib_release_lock(BeamformerSharedMemoryLockKind_ParametersHead);
}
}
return result;
@@ -393,14 +438,8 @@ b32
send_data(void *data, u32 data_size)
{
b32 result = 0;
- if (beamformer_push_data(data, data_size, 0)) {
- result = beamformer_start_compute(0);
- if (result) {
- /* TODO(rnp): should we just set timeout on acquiring the lock instead of this? */
- try_wait_sync(&g_bp->raw_data_sync, -1, os_wait_on_value);
- atomic_store_u32(&g_bp->raw_data_sync, 1);
- }
- }
+ if (beamformer_push_data(data, data_size, 0))
+ result = beamformer_start_compute(-1);
return result;
}
diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h
@@ -38,10 +38,12 @@ LIB_FN uint32_t send_data(void *data, uint32_t data_size);
LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, uint32_t output_points[3],
float *out_data, int32_t timeout_ms);
-LIB_FN uint32_t beamformer_start_compute(uint32_t image_plane_tag);
+/* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */
+LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms);
/* NOTE: these functions only queue an upload; you must flush (old data functions or start_compute) */
LIB_FN uint32_t beamformer_push_data(void *data, uint32_t size, int32_t timeout_ms);
+LIB_FN uint32_t beamformer_push_data_with_compute(void *data, uint32_t size, uint32_t image_plane_tag, int32_t timeout_ms);
LIB_FN uint32_t beamformer_push_channel_mapping(int16_t *mapping, uint32_t count, int32_t timeout_ms);
LIB_FN uint32_t beamformer_push_sparse_elements(int16_t *elements, uint32_t count, int32_t timeout_ms);
LIB_FN uint32_t beamformer_push_focal_vectors(float *vectors, uint32_t count, int32_t timeout_ms);
diff --git a/os_linux.c b/os_linux.c
@@ -83,24 +83,25 @@ os_get_timer_counter(void)
return result;
}
+function iz
+os_round_up_to_page_size(iz value)
+{
+ iz result = round_up_to(value, sysconf(_SC_PAGESIZE));
+ return result;
+}
+
function OS_ALLOC_ARENA_FN(os_alloc_arena)
{
- Arena result;
- iz pagesize = sysconf(_SC_PAGESIZE);
- if (capacity % pagesize != 0)
- capacity += pagesize - capacity % pagesize;
-
- iz oldsize = old.end - old.beg;
- if (oldsize > capacity)
- return old;
-
- if (old.beg)
- munmap(old.beg, oldsize);
-
- result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
- if (result.beg == MAP_FAILED)
- os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
- result.end = result.beg + capacity;
+ Arena result = old;
+ capacity = os_round_up_to_page_size(capacity);
+ iz old_size = old.end - old.beg;
+ if (old_size < capacity) {
+ if (old.beg) munmap(old.beg, old_size);
+ result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ if (result.beg == MAP_FAILED)
+ os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
+ result.end = result.beg + capacity;
+ }
return result;
}
@@ -163,15 +164,15 @@ function OS_READ_FILE_FN(os_read_file)
return total_read;
}
-function void *
-os_create_shared_memory_area(char *name, iz cap)
+function SharedMemoryRegion
+os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity)
{
- void *result = 0;
+ iz capacity = os_round_up_to_page_size(requested_capacity);
+ SharedMemoryRegion result = {0};
i32 fd = shm_open(name, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
- if (fd > 0 && ftruncate(fd, cap) != -1) {
- void *new = mmap(NULL, cap, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
- if (new != MAP_FAILED)
- result = new;
+ if (fd > 0 && ftruncate(fd, capacity) != -1) {
+ void *new = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ if (new != MAP_FAILED) result.region = new;
}
if (fd > 0) close(fd);
return result;
@@ -294,7 +295,30 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value)
function OS_WAKE_WAITERS_FN(os_wake_waiters)
{
if (sync) {
- atomic_store_u32(sync, 1);
+ atomic_store_u32(sync, 0);
syscall(SYS_futex, sync, FUTEX_WAKE, I32_MAX, 0, 0, 0);
}
}
+
+function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock)
+{
+ b32 result = 0;
+ for (;;) {
+ i32 current = atomic_load_u32(locks + lock_index);
+ if (current == 0 && atomic_cas_u32(locks + lock_index, ¤t, 1)) {
+ result = 1;
+ break;
+ }
+ if (!timeout_ms || !os_wait_on_value(locks + lock_index, current, timeout_ms))
+ break;
+ }
+ return result;
+}
+
+function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock)
+{
+ i32 *lock = locks + lock_index;
+ assert(atomic_load_u32(lock));
+ atomic_store_u32(lock, 0);
+ os_wake_waiters(lock);
+}
diff --git a/os_win32.c b/os_win32.c
@@ -83,12 +83,17 @@ typedef struct {
iptr context;
} w32_io_completion_event;
+typedef struct {
+ iptr *semaphores;
+} w32_shared_memory_context;
+
#define W32(r) __declspec(dllimport) r __stdcall
W32(b32) CloseHandle(iptr);
W32(b32) CopyFileA(c8 *, c8 *, b32);
W32(iptr) CreateFileA(c8 *, u32, u32, void *, u32, u32, void *);
W32(iptr) CreateFileMappingA(iptr, void *, u32, u32, u32, c8 *);
W32(iptr) CreateIoCompletionPort(iptr, iptr, uptr, u32);
+W32(iptr) CreateSemaphoreA(iptr, i32, i32, c8 *);
W32(iptr) CreateThread(iptr, uz, iptr, iptr, u32, u32 *);
W32(b32) DeleteFileA(c8 *);
W32(void) ExitProcess(i32);
@@ -107,8 +112,9 @@ W32(b32) QueryPerformanceCounter(u64 *);
W32(b32) QueryPerformanceFrequency(u64 *);
W32(b32) ReadDirectoryChangesW(iptr, u8 *, u32, b32, u32, u32 *, void *, void *);
W32(b32) ReadFile(iptr, u8 *, i32, i32 *, void *);
-W32(b32) ReleaseSemaphore(iptr, i64, i64 *);
+W32(b32) ReleaseSemaphore(iptr, i32, i32 *);
W32(i32) SetThreadDescription(iptr, u16 *);
+W32(u32) WaitForSingleObject(iptr, u32);
W32(b32) WaitOnAddress(void *, void *, uz, u32);
W32(i32) WakeByAddressAll(void *);
W32(iptr) wglGetProcAddress(c8 *);
@@ -168,10 +174,9 @@ os_get_timer_counter(void)
return result;
}
-function OS_ALLOC_ARENA_FN(os_alloc_arena)
+function iz
+os_round_up_to_page_size(iz value)
{
- Arena result = old;
-
struct {
u16 architecture;
u16 _pad1;
@@ -185,17 +190,18 @@ function OS_ALLOC_ARENA_FN(os_alloc_arena)
u16 processor_level;
u16 processor_revision;
} info;
-
GetSystemInfo(&info);
+ iz result = round_up_to(value, info.page_size);
+ return result;
+}
- if (capacity % info.page_size != 0)
- capacity += (info.page_size - capacity % info.page_size);
-
+function OS_ALLOC_ARENA_FN(os_alloc_arena)
+{
+ Arena result = old;
+ capacity = os_round_up_to_page_size(capacity);
iz old_size = old.end - old.beg;
if (old_size < capacity) {
- if (old.beg)
- VirtualFree(old.beg, old_size, MEM_RELEASE);
-
+ if (old.beg) VirtualFree(old.beg, old_size, MEM_RELEASE);
result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
if (!result.beg)
os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
@@ -268,13 +274,34 @@ os_file_exists(char *path)
return result;
}
-function void *
-os_create_shared_memory_area(char *name, iz cap)
+function SharedMemoryRegion
+os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity)
{
- void *result = 0;
- iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, cap, name);
- if (h != INVALID_FILE)
- result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, cap);
+ iz capacity = os_round_up_to_page_size(requested_capacity);
+ SharedMemoryRegion result = {0};
+ iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, capacity, name);
+ if (h != INVALID_FILE) {
+ void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, capacity);
+ if (new) {
+ w32_shared_memory_context *ctx = push_struct(arena, typeof(*ctx));
+ ctx->semaphores = push_array(arena, typeof(*ctx->semaphores), lock_count);
+ result.os_context = (iptr)ctx;
+ result.region = new;
+
+ Stream sb = arena_stream(*arena);
+ stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_"));
+ for (i32 i = 0; i < lock_count; i++) {
+ Stream lb = sb;
+ stream_append_i64(&lb, i);
+ stream_append_byte(&lb, 0);
+ ctx->semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data);
+ if (ctx->semaphores[i] == INVALID_FILE) {
+ os_fatal(s8("os_create_shared_memory_area: "
+ "failed to create semaphore\n"));
+ }
+ }
+ }
+ }
return result;
}
@@ -380,7 +407,23 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value)
function OS_WAKE_WAITERS_FN(os_wake_waiters)
{
if (sync) {
- atomic_add_u32(sync, 1);
+ atomic_store_u32(sync, 0);
WakeByAddressAll(sync);
}
}
+
+function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock)
+{
+ w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context;
+ b32 result = !WaitForSingleObject(ctx->semaphores[lock_index], timeout_ms);
+ if (result) atomic_store_u32(locks + lock_index, 1);
+ return result;
+}
+
+function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock)
+{
+ w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context;
+ assert(atomic_load_u32(locks + lock_index));
+ os_wake_waiters(locks + lock_index);
+ ReleaseSemaphore(ctx->semaphores[lock_index], 1, 0);
+}
diff --git a/static.c b/static.c
@@ -212,7 +212,6 @@ function FILE_WATCH_CALLBACK_FN(load_cuda_lib)
return result;
}
-
#define GLFW_VISIBLE 0x00020004
void glfwWindowHint(i32, i32);
iptr glfwCreateWindow(i32, i32, char *, iptr, iptr);
@@ -229,12 +228,12 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
for (;;) {
for (;;) {
- i32 expected = 1;
- if (atomic_cas_u32(&ctx->sync_variable, &expected, 0))
+ i32 expected = 0;
+ if (atomic_cas_u32(&ctx->sync_variable, &expected, 1))
break;
ctx->asleep = 1;
- os_wait_on_value(&ctx->sync_variable, 0, -1);
+ os_wait_on_value(&ctx->sync_variable, 1, -1);
ctx->asleep = 0;
}
beamformer_complete_compute(ctx->user_context, ctx->arena, ctx->gl_context);
@@ -280,25 +279,19 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory)
ctx->beamform_work_queue = push_struct(memory, BeamformWorkQueue);
- ctx->shared_memory = os_create_shared_memory_area(OS_SHARED_MEMORY_NAME, BEAMFORMER_SHARED_MEMORY_SIZE);
- if (!ctx->shared_memory)
- os_fatal(s8("Get more ram lol\n"));
- mem_clear(ctx->shared_memory, 0, sizeof(*ctx->shared_memory));
-
- ctx->shared_memory->version = BEAMFORMER_PARAMETERS_VERSION;
- /* TODO(rnp): refactor - this is annoying */
- ctx->shared_memory->parameters_sync = 1;
- ctx->shared_memory->parameters_head_sync = 1;
- ctx->shared_memory->parameters_ui_sync = 1;
- ctx->shared_memory->raw_data_sync = 1;
- ctx->shared_memory->channel_mapping_sync = 1;
- ctx->shared_memory->sparse_elements_sync = 1;
- ctx->shared_memory->focal_vectors_sync = 1;
+ ctx->shared_memory = os_create_shared_memory_area(memory, OS_SHARED_MEMORY_NAME,
+ BeamformerSharedMemoryLockKind_Count,
+ BEAMFORMER_SHARED_MEMORY_SIZE);
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
+ if (!sm) os_fatal(s8("Get more ram lol\n"));
+ mem_clear(sm, 0, sizeof(*sm));
+
+ sm->version = BEAMFORMER_SHARED_MEMORY_VERSION;
/* NOTE: default compute shader pipeline */
- ctx->shared_memory->compute_stages[0] = ComputeShaderKind_Decode;
- ctx->shared_memory->compute_stages[1] = ComputeShaderKind_DASCompute;
- ctx->shared_memory->compute_stages_count = 2;
+ sm->compute_stages[0] = ComputeShaderKind_Decode;
+ sm->compute_stages[1] = ComputeShaderKind_DASCompute;
+ sm->compute_stages_count = 2;
if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
&& load_cuda_lib(&ctx->os, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory))
diff --git a/ui.c b/ui.c
@@ -1965,18 +1965,18 @@ draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, ComputeShaderStats *sta
read_only local_persist s8 labels[ComputeShaderKind_Count] = {COMPUTE_SHADERS};
#undef X
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
BeamformerUI *ui = ctx->ui;
f32 compute_time_sum = 0;
- u32 stages = ctx->shared_memory->compute_stages_count;
+ u32 stages = sm->compute_stages_count;
TextSpec text_spec = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED};
Table *table = table_new(&arena, stages + 1, 3, (TextAlignment []){TA_LEFT, TA_LEFT, TA_LEFT});
for (u32 i = 0; i < stages; i++) {
TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data;
-
Stream sb = arena_stream(arena);
- u32 index = ctx->shared_memory->compute_stages[i];
+ ShaderKind index = (ShaderKind)sm->compute_stages[i];
compute_time_sum += stats->times[index];
stream_append_f64_e(&sb, stats->times[index]);
@@ -2824,6 +2824,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
ComputeShaderStats *latest_compute_stats)
{
BeamformerUI *ui = ctx->ui;
+ BeamformerSharedMemory *sm = ctx->shared_memory.region;
ui->latest_plane[IPT_LAST] = frame_to_draw;
ui->latest_plane[frame_plane] = frame_to_draw;
@@ -2831,7 +2832,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
/* TODO(rnp): there should be a better way of detecting this */
if (ctx->ui_read_params) {
- mem_copy(&ui->params, &ctx->shared_memory->parameters.output_min_coordinate, sizeof(ui->params));
+ mem_copy(&ui->params, &sm->parameters.output_min_coordinate, sizeof(ui->params));
ui->flush_params = 0;
ctx->ui_read_params = 0;
}
@@ -2841,19 +2842,18 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
ui_interact(ui, input, ctx->window_size);
if (ui->flush_params) {
+ i32 lock = BeamformerSharedMemoryLockKind_Parameters;
validate_ui_parameters(&ui->params);
- BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
- if (work && try_wait_sync(&ctx->shared_memory->parameters_sync, 0, ctx->os.wait_on_value)) {
- BeamformerUploadContext *uc = &work->upload_context;
- uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
- uc->size = sizeof(ctx->shared_memory->parameters);
- uc->kind = BU_KIND_PARAMETERS;
- work->type = BW_UPLOAD_BUFFER;
- work->completion_barrier = (iptr)&ctx->shared_memory->parameters_sync;
- mem_copy(&ctx->shared_memory->parameters_ui, &ui->params, sizeof(ui->params));
- beamform_work_queue_push_commit(ctx->beamform_work_queue);
- ui->flush_params = 0;
- ctx->start_compute = 1;
+ if (ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, lock, 0)) {
+ mem_copy(&sm->parameters_ui, &ui->params, sizeof(ui->params));
+ ui->flush_params = 0;
+ ctx->csctx.shared_ubo_dirty = 1;
+ b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks,
+ BeamformerSharedMemoryLockKind_DispatchCompute,
+ 0);
+ sm->start_compute_from_main |= dispatch &
+ ctx->beamform_frames[ctx->display_frame_index].ready_to_present;
+ ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock);
}
}
diff --git a/util.c b/util.c
@@ -508,6 +508,15 @@ round_down_power_of_2(u32 a)
return result;
}
+function iz
+round_up_to(iz value, iz multiple)
+{
+ iz result = value;
+ if (value % multiple != 0)
+ result += multiple - value % multiple;
+ return result;
+}
+
function b32
uv2_equal(uv2 a, uv2 b)
{
diff --git a/util.h b/util.h
@@ -254,6 +254,11 @@ typedef struct {
iptr handle;
} FileWatchContext;
+typedef struct {
+ void *region;
+ iptr os_context;
+} SharedMemoryRegion;
+
#define OS_ALLOC_ARENA_FN(name) Arena name(Arena old, iz capacity)
typedef OS_ALLOC_ARENA_FN(os_alloc_arena_fn);
@@ -291,16 +296,23 @@ typedef OS_WRITE_FILE_FN(os_write_file_fn);
#define OS_THREAD_ENTRY_POINT_FN(name) iptr name(iptr _ctx)
typedef OS_THREAD_ENTRY_POINT_FN(os_thread_entry_point_fn);
+#define OS_SHARED_MEMORY_LOCK_REGION_FN(name) b32 name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index, i32 timeout_ms)
+typedef OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock_fn);
+
+#define OS_SHARED_MEMORY_UNLOCK_REGION_FN(name) void name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index)
+typedef OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock_fn);
+
#define OS_FNS \
- X(add_file_watch) \
- X(alloc_arena) \
- X(close) \
- X(open_for_write) \
- X(read_file) \
- X(read_whole_file) \
- X(wait_on_value) \
- X(wake_waiters) \
- X(write_new_file) \
+ X(add_file_watch) \
+ X(alloc_arena) \
+ X(close) \
+ X(open_for_write) \
+ X(read_file) \
+ X(read_whole_file) \
+ X(shared_memory_region_lock) \
+ X(shared_memory_region_unlock) \
+ X(wake_waiters) \
+ X(write_new_file) \
X(write_file)
#define RENDERDOC_GET_API_FN(name) b32 name(u32 version, void **out_api)