ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 9130418de20f2674c3152ae6ef4bafaba43b6a00
Parent: 358a6695b6f36e4ae9425bf8ce1dcf07e9c8df9a
Author: Randy Palamar
Date:   Sun, 22 Jun 2025 12:21:30 -0600

core/lib: use shared memory for export, add compute stats export

This touches many files because deleting the export pipe means
that a large portion of other trash can be removed. It also
greatly shrinks the platform layer surface area.

Diffstat:
Mbeamformer.c | 249+++++++++++++++++++++++++++++++++++++------------------------------------------
Mbeamformer.h | 25+++++--------------------
Mbeamformer_parameters.h | 16+++++++++++++---
Mbeamformer_work_queue.c | 11+++++++++++
Mbeamformer_work_queue.h | 62++++++++++++++++++++++++++++++++++----------------------------
Mbuild.c | 2+-
Mhelpers/ogl_beamformer_lib.c | 197+++++++++++++++++++++++++++++--------------------------------------------------
Mhelpers/ogl_beamformer_lib_base.h | 10++++++----
Mmain_linux.c | 3+--
Mmain_w32.c | 3+--
Mos_linux.c | 41++++++-----------------------------------
Mos_win32.c | 37++++++-------------------------------
Mstatic.c | 13+++++++------
Mtests/throughput.c | 9+++++----
Mui.c | 11+++++------
Mutil.h | 26++++----------------------
16 files changed, 291 insertions(+), 424 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -160,6 +160,20 @@ push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info) t->buffer[index] = info; } +function BeamformComputeFrame * +beamformer_get_newest_frame(BeamformerCtx *ctx, b32 average_frame) +{ + BeamformComputeFrame *result = 0; + if (average_frame) { + u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames)); + result = ctx->averaged_frames + a_index; + } else { + u32 index = (ctx->next_render_frame_index - 1) % countof(ctx->beamform_frames); + result = ctx->beamform_frames + index; + } + return result; +} + function b32 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane) { @@ -168,7 +182,7 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl result = 1; u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1); u32 frame_index = frame_id % countof(ctx->beamform_frames); - work->type = BW_COMPUTE; + work->kind = BeamformerWorkKind_Compute; work->lock = BeamformerSharedMemoryLockKind_DispatchCompute; work->frame = ctx->beamform_frames + frame_index; work->frame->ready_to_present = 0; @@ -179,19 +193,6 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl } function void -export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame) -{ - uv3 dim = frame->dim; - iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); - ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size); - glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg); - s8 raw = {.len = out_size, .data = ctx->export_buffer.beg}; - if (!ctx->os.write_file(handle, raw)) - ctx->os.write_file(ctx->os.error_handle, s8("failed to export frame\n")); - ctx->os.close(handle); -} - -function void do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, u32 out_texture, uv3 out_data_dim) { @@ -200,7 +201,7 @@ do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); - glProgramUniform1f(cs->programs[ShaderKind_Sum], CS_SUM_PRESCALE_UNIFORM_LOC, in_scale); + glProgramUniform1f(cs->programs[BeamformerShaderKind_Sum], CS_SUM_PRESCALE_UNIFORM_LOC, in_scale); for (u32 i = 0; i < in_texture_count; i++) { glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); glDispatchCompute(ORONE(out_data_dim.x / 32), @@ -277,7 +278,7 @@ compute_cursor_finished(struct compute_cursor *cursor) } function void -do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader) +do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, BeamformerShaderKind shader) { ComputeShaderCtx *csctx = &ctx->csctx; BeamformerSharedMemory *sm = ctx->shared_memory.region; @@ -288,9 +289,9 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, u32 input_ssbo_idx = csctx->last_output_ssbo_index; switch (shader) { - case ShaderKind_Decode: - case ShaderKind_DecodeFloat: - case ShaderKind_DecodeFloatComplex:{ + case BeamformerShaderKind_Decode: + case BeamformerShaderKind_DecodeFloat: + case BeamformerShaderKind_DecodeFloatComplex:{ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); @@ -300,15 +301,15 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; }break; - case ShaderKind_CudaDecode:{ + case BeamformerShaderKind_CudaDecode:{ ctx->cuda_lib.decode(0, output_ssbo_idx, 0); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; }break; - case ShaderKind_CudaHilbert: + case BeamformerShaderKind_CudaHilbert: ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; break; - case ShaderKind_Demodulate:{ + case BeamformerShaderKind_Demodulate:{ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), @@ -316,7 +317,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ORONE(csctx->dec_data_dim.z)); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; }break; - case ShaderKind_MinMax:{ + case BeamformerShaderKind_MinMax:{ u32 texture = frame->frame.texture; for (u32 i = 1; i < frame->frame.mips; i++) { glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); @@ -330,7 +331,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } }break; - case ShaderKind_DASCompute:{ + case BeamformerShaderKind_DASCompute:{ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); @@ -367,7 +368,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, #endif glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); }break; - case ShaderKind_Sum:{ + case BeamformerShaderKind_Sum:{ u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index; aframe->ready_to_present = 0; @@ -405,7 +406,7 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header); switch (ctx->kind) { - case ShaderKind_DASCompute:{ + case BeamformerShaderKind_DASCompute:{ #define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n" stream_append_s8(&sb, s8("" "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " @@ -417,14 +418,14 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) )); #undef X }break; - case ShaderKind_DecodeFloat: - case ShaderKind_DecodeFloatComplex:{ - if (ctx->kind == ShaderKind_DecodeFloat) + case BeamformerShaderKind_DecodeFloat: + case BeamformerShaderKind_DecodeFloatComplex:{ + if (ctx->kind == BeamformerShaderKind_DecodeFloat) stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); else stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); } /* FALLTHROUGH */ - case ShaderKind_Decode:{ + case BeamformerShaderKind_Decode:{ #define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n" stream_append_s8(&sb, s8("" "layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", " @@ -434,11 +435,11 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) )); #undef X }break; - case ShaderKind_MinMax:{ + case BeamformerShaderKind_MinMax:{ stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) ") uniform int u_mip_map;\n\n")); }break; - case ShaderKind_Sum:{ + case BeamformerShaderKind_Sum:{ stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC) ") uniform float u_sum_prescale = 1.0;\n\n")); }break; @@ -477,7 +478,7 @@ DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader) if (new_program) { glDeleteProgram(*src->shader); *src->shader = new_program; - if (src->kind == ShaderKind_Render2D) ctx->frame_view_render_context.updated = 1; + if (src->kind == BeamformerShaderKind_Render2D) ctx->frame_view_render_context.updated = 1; } return new_program != 0; } @@ -506,32 +507,62 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co BeamformWork *work = beamform_work_queue_pop(q); while (work) { b32 can_commit = 1; - switch (work->type) { - case BW_RELOAD_SHADER: { + switch (work->kind) { + case BeamformerWorkKind_ReloadShader:{ ShaderReloadContext *src = work->shader_reload_context; b32 success = reload_compute_shader(ctx, src, s8(""), arena); - if (src->kind == ShaderKind_Decode) { + if (src->kind == BeamformerShaderKind_Decode) { /* TODO(rnp): think of a better way of doing this */ - src->kind = ShaderKind_DecodeFloatComplex; - src->shader = cs->programs + ShaderKind_DecodeFloatComplex; + src->kind = BeamformerShaderKind_DecodeFloatComplex; + src->shader = cs->programs + BeamformerShaderKind_DecodeFloatComplex; success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena); - src->kind = ShaderKind_DecodeFloat; - src->shader = cs->programs + ShaderKind_DecodeFloat; + src->kind = BeamformerShaderKind_DecodeFloat; + src->shader = cs->programs + BeamformerShaderKind_DecodeFloat; success &= reload_compute_shader(ctx, src, s8(" (F32)"), arena); - src->kind = ShaderKind_Decode; - src->shader = cs->programs + ShaderKind_Decode; + src->kind = BeamformerShaderKind_Decode; + src->shader = cs->programs + BeamformerShaderKind_Decode; } - if (success) { + if (success && ctx->csctx.raw_data_ssbo) { /* TODO(rnp): this check seems off */ - if (ctx->csctx.raw_data_ssbo) { - can_commit = 0; - ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; - fill_frame_compute_work(ctx, work, plane); + can_commit = 0; + BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, bp->output_points[3] > 1); + fill_frame_compute_work(ctx, work, frame->image_plane_tag); + } + }break; + case BeamformerWorkKind_ExportBuffer:{ + /* TODO(rnp): better way of handling DispatchCompute barrier */ + post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute, + sm->locks, ctx->os.shared_memory_region_unlock); + ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1); + BeamformerExportContext *ec = &work->export_context; + switch (ec->kind) { + case BeamformerExportKind_BeamformedData:{ + BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, bp->output_points[3] > 1); + assert(frame->ready_to_present); + u32 texture = frame->frame.texture; + uv3 dim = frame->frame.dim; + iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); + if (out_size <= ec->size) { + glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, out_size, + (u8 *)sm + BEAMFORMER_SCRATCH_OFF); } + }break; + case BeamformerExportKind_Stats:{ + ComputeTimingTable *table = ctx->compute_timing_table; + /* NOTE(rnp): do a little spin to let this finish updating */ + while (table->write_index != atomic_load_u32(&table->read_index)); + ComputeShaderStats *stats = ctx->compute_shader_stats; + if (sizeof(stats->table) <= ec->size) + mem_copy((u8 *)sm + BEAMFORMER_SCRATCH_OFF, &stats->table, sizeof(stats->table)); + }break; + InvalidDefaultCase; } - } break; - case BW_UPLOAD_BUFFER: { + ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); + post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks, + ctx->os.shared_memory_region_unlock); + }break; + case BeamformerWorkKind_UploadBuffer:{ ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1); BeamformerUploadContext *uc = &work->upload_context; u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; @@ -589,18 +620,14 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1))); ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); - } break; - case BW_COMPUTE_INDIRECT:{ + }break; + case BeamformerWorkKind_ComputeIndirect:{ fill_frame_compute_work(ctx, work, work->compute_indirect_plane); - DEBUG_DECL(work->type = BW_COMPUTE_INDIRECT;) + DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;) } /* FALLTHROUGH */ - case BW_COMPUTE:{ - /* NOTE(rnp): debug: here it is not a bug to release the lock if it - * isn't held but elswhere it is */ - DEBUG_DECL(if (sm->locks[work->lock])) { - ctx->os.shared_memory_region_unlock(&ctx->shared_memory, - sm->locks, work->lock); - } + case BeamformerWorkKind_Compute:{ + post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks, + ctx->os.shared_memory_region_unlock); push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); @@ -628,7 +655,6 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co } } - frame->in_flight = 1; frame->frame.min_coordinate = v4_from_f32_array(bp->output_min_coordinate); frame->frame.max_coordinate = v4_from_f32_array(bp->output_max_coordinate); frame->frame.das_shader_kind = bp->das_shader_id; @@ -636,11 +662,11 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co b32 did_sum_shader = 0; u32 stage_count = sm->compute_stages_count; - ComputeShaderKind *stages = sm->compute_stages; + BeamformerShaderKind *stages = sm->compute_stages; for (u32 i = 0; i < stage_count; i++) { - did_sum_shader |= stages[i] == ComputeShaderKind_Sum; + did_sum_shader |= stages[i] == BeamformerShaderKind_Sum; glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]); - do_compute_shader(ctx, arena, frame, (ShaderKind)stages[i]); + do_compute_shader(ctx, arena, frame, stages[i]); glEndQuery(GL_TIME_ELAPSED); } /* NOTE(rnp): block until work completes so that we can record timings */ @@ -650,7 +676,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co for (u32 i = 0; i < stage_count; i++) { ComputeTimingInfo info = {0}; info.kind = ComputeTimingInfoKind_Shader; - info.shader = (ShaderKind)stages[i]; + info.shader = stages[i]; glGetQueryObjectui64v(cs->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count); push_compute_timing_info(ctx->compute_timing_table, info); } @@ -668,16 +694,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd}); end_renderdoc_capture(gl_context); - } break; - case BW_SAVE_FRAME: { - BeamformComputeFrame *frame = work->output_frame_ctx.frame; - if (frame->ready_to_present) { - export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame); - } else { - /* TODO(rnp): should we handle this? */ - INVALID_CODE_PATH; - } - } break; + }break; InvalidDefaultCase; } @@ -695,55 +712,57 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats) * info item. this could result in garbage entries but they shouldn't really matter */ u32 target = atomic_load_u32(&t->write_index); - u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->times); + u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times); - static_assert(ShaderKind_Count + 1 <= 32, "timing coalescence bitfield test"); + static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test"); u32 seen_info_test = 0; while (t->read_index != target) { - ComputeTimingInfo info = t->buffer[(t->read_index++) % countof(t->buffer)]; + ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)]; switch (info.kind) { case ComputeTimingInfoKind_ComputeFrameBegin:{ assert(t->compute_frame_active == 0); t->compute_frame_active = 1; /* NOTE(rnp): allow multiple instances of same shader to accumulate */ - mem_clear(stats->times[stats_index], 0, sizeof(stats->times[stats_index])); + mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index])); }break; case ComputeTimingInfoKind_ComputeFrameEnd:{ assert(t->compute_frame_active == 1); t->compute_frame_active = 0; stats->latest_frame_index = stats_index; - stats_index = (stats_index + 1) % countof(stats->times); + stats_index = (stats_index + 1) % countof(stats->table.times); }break; case ComputeTimingInfoKind_Shader:{ - stats->times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9; + stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9; seen_info_test |= (1 << info.shader); }break; case ComputeTimingInfoKind_RF_Data:{ - stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->rf_time_deltas); + stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas); f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9; - stats->rf_time_deltas[stats->latest_rf_index] = delta; + stats->table.rf_time_deltas[stats->latest_rf_index] = delta; stats->last_rf_timer_count = info.timer_count; - seen_info_test |= (1 << ShaderKind_Count); + seen_info_test |= (1 << BeamformerShaderKind_Count); }break; } + /* NOTE(rnp): do this at the end so that stats table is always in a consistent state */ + atomic_add_u32(&t->read_index, 1); } if (seen_info_test) { - for EachEnumValue(ShaderKind, shader) { + for EachEnumValue(BeamformerShaderKind, shader) { if (seen_info_test & (1 << shader)) { f32 sum = 0; - for EachElement(stats->times, i) - sum += stats->times[i][shader]; - stats->average_times[shader] = sum / countof(stats->times); + for EachElement(stats->table.times, i) + sum += stats->table.times[i][shader]; + stats->average_times[shader] = sum / countof(stats->table.times); } } - if (seen_info_test & (1 << ShaderKind_Count)) { + if (seen_info_test & (1 << BeamformerShaderKind_Count)) { f32 sum = 0; - for EachElement(stats->rf_time_deltas, i) - sum += stats->rf_time_deltas[i]; - stats->rf_time_delta_average = sum / countof(stats->rf_time_deltas); + for EachElement(stats->table.rf_time_deltas, i) + sum += stats->table.rf_time_deltas[i]; + stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); } } } @@ -805,58 +824,20 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) BeamformerSharedMemory *sm = ctx->shared_memory.region; BeamformerParameters *bp = &sm->parameters; + b32 averaging = bp->output_points[3] > 1; if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) { if (sm->start_compute_from_main) { BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); - ImagePlaneTag tag = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; - if (fill_frame_compute_work(ctx, work, tag)) { + ImagePlaneTag tag = beamformer_get_newest_frame(ctx, averaging)->image_plane_tag; + if (fill_frame_compute_work(ctx, work, tag)) beamform_work_queue_push_commit(ctx->beamform_work_queue); - if (sm->export_next_frame) { - BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); - if (export) { - /* TODO: we don't really want the beamformer opening/closing files */ - iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name); - export->type = BW_SAVE_FRAME; - export->output_frame_ctx.file_handle = f; - if (bp->output_points[3] > 1) { - static_assert(countof(ctx->averaged_frames) == 2, - "fix this, we assume average frame ping pong buffer"); - u32 a_index = !(ctx->averaged_frame_index % - countof(ctx->averaged_frames)); - BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; - export->output_frame_ctx.frame = aframe; - } else { - export->output_frame_ctx.frame = work->frame; - } - beamform_work_queue_push_commit(ctx->beamform_work_queue); - } - sm->export_next_frame = 0; - } - } atomic_store_u32(&sm->start_compute_from_main, 0); } ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); } - ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index, - ctx->next_render_frame_index - ctx->display_frame_index); - for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) { - if (frame->in_flight && frame->ready_to_present) { - frame->in_flight = 0; - ctx->display_frame_index = frame - cfi.frames; - } - } - - BeamformComputeFrame *frame_to_draw; - if (bp->output_points[3] > 1) { - u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames)); - frame_to_draw = ctx->averaged_frames + a_index; - } else { - frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; - } - - draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0, - frame_to_draw->image_plane_tag); + BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, averaging); + draw_ui(ctx, input, frame->ready_to_present? &frame->frame : 0, frame->image_plane_tag); ctx->frame_view_render_context.updated = 0; diff --git a/beamformer.h b/beamformer.h @@ -77,7 +77,7 @@ typedef struct { #define CS_SUM_PRESCALE_UNIFORM_LOC 1 typedef struct { - u32 programs[ComputeShaderKind_Count]; + u32 programs[BeamformerShaderKind_ComputeCount]; /* NOTE: Decoded data is only relevant in the context of a single frame. We use two * buffers so that they can be swapped when chaining multiple compute stages */ @@ -110,22 +110,11 @@ typedef enum { DASShaderKind_Count } DASShaderKind; -typedef enum { - #define X(e, n, s, h, pn) ShaderKind_##e = n, - COMPUTE_SHADERS - #undef X - ShaderKind_Render2D, - ShaderKind_Count -} ShaderKind; - typedef struct { - /* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which - * visualization method you want to use. the coalescing function wants both directions */ - f32 times[32][ShaderKind_Count]; - f32 average_times[ShaderKind_Count]; + BeamformerComputeStatsTable table; + f32 average_times[BeamformerShaderKind_Count]; u64 last_rf_timer_count; - f32 rf_time_deltas[32]; f32 rf_time_delta_average; u32 latest_frame_index; @@ -144,7 +133,7 @@ typedef struct { u64 timer_count; ComputeTimingInfoKind kind; union { - ShaderKind shader; + BeamformerShaderKind shader; }; } ComputeTimingInfo; @@ -175,7 +164,6 @@ typedef struct BeamformFrame { struct BeamformComputeFrame { BeamformFrame frame; ImagePlaneTag image_plane_tag; - b32 in_flight; b32 ready_to_present; }; @@ -210,7 +198,6 @@ typedef struct { BeamformComputeFrame beamform_frames[MAX_BEAMFORMED_SAVED_FRAMES]; u32 next_render_frame_index; - u32 display_frame_index; /* NOTE: this will only be used when we are averaging */ u32 averaged_frame_index; @@ -222,8 +209,6 @@ typedef struct { * destroying itself on hot-reload */ FrameViewRenderContext frame_view_render_context; - Arena export_buffer; - CudaLib cuda_lib; OS os; Stream error_stream; @@ -244,7 +229,7 @@ struct ShaderReloadContext { u32 *shader; ShaderReloadContext *link; GLenum gl_type; - ShaderKind kind; + BeamformerShaderKind kind; }; #define BEAMFORMER_FRAME_STEP_FN(name) void name(BeamformerCtx *ctx, Arena *arena, \ diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -20,11 +20,21 @@ X(Sum, 8, "sum", 0, "Sum") typedef enum { - #define X(e, n, s, h, pn) ComputeShaderKind_##e = n, + #define X(e, n, s, h, pn) BeamformerShaderKind_##e = n, COMPUTE_SHADERS #undef X - ComputeShaderKind_Count -} ComputeShaderKind; + BeamformerShaderKind_Render2D, + BeamformerShaderKind_Count, + + BeamformerShaderKind_ComputeCount = BeamformerShaderKind_Render2D, +} BeamformerShaderKind; + +typedef struct { + /* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which + * visualization method you want to use. the coalescing function wants both directions */ + float times[32][BeamformerShaderKind_Count]; + float rf_time_deltas[32]; +} BeamformerComputeStatsTable; /* X(type, id, pretty name) */ #define DECODE_TYPES \ diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c @@ -50,3 +50,14 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit) { atomic_add_u64(&q->queue, 1); } + +function void +post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks, + os_shared_memory_region_unlock_fn *os_shared_memory_region_unlock) +{ + /* NOTE(rnp): debug: here it is not a bug to release the lock if it + * isn't held but elswhere it is */ + DEBUG_DECL(if (locks[lock])) { + os_shared_memory_region_unlock(sm, locks, lock); + } +} diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h @@ -2,19 +2,19 @@ #ifndef _BEAMFORMER_WORK_QUEUE_H_ #define _BEAMFORMER_WORK_QUEUE_H_ -#define BEAMFORMER_SHARED_MEMORY_VERSION (6UL) +#define BEAMFORMER_SHARED_MEMORY_VERSION (7UL) typedef struct BeamformComputeFrame BeamformComputeFrame; typedef struct ShaderReloadContext ShaderReloadContext; typedef enum { - BW_COMPUTE, - BW_COMPUTE_INDIRECT, - BW_RELOAD_SHADER, - BW_SAVE_FRAME, - BW_SEND_FRAME, - BW_UPLOAD_BUFFER, -} BeamformWorkType; + BeamformerWorkKind_Compute, + BeamformerWorkKind_ComputeIndirect, + BeamformerWorkKind_ReloadShader, + BeamformerWorkKind_SendFrame, + BeamformerWorkKind_ExportBuffer, + BeamformerWorkKind_UploadBuffer, +} BeamformerWorkKind; typedef enum { BU_KIND_CHANNEL_MAPPING, @@ -26,23 +26,29 @@ typedef enum { } BeamformerUploadKind; typedef struct { + BeamformerUploadKind kind; i32 size; i32 shared_memory_offset; - BeamformerUploadKind kind; } BeamformerUploadContext; +typedef enum { + BeamformerExportKind_BeamformedData, + BeamformerExportKind_Stats, +} BeamformerExportKind; + typedef struct { - BeamformComputeFrame *frame; - iptr file_handle; -} BeamformOutputFrameContext; + BeamformerExportKind kind; + i32 size; +} BeamformerExportContext; #define BEAMFORMER_SHARED_MEMORY_LOCKS \ X(None) \ - X(Parameters) \ - X(FocalVectors) \ X(ChannelMapping) \ + X(FocalVectors) \ + X(Parameters) \ + X(ScratchSpace) \ X(SparseElements) \ - X(RawData) \ + X(ExportSync) \ X(DispatchCompute) #define X(name) BeamformerSharedMemoryLockKind_##name, @@ -52,16 +58,15 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun /* NOTE: discriminated union based on type */ typedef struct { union { - BeamformComputeFrame *frame; - BeamformerUploadContext upload_context; - BeamformOutputFrameContext output_frame_ctx; - ShaderReloadContext *shader_reload_context; - ImagePlaneTag compute_indirect_plane; - void *generic; + BeamformComputeFrame *frame; + BeamformerUploadContext upload_context; + BeamformerExportContext export_context; + ShaderReloadContext *shader_reload_context; + ImagePlaneTag compute_indirect_plane; + void *generic; }; BeamformerSharedMemoryLockKind lock; - - BeamformWorkType type; + BeamformerWorkKind kind; } BeamformWork; typedef struct { @@ -79,11 +84,12 @@ typedef BEAMFORM_WORK_QUEUE_PUSH_FN(beamform_work_queue_push_fn); typedef BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit_fn); #define BEAMFORMER_SHARED_MEMORY_SIZE (GB(2)) -#define BEAMFORMER_RF_DATA_OFF (sizeof(BeamformerSharedMemory) + 4096ULL \ +#define BEAMFORMER_SCRATCH_OFF (sizeof(BeamformerSharedMemory) + 4096ULL \ - (uintptr_t)(sizeof(BeamformerSharedMemory) & 4095ULL)) -#define BEAMFORMER_MAX_RF_DATA_SIZE (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_RF_DATA_OFF) +#define BEAMFORMER_SCRATCH_SIZE (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_SCRATCH_OFF) +#define BEAMFORMER_MAX_RF_DATA_SIZE (BEAMFORMER_SCRATCH_SIZE) -typedef align_as(64) struct { +typedef struct { u32 version; /* NOTE(rnp): causes future library calls to fail. @@ -113,8 +119,8 @@ typedef align_as(64) struct { }; }; - ComputeShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES]; - u32 compute_stages_count; + BeamformerShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES]; + u32 compute_stages_count; /* TODO(rnp): hack: we need a different way of dispatching work for export */ b32 start_compute_from_main; diff --git a/build.c b/build.c @@ -698,7 +698,7 @@ main(i32 argc, char *argv[]) u64 start_time = os_get_timer_counter(); b32 result = 1; - Arena arena = os_alloc_arena((Arena){0}, MB(8)); + Arena arena = os_alloc_arena(MB(8)); check_rebuild_self(arena, argc, argv); Options options = parse_options(argc, argv); diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c @@ -6,8 +6,6 @@ #include "ogl_beamformer_lib_base.h" #include "../beamformer_work_queue.c" -#define PIPE_RETRY_PERIOD_MS (100ULL) - global SharedMemoryRegion g_shared_memory; global BeamformerSharedMemory *g_bp; global BeamformerLibErrorKind g_lib_last_error; @@ -17,20 +15,7 @@ global BeamformerLibErrorKind g_lib_last_error; #elif OS_WINDOWS #include "../os_win32.c" -#define PIPE_TYPE_BYTE 0x00 -#define PIPE_ACCESS_INBOUND 0x01 - -#define PIPE_WAIT 0x00 -#define PIPE_NOWAIT 0x01 - -#define ERROR_NO_DATA 232L -#define ERROR_PIPE_NOT_CONNECTED 233L -#define ERROR_PIPE_LISTENING 536L - -W32(iptr) CreateNamedPipeA(c8 *, u32, u32, u32, u32, u32, u32, void *); -W32(b32) DisconnectNamedPipe(iptr); W32(iptr) OpenFileMappingA(u32, b32, c8 *); -W32(void) Sleep(u32); #else #error Unsupported Platform @@ -38,41 +23,6 @@ W32(void) Sleep(u32); #if OS_LINUX -function Pipe -os_open_read_pipe(char *name) -{ - mkfifo(name, 0660); - return (Pipe){.file = open(name, O_RDONLY|O_NONBLOCK), .name = name}; -} - -static void -os_disconnect_pipe(Pipe p) -{ -} - -static void -os_close_pipe(iptr *file, char *name) -{ - if (file) close(*file); - if (name) unlink(name); - *file = INVALID_FILE; -} - -static b32 -os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms) -{ - struct pollfd pfd = {.fd = p.file, .events = POLLIN}; - iz total_read = 0; - if (poll(&pfd, 1, timeout_ms) > 0) { - iz r; - do { - r = read(p.file, (u8 *)buf + total_read, read_size - total_read); - if (r > 0) total_read += r; - } while (r != 0); - } - return total_read == read_size; -} - function SharedMemoryRegion os_open_shared_memory_area(char *name) { @@ -88,54 +38,6 @@ os_open_shared_memory_area(char *name) #elif OS_WINDOWS -static Pipe -os_open_read_pipe(char *name) -{ - iptr file = CreateNamedPipeA(name, PIPE_ACCESS_INBOUND, PIPE_TYPE_BYTE|PIPE_NOWAIT, 1, - 0, 1024UL * 1024UL, 0, 0); - return (Pipe){.file = file, .name = name}; -} - -static void -os_disconnect_pipe(Pipe p) -{ - DisconnectNamedPipe(p.file); -} - -static void -os_close_pipe(iptr *file, char *name) -{ - if (file) CloseHandle(*file); - *file = INVALID_FILE; -} - -static b32 -os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms) -{ - iz elapsed_ms = 0, total_read = 0; - while (elapsed_ms <= timeout_ms && read_size != total_read) { - u8 data; - i32 read; - b32 result = ReadFile(p.file, &data, 0, &read, 0); - if (!result) { - i32 error = GetLastError(); - if (error != ERROR_NO_DATA && - error != ERROR_PIPE_LISTENING && - error != ERROR_PIPE_NOT_CONNECTED) - { - /* NOTE: pipe is in a bad state; we will never read anything */ - break; - } - Sleep(PIPE_RETRY_PERIOD_MS); - elapsed_ms += PIPE_RETRY_PERIOD_MS; - } else { - ReadFile(p.file, (u8 *)buf + total_read, read_size - total_read, &read, 0); - total_read += read; - } - } - return total_read == read_size; -} - function SharedMemoryRegion os_open_shared_memory_area(char *name) { @@ -216,6 +118,18 @@ lib_release_lock(BeamformerSharedMemoryLockKind lock) os_shared_memory_region_unlock(&g_shared_memory, g_bp->locks, (i32)lock); } +function b32 +try_wait_sync(BeamformerSharedMemoryLockKind lock, i32 timeout_ms) +{ + b32 result = 0; + if (lib_try_lock(lock, 0) && lib_try_lock(lock, timeout_ms)) { + /* TODO(rnp): non-critical race condition */ + lib_release_lock(lock); + result = 1; + } + return result; +} + u32 beamformer_get_api_version(void) { @@ -251,7 +165,7 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count) if (check_shared_memory()) { g_bp->compute_stages_count = 0; for (i32 i = 0; i < stages_count; i++) { - if (BETWEEN(stages[i], 0, ComputeShaderKind_Count)) { + if (BETWEEN(stages[i], 0, BeamformerShaderKind_ComputeCount)) { g_bp->compute_stages[g_bp->compute_stages_count++] = stages[i]; } } @@ -270,16 +184,8 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count) b32 beamformer_start_compute(i32 timeout_ms) { - b32 result = 0; - if (check_shared_memory()) { - if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) { - if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms)) { - /* TODO(rnp): non-critical race condition */ - lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute); - result = 1; - } - } - } + b32 result = check_shared_memory() && + try_wait_sync(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms); return result; } @@ -293,7 +199,7 @@ beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerUploa result = work && lib_try_lock(lock, timeout_ms); if (result) { work->upload_context = upload_context; - work->type = BW_UPLOAD_BUFFER; + work->kind = BeamformerWorkKind_UploadBuffer; work->lock = lock; mem_copy((u8 *)g_bp + store_offset, data, size); if ((atomic_load_u32(&g_bp->dirty_regions) & (1 << (lock - 1))) == 0) { @@ -335,11 +241,11 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_f b32 result = 0; if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) { BeamformerUploadContext uc = {0}; - uc.shared_memory_offset = BEAMFORMER_RF_DATA_OFF; + uc.shared_memory_offset = BEAMFORMER_SCRATCH_OFF; uc.size = data_size; uc.kind = BU_KIND_RF_DATA; result = beamformer_upload_buffer(data, data_size, uc.shared_memory_offset, uc, - BeamformerSharedMemoryLockKind_RawData, timeout_ms); + BeamformerSharedMemoryLockKind_ScratchSpace, timeout_ms); if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1); } else { g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW; @@ -363,7 +269,7 @@ beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag BeamformWork *work = try_push_work_queue(); result = work != 0; if (result) { - work->type = BW_COMPUTE_INDIRECT; + work->kind = BeamformerWorkKind_ComputeIndirect; work->compute_indirect_plane = image_plane_tag; beamform_work_queue_push_commit(&g_bp->external_work_queue); } @@ -438,6 +344,20 @@ send_data(void *data, u32 data_size) return result; } +function b32 +beamformer_export_buffer(BeamformerExportContext export_context) +{ + BeamformWork *work = try_push_work_queue(); + b32 result = work != 0; + if (result) { + work->export_context = export_context; + work->kind = BeamformerWorkKind_ExportBuffer; + work->lock = BeamformerSharedMemoryLockKind_ScratchSpace; + beamform_work_queue_push_commit(&g_bp->external_work_queue); + } + return result; +} + b32 beamform_data_synchronized(void *data, u32 data_size, u32 output_points[3], f32 *out_data, i32 timeout_ms) { @@ -450,21 +370,46 @@ beamform_data_synchronized(void *data, u32 data_size, u32 output_points[3], f32 g_bp->parameters.output_points[0] = output_points[0]; g_bp->parameters.output_points[1] = output_points[1]; g_bp->parameters.output_points[2] = output_points[2]; - g_bp->export_next_frame = 1; - - Pipe export_pipe = os_open_read_pipe(OS_EXPORT_PIPE_NAME); - if (export_pipe.file != INVALID_FILE) { - if (send_data(data, data_size)) { - iz output_size = output_points[0] * output_points[1] * - output_points[2] * sizeof(f32) * 2; - result = os_wait_read_pipe(export_pipe, out_data, output_size, timeout_ms); - if (!result) g_lib_last_error = BF_LIB_ERR_KIND_READ_EXPORT_PIPE; - } - os_disconnect_pipe(export_pipe); - os_close_pipe(&export_pipe.file, export_pipe.name); + iz output_size = output_points[0] * output_points[1] * output_points[2] * sizeof(f32) * 2; + if (output_size <= BEAMFORMER_SCRATCH_SIZE && + beamformer_push_data_with_compute(data, data_size, 0, 0)) + { + BeamformerExportContext export; + export.kind = BeamformerExportKind_BeamformedData; + export.size = output_size; + if (beamformer_export_buffer(export) && + lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) + { + if (try_wait_sync(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) { + mem_copy(out_data, (u8 *)g_bp + BEAMFORMER_SCRATCH_OFF, output_size); + result = 1; + } + } } else { - g_lib_last_error = BF_LIB_ERR_KIND_OPEN_EXPORT_PIPE; + g_lib_last_error = BF_LIB_ERR_KIND_EXPORT_SPACE_OVERFLOW; + } + } + return result; +} + +b32 +beamformer_compute_timings(BeamformerComputeStatsTable *output, i32 timeout_ms) +{ + b32 result = 0; + if (check_shared_memory()) { + static_assert(sizeof(*output) <= BEAMFORMER_SCRATCH_SIZE, "timing table size exceeds scratch space"); + BeamformerExportContext export; + export.kind = BeamformerExportKind_Stats; + export.size = sizeof(*output); + + if (beamformer_export_buffer(export) && + lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) + { + if (try_wait_sync(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) { + mem_copy(output, (u8 *)g_bp + BEAMFORMER_SCRATCH_OFF, sizeof(*output)); + result = 1; + } } } return result; diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h @@ -16,10 +16,9 @@ X(INVALID_IMAGE_PLANE, 5, "invalid image plane") \ X(BUFFER_OVERFLOW, 6, "passed buffer size exceeds available space") \ X(WORK_QUEUE_FULL, 7, "work queue full") \ - X(OPEN_EXPORT_PIPE, 8, "failed to open export pipe") \ - X(READ_EXPORT_PIPE, 9, "failed to read full export data from pipe") \ - X(SHARED_MEMORY, 10, "failed to open shared memory region") \ - X(SYNC_VARIABLE, 11, "failed to acquire lock within timeout period") + X(EXPORT_SPACE_OVERFLOW, 8, "not enough space for data export") \ + X(SHARED_MEMORY, 9, "failed to open shared memory region") \ + X(SYNC_VARIABLE, 10, "failed to acquire lock within timeout period") #define X(type, num, string) BF_LIB_ERR_KIND_ ##type = num, typedef enum {BEAMFORMER_LIB_ERRORS} BeamformerLibErrorKind; @@ -41,6 +40,9 @@ LIB_FN uint32_t send_data(void *data, uint32_t data_size); LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, uint32_t output_points[3], float *out_data, int32_t timeout_ms); +/* NOTE: downloads the last 32 frames worth of compute timings into output */ +LIB_FN uint32_t beamformer_compute_timings(BeamformerComputeStatsTable *output, int32_t timeout_ms); + /* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */ LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms); diff --git a/main_linux.c b/main_linux.c @@ -75,7 +75,7 @@ main(void) { BeamformerCtx ctx = {0}; BeamformerInput input = {.executable_reloaded = 1}; - Arena temp_memory = os_alloc_arena((Arena){0}, MB(16)); + Arena temp_memory = os_alloc_arena(MB(16)); ctx.error_stream = stream_alloc(&temp_memory, MB(1)); ctx.ui_backing_store = sub_arena(&temp_memory, MB(2), KB(4)); @@ -88,7 +88,6 @@ main(void) ctx.os.file_watch_context.handle = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); ctx.os.compute_worker.asleep = 1; ctx.os.error_handle = STDERR_FILENO; - ctx.os.export_pipe_name = OS_EXPORT_PIPE_NAME; setup_beamformer(&ctx, &input, &temp_memory); os_wake_waiters(&ctx.os.compute_worker.sync_variable); diff --git a/main_w32.c b/main_w32.c @@ -104,7 +104,7 @@ main(void) { BeamformerCtx ctx = {0}; BeamformerInput input = {.executable_reloaded = 1}; - Arena temp_memory = os_alloc_arena((Arena){0}, MB(16)); + Arena temp_memory = os_alloc_arena(MB(16)); ctx.error_stream = stream_alloc(&temp_memory, MB(1)); ctx.ui_backing_store = sub_arena(&temp_memory, MB(2), KB(4)); @@ -121,7 +121,6 @@ main(void) ctx.os.context = (iptr)&w32_ctx; ctx.os.compute_worker.asleep = 1; ctx.os.error_handle = GetStdHandle(STD_ERROR_HANDLE); - ctx.os.export_pipe_name = OS_EXPORT_PIPE_NAME; setup_beamformer(&ctx, &input, &temp_memory); os_wake_waiters(&ctx.os.compute_worker.sync_variable); diff --git a/os_linux.c b/os_linux.c @@ -4,7 +4,6 @@ * be provided by any platform the beamformer is ported to. */ #define OS_SHARED_MEMORY_NAME "/ogl_beamformer_shared_memory" -#define OS_EXPORT_PIPE_NAME "/tmp/beamformer_output_pipe" #define OS_PATH_SEPARATOR_CHAR '/' #define OS_PATH_SEPARATOR "/" @@ -92,29 +91,12 @@ os_round_up_to_page_size(iz value) function OS_ALLOC_ARENA_FN(os_alloc_arena) { - Arena result = old; - capacity = os_round_up_to_page_size(capacity); - iz old_size = old.end - old.beg; - if (old_size < capacity) { - if (old.beg) munmap(old.beg, old_size); - result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (result.beg == MAP_FAILED) - os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); - result.end = result.beg + capacity; - } - return result; -} - -function OS_CLOSE_FN(os_close) -{ - close(file); -} - -function OS_OPEN_FOR_WRITE_FN(os_open_for_write) -{ - iptr result = open(fname, O_WRONLY|O_TRUNC); - if (result == -1) - result = INVALID_FILE; + Arena result = {0}; + capacity = os_round_up_to_page_size(capacity); + result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (result.beg == MAP_FAILED) + os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); + result.end = result.beg + capacity; return result; } @@ -153,17 +135,6 @@ os_file_exists(char *path) return result; } -function OS_READ_FILE_FN(os_read_file) -{ - iz r = 0, total_read = 0; - do { - if (r != -1) - total_read += r; - r = read(file, buf + total_read, size - total_read); - } while (r); - return total_read; -} - function SharedMemoryRegion os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity) { diff --git a/os_win32.c b/os_win32.c @@ -1,7 +1,6 @@ /* See LICENSE for license details. */ #define OS_SHARED_MEMORY_NAME "Local\\ogl_beamformer_parameters" -#define OS_EXPORT_PIPE_NAME "\\\\.\\pipe\\beamformer_output_pipe" #define OS_PATH_SEPARATOR_CHAR '\\' #define OS_PATH_SEPARATOR "\\" @@ -15,7 +14,6 @@ #define PAGE_READWRITE 0x04 #define MEM_COMMIT 0x1000 #define MEM_RESERVE 0x2000 -#define MEM_RELEASE 0x8000 #define GENERIC_WRITE 0x40000000 #define GENERIC_READ 0x80000000 @@ -120,7 +118,6 @@ W32(i32) WakeByAddressAll(void *); W32(iptr) wglGetProcAddress(c8 *); W32(b32) WriteFile(iptr, u8 *, i32, i32 *, void *); W32(void *) VirtualAlloc(u8 *, iz, u32, u32); -W32(b32) VirtualFree(u8 *, iz, u32); #ifdef _DEBUG function void * @@ -197,27 +194,12 @@ os_round_up_to_page_size(iz value) function OS_ALLOC_ARENA_FN(os_alloc_arena) { - Arena result = old; - capacity = os_round_up_to_page_size(capacity); - iz old_size = old.end - old.beg; - if (old_size < capacity) { - if (old.beg) VirtualFree(old.beg, old_size, MEM_RELEASE); - result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); - if (!result.beg) - os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); - result.end = result.beg + capacity; - } - return result; -} - -function OS_CLOSE_FN(os_close) -{ - CloseHandle(file); -} - -function OS_OPEN_FOR_WRITE_FN(os_open_for_write) -{ - iptr result = CreateFileA(fname, GENERIC_WRITE, 0, 0, OPEN_EXISTING, 0, 0); + Arena result = {0}; + capacity = os_round_up_to_page_size(capacity); + result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE); + if (!result.beg) + os_fatal(s8("os_alloc_arena: couldn't allocate memory\n")); + result.end = result.beg + capacity; return result; } @@ -243,13 +225,6 @@ function OS_READ_WHOLE_FILE_FN(os_read_whole_file) return result; } -function OS_READ_FILE_FN(os_read_file) -{ - i32 total_read = 0; - ReadFile(file, buf, size, &total_read, 0); - return total_read; -} - function OS_WRITE_NEW_FILE_FN(os_write_new_file) { b32 result = 0; diff --git a/static.c b/static.c @@ -180,7 +180,7 @@ function FILE_WATCH_CALLBACK_FN(reload_shader_indirect) BeamformerCtx *ctx = src->beamformer_context; BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); if (work) { - work->type = BW_RELOAD_SHADER; + work->kind = BeamformerWorkKind_ReloadShader, work->shader_reload_context = src; beamform_work_queue_push_commit(ctx->beamform_work_queue); os_wake_waiters(&os->compute_worker.sync_variable); @@ -291,8 +291,8 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory) sm->version = BEAMFORMER_SHARED_MEMORY_VERSION; /* NOTE: default compute shader pipeline */ - sm->compute_stages[0] = ComputeShaderKind_Decode; - sm->compute_stages[1] = ComputeShaderKind_DASCompute; + sm->compute_stages[0] = BeamformerShaderKind_Decode; + sm->compute_stages[1] = BeamformerShaderKind_DASCompute; sm->compute_stages_count = 2; if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA @@ -325,15 +325,16 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory) ); #undef X + ComputeShaderCtx *cs = &ctx->csctx; #define X(e, sn, f, nh, pretty_name) do if (s8(f).len > 0) { \ ShaderReloadContext *src = push_struct(memory, typeof(*src)); \ src->beamformer_context = ctx; \ if (nh) src->header = compute_parameters_header; \ src->path = s8(static_path_join("shaders", f ".glsl")); \ src->name = src->path; \ - src->shader = ctx->csctx.programs + ShaderKind_##e; \ + src->shader = cs->programs + BeamformerShaderKind_##e; \ src->gl_type = GL_COMPUTE_SHADER; \ - src->kind = ShaderKind_##e; \ + src->kind = BeamformerShaderKind_##e; \ src->link = src; \ os_add_file_watch(&ctx->os, memory, src->path, reload_shader_indirect, (iptr)src); \ reload_shader_indirect(&ctx->os, src->path, (iptr)src, *memory); \ @@ -372,7 +373,7 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory) render_2d->path = s8(static_path_join("shaders", "render_2d.frag.glsl")); render_2d->name = s8("shaders/render_2d.glsl"); render_2d->gl_type = GL_FRAGMENT_SHADER; - render_2d->kind = ShaderKind_Render2D; + render_2d->kind = BeamformerShaderKind_Render2D; render_2d->shader = &fvr->shader; render_2d->header = s8("" "layout(location = 0) in vec2 texture_coordinate;\n" diff --git a/tests/throughput.c b/tests/throughput.c @@ -2,6 +2,7 @@ /* TODO(rnp): * [ ]: for finer grained evaluation of throughput latency just queue a data upload * without replacing the data. + * [ ]: bug: we aren't inserting rf data between each frame */ #define LIB_FN function @@ -371,9 +372,9 @@ execute_study(s8 study, Arena arena, Stream path, Options *options) i32 shader_stages[16]; i32 shader_stage_count = 0; - if (options->cuda) shader_stages[shader_stage_count++] = ComputeShaderKind_CudaDecode; - else shader_stages[shader_stage_count++] = ComputeShaderKind_Decode; - shader_stages[shader_stage_count++] = ComputeShaderKind_DASCompute; + if (options->cuda) shader_stages[shader_stage_count++] = BeamformerShaderKind_CudaDecode; + else shader_stages[shader_stage_count++] = BeamformerShaderKind_Decode; + shader_stages[shader_stage_count++] = BeamformerShaderKind_DASCompute; set_beamformer_pipeline(shader_stages, shader_stage_count); @@ -428,7 +429,7 @@ main(i32 argc, char *argv[]) signal(SIGINT, sigint); - Arena arena = os_alloc_arena((Arena){0}, KB(8)); + Arena arena = os_alloc_arena(KB(8)); Stream path = stream_alloc(&arena, KB(4)); stream_append_s8(&path, c_str_to_s8(options.remaining[0])); stream_ensure_termination(&path, OS_PATH_SEPARATOR_CHAR); diff --git a/ui.c b/ui.c @@ -2032,8 +2032,8 @@ draw_compute_progress_bar(BeamformerUI *ui, Arena arena, ComputeProgressBar *sta function v2 draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, Rect r) { - #define X(e, n, s, h, pn) [ComputeShaderKind_##e] = s8_comp(pn ":"), - read_only local_persist s8 labels[ComputeShaderKind_Count] = {COMPUTE_SHADERS}; + #define X(e, n, s, h, pn) [BeamformerShaderKind_##e] = s8_comp(pn ":"), + read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS}; #undef X BeamformerSharedMemory *sm = ctx->shared_memory.region; @@ -2043,14 +2043,14 @@ draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, Rect r) u32 stages = sm->compute_stages_count; TextSpec text_spec = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED}; - static_assert(ShaderKind_Count <= 32, "shader kind bitfield test"); + static_assert(countof(labels) <= 32, "shader kind bitfield test"); u32 seen_shaders = 0; Table *table = table_new(&arena, stages + 2, TextAlignment_Left, TextAlignment_Left, TextAlignment_Left); for (u32 i = 0; i < stages; i++) { TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data; Stream sb = arena_stream(arena); - ShaderKind index = (ShaderKind)sm->compute_stages[i]; + BeamformerShaderKind index = sm->compute_stages[i]; if ((seen_shaders & (1 << index)) == 0) { compute_time_sum += stats->average_times[index]; stream_append_f64_e(&sb, stats->average_times[index]); @@ -3019,8 +3019,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, BeamformerSharedMemoryLockKind_DispatchCompute, 0); - sm->start_compute_from_main |= dispatch & - ctx->beamform_frames[ctx->display_frame_index].ready_to_present; + sm->start_compute_from_main |= dispatch & beamformer_get_newest_frame(ctx, 0)->ready_to_present; ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock); } } diff --git a/util.h b/util.h @@ -43,6 +43,7 @@ #define INVALID_CODE_PATH ASSERT(0) #define INVALID_DEFAULT_CASE default: ASSERT(0); break +#define InvalidCodePath assert(0) #define InvalidDefaultCase default: assert(0); break #define arg_list(type, ...) (type []){__VA_ARGS__}, sizeof((type []){__VA_ARGS__}) / sizeof(type) @@ -207,18 +208,14 @@ typedef union { .size = {.x = -F32_INFINITY, .y = -F32_INFINITY}} typedef struct { - iptr file; - char *name; -} Pipe; -#define INVALID_FILE (-1) - -typedef struct { u8 *data; u32 widx; u32 cap; b32 errors; } Stream; +#define INVALID_FILE (-1) + typedef struct OS OS; typedef struct { @@ -263,7 +260,7 @@ typedef struct { iptr os_context; } SharedMemoryRegion; -#define OS_ALLOC_ARENA_FN(name) Arena name(Arena old, iz capacity) +#define OS_ALLOC_ARENA_FN(name) Arena name(iz capacity) typedef OS_ALLOC_ARENA_FN(os_alloc_arena_fn); #define OS_ADD_FILE_WATCH_FN(name) void name(OS *os, Arena *a, s8 path, \ @@ -273,18 +270,9 @@ typedef OS_ADD_FILE_WATCH_FN(os_add_file_watch_fn); #define OS_WAKE_WORKER_FN(name) void name(GLWorkerThreadContext *ctx) typedef OS_WAKE_WORKER_FN(os_wake_worker_fn); -#define OS_CLOSE_FN(name) void name(iptr file) -typedef OS_CLOSE_FN(os_close_fn); - -#define OS_OPEN_FOR_WRITE_FN(name) iptr name(c8 *fname) -typedef OS_OPEN_FOR_WRITE_FN(os_open_for_write_fn); - #define OS_READ_WHOLE_FILE_FN(name) s8 name(Arena *arena, char *file) typedef OS_READ_WHOLE_FILE_FN(os_read_whole_file_fn); -#define OS_READ_FILE_FN(name) iz name(iptr file, void *buf, iz size) -typedef OS_READ_FILE_FN(os_read_file_fn); - #define OS_WAIT_ON_VALUE_FN(name) b32 name(i32 *value, i32 current, u32 timeout_ms) typedef OS_WAIT_ON_VALUE_FN(os_wait_on_value_fn); @@ -308,10 +296,6 @@ typedef OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock_fn); #define OS_FNS \ X(add_file_watch) \ - X(alloc_arena) \ - X(close) \ - X(open_for_write) \ - X(read_file) \ X(read_whole_file) \ X(shared_memory_region_lock) \ X(shared_memory_region_unlock) \ @@ -342,8 +326,6 @@ struct OS { iptr error_handle; GLWorkerThreadContext compute_worker; - char *export_pipe_name; - DEBUG_DECL(renderdoc_start_frame_capture_fn *start_frame_capture;) DEBUG_DECL(renderdoc_end_frame_capture_fn *end_frame_capture;) };