ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: 67bd1deee3a5fa0454f0ac8d6361492c2fa9daea
Parent: 59f09bb2efea630c3fea3d4a8d246d6dcec92b72
Author: Randy Palamar
Date:   Fri,  8 Nov 2024 14:58:30 -0700

restructure beamform submissions into work queue

This is moving the program towards a state where we can export
arbitrary beamformed frames.

Diffstat:
Mbeamformer.c | 603++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mbeamformer.h | 93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mbeamformer_parameters.h | 18++++++++++++------
Mmain_generic.c | 2+-
Mstatic.c | 28+++++++++-------------------
Mui.c | 26+++++++++++++++-----------
Mutil.c | 4++++
Mutil.h | 20++++++++++++++++++++
8 files changed, 486 insertions(+), 308 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -3,8 +3,6 @@ static f32 dt_for_frame; -#include "ui.c" - static size decoded_data_size(ComputeShaderCtx *cs) { @@ -14,63 +12,57 @@ decoded_data_size(ComputeShaderCtx *cs) } static void -alloc_output_image(BeamformerCtx *ctx, Arena a) +alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_index, s8 name) { - BeamformerParameters *bp = &ctx->params->raw; - ComputeShaderCtx *cs = &ctx->csctx; + glDeleteTextures(out->dim.w, out->textures); - u32 max_3d_dim = ctx->gl.max_3d_texture_dim; - ctx->out_data_dim.x = CLAMP(round_down_power_of_2(ORONE(bp->output_points.x)), 1, max_3d_dim); - ctx->out_data_dim.y = CLAMP(round_down_power_of_2(ORONE(bp->output_points.y)), 1, max_3d_dim); - ctx->out_data_dim.z = CLAMP(round_down_power_of_2(ORONE(bp->output_points.z)), 1, max_3d_dim); - ctx->out_data_dim.w = CLAMP(bp->output_points.w, 1, ARRAY_COUNT(cs->sum_textures)); - bp->output_points = ctx->out_data_dim; + out->dim.x = CLAMP(round_down_power_of_2(ORONE(out_dim.x)), 1, gp->max_3d_texture_dim); + out->dim.y = CLAMP(round_down_power_of_2(ORONE(out_dim.y)), 1, gp->max_3d_texture_dim); + out->dim.z = CLAMP(round_down_power_of_2(ORONE(out_dim.z)), 1, gp->max_3d_texture_dim); + out->dim.w = CLAMP(out_dim.w, 0, MAX_MULTI_XDC_COUNT); /* NOTE: allocate storage for beamformed output data; * this is shared between compute and fragment shaders */ - uv4 odim = ctx->out_data_dim; - u32 max_dim = MAX(odim.x, MAX(odim.y, odim.z)); - ctx->out_texture_mips = _tzcnt_u32(max_dim) + 1; - - glActiveTexture(GL_TEXTURE0); - glDeleteTextures(1, &ctx->out_texture); - glGenTextures(1, &ctx->out_texture); - glBindTexture(GL_TEXTURE_3D, ctx->out_texture); - glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z); - LABEL_GL_OBJECT(GL_TEXTURE, ctx->out_texture, s8("Beamformed_Data_Texture")); - - Stream label = stream_alloc(&a, 256); - stream_append_s8(&label, s8("Sum_Texture_")); + u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); + out->mips = _tzcnt_u32(max_dim) + 1; + + u8 buf[256]; + Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)}; + stream_append_s8(&label, name); + stream_append_byte(&label, '['); + stream_append_u64(&label, frame_index); + stream_append_s8(&label, s8("][")); u32 sidx = label.widx; - glDeleteTextures(ARRAY_COUNT(cs->sum_textures), cs->sum_textures); - if (odim.w > 1) { - glGenTextures(odim.w, cs->sum_textures); - for (u32 i = 0; i < odim.w; i++) { - glBindTexture(GL_TEXTURE_3D, cs->sum_textures[i]); - glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z); - stream_append_u64(&label, i); - s8 slabel = stream_to_s8(&label); - LABEL_GL_OBJECT(GL_TEXTURE, cs->sum_textures[i], slabel); - label.widx = sidx; - } - } - bp->xdc_count = CLAMP(bp->xdc_count, 1, ARRAY_COUNT(cs->array_textures)); - glDeleteTextures(ARRAY_COUNT(cs->array_textures), cs->array_textures); - glGenTextures(bp->xdc_count, cs->array_textures); - for (u32 i = 0; i < bp->xdc_count; i++) { - glBindTexture(GL_TEXTURE_3D, cs->array_textures[i]); - glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z); + glCreateTextures(GL_TEXTURE_3D, out->dim.w, out->textures); + for (u32 i = 0; i < out->dim.w; i++) { + glTextureStorage3D(out->textures[i], out->mips, GL_RG32F, + out->dim.x, out->dim.y, out->dim.z); + stream_append_u64(&label, i); + stream_append_byte(&label, ']'); + LABEL_GL_OBJECT(GL_TEXTURE, out->textures[i], stream_to_s8(&label)); + label.widx = sidx; } +} - UnloadRenderTexture(ctx->fsctx.output); - /* TODO: select odim.x vs odim.y */ - ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z); - LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View")); - GenTextureMipmaps(&ctx->fsctx.output.texture); - //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X); - //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR); - SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR); +static void +alloc_output_image(BeamformerCtx *ctx, uv4 output_dim) +{ + uv4 try_dim = {.xyz = output_dim.xyz}; + if (!uv4_equal(try_dim, ctx->averaged_frame.dim)) { + alloc_beamform_frame(&ctx->gl, &ctx->averaged_frame, try_dim, 0, + s8("Beamformed_Averaged_Data")); + uv4 odim = ctx->averaged_frame.dim; + + UnloadRenderTexture(ctx->fsctx.output); + /* TODO: select odim.x vs odim.y */ + ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z); + LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View")); + GenTextureMipmaps(&ctx->fsctx.output.texture); + //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X); + //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR); + SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR); + } } static void @@ -145,6 +137,95 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) LABEL_GL_OBJECT(GL_BUFFER, cs->hadamard_ssbo, s8("Hadamard_SSBO")); } +static BeamformWork * +beamform_work_queue_pop(BeamformWorkQueue *q) +{ + BeamformWork *result = q->first; + if (result) { + q->first = result->next; + if (result == q->last) { + ASSERT(result->next == 0); + q->last = 0; + } + + switch (result->type) { + case BW_FULL_COMPUTE: + case BW_RECOMPUTE: + case BW_PARTIAL_COMPUTE: + /* NOTE: only one compute is allowed per frame */ + if (q->did_compute_this_frame) + result = 0; + else + q->did_compute_this_frame = 1; + break; + } + } + return result; +} + +static BeamformWork * +beamform_work_queue_push(BeamformerCtx *ctx, Arena *a, enum beamform_work work_type) +{ + BeamformWorkQueue *q = &ctx->beamform_work_queue; + ComputeShaderCtx *cs = &ctx->csctx; + + BeamformWork *result = q->next_free; + if (result) q->next_free = result->next; + else result = alloc(a, typeof(*result), 1); + + if (result) { + result->type = work_type; + result->next = 0; + + switch (work_type) { + case BW_FULL_COMPUTE: + /* TODO: limiting to make sure we don't have too many of these in the queue */ + cs->raw_data_index++; + if (cs->raw_data_index >= ARRAY_COUNT(cs->raw_data_fences)) + cs->raw_data_index = 0; + /* FALLTHROUGH */ + case BW_RECOMPUTE: { + i32 raw_index = cs->raw_data_index; + result->compute_ctx.raw_data_ssbo_index = raw_index; + /* NOTE: if this times out it means the command queue is more than 3 + * frames behind. In that case we need to re-evaluate the buffer size */ + if (cs->raw_data_fences[raw_index]) { + i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, + 10000); + if (result == GL_TIMEOUT_EXPIRED) { + //ASSERT(0); + } + glDeleteSync(cs->raw_data_fences[raw_index]); + cs->raw_data_fences[raw_index] = NULL; + } + ctx->displayed_frame_index++; + if (ctx->displayed_frame_index >= ARRAY_COUNT(ctx->beamform_frames)) + ctx->displayed_frame_index = 0; + result->compute_ctx.frame = ctx->beamform_frames + ctx->displayed_frame_index; + result->compute_ctx.first_pass = 1; + + uv4 try_dim = ctx->params->raw.output_points; + try_dim.w = ctx->params->raw.xdc_count; + if (!uv4_equal(result->compute_ctx.frame->dim, try_dim)) { + alloc_beamform_frame(&ctx->gl, result->compute_ctx.frame, try_dim, + ctx->displayed_frame_index, + s8("Beamformed_Data")); + } + } break; + case BW_PARTIAL_COMPUTE: + case BW_SAVE_FRAME: + case BW_SEND_FRAME: + case BW_SSBO_COPY: + break; + } + + if (q->last) q->last = q->last->next = result; + else q->last = q->first = result; + } + + return result; +} + static m3 v3_to_xdc_space(v3 direction, v3 origin, v3 corner1, v3 corner2) { @@ -173,65 +254,84 @@ f32_4_to_v4(f32 *in) return result; } +static void +do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, + u32 out_texture, uv4 out_data_dim) +{ + /* NOTE: zero output before summing */ + glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); + + glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); + glUniform1f(cs->sum_prescale_id, in_scale); + for (u32 i = 0; i < in_texture_count; i++) { + glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); + glDispatchCompute(ORONE(out_data_dim.x / 32), + ORONE(out_data_dim.y), + ORONE(out_data_dim.z / 32)); + glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + } +} + +static void +do_beamform_shader(ComputeShaderCtx *cs, BeamformerParameters *bp, BeamformFrame *frame, + u32 rf_ssbo, iv3 compute_dim_offset, i32 compute_pass) +{ + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, rf_ssbo); + glUniform3iv(cs->volume_export_dim_offset_id, 1, compute_dim_offset.E); + glUniform1i(cs->volume_export_pass_id, compute_pass); + + for (u32 i = 0; i < frame->dim.w; i++) { + u32 texture = frame->textures[i]; + m3 xdc_transform = v3_to_xdc_space((v3){.z = 1}, + f32_4_to_v4(bp->xdc_origin + (4 * i)).xyz, + f32_4_to_v4(bp->xdc_corner1 + (4 * i)).xyz, + f32_4_to_v4(bp->xdc_corner2 + (4 * i)).xyz); + glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); + glUniform1i(cs->xdc_index_id, i); + glUniformMatrix3fv(cs->xdc_transform_id, 1, GL_FALSE, xdc_transform.E); + glDispatchCompute(ORONE(frame->dim.x / 32), frame->dim.y, + ORONE(frame->dim.z / 32)); + } +} + static b32 -do_volume_computation_step(BeamformerCtx *ctx, enum compute_shaders shader) +do_partial_compute_step(BeamformerCtx *ctx, BeamformFrame *frame) { - ComputeShaderCtx *cs = &ctx->csctx; - ExportCtx *e = &ctx->export_ctx; + ComputeShaderCtx *cs = &ctx->csctx; + PartialComputeCtx *pc = &ctx->partial_compute_ctx; b32 done = 0; /* NOTE: we start this elsewhere on the first dispatch so that we can include * times such as decoding/demodulation/etc. */ - if (!(e->state & ES_TIMER_ACTIVE)) { - glQueryCounter(e->timer_ids[0], GL_TIMESTAMP); - e->state |= ES_TIMER_ACTIVE; + if (!(pc->state & PCS_TIMER_ACTIVE)) { + glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP); + pc->state |= PCS_TIMER_ACTIVE; } - glUseProgram(cs->programs[shader]); + glUseProgram(cs->programs[pc->shader]); glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, e->rf_data_ssbo); - - glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F); - glUniform1i(cs->volume_export_pass_id, 1); /* NOTE: We must tile this otherwise GL will kill us for taking too long */ /* TODO: this could be based on multiple dimensions */ - u32 dispatch_count = e->volume_dim.z / 32; - uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++}; - glUniform3iv(cs->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E); - glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1); - if (e->dispatch_index >= dispatch_count) { - e->dispatch_index = 0; - e->state &= ~ES_COMPUTING; - done = 1; + i32 dispatch_count = frame->dim.z / 32; + iv3 dim_offset = {.z = !!dispatch_count * 32 * pc->dispatch_index++}; + do_beamform_shader(cs, &ctx->params->raw, frame, pc->rf_data_ssbo, dim_offset, 1); + + if (pc->dispatch_index >= dispatch_count) { + pc->dispatch_index = 0; + pc->state &= ~PCS_COMPUTING; + done = 1; } - glQueryCounter(e->timer_ids[1], GL_TIMESTAMP); + glQueryCounter(pc->timer_ids[1], GL_TIMESTAMP); return done; } static void -do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, - u32 out_texture, uv4 out_data_dim) -{ - /* NOTE: zero output before summing */ - glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); - - glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); - glUniform1f(cs->sum_prescale_id, in_scale); - for (u32 i = 0; i < in_texture_count; i++) { - glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); - glDispatchCompute(ORONE(out_data_dim.x / 32), - ORONE(out_data_dim.y), - ORONE(out_data_dim.z / 32)); - glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); - } -} - -static void -do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) +do_compute_shader(BeamformerCtx *ctx, BeamformFrame *frame, u32 raw_data_index, + enum compute_shaders shader) { ComputeShaderCtx *csctx = &ctx->csctx; uv2 rf_raw_dim = ctx->params->raw.rf_raw_dim; @@ -244,22 +344,23 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) u32 output_ssbo_idx = !csctx->last_output_ssbo_index; u32 input_ssbo_idx = csctx->last_output_ssbo_index; + switch (shader) { case CS_HADAMARD: glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, - csctx->raw_data_index * rf_raw_size, rf_raw_size); + raw_data_index * rf_raw_size, rf_raw_size); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo); glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), ORONE(csctx->dec_data_dim.y / 32), ORONE(csctx->dec_data_dim.z)); - csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; break; case CS_CUDA_DECODE: - ctx->cuda_lib.cuda_decode(csctx->raw_data_index * rf_raw_size, output_ssbo_idx); - csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + ctx->cuda_lib.cuda_decode(raw_data_index * rf_raw_size, output_ssbo_idx); + csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; break; case CS_CUDA_HILBERT: @@ -275,76 +376,42 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; break; case CS_MIN_MAX: { - u32 texture = ctx->out_texture; - for (u32 i = 1; i < ctx->out_texture_mips; i++) { + u32 texture = frame->textures[frame->dim.w - 1]; + for (u32 i = 1; i < frame->mips; i++) { glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); glUniform1i(csctx->mips_level_id, i); - u32 width = ctx->out_data_dim.x >> i; - u32 height = ctx->out_data_dim.y >> i; - u32 depth = ctx->out_data_dim.z >> i; + u32 width = frame->dim.x >> i; + u32 height = frame->dim.y >> i; + u32 depth = frame->dim.z >> i; glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } } break; case CS_HERCULES: case CS_UFORCES: { - if (ctx->export_ctx.state & ES_START) { - /* NOTE: on the first frame of compute make a copy of the rf data */ - size rf_size = decoded_data_size(csctx); - ctx->export_ctx.state &= ~ES_START; - ctx->export_ctx.state |= ES_COMPUTING; - glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx], - ctx->export_ctx.rf_data_ssbo, 0, 0, rf_size); - } - - BeamformerParameters *bp = &ctx->params->raw; - - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); - glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 []){0, 0, 0}); - glUniform1i(csctx->volume_export_pass_id, 0); - - for (u32 i = 0; i < bp->xdc_count; i++) { - u32 texture; - if (bp->xdc_count == 1) { - if (ctx->out_data_dim.w > 1) { - texture = csctx->sum_textures[csctx->sum_texture_index]; - } else { - texture = ctx->out_texture; - } - } else { - texture = csctx->array_textures[i]; - } - - m3 xdc_transform = v3_to_xdc_space((v3){.z = 1}, - f32_4_to_v4(bp->xdc_origin + (4 * i)).xyz, - f32_4_to_v4(bp->xdc_corner1 + (4 * i)).xyz, - f32_4_to_v4(bp->xdc_corner2 + (4 * i)).xyz); - glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); - glUniform1i(csctx->xdc_index_id, i); - glUniformMatrix3fv(csctx->xdc_transform_id, 1, GL_FALSE, xdc_transform.E); - glDispatchCompute(ORONE(ctx->out_data_dim.x / 32), - ctx->out_data_dim.y, - ORONE(ctx->out_data_dim.z / 32)); - } - if (bp->xdc_count > 1) { + u32 rf_ssbo = csctx->rf_data_ssbos[input_ssbo_idx]; + do_beamform_shader(csctx, &ctx->params->raw, frame, rf_ssbo, (iv3){0}, 0); + if (frame->dim.w > 1) { glUseProgram(csctx->programs[CS_SUM]); glBindBufferBase(GL_UNIFORM_BUFFER, 0, csctx->shared_ubo); - u32 out; - if (ctx->out_data_dim.w > 1) out = csctx->sum_textures[csctx->sum_texture_index]; - else out = ctx->out_texture; - do_sum_shader(csctx, csctx->array_textures, bp->xdc_count, - 1 / (f32)bp->xdc_count, out, ctx->out_data_dim); + u32 input_texture_count = frame->dim.w - 1; + do_sum_shader(csctx, frame->textures, input_texture_count, + 1 / (f32)input_texture_count, frame->textures[frame->dim.w - 1], + frame->dim); } } break; case CS_SUM: { - u32 frame_count = ctx->out_data_dim.w; - if (frame_count > 1) { - do_sum_shader(csctx, csctx->sum_textures, frame_count, 1 / (f32)frame_count, - ctx->out_texture, ctx->out_data_dim); - csctx->sum_texture_index = (csctx->sum_texture_index + 1) % frame_count; + u32 frame_count = ctx->params->raw.output_points.w; + u32 in_textures[MAX_BEAMFORMED_SAVED_FRAMES]; + for (u32 i = 0; i < frame_count; i++) { + u32 idx = (ctx->displayed_frame_index - i) % ARRAY_COUNT(ctx->beamform_frames); + BeamformFrame *frame = ctx->beamform_frames + idx; + in_textures[i] = frame->textures[frame->dim.w - 1]; } + do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, + ctx->averaged_frame.textures[0], ctx->averaged_frame.dim); } break; default: ASSERT(0); } @@ -353,16 +420,101 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) } static void -check_compute_timers(ComputeShaderCtx *cs, ExportCtx *e, BeamformerParametersFull *bp) +do_beamform_work(BeamformerCtx *ctx, Arena *a) +{ + BeamformerParameters *bp = &ctx->params->raw; + BeamformWorkQueue *q = &ctx->beamform_work_queue; + BeamformWork *work = beamform_work_queue_pop(q); + ComputeShaderCtx *cs = &ctx->csctx; + + while (work) { + switch (work->type) { + case BW_PARTIAL_COMPUTE: { + BeamformFrame *frame = work->compute_ctx.frame; + + if (work->compute_ctx.first_pass) { + if (ctx->params->upload) { + glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp); + ctx->params->upload = 0; + } + + /* TODO: maybe we should have some concept of compute shader + * groups, then we could define a group that does the decoding + * and filtering and apply that group directly here. For now + * we will do this dumb thing */ + u32 stage_count = ctx->params->compute_stages_count; + enum compute_shaders *stages = ctx->params->compute_stages; + for (u32 i = 0; i < stage_count; i++) { + if (stages[i] == CS_UFORCES || stages[i] == CS_HERCULES) { + /* TODO: this is not a proper solution if we have + * more beamforming shaders */ + ctx->partial_compute_ctx.shader = stages[i]; + break; + } + do_compute_shader(ctx, frame, + work->compute_ctx.raw_data_ssbo_index, + stages[i]); + } + u32 output_ssbo = ctx->partial_compute_ctx.rf_data_ssbo; + u32 input_ssbo = cs->last_output_ssbo_index; + size rf_size = decoded_data_size(cs); + glCopyNamedBufferSubData(cs->rf_data_ssbos[input_ssbo], + output_ssbo, 0, 0, rf_size); + } + + b32 done = do_partial_compute_step(ctx, frame); + if (!done) { + BeamformWork *new; + /* NOTE: this push must not fail */ + new = beamform_work_queue_push(ctx, a, BW_PARTIAL_COMPUTE); + new->compute_ctx.first_pass = 0; + } + } break; + case BW_FULL_COMPUTE: + case BW_RECOMPUTE: { + BeamformFrame *frame = work->compute_ctx.frame; + + if (work->compute_ctx.first_pass) { + if (ctx->params->upload) { + glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp); + ctx->params->upload = 0; + } + } + + u32 stage_count = ctx->params->compute_stages_count; + enum compute_shaders *stages = ctx->params->compute_stages; + for (u32 i = 0; i < stage_count; i++) + do_compute_shader(ctx, frame, work->compute_ctx.raw_data_ssbo_index, + stages[i]); + ctx->flags |= GEN_MIPMAPS; + } break; + } + + + work->next = q->next_free; + q->next_free = work; + work = beamform_work_queue_pop(q); + } + + if (q->did_compute_this_frame) { + u32 tidx = ctx->csctx.timer_index; + glDeleteSync(ctx->csctx.timer_fences[tidx]); + ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences); + } +} + +static void +check_compute_timers(ComputeShaderCtx *cs, PartialComputeCtx *pc, BeamformerParametersFull *bp) { /* NOTE: volume generation running timer */ - if (e->state & ES_TIMER_ACTIVE) { + if (pc->state & PCS_TIMER_ACTIVE) { u64 start_ns = 0, end_ns = 0; - glGetQueryObjectui64v(e->timer_ids[0], GL_QUERY_RESULT, &start_ns); - glGetQueryObjectui64v(e->timer_ids[1], GL_QUERY_RESULT, &end_ns); + glGetQueryObjectui64v(pc->timer_ids[0], GL_QUERY_RESULT, &start_ns); + glGetQueryObjectui64v(pc->timer_ids[1], GL_QUERY_RESULT, &end_ns); u64 elapsed_ns = end_ns - start_ns; - e->runtime += (f32)elapsed_ns * 1e-9; - e->state &= ~ES_TIMER_ACTIVE; + pc->runtime += (f32)elapsed_ns * 1e-9; + pc->state &= ~PCS_TIMER_ACTIVE; } /* NOTE: main timers for display portion of the program */ @@ -384,8 +536,10 @@ check_compute_timers(ComputeShaderCtx *cs, ExportCtx *e, BeamformerParametersFul } } +#include "ui.c" + DEBUG_EXPORT void -do_beamformer(BeamformerCtx *ctx, Arena arena) +do_beamformer(BeamformerCtx *ctx, Arena *arena) { dt_for_frame = GetFrameTime(); @@ -395,108 +549,53 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) } /* NOTE: Store the compute time for the last frame. */ - check_compute_timers(&ctx->csctx, &ctx->export_ctx, ctx->params); + check_compute_timers(&ctx->csctx, &ctx->partial_compute_ctx, ctx->params); BeamformerParameters *bp = &ctx->params->raw; /* NOTE: Check for and Load RF Data into GPU */ if (ctx->platform.poll_pipe(ctx->data_pipe)) { - ComputeShaderCtx *cs = &ctx->csctx; - if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) - alloc_shader_storage(ctx, arena); - - if (!uv4_equal(ctx->out_data_dim, bp->output_points)) - alloc_output_image(ctx, arena); - - cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences); - i32 raw_index = ctx->csctx.raw_data_index; - /* NOTE: if this times out it means the command queue is more than 3 frames behind. - * In that case we need to re-evaluate the buffer size */ - if (ctx->csctx.raw_data_fences[raw_index]) { - i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000); - if (result == GL_TIMEOUT_EXPIRED) { - //ASSERT(0); + BeamformWork *work = beamform_work_queue_push(ctx, arena, BW_FULL_COMPUTE); + /* NOTE: we can only read in the new data if we get back a work item. + * otherwise we have too many frames in flight and should wait until the + * next frame to try again */ + if (work) { + ComputeShaderCtx *cs = &ctx->csctx; + if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) { + alloc_shader_storage(ctx, *arena); + /* TODO: we may need to invalidate all queue items here */ } - glDeleteSync(cs->raw_data_fences[raw_index]); - cs->raw_data_fences[raw_index] = NULL; - } - uv2 rf_raw_dim = cs->rf_raw_dim; - size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); - - void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size; - size rlen = ctx->platform.read_pipe(ctx->data_pipe, rf_data_buf, rf_raw_size); - if (rlen != rf_raw_size) { - ctx->partial_transfer_count++; - } else { - ctx->flags |= DO_COMPUTE; - switch (ctx->gl.vendor_id) { - case GL_VENDOR_INTEL: - /* TODO: intel complains about this buffer being busy even with - * MAP_UNSYNCHRONIZED_BIT */ - case GL_VENDOR_AMD: - break; - case GL_VENDOR_NVIDIA: - glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size, - rf_raw_size, rf_data_buf); + u32 raw_index = work->compute_ctx.raw_data_ssbo_index; + uv2 rf_raw_dim = cs->rf_raw_dim; + size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); + void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size; + + alloc_output_image(ctx, bp->output_points); + + size rlen = ctx->platform.read_pipe(ctx->data_pipe, rf_data_buf, rf_raw_size); + if (rlen != rf_raw_size) { + stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: ")); + stream_append_i64(&ctx->error_stream, rlen); + stream_append_byte(&ctx->error_stream, '/'); + stream_append_i64(&ctx->error_stream, rf_raw_size); + stream_append_s8(&ctx->error_stream, s8("\n\0")); + TraceLog(LOG_WARNING, (c8 *)stream_to_s8(&ctx->error_stream).data); + ctx->error_stream.widx = 0; + } else { + switch (ctx->gl.vendor_id) { + case GL_VENDOR_INTEL: + case GL_VENDOR_AMD: + break; + case GL_VENDOR_NVIDIA: + glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen, + rlen, rf_data_buf); + } } } } - /* NOTE: we are starting a volume computation on this frame so make some space */ - if (ctx->export_ctx.state & ES_START) { - ExportCtx *e = &ctx->export_ctx; - e->runtime = 0; - uv4 edim = e->volume_dim; - - /* NOTE: get a timestamp here which will include decoding/demodulating/etc. */ - glQueryCounter(e->timer_ids[0], GL_TIMESTAMP); - e->state |= ES_TIMER_ACTIVE; - - glDeleteTextures(1, &e->volume_texture); - glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture); - glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z); - LABEL_GL_OBJECT(GL_TEXTURE, e->volume_texture, s8("Beamformed_Volume")); - - glDeleteBuffers(1, &e->rf_data_ssbo); - glCreateBuffers(1, &e->rf_data_ssbo); - glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0); - LABEL_GL_OBJECT(GL_BUFFER, e->rf_data_ssbo, s8("Volume_RF_SSBO")); - } - - if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_START) { - if (ctx->params->upload && !(ctx->export_ctx.state & ES_COMPUTING)) { - glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp); - ctx->params->upload = 0; - } - - u32 stages = ctx->params->compute_stages_count; - for (u32 i = 0; i < stages; i++) { - do_compute_shader(ctx, ctx->params->compute_stages[i]); - } - ctx->flags &= ~DO_COMPUTE; - ctx->flags |= GEN_MIPMAPS; - - u32 tidx = ctx->csctx.timer_index; - glDeleteSync(ctx->csctx.timer_fences[tidx]); - ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences); - } - - if (ctx->export_ctx.state & ES_COMPUTING) { - /* TODO: this could probably be adapted to do FORCES as well */ - b32 done = do_volume_computation_step(ctx, CS_HERCULES); - if (done) { - ExportCtx *e = &ctx->export_ctx; - uv4 dim = e->volume_dim; - size volume_out_size = dim.x * dim.y * dim.z * sizeof(f32); - e->volume_buf = ctx->platform.alloc_arena(e->volume_buf, volume_out_size); - glGetTextureImage(e->volume_texture, 0, GL_RED, GL_FLOAT, volume_out_size, - e->volume_buf.beg); - s8 raw = {.len = volume_out_size, .data = e->volume_buf.beg}; - if (!ctx->platform.write_new_file("raw_volume.bin", raw)) - TraceLog(LOG_WARNING, "failed to write output volume\n"); - } - } + ctx->beamform_work_queue.did_compute_this_frame = 0; + do_beamform_work(ctx, arena); /* NOTE: draw output image texture using render fragment shader */ BeginTextureMode(ctx->fsctx.output); @@ -504,7 +603,15 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) BeginShaderMode(ctx->fsctx.shader); FragmentShaderCtx *fs = &ctx->fsctx; glUseProgram(fs->shader.id); - glBindTextureUnit(0, ctx->out_texture); + u32 out_texture = 0; + if (bp->output_points.w > 1) { + out_texture = ctx->averaged_frame.textures[0]; + } else { + BeamformFrame *f = ctx->beamform_frames + ctx->displayed_frame_index; + /* NOTE: verify we have actually beamformed something yet */ + if (f->dim.w) out_texture = f->textures[f->dim.w - 1]; + } + glBindTextureUnit(0, out_texture); glUniform1f(fs->db_cutoff_id, fs->db); glUniform1f(fs->threshold_id, fs->threshold); DrawTexture(fs->output.texture, 0, 0, WHITE); @@ -520,7 +627,7 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) ctx->flags &= ~GEN_MIPMAPS; } - draw_ui(ctx, arena); + draw_ui(ctx, *arena); if (IsKeyPressed(KEY_R)) ctx->flags |= RELOAD_SHADERS; diff --git a/beamformer.h b/beamformer.h @@ -29,8 +29,7 @@ enum program_flags { SHOULD_EXIT = 1 << 0, RELOAD_SHADERS = 1 << 1, - GEN_MIPMAPS = 1 << 29, - DO_COMPUTE = 1 << 30, + GEN_MIPMAPS = 1 << 30, }; enum gl_vendor_ids { @@ -128,14 +127,6 @@ typedef struct { GLsync timer_fences[MAX_FRAMES_IN_FLIGHT]; f32 last_frame_time[CS_LAST]; - /* NOTE: circular buffer of textures for averaging. - * Only allocated up to configured frame average count */ - u32 sum_textures[16]; - u32 sum_texture_index; - - /* NOTE: array output textures. Only allocated up to configured array count */ - u32 array_textures[4]; - /* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled * transfers when the GPU is running behind the CPU. It is not mapped on NVIDIA because * their drivers _will_ store the buffer in the system memory. This doesn't happen @@ -174,22 +165,32 @@ typedef struct { f32 threshold; } FragmentShaderCtx; -enum export_state { - ES_START = (1 << 0), - ES_COMPUTING = (1 << 1), - ES_TIMER_ACTIVE = (1 << 2), +enum { + PCS_COMPUTING, + PCS_TIMER_ACTIVE, }; typedef struct { - Arena volume_buf; + /* NOTE: we always have one extra texture to sum into; thus the final output data + * is always found in textures[dim.w - 1] */ + u32 textures[MAX_MULTI_XDC_COUNT + 1]; + uv4 dim; + u32 mips; +} BeamformFrame; + +typedef struct { + /* TODO: possibly both of these should be stored elsewhere */ + Arena export_buf; uv4 volume_dim; + + BeamformFrame frame; u32 timer_ids[2]; f32 runtime; - u32 volume_texture; u32 rf_data_ssbo; - u32 state; + u32 shader; u32 dispatch_index; -} ExportCtx; + u32 state; +} PartialComputeCtx; typedef struct { enum gl_vendor_ids vendor_id; @@ -201,6 +202,49 @@ typedef struct { i32 max_ubo_size; } GLParams; +enum beamform_work { + BW_FULL_COMPUTE, + BW_RECOMPUTE, + BW_PARTIAL_COMPUTE, + BW_SAVE_FRAME, + BW_SEND_FRAME, + BW_SSBO_COPY, +}; + +typedef struct { + u32 source_ssbo; + u32 dest_ssbo; +} BeamformSSBOCopy; + +typedef struct { + BeamformFrame *frame; + u32 raw_data_ssbo_index; + b32 first_pass; +} BeamformCompute; + +typedef struct { + BeamformFrame *frame; + iptr output_handle; +} BeamformOutputFrame; + +/* NOTE: discriminated union based on type */ +typedef struct BeamformWork { + struct BeamformWork *next; + union { + BeamformSSBOCopy ssbo_copy_ctx; + BeamformCompute compute_ctx; + BeamformOutputFrame output_frame_ctx; + }; + u32 type; +} BeamformWork; + +typedef struct { + BeamformWork *first; + BeamformWork *last; + BeamformWork *next_free; + b32 did_compute_this_frame; +} BeamformWorkQueue; + typedef struct BeamformerCtx { GLParams gl; @@ -213,13 +257,14 @@ typedef struct BeamformerCtx { InputState is; - uv4 out_data_dim; - u32 out_texture; - u32 out_texture_mips; + BeamformFrame beamform_frames[MAX_BEAMFORMED_SAVED_FRAMES]; + u32 displayed_frame_index; + /* NOTE: this will only be used when we are averaging */ + BeamformFrame averaged_frame; ComputeShaderCtx csctx; FragmentShaderCtx fsctx; - ExportCtx export_ctx; + PartialComputeCtx partial_compute_ctx; Pipe data_pipe; u32 partial_transfer_count; @@ -228,9 +273,11 @@ typedef struct BeamformerCtx { Platform platform; Stream error_stream; + BeamformWorkQueue beamform_work_queue; + BeamformerParametersFull *params; } BeamformerCtx; -#define LABEL_GL_OBJECT(type, id, s) glObjectLabel(type, id, (s).len, (c8 *)(s).data) +#define LABEL_GL_OBJECT(type, id, s) {s8 _s = (s); glObjectLabel(type, id, _s.len, (c8 *)_s.data);} #endif /*_BEAMFORMER_H_ */ diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -11,14 +11,16 @@ enum compute_shaders { CS_LAST }; +#define MAX_BEAMFORMED_SAVED_FRAMES 16 +#define MAX_MULTI_XDC_COUNT 4 /* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have * read and understood the rules, particulary with regards to _member alignment_ */ typedef struct { u16 channel_mapping[512]; /* Transducer Channel to Verasonics Channel */ u32 uforces_channels[128]; /* Channels used for virtual UFORCES elements */ - f32 xdc_origin[16]; /* [m] (4 v4s) Corner of transducer being treated as origin */ - f32 xdc_corner1[16]; /* [m] (4 v4s) Corner of transducer along first axis (arbitrary) */ - f32 xdc_corner2[16]; /* [m] (4 v4s) Corner of transducer along second axis (arbitrary) */ + f32 xdc_origin[4 * MAX_MULTI_XDC_COUNT]; /* [m] Corner of transducer being treated as origin */ + f32 xdc_corner1[4 * MAX_MULTI_XDC_COUNT]; /* [m] Corner of transducer along first axis */ + f32 xdc_corner2[4 * MAX_MULTI_XDC_COUNT]; /* [m] Corner of transducer along second axis */ uv4 dec_data_dim; /* Samples * Channels * Acquisitions; last element ignored */ uv4 output_points; /* Width * Height * Depth * (Frame Average Count) */ v4 output_min_coordinate; /* [m] Back-Top-Left corner of output region (w ignored) */ @@ -36,15 +38,19 @@ typedef struct { f32 _pad[1]; } BeamformerParameters; +/* NOTE: garbage to get the prepocessor to properly stringize the value of a macro */ +#define str_(x) #x +#define str(x) str_(x) + #define COMPUTE_SHADER_HEADER "\ #version 460 core\n\ \n\ layout(std140, binding = 0) uniform parameters {\n\ uvec4 channel_mapping[64]; /* Transducer Channel to Verasonics Channel */\n\ uvec4 uforces_channels[32]; /* Channels used for virtual UFORCES elements */\n\ - vec4 xdc_origin[4]; /* [m] Corner of transducer being treated as origin */\n\ - vec4 xdc_corner1[4]; /* [m] Corner of transducer along first axis (arbitrary) */\n\ - vec4 xdc_corner2[4]; /* [m] Corner of transducer along second axis (arbitrary) */\n\ + vec4 xdc_origin[" str(MAX_MULTI_XDC_COUNT) "]; /* [m] Corner of transducer being treated as origin */\n\ + vec4 xdc_corner1[" str(MAX_MULTI_XDC_COUNT) "]; /* [m] Corner of transducer along first axis (arbitrary) */\n\ + vec4 xdc_corner2[" str(MAX_MULTI_XDC_COUNT) "]; /* [m] Corner of transducer along second axis (arbitrary) */\n\ uvec4 dec_data_dim; /* Samples * Channels * Acquisitions; last element ignored */\n\ uvec4 output_points; /* Width * Height * Depth * (Frame Average Count) */\n\ vec4 output_min_coord; /* [m] Top left corner of output region */\n\ diff --git a/main_generic.c b/main_generic.c @@ -44,7 +44,7 @@ main(void) setup_beamformer(&ctx, temp_memory); while(!(ctx.flags & SHOULD_EXIT)) { - do_program_step(&ctx, temp_memory); + do_program_step(&ctx, &temp_memory); } /* NOTE: make sure this will get cleaned up after external diff --git a/static.c b/static.c @@ -20,7 +20,8 @@ static struct { #else static void *debug_lib; -typedef void do_beamformer_fn(BeamformerCtx *, Arena); +/* TODO: move this to a header */ +typedef void do_beamformer_fn(BeamformerCtx *, Arena *); static do_beamformer_fn *do_beamformer; static void @@ -173,13 +174,6 @@ compile_shader(Arena a, u32 type, s8 shader) } static void -init_fragment_shader_ctx(FragmentShaderCtx *ctx, uv4 out_data_dim) -{ - ctx->db = -50.0f; - ctx->threshold = 40.0f; -} - -static void reload_shaders(BeamformerCtx *ctx, Arena a) { ComputeShaderCtx *csctx = &ctx->csctx; @@ -211,7 +205,6 @@ reload_shaders(BeamformerCtx *ctx, Arena a) glDeleteProgram(csctx->programs[i]); csctx->programs[i] = rlLoadComputeShaderProgram(shader_id); LABEL_GL_OBJECT(GL_PROGRAM, csctx->programs[i], compute_shaders[i].label); - ctx->flags |= DO_COMPUTE; } glDeleteShader(shader_id); @@ -262,8 +255,7 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory) { ctx->window_size = (uv2){.w = 1280, .h = 840}; - ctx->out_data_dim = (uv4){.x = 1, .y = 1, .z = 1}; - ctx->export_ctx.volume_dim = (uv4){.x = 1, .y = 1, .z = 1}; + ctx->partial_compute_ctx.volume_dim = (uv4){.x = 1, .y = 1, .z = 1}; SetConfigFlags(FLAG_VSYNC_HINT); InitWindow(ctx->window_size.w, ctx->window_size.h, "OGL Beamformer"); @@ -280,7 +272,8 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory) ctx->font = LoadFontEx("assets/IBMPlexSans-Bold.ttf", 28, 0, 0); ctx->small_font = LoadFontEx("assets/IBMPlexSans-Bold.ttf", 22, 0, 0); - init_fragment_shader_ctx(&ctx->fsctx, ctx->out_data_dim); + ctx->fsctx.db = -50.0f; + ctx->fsctx.threshold = 40.0f; ctx->data_pipe = os_open_named_pipe(OS_PIPE_NAME); ctx->params = os_open_shared_memory_area(OS_SMEM_NAME, sizeof(ctx->params)); @@ -288,7 +281,6 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory) ASSERT(ctx->data_pipe.file != INVALID_FILE); ASSERT(ctx->params); - ctx->params->raw.output_points = ctx->out_data_dim; /* NOTE: default compute shader pipeline */ ctx->params->compute_stages[0] = CS_HADAMARD; ctx->params->compute_stages[1] = CS_DEMOD; @@ -311,15 +303,13 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory) LABEL_GL_OBJECT(GL_BUFFER, ctx->csctx.shared_ubo, s8("Beamformer_Parameters")); glGenQueries(ARRAY_COUNT(ctx->csctx.timer_fences) * CS_LAST, (u32 *)ctx->csctx.timer_ids); - glGenQueries(ARRAY_COUNT(ctx->export_ctx.timer_ids), ctx->export_ctx.timer_ids); + glGenQueries(ARRAY_COUNT(ctx->partial_compute_ctx.timer_ids), ctx->partial_compute_ctx.timer_ids); - /* NOTE: do not DO_COMPUTE on first frame */ reload_shaders(ctx, temp_memory); - ctx->flags &= ~DO_COMPUTE; } static void -do_program_step(BeamformerCtx *ctx, Arena temp_memory) +do_program_step(BeamformerCtx *ctx, Arena *memory) { do_debug(&ctx->error_stream); if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA) @@ -327,8 +317,8 @@ do_program_step(BeamformerCtx *ctx, Arena temp_memory) if (ctx->flags & RELOAD_SHADERS) { ctx->flags &= ~RELOAD_SHADERS; - reload_shaders(ctx, temp_memory); + reload_shaders(ctx, *memory); } - do_beamformer(ctx, temp_memory); + do_beamformer(ctx, memory); } diff --git a/ui.c b/ui.c @@ -2,10 +2,13 @@ static void ui_start_compute(BeamformerCtx *ctx) { - ctx->flags |= DO_COMPUTE; - if (ctx->params->raw.output_points.w > 1) { - for (u32 i = 0; i < ctx->params->raw.output_points.w; i++) - glClearTexImage(ctx->csctx.sum_textures[i], 0, GL_RED, GL_FLOAT, 0); + /* NOTE: we do not allow ui to start a work if no work was previously completed */ + Arena a = {0}; + beamform_work_queue_push(ctx, &a, BW_RECOMPUTE); + for (u32 i = 0; i < ARRAY_COUNT(ctx->beamform_frames); i++) { + BeamformFrame *frame = ctx->beamform_frames + i; + if (frame->dim.w && frame->textures[frame->dim.w - 1]) + glClearTexImage(frame->textures[frame->dim.w - 1], 0, GL_RED, GL_FLOAT, 0); } ctx->params->upload = 1; } @@ -560,19 +563,21 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse) draw_r.pos.y += 2 * LISTING_LINE_PAD; draw_r.size.y -= 2 * LISTING_LINE_PAD; - bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.x, bmv_store_power_of_two, + #if 0 + /* TODO: work this into the work queue */ + bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.x, bmv_store_power_of_two, .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim}, MV_INT, 1, 1}; draw_r = do_text_input_listing(s8("Export Dimension X:"), s8(""), bmv, ctx, arena, draw_r, mouse, hover_t + idx++); - bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.y, bmv_store_power_of_two, + bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.y, bmv_store_power_of_two, .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim}, MV_INT, 1, 1}; draw_r = do_text_input_listing(s8("Export Dimension Y:"), s8(""), bmv, ctx, arena, draw_r, mouse, hover_t + idx++); - bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.z, bmv_store_power_of_two, + bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.z, bmv_store_power_of_two, .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim}, MV_INT, 1, 1}; draw_r = do_text_input_listing(s8("Export Dimension Z:"), s8(""), bmv, ctx, arena, @@ -582,11 +587,10 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse) btn_r.size.h = ctx->font.baseSize * 1.3; btn_r.size.w *= 0.6; if (do_text_button(ctx, s8("Dump Raw Volume"), btn_r, mouse, hover_t + idx++)) { - if (!ctx->export_ctx.state) { - ctx->export_ctx.state = ES_START; - ctx->flags |= DO_COMPUTE; + if (!ctx->partial_compute_ctx.state) { } } + #endif /* NOTE: if C compilers didn't suck this would be a static assert */ ASSERT(idx <= ARRAY_COUNT(hover_t)); @@ -631,7 +635,7 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r) } static s8 totals[2] = {s8("Compute Total:"), s8("Volume Total:")}; - f32 times[2] = {compute_time_sum, ctx->export_ctx.runtime}; + f32 times[2] = {compute_time_sum, ctx->partial_compute_ctx.runtime}; for (u32 i = 0; i < ARRAY_COUNT(totals); i++) { pos.y -= measure_text(ctx->font, totals[i]).y; draw_text(ctx->font, totals[i], pos, 0, colour_from_normalized(FG_COLOUR)); diff --git a/util.c b/util.c @@ -47,6 +47,10 @@ mem_move(u8 *src, u8 *dest, size n) static void * alloc_(Arena *a, size len, size align, size count) { + /* NOTE: special case 0 arena */ + if (a->beg == 0) + return 0; + size padding = -(uintptr_t)a->beg & (align - 1); size available = a->end - a->beg - padding; if (available < 0 || count > available / len) diff --git a/util.h b/util.h @@ -11,6 +11,10 @@ #define asm __asm__ #endif +#ifndef typeof +#define typeof __typeof__ +#endif + #ifndef unreachable #ifdef _MSC_VER #define unreachable() __assume(0) @@ -62,6 +66,7 @@ typedef ptrdiff_t size; typedef ptrdiff_t iptr; typedef struct { u8 *beg, *end; } Arena; +typedef struct { Arena *arena; u8 *old_beg; } TempArena; typedef struct { size len; u8 *data; } s8; #define s8(s) (s8){.len = ARRAY_COUNT(s) - 1, .data = (u8 *)s} @@ -79,13 +84,28 @@ typedef union { } iv2; typedef union { + struct { i32 x, y, z; }; + struct { i32 w, h, d; }; + iv2 xy; + i32 E[3]; +} iv3; + +typedef union { struct { u32 x, y; }; struct { u32 w, h; }; u32 E[2]; } uv2; typedef union { + struct { u32 x, y, z; }; + struct { u32 w, h, d; }; + uv2 xy; + u32 E[3]; +} uv3; + +typedef union { struct { u32 x, y, z, w; }; + uv3 xyz; u32 E[4]; } uv4;