split volume computation step to a seperate function - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 66d6d9e5f8d9d3bded52fcb3bc189f209050dfa9
Parent: 9e0caec50c40eb9b313d84a6332fa02c9823a519
Author: Randy Palamar
Date:   Sun,  8 Sep 2024 10:20:46 -0600

split volume computation step to a seperate function

This also fixes the volume computation to actually use the copied
rf_data buffer and removes the need to run the whole beamforming
pipe for each volume step.

Also the HERCULES shader now ignores the y-axis position when not
doing a volume computation.

Diffstat:
M beamformer.c  | 106 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M shaders/hercules.glsl  | 4 ++++

2 files changed, 64 insertions(+), 46 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -103,6 +103,39 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 	glNamedBufferStorage(cs->hadamard_ssbo, hadamard_elements * sizeof(i32), hadamard, 0);
 }
 
+static b32
+do_volume_computation_step(BeamformerCtx *ctx, enum compute_shaders shader)
+{
+	ComputeShaderCtx *cs = &ctx->csctx;
+	ExportCtx        *e  = &ctx->export_ctx;
+
+	b32 done = 0;
+
+	/* TODO: volume computation running timer */
+	glUseProgram(cs->programs[shader]);
+	glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo);
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, e->rf_data_ssbo);
+
+	glActiveTexture(GL_TEXTURE0);
+	glBindTexture(GL_TEXTURE_3D, e->volume_texture);
+	glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F);
+	glUniform1i(e->volume_texture_id, 0);
+	glUniform1i(cs->volume_export_pass_id, 1);
+
+	/* NOTE: We must tile this otherwise GL will kill us for taking too long */
+	/* TODO: this could be based on multiple dimensions */
+	u32 dispatch_count = e->volume_dim.z / 32;
+	uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++};
+	glUniform3iv(cs->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E);
+	glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1);
+	if (e->dispatch_index >= dispatch_count) {
+		e->dispatch_index = 0;
+		e->state          = 0;
+		done              = 1;
+	}
+	return done;
+}
+
 static void
 do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 {
@@ -113,7 +146,6 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 	glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[csctx->timer_index][shader]);
 
 	glUseProgram(csctx->programs[shader]);
-
 	glBindBufferBase(GL_UNIFORM_BUFFER, 0, csctx->shared_ubo);
 
 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
@@ -171,39 +203,15 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 		break;
 	case CS_HERCULES:
 	case CS_UFORCES:
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
-
-		/* NOTE: Do a volume computation before doing the normal display path */
-		if (ctx->export_ctx.state & (ES_START|ES_COMPUTING)) {
-			ExportCtx *e = &ctx->export_ctx;
+		if (ctx->export_ctx.state & ES_START) {
 			/* NOTE: on the first frame of compute make a copy of the rf data */
-			if (e->state & ES_START) {
-				size rf_size = decoded_data_size(csctx);
-				e->state &= ~ES_START;
-				glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx],
-				                         e->rf_data_ssbo, 0, 0, rf_size);
-			}
-
-			glActiveTexture(GL_TEXTURE0);
-			glBindTexture(GL_TEXTURE_3D, e->volume_texture);
-			glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F);
-			glUniform1i(e->volume_texture_id, 0);
-			glUniform1i(csctx->volume_export_pass_id, 1);
-
-			e->state |= ES_COMPUTING;
-
-			/* NOTE: We must tile this otherwise GL will kill us for taking too long */
-			/* TODO: this could be based on multiple dimensions */
-			u32 dispatch_count = e->volume_dim.z / 32;
-			uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++};
-			glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E);
-			glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1);
-			if (e->dispatch_index >= dispatch_count) {
-				e->dispatch_index = 0;
-				e->state          = ES_DONE;
-			}
+			size rf_size          = decoded_data_size(csctx);
+			ctx->export_ctx.state = ES_COMPUTING;
+			glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx],
+			                         ctx->export_ctx.rf_data_ssbo, 0, 0, rf_size);
 		}
 
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 		glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 []){0, 0, 0});
 		glUniform1i(csctx->volume_export_pass_id, 0);
 		glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit);
@@ -297,24 +305,25 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		else                     ctx->partial_transfer_count++;
 	}
 
-	if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_COMPUTING) {
+	/* NOTE: we are starting a volume computation on this frame so make some space */
+	if (ctx->export_ctx.state & ES_START) {
+		ExportCtx *e = &ctx->export_ctx;
+		uv4 edim = e->volume_dim;
+		glDeleteTextures(1, &e->volume_texture);
+		glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture);
+		glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z);
+
+		glDeleteBuffers(1, &e->rf_data_ssbo);
+		glCreateBuffers(1, &e->rf_data_ssbo);
+		glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0);
+	}
+
+	if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_START) {
 		if (ctx->params->upload && !(ctx->export_ctx.state & ES_COMPUTING)) {
 			glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp);
 			ctx->params->upload = 0;
 		}
 
-		if (ctx->export_ctx.state & ES_START) {
-			ExportCtx *e = &ctx->export_ctx;
-			uv4 edim = e->volume_dim;
-			glDeleteTextures(1, &e->volume_texture);
-			glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture);
-			glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z);
-
-			glDeleteBuffers(1, &e->rf_data_ssbo);
-			glCreateBuffers(1, &e->rf_data_ssbo);
-			glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0);
-		}
-
 		u32 stages = ctx->params->compute_stages_count;
 		for (u32 i = 0; i < stages; i++) {
 			do_compute_shader(ctx, ctx->params->compute_stages[i]);
@@ -326,8 +335,12 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		glDeleteSync(ctx->csctx.timer_fences[tidx]);
 		ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences);
+	}
 
-		if (ctx->export_ctx.state & ES_DONE) {
+	if (ctx->export_ctx.state & ES_COMPUTING) {
+		/* TODO: this could probably be adapted to do FORCES as well */
+		b32 done = do_volume_computation_step(ctx, CS_HERCULES);
+		if (done) {
 			ExportCtx *e = &ctx->export_ctx;
 			uv4 dim = e->volume_dim;
 			size volume_out_size = dim.x * dim.y * dim.z * sizeof(f32);
@@ -337,7 +350,6 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 			s8 raw = {.len = volume_out_size, .data = e->volume_buf.beg};
 			if (!os_write_file("raw_volume.bin", raw))
 				TraceLog(LOG_WARNING, "failed to write output volume\n");
-			ctx->export_ctx.state = 0;
 		}
 	}
 
@@ -347,6 +359,8 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		BeginShaderMode(ctx->fsctx.shader);
 			FragmentShaderCtx *fs = &ctx->fsctx;
 			glUseProgram(fs->shader.id);
+			glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit);
+			glBindTexture(GL_TEXTURE_3D, ctx->out_texture);
 			glUniform1i(fs->out_data_tex_id, ctx->out_texture_unit);
 			glUniform1f(fs->db_cutoff_id, fs->db);
 			DrawTexture(fs->output.texture, 0, 0, WHITE);
diff --git a/shaders/hercules.glsl b/shaders/hercules.glsl
@@ -85,6 +85,10 @@ void main()
 		output_min_coord.z + voxel.z * output_size.z / out_data_dim.z
 	);
 
+	/* TODO: off_axis_position */
+	if (u_volume_export_pass == 0)
+		image_point.y = 0;
+
 	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
 	 *
 	 *                  /        |x_e - x_i|\

M	beamformer.c	\|	106	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	shaders/hercules.glsl	\|	4	++++