ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (14082B)


      1 /* See LICENSE for license details. */
      2 
      3 #include "beamformer_internal.h"
      4 
      5 /* NOTE(rnp): magic variables to force discrete GPU usage on laptops with multiple devices */
      6 EXPORT i32 NvOptimusEnablement = 1;
      7 EXPORT i32 AmdPowerXpressRequestHighPerformance = 1;
      8 
      9 #if !BEAMFORMER_DEBUG
     10 #include "beamformer_core.c"
     11 #else
     12 
     13 typedef void beamformer_frame_step_fn(BeamformerInput *);
     14 
     15 #define BEAMFORMER_DEBUG_ENTRY_POINTS \
     16 	X(beamformer_debug_ui_deinit)  \
     17 	X(beamformer_complete_compute) \
     18 	X(beamformer_frame_step)       \
     19 	X(beamformer_rf_upload)        \
     20 
     21 #define X(name) global name ##_fn *name;
     22 BEAMFORMER_DEBUG_ENTRY_POINTS
     23 #undef X
     24 
     25 BEAMFORMER_EXPORT void
     26 beamformer_debug_hot_release(BeamformerInput *input)
     27 {
     28 	BeamformerCtx *ctx = BeamformerContextMemory(input->memory);
     29 
     30 	// TODO(rnp): this will deadlock if live imaging is active
     31 	/* NOTE(rnp): spin until compute thread finishes its work (we will probably
     32 	 * never reload while compute is in progress but just incase). */
     33 	spin_wait(atomic_load_u32(&ctx->upload_worker.awake));
     34 	spin_wait(atomic_load_u32(&ctx->compute_worker.awake));
     35 }
     36 
     37 BEAMFORMER_EXPORT void
     38 beamformer_debug_hot_reload(OSLibrary library, BeamformerInput *input)
     39 {
     40 	#define X(name) name = os_lookup_symbol(library, #name);
     41 	BEAMFORMER_DEBUG_ENTRY_POINTS
     42 	#undef X
     43 
     44 	s8 info = beamformer_info("reloaded main executable");
     45 	os_console_log(info.data, info.len);
     46 }
     47 
     48 #endif /* BEAMFORMER_DEBUG */
     49 
     50 function no_return void
     51 fatal(s8 message)
     52 {
     53 	os_fatal(message.data, message.len);
     54 	unreachable();
     55 }
     56 
     57 #include "vulkan.c"
     58 
     59 // TODO(rnp): this doesn't belong here, but will be removed
     60 // once vulkan migration is complete
     61 void * glfwGetProcAddress(char *);
     62 
     63 function void
     64 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx)
     65 {
     66 	Stream *e = (Stream *)userctx;
     67 	stream_append_s8s(e, s8("[OpenGL] "), (s8){.len = len, .data = (u8 *)msg}, s8("\n"));
     68 	os_console_log(e->data, e->widx);
     69 	stream_reset(e, 0);
     70 }
     71 
     72 function void
     73 load_gl(Stream *err)
     74 {
     75 	#define X(name, ret, params) name = (name##_fn *)glfwGetProcAddress(#name);
     76 	OGLProcedureList
     77 	OGLRequiredExtensionProcedureList
     78 	#undef X
     79 
     80 	stream_reset(err, 0);
     81 	#define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n"));
     82 	OGLProcedureList
     83 	OGLRequiredExtensionProcedureListBase
     84 	#if OS_WINDOWS
     85 	  OGLRequiredExtensionProcedureListW32
     86 	#else
     87 	  OGLRequiredExtensionProcedureListLinux
     88 	#endif
     89 	#undef X
     90 
     91 	if (err->widx) fatal(stream_to_s8(err));
     92 }
     93 
     94 function void
     95 beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena)
     96 {
     97 	/* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently
     98 	 * causing a major performance regression. for now we are disabling its use
     99 	 * altogether. it will be reenabled once the issue can be fixed */
    100 	b32 result = 0 && vk_gpu_info()->vendor == GPUVendor_NVIDIA && ValidHandle(cuda);
    101 	if (result) {
    102 		Stream err = arena_stream(arena);
    103 
    104 		stream_append_s8(&err, beamformer_info("loading CUDA library functions"));
    105 		#define X(name, symname) cuda_## name = os_lookup_symbol(cuda, symname);
    106 		CUDALibraryProcedureList
    107 		#undef X
    108 
    109 		os_console_log(err.data, err.widx);
    110 	}
    111 
    112 	#define X(name, symname) if (!cuda_## name) cuda_## name = cuda_ ## name ## _stub;
    113 	CUDALibraryProcedureList
    114 	#undef X
    115 }
    116 
    117 function void
    118 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm)
    119 {
    120 	for (;;) {
    121 		i32 expected = 0;
    122 		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) ||
    123 		    atomic_load_u32(&sm->live_imaging_parameters.active))
    124 		{
    125 			break;
    126 		}
    127 
    128 		/* TODO(rnp): clean this crap up; we shouldn't need two values to communicate this */
    129 		atomic_store_u32(&ctx->awake, 0);
    130 		os_wait_on_address(&ctx->sync_variable, 1, (u32)-1);
    131 		atomic_store_u32(&ctx->awake, 1);
    132 	}
    133 }
    134 
    135 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
    136 {
    137 	GLWorkerThreadContext *ctx = user_context;
    138 
    139 	BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context;
    140 
    141 	for (;;) {
    142 		worker_thread_sleep(ctx, beamformer->shared_memory);
    143 		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
    144 		beamformer_complete_compute(beamformer, &ctx->arena);
    145 	}
    146 
    147 	unreachable();
    148 
    149 	return 0;
    150 }
    151 
    152 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point)
    153 {
    154 	GLWorkerThreadContext         *ctx = user_context;
    155 	BeamformerUploadThreadContext *up  = (typeof(up))ctx->user_context;
    156 
    157 	for (;;) {
    158 		worker_thread_sleep(ctx, up->shared_memory);
    159 		beamformer_rf_upload(up);
    160 	}
    161 
    162 	unreachable();
    163 
    164 	return 0;
    165 }
    166 
    167 BEAMFORMER_EXPORT void
    168 beamformer_init(BeamformerInput *input)
    169 {
    170 	Arena  memory        = arena_from_memory(input->memory, input->memory_size);
    171 	Arena  compute_arena = sub_arena_end(&memory, MB(2), KB(4));
    172 	Arena  upload_arena  = sub_arena_end(&memory, KB(4), KB(4));
    173 	Arena  ui_arena      = sub_arena_end(&memory, MB(2), KB(4));
    174 	Stream error         = arena_stream(sub_arena_end(&memory, MB(1), 1));
    175 
    176 	BeamformerCtx *ctx   = push_struct(&memory, BeamformerCtx);
    177 
    178 	str8 window_title = str8("VK Beamformer");
    179 	ctx->main_window  = os_window_create(window_title.data, window_title.length, 1280, 840);
    180 	ctx->window_size  = (iv2){{1280, 840}};
    181 
    182 	Arena scratch = {.beg = memory.end - 4096L, .end = memory.end};
    183 	memory.end = scratch.beg;
    184 
    185 	ctx->error_stream          = error;
    186 	ctx->ui_backing_store      = ui_arena;
    187 	ctx->compute_worker.arena  = compute_arena;
    188 	ctx->upload_worker.arena   = upload_arena;
    189 
    190 	#if BEAMFORMER_RENDERDOC_HOOKS
    191 	start_frame_capture       = input->renderdoc_start_frame_capture;
    192 	end_frame_capture         = input->renderdoc_end_frame_capture;
    193 	set_capture_path_template = input->renderdoc_set_capture_file_path_template;
    194 	#endif
    195 
    196 	vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream);
    197 
    198 	BeamformerComputeContext *cs = &ctx->compute_context;
    199 
    200 	// NOTE(rnp): allocate beamformed image ring buffer
    201 	{
    202 		u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size;
    203 		u64 trial_sizes[] = {
    204 			GB(4),
    205 			GB(2),
    206 			GB(1) + MB(512),
    207 			GB(1),
    208 		};
    209 
    210 		u32 base_index = 0;
    211 		for EachElement(trial_sizes, it) {
    212 			if (gpu_heap_size >= 2 * trial_sizes[it])
    213 				break;
    214 			base_index++;
    215 		}
    216 
    217 		for (u32 i = base_index; i < countof(trial_sizes); i++) {
    218 			// TODO(rnp): it may be better to download data from this using the transfer queue
    219 			VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics};
    220 			GPUBufferAllocateInfo allocate_info = {
    221 				.size            = trial_sizes[i],
    222 				.flags           = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite,
    223 				.timeline_count  = countof(timelines),
    224 				.timelines_used  = timelines,
    225 				.label           = str8("BeamformedData"),
    226 			};
    227 			vk_buffer_allocate(cs->backlog.buffer, &allocate_info);
    228 			if (cs->backlog.buffer->size > 0)
    229 				break;
    230 		}
    231 		if (cs->backlog.buffer->size == 0) {
    232 			// NOTE(rnp): if this becomes an issue we may be able to get by in some other way
    233 			fatal(s8("Failed to allocate space for beamformed data\n"));
    234 		}
    235 
    236 		BeamformerShaderResourceInfo shader_resource_infos[] = {
    237 			{
    238 				.kind   = BeamformerShaderResourceKind_Buffer,
    239 				.handle = cs->backlog.buffer->handle,
    240 				.slot   = BeamformerShaderBufferSlot_BeamformedData,
    241 			},
    242 		};
    243 		vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos));
    244 	}
    245 
    246 	beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory);
    247 
    248 	load_gl(&ctx->error_stream);
    249 
    250 	ctx->shared_memory      = input->shared_memory;
    251 	ctx->shared_memory_size = input->shared_memory_size;
    252 	if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory))
    253 		fatal(s8("Get more ram lol\n"));
    254 	zero_struct(ctx->shared_memory);
    255 
    256 	ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION;
    257 	ctx->shared_memory->reserved_parameter_blocks = 1;
    258 
    259 	ctx->shared_memory->beamformed_frame_buffer_size = cs->backlog.buffer->size;
    260 
    261 	// TODO(rnp): dynamic rf data buffer slot usage
    262 	// NOTE(rnp): will be same as the max size we were able to get for the frame buffer
    263 	ctx->shared_memory->capabilities.max_rf_data_size = cs->backlog.buffer->size
    264 	                                                    / BeamformerMaxRawDataFramesInFlight;
    265 
    266 	ctx->shared_memory->capabilities.cuda    = cuda_supported();
    267 	// TODO(rnp): re-enable hilbert support, with and without cuda
    268 	ctx->shared_memory->capabilities.hilbert = 0;
    269 
    270 	/* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores
    271 	 * on w32 but thats what we are doing for now */
    272 	#if OS_WINDOWS
    273 	{
    274 		Stream sb = arena_stream(memory);
    275 		stream_append(&sb, input->shared_memory_name, input->shared_memory_name_length);
    276 		stream_append_s8(&sb, s8("_lock_"));
    277 		i32 start_index = sb.widx;
    278 		for EachElement(os_w32_shared_memory_semaphores, it) {
    279 			stream_reset(&sb, start_index);
    280 			stream_append_u64(&sb, it);
    281 			stream_append_byte(&sb, 0);
    282 			os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1);
    283 			if InvalidHandle(os_w32_shared_memory_semaphores[it])
    284 				fatal(beamformer_info("init: failed to create w32 shared memory semaphore\n"));
    285 
    286 			/* NOTE(rnp): hacky garbage because CreateSemaphore will just open an existing
    287 			 * semaphore without any indication. Sometimes the other side of the shared memory
    288 			 * will provide incorrect parameters or will otherwise fail and its faster to
    289 			 * restart this program than to get that application to release the semaphores */
    290 			/* TODO(rnp): figure out something more robust */
    291 			os_w32_semaphore_release(os_w32_shared_memory_semaphores[it], 1);
    292 		}
    293 	}
    294 	#endif
    295 
    296 	GLWorkerThreadContext *worker = &ctx->compute_worker;
    297 	/* TODO(rnp): we should lock this down after we have something working */
    298 	worker->user_context = (iptr)ctx;
    299 	worker->handle       = os_create_thread("[compute]", worker, compute_worker_thread_entry_point);
    300 
    301 	GLWorkerThreadContext         *upload = &ctx->upload_worker;
    302 	BeamformerUploadThreadContext *upctx  = push_struct(&memory, typeof(*upctx));
    303 	upload->user_context        = (iptr)upctx;
    304 	upctx->rf_buffer            = &cs->rf_buffer;
    305 	upctx->shared_memory        = ctx->shared_memory;
    306 	upctx->shared_memory_size   = ctx->shared_memory_size;
    307 	upctx->compute_timing_table = ctx->compute_timing_table;
    308 	upctx->compute_worker_sync  = &ctx->compute_worker.sync_variable;
    309 	upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point);
    310 
    311 	/* NOTE: set up OpenGL debug logging */
    312 	Stream *gl_error_stream = push_struct(&memory, Stream);
    313 	*gl_error_stream        = stream_alloc(&memory, 1024);
    314 	glDebugMessageCallback(gl_debug_logger, gl_error_stream);
    315 #ifdef _DEBUG
    316 	glEnable(GL_DEBUG_OUTPUT);
    317 #endif
    318 
    319 	if (!BakeShaders)
    320 	{
    321 		for EachElement(beamformer_reloadable_compute_shader_info_indices, it) {
    322 			i32   index = beamformer_reloadable_compute_shader_info_indices[it];
    323 			Arena temp  = scratch;
    324 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    325 			                             beamformer_reloadable_shader_files[index][0]);
    326 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    327 			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
    328 			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
    329 			os_add_file_watch((char *)file.data, file.len, frc);
    330 		}
    331 
    332 		for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) {
    333 			i32   index = beamformer_reloadable_compute_helpers_shader_info_indices[it];
    334 			Arena temp  = scratch;
    335 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    336 			                             beamformer_reloadable_shader_files[index][0]);
    337 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    338 			frc->kind                 = BeamformerFileReloadKind_ComputeShader;
    339 			frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index];
    340 			os_add_file_watch((char *)file.data, file.len, frc);
    341 		}
    342 
    343 		for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) {
    344 			i32   index = beamformer_reloadable_compute_internal_shader_info_indices[it];
    345 			Arena temp  = scratch;
    346 			s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"),
    347 			                             beamformer_reloadable_shader_files[index][0]);
    348 			BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc));
    349 			frc->kind                   = BeamformerFileReloadKind_ComputeInternalShader;
    350 			frc->shader_reload.shader   = beamformer_reloadable_shader_kinds[index];
    351 			frc->shader_reload.pipeline = cs->compute_internal_pipelines + it;
    352 			os_add_file_watch((char *)file.data, file.len, frc);
    353 		}
    354 	}
    355 
    356 	memory.end = scratch.end;
    357 	ctx->arena = memory;
    358 	ctx->state = BeamformerState_Running;
    359 }
    360 
    361 BEAMFORMER_EXPORT void
    362 beamformer_terminate(BeamformerInput *input)
    363 {
    364 	/* NOTE(rnp): work around pebkac when the beamformer is closed while we are doing live
    365 	 * imaging. if the verasonics is blocked in an external function (calling the library
    366 	 * to start compute) it is impossible for us to get it to properly shut down which
    367 	 * will sometimes result in us needing to power cycle the system. set the shared memory
    368 	 * into an error state and release dispatch lock so that future calls will error instead
    369 	 * of blocking.
    370 	 */
    371 	BeamformerCtx *          ctx = BeamformerContextMemory(input->memory);
    372 	BeamformerSharedMemory * sm  = input->shared_memory;
    373 	if (ctx->state != BeamformerState_Terminated) {
    374 		if (sm) {
    375 			BeamformerSharedMemoryLockKind lock = BeamformerSharedMemoryLockKind_DispatchCompute;
    376 			atomic_store_u32(&sm->invalid, 1);
    377 			atomic_store_u32(&sm->external_work_queue.ridx, sm->external_work_queue.widx);
    378 			DEBUG_DECL(if (sm->locks[lock])) {
    379 				beamformer_shared_memory_release_lock(sm, (i32)lock);
    380 			}
    381 
    382 			atomic_or_u32(&sm->live_imaging_dirty_flags, BeamformerLiveImagingDirtyFlags_StopImaging);
    383 		}
    384 
    385 		beamformer_debug_ui_deinit(ctx);
    386 
    387 		ctx->state = BeamformerState_Terminated;
    388 	}
    389 }
    390 
    391 BEAMFORMER_EXPORT u32
    392 beamformer_should_close(BeamformerInput *input)
    393 {
    394 	BeamformerCtx * ctx = BeamformerContextMemory(input->memory);
    395 	if (ctx->state == BeamformerState_ShouldClose)
    396 		beamformer_terminate(input);
    397 	return ctx->state == BeamformerState_Terminated;
    398 }