ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

ogl_beamformer_lib.c (24634B)


      1 /* See LICENSE for license details. */
      2 #include "../compiler.h"
      3 
      4 #define BEAMFORMER_IMPORT static
      5 
      6 #include "../beamformer.h"
      7 
      8 #include "../util.h"
      9 
     10 #include "../generated/beamformer.meta.c"
     11 #include "../beamformer_parameters.h"
     12 #include "ogl_beamformer_lib_base.h"
     13 
     14 #if OS_LINUX
     15 #include "../os_linux.c"
     16 #elif OS_WINDOWS
     17 #include "../os_win32.c"
     18 
     19 W32(iptr) OpenFileMappingA(u32, b32, c8 *);
     20 
     21 #else
     22 #error Unsupported Platform
     23 #endif
     24 
     25 #include "../util_os.c"
     26 #include "../beamformer_shared_memory.c"
     27 
     28 global struct {
     29 	BeamformerSharedMemory *bp;
     30 	i32                     timeout_ms;
     31 	BeamformerLibErrorKind  last_error;
     32 	i64                     shared_memory_size;
     33 } g_beamformer_library_context;
     34 
     35 #if OS_LINUX
     36 
     37 function s8
     38 os_open_shared_memory_area(char *name)
     39 {
     40 	s8 result = {0};
     41 	i32 fd = shm_open(name, O_RDWR, S_IRUSR|S_IWUSR);
     42 	if (fd > 0) {
     43 		struct stat sb;
     44 		if (fstat(fd, &sb) != -1) {
     45 			void *new = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
     46 			if (new != MAP_FAILED) {
     47 				result.data = new;
     48 				result.len  = sb.st_size;
     49 			}
     50 		}
     51 		close(fd);
     52 	}
     53 	return result;
     54 }
     55 
     56 function void
     57 os_close_shared_memory_area(void *memory, i64 size)
     58 {
     59 	munmap(memory, size);
     60 }
     61 
     62 #elif OS_WINDOWS
     63 
     64 W32(u64) VirtualQuery(void *base_address, void *memory_basic_info, u64 memory_basic_info_size);
     65 W32(b32) UnmapViewOfFile(void *);
     66 
     67 function b32
     68 os_reserve_region_locks(void)
     69 {
     70 	u8 buffer[1024];
     71 	Stream sb = {.data = buffer, .cap = countof(buffer)};
     72 	stream_append_s8(&sb, s8(OS_SHARED_MEMORY_NAME "_lock_"));
     73 
     74 	i32 start_index    = sb.widx;
     75 	u32 reserved_count = 0;
     76 	for EachElement(os_w32_shared_memory_semaphores, it) {
     77 		stream_reset(&sb, start_index);
     78 		stream_append_u64(&sb, it);
     79 		stream_append_byte(&sb, 0);
     80 		os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1);
     81 		if InvalidHandle(os_w32_shared_memory_semaphores[it])
     82 			break;
     83 		reserved_count++;
     84 	}
     85 
     86 	b32 result = reserved_count == countof(os_w32_shared_memory_semaphores);
     87 	if (!result) {
     88 		for (u32 i = 0; i < reserved_count; i++)
     89 			CloseHandle(os_w32_shared_memory_semaphores[i].value[0]);
     90 	}
     91 
     92 	return result;
     93 }
     94 
     95 function s8
     96 os_open_shared_memory_area(char *name)
     97 {
     98 	struct alignas(16) {
     99 		void *BaseAddress;
    100 		void *AllocationBase;
    101 		u32   AllocationProtect;
    102 		u32   __alignment1;
    103 		u64   RegionSize;
    104 		u32   State;
    105 		u32   Protect;
    106 		u32   Type;
    107 		u32   __alignment2;
    108 	} memory_basic_info;
    109 
    110 	s8 result = {0};
    111 	iptr h = OpenFileMappingA(FILE_MAP_ALL_ACCESS, 0, name);
    112 	if (h != INVALID_FILE) {
    113 		// NOTE(rnp): a size of 0 maps the whole region, we can determine its size after
    114 		void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, 0);
    115 		if (new &&
    116 		    VirtualQuery(new, &memory_basic_info, sizeof(memory_basic_info)) == sizeof(memory_basic_info) &&
    117 		    os_reserve_region_locks())
    118 		{
    119 			result.data = new;
    120 			result.len  = (i64)memory_basic_info.RegionSize;
    121 		}
    122 
    123 		if (new && !result.data)
    124 			UnmapViewOfFile(new);
    125 
    126 		CloseHandle(h);
    127 	}
    128 	return result;
    129 }
    130 
    131 function void
    132 os_close_shared_memory_area(void *memory, i64 size)
    133 {
    134 	UnmapViewOfFile(memory);
    135 }
    136 
    137 #endif
    138 
    139 #define lib_error_check(c, e) lib_error_check_(c, BeamformerLibErrorKind_##e)
    140 function b32
    141 lib_error_check_(b32 condition, BeamformerLibErrorKind error_kind)
    142 {
    143 	b32 result = condition;
    144 	if (!result) g_beamformer_library_context.last_error = error_kind;
    145 	assert(result);
    146 	return result;
    147 }
    148 
    149 function b32
    150 check_shared_memory(void)
    151 {
    152 	b32 result = g_beamformer_library_context.bp != 0;
    153 	if unlikely(!g_beamformer_library_context.bp) {
    154 		s8 shared_memory = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
    155 		if (lib_error_check(shared_memory.data != 0, SharedMemory)) {
    156 			BeamformerSharedMemory *bp = (BeamformerSharedMemory *)shared_memory.data;
    157 			result = lib_error_check(bp->version == BEAMFORMER_SHARED_MEMORY_VERSION, VersionMismatch);
    158 			if (result) {
    159 				g_beamformer_library_context.bp                 = bp;
    160 				g_beamformer_library_context.shared_memory_size = shared_memory.len;
    161 			} else {
    162 				os_close_shared_memory_area(shared_memory.data, shared_memory.len);
    163 			}
    164 		}
    165 	}
    166 
    167 	if likely(g_beamformer_library_context.bp)
    168 		result = lib_error_check(likely(!g_beamformer_library_context.bp->invalid), InvalidAccess);
    169 	return result;
    170 }
    171 
    172 function b32
    173 valid_parameter_block(u32 block)
    174 {
    175 	b32 result = check_shared_memory();
    176 	if (result) {
    177 		result = lib_error_check(block < g_beamformer_library_context.bp->reserved_parameter_blocks,
    178 		                         ParameterBlockUnallocated);
    179 	}
    180 	return result;
    181 }
    182 
    183 function BeamformWork *
    184 try_push_work_queue(void)
    185 {
    186 	BeamformWork *result = beamform_work_queue_push(&g_beamformer_library_context.bp->external_work_queue);
    187 	lib_error_check(result != 0, WorkQueueFull);
    188 	return result;
    189 }
    190 
    191 function b32
    192 lib_try_lock(i32 lock, i32 timeout_ms)
    193 {
    194 	b32 result = beamformer_shared_memory_take_lock(g_beamformer_library_context.bp, lock, (u32)timeout_ms);
    195 	lib_error_check(result, SyncVariable);
    196 	return result;
    197 }
    198 
    199 function void
    200 lib_release_lock(i32 lock)
    201 {
    202 	beamformer_shared_memory_release_lock(g_beamformer_library_context.bp, lock);
    203 }
    204 
    205 u32
    206 beamformer_get_api_version(void)
    207 {
    208 	return BEAMFORMER_SHARED_MEMORY_VERSION;
    209 }
    210 
    211 const char *
    212 beamformer_error_string(BeamformerLibErrorKind kind)
    213 {
    214 	#define X(type, num, string) string,
    215 	local_persist const char *error_string_table[] = {BEAMFORMER_LIB_ERRORS "invalid error kind"};
    216 	#undef X
    217 	return error_string_table[MIN(kind, countof(error_string_table) - 1)];
    218 }
    219 
    220 BeamformerLibErrorKind
    221 beamformer_get_last_error(void)
    222 {
    223 	return g_beamformer_library_context.last_error;
    224 }
    225 
    226 const char *
    227 beamformer_get_last_error_string(void)
    228 {
    229 	return beamformer_error_string(beamformer_get_last_error());
    230 }
    231 
    232 void
    233 beamformer_set_global_timeout(u32 timeout_ms)
    234 {
    235 	g_beamformer_library_context.timeout_ms = timeout_ms;
    236 }
    237 
    238 b32
    239 beamformer_reserve_parameter_blocks(uint32_t count)
    240 {
    241 	b32 result = 0;
    242 	if (check_shared_memory() &&
    243 	    lib_error_check(count <= BeamformerMaxParameterBlocks, ParameterBlockOverflow))
    244 	{
    245 		g_beamformer_library_context.bp->reserved_parameter_blocks = count;
    246 		result = 1;
    247 	}
    248 	return result;
    249 }
    250 
    251 function b32
    252 validate_parameters(BeamformerParameters *bp)
    253 {
    254 	if (!lib_error_check(Between(bp->contrast_mode, 0, BeamformerContrastMode_Count - 1), InvalidContrastMode))
    255 		return 0;
    256 
    257 	u32 contrast_raw_sample_count = bp->acquisition_count * bp->sample_count * beamformer_contrast_mode_samples[bp->contrast_mode];
    258 	if (!lib_error_check(contrast_raw_sample_count <= bp->raw_data_dimensions.x, DataSizeMismatch))
    259 		return 0;
    260 
    261 	return 1;
    262 }
    263 
    264 function b32
    265 validate_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
    266 {
    267 	b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1) &&
    268 	                     data_kind != BeamformerDataKind_Float16 &&
    269 	                     data_kind != BeamformerDataKind_Float16Complex;
    270 	if (!lib_error_check(data_kind_test, InvalidDataKind))
    271 		return 0;
    272 
    273 	if (!lib_error_check(shader_count <= BeamformerMaxComputeShaderStages, ComputeStageOverflow))
    274 		return 0;
    275 
    276 	for (u32 i = 0; i < shader_count; i++) {
    277 		b32 stage_test = Between(shaders[i], BeamformerShaderKind_ComputeFirst, BeamformerShaderKind_ComputeLast);
    278 		if (!lib_error_check(stage_test, InvalidComputeStage))
    279 			return 0;
    280 
    281 		if (shaders[i] == BeamformerShaderKind_Demodulate &&
    282 		    !lib_error_check(!beamformer_data_kind_complex[data_kind], InvalidDemodulationDataKind))
    283 		{
    284 			return 0;
    285 		}
    286 	}
    287 
    288 	b32 start_stage_test = shaders[0] == BeamformerShaderKind_Demodulate ||
    289 	                       shaders[0] == BeamformerShaderKind_Decode;
    290 	if (!lib_error_check(start_stage_test, InvalidStartShader))
    291 		return 0;
    292 
    293 	return 1;
    294 }
    295 
    296 function b32
    297 parameter_block_region_upload(void *data, u32 size, u32 block, BeamformerParameterBlockRegions region_id,
    298                               u32 block_offset, i32 timeout_ms)
    299 {
    300 	i32 lock   = BeamformerSharedMemoryLockKind_Count + (i32)block;
    301 	b32 result = valid_parameter_block(block) && lib_try_lock(lock, timeout_ms);
    302 	if (result) {
    303 		mem_copy((u8 *)beamformer_parameter_block(g_beamformer_library_context.bp, block) + block_offset,
    304 		         data, size);
    305 		mark_parameter_block_region_dirty(g_beamformer_library_context.bp, block, region_id);
    306 		lib_release_lock(lock);
    307 	}
    308 	return result;
    309 }
    310 
    311 b32
    312 beamformer_set_pipeline_stage_parameters_at(u32 stage_index, i32 parameter, u32 block)
    313 {
    314 	u32 offset  = BeamformerParameterBlockRegionOffsets[BeamformerParameterBlockRegion_ComputePipeline];
    315 	offset     += offsetof(BeamformerComputePipeline, parameters);
    316 	offset     += (stage_index % BeamformerMaxComputeShaderStages) * sizeof(BeamformerShaderParameters);
    317 	b32 result  = parameter_block_region_upload(&parameter, sizeof(BeamformerShaderParameters), block,
    318 	                                            BeamformerParameterBlockRegion_ComputePipeline, offset,
    319 	                                            g_beamformer_library_context.timeout_ms);
    320 	return result;
    321 }
    322 
    323 b32
    324 beamformer_set_pipeline_stage_parameters(u32 stage_index, i32 parameter)
    325 {
    326 	b32 result = beamformer_set_pipeline_stage_parameters_at(stage_index, parameter, 0);
    327 	return result;
    328 }
    329 
    330 b32
    331 beamformer_push_pipeline_at(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind, u32 block)
    332 {
    333 	b32 result = 0;
    334 	if (check_shared_memory() && validate_pipeline(shaders, shader_count, data_kind)) {
    335 		i32 lock = BeamformerSharedMemoryLockKind_Count + (i32)block;
    336 		if (valid_parameter_block(block) && lib_try_lock(lock, g_beamformer_library_context.timeout_ms)) {
    337 			BeamformerParameterBlock *b = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    338 			mem_copy(&b->pipeline.shaders, shaders, shader_count * sizeof(*shaders));
    339 			mark_parameter_block_region_dirty(g_beamformer_library_context.bp, block,
    340 			                                  BeamformerParameterBlockRegion_ComputePipeline);
    341 			b->pipeline.shader_count = shader_count;
    342 			b->pipeline.data_kind    = data_kind;
    343 			lib_release_lock(lock);
    344 			result = 1;
    345 		}
    346 	}
    347 	return result;
    348 }
    349 
    350 b32
    351 beamformer_push_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
    352 {
    353 	b32 result = beamformer_push_pipeline_at(shaders, shader_count, data_kind, 0);
    354 	return result;
    355 }
    356 
    357 b32
    358 beamformer_create_filter(BeamformerFilterParameters *filter, u8 filter_slot, u8 parameter_block)
    359 {
    360 	b32 result = 0;
    361 	if (lib_error_check(filter->kind >= 0 && filter->kind < BeamformerFilterKind_Count, InvalidFilterKind)) {
    362 		if (check_shared_memory()) {
    363 			BeamformWork *work = try_push_work_queue();
    364 			if (work) {
    365 				BeamformerCreateFilterContext *ctx = &work->create_filter_context;
    366 				work->kind = BeamformerWorkKind_CreateFilter;
    367 				ctx->parameters      = *filter;
    368 				ctx->filter_slot     = filter_slot     % BeamformerFilterSlots;
    369 				ctx->parameter_block = parameter_block % BeamformerMaxParameterBlocks;
    370 				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    371 				result = 1;
    372 			}
    373 		}
    374 	}
    375 	return result;
    376 }
    377 
    378 function void
    379 beamformer_flush_commands(void)
    380 {
    381 	i32 lock = BeamformerSharedMemoryLockKind_DispatchCompute;
    382 	beamformer_shared_memory_take_lock(g_beamformer_library_context.bp, lock, 0);
    383 }
    384 
    385 #define BEAMFORMER_UPLOAD_FNS \
    386 	X(channel_mapping,               i16, 1, ChannelMapping) \
    387 	X(focal_vectors,                 f32, 2, FocalVectors)   \
    388 	X(sparse_elements,               i16, 1, SparseElements) \
    389 	X(transmit_receive_orientations, u8,  1, TransmitReceiveOrientations)
    390 
    391 #define X(name, dtype, elements, region_name) \
    392 b32 beamformer_push_##name ##_at(dtype *data, u32 count, u32 block) { \
    393 	b32 result = 0; \
    394 	if (lib_error_check(count <= countof(((BeamformerParameterBlock *)0)->name), BufferOverflow)) { \
    395 		result = parameter_block_region_upload(data, count * elements * sizeof(dtype), block, \
    396 		                                       BeamformerParameterBlockRegion_##region_name,  \
    397 		                                       offsetof(BeamformerParameterBlock, name),      \
    398 		                                       g_beamformer_library_context.timeout_ms);      \
    399 	} \
    400 	return result; \
    401 }
    402 BEAMFORMER_UPLOAD_FNS
    403 #undef X
    404 
    405 #define X(name, dtype, ...) \
    406 b32 beamformer_push_##name (dtype *data, u32 count) { \
    407 	b32 result = beamformer_push_##name ##_at(data, count, 0); \
    408 	return result; \
    409 }
    410 BEAMFORMER_UPLOAD_FNS
    411 #undef X
    412 
    413 #define BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(name) void name(void *restrict output_v, \
    414                                                            void *restrict input_v, \
    415                                                            u32 sample_count)
    416 typedef BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_fn);
    417 
    418 #define BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST \
    419 	X(i16) \
    420 	X(f32) \
    421 	X(f16) \
    422 
    423 static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
    424 
    425 #define X(type, ...) \
    426 function BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_##type) \
    427 { \
    428 	type *input_a = (type *)input_v + 0 * sample_count; \
    429 	type *input_b = (type *)input_v + 1 * sample_count; \
    430 	type *input_c = (type *)input_v + 2 * sample_count; \
    431 	type *output  = (type *)output_v; \
    432 	for (u32 sample = 0; sample < sample_count; sample++) \
    433 		output[sample] = input_a[sample] - input_b[sample] - input_c[sample]; \
    434 }
    435 BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
    436 #undef X
    437 
    438 function b32
    439 beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
    440 {
    441 	b32 result = 0;
    442 	Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    443 	                                                       g_beamformer_library_context.shared_memory_size);
    444 	BeamformerParameterBlock *b  = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    445 	BeamformerParameters     *bp = &b->parameters;
    446 	BeamformerDataKind     data_kind     = b->pipeline.data_kind;
    447 	BeamformerContrastMode contrast_mode = bp->contrast_mode;
    448 
    449 	u32 size     = bp->acquisition_count * bp->sample_count * bp->channel_count * beamformer_data_kind_byte_size[data_kind];
    450 	u32 raw_size = bp->raw_data_dimensions.x * bp->raw_data_dimensions.y * beamformer_data_kind_byte_size[data_kind];
    451 
    452 	if (lib_error_check(size <= arena_capacity(&scratch, u8), BufferOverflow) &&
    453 	    lib_error_check(size <= data_size && data_size == raw_size, DataSizeMismatch))
    454 	{
    455 		if (lib_try_lock(BeamformerSharedMemoryLockKind_UploadRF, timeout_ms)) {
    456 			if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
    457 				u32 channel_count      = bp->channel_count;
    458 				u32 out_channel_stride = beamformer_data_kind_byte_size[data_kind] * bp->sample_count * bp->acquisition_count;
    459 				u32 in_channel_stride  = beamformer_data_kind_byte_size[data_kind] * bp->raw_data_dimensions.x;
    460 
    461 				for (u32 channel = 0; channel < channel_count; channel++) {
    462 					u16 data_channel = (u16)b->channel_mapping[channel];
    463 					u32 out_off = out_channel_stride * channel;
    464 					u32 in_off  = in_channel_stride  * data_channel;
    465 					switch (contrast_mode) {
    466 					default:{
    467 						/* NOTE(rnp): non temporal copy would be better, but we can't ensure
    468 						 * 64 byte boundaries. */
    469 						memory_copy(scratch.beg + out_off, (u8 *)data + in_off, out_channel_stride);
    470 					}break;
    471 
    472 					case BeamformerContrastMode_A1S2:{
    473 						read_only local_persist u8 reduce_a1s2_index_map[] = {
    474 							[BeamformerDataKind_Int16]          = 0,
    475 							[BeamformerDataKind_Int16Complex]   = 0,
    476 							[BeamformerDataKind_Float32]        = 1,
    477 							[BeamformerDataKind_Float32Complex] = 1,
    478 							[BeamformerDataKind_Float16]        = 2,
    479 							[BeamformerDataKind_Float16Complex] = 2,
    480 						};
    481 						static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
    482 
    483 						read_only local_persist beamformer_reduce_a1s2_contrast_fn *reduce_a1s2_fn_table[] = {
    484 							#define X(type, ...) beamformer_reduce_a1s2_contrast_##type,
    485 							BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
    486 							#undef X
    487 						};
    488 
    489 						// TODO(rnp): HACK: for some unknown reason loading contrast data after loading
    490 						// non-contrast data causes the dataset to not be stored correctly (it looks
    491 						// like mix of the old and new dataset). Putting this here fixes the issue.
    492 						// Counter-intuitively this improves throughput on my zen4 test computer,
    493 						// however it obviously should not be needed.
    494 						memory_clear(scratch.beg + out_off, 0, out_channel_stride);
    495 
    496 						u32 sample_count = bp->sample_count * beamformer_data_kind_element_count[data_kind];
    497 						reduce_a1s2_fn_table[reduce_a1s2_index_map[data_kind]](scratch.beg + out_off,
    498 						                                                       (u8 *)data + in_off,
    499 						                                                       sample_count);
    500 					}break;
    501 					}
    502 				}
    503 
    504 				lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
    505 				/* TODO(rnp): need a better way to communicate this */
    506 				u64 rf_block_rf_size = (u64)block << 32ULL | (u64)size;
    507 				atomic_store_u64(&g_beamformer_library_context.bp->rf_block_rf_size, rf_block_rf_size);
    508 				result = 1;
    509 			}
    510 		}
    511 	}
    512 	return result;
    513 }
    514 
    515 b32
    516 beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, u32 parameter_slot)
    517 {
    518 	b32 result = 0;
    519 	if (check_shared_memory()) {
    520 		u32 reserved_blocks = g_beamformer_library_context.bp->reserved_parameter_blocks;
    521 		if (lib_error_check(image_plane_tag < BeamformerViewPlaneTag_Count, InvalidImagePlane) &&
    522 		    lib_error_check(parameter_slot < reserved_blocks, ParameterBlockUnallocated) &&
    523 		    beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms, parameter_slot))
    524 		{
    525 			BeamformWork *work = try_push_work_queue();
    526 			if (work) {
    527 				work->kind = BeamformerWorkKind_ComputeIndirect;
    528 				work->compute_indirect_context.view_plane      = image_plane_tag;
    529 				work->compute_indirect_context.parameter_block = parameter_slot;
    530 				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    531 				beamformer_flush_commands();
    532 				result = 1;
    533 			}
    534 		}
    535 	}
    536 	return result;
    537 }
    538 
    539 b32
    540 beamformer_push_parameters_at(BeamformerParameters *bp, u32 block)
    541 {
    542 	b32 result = check_shared_memory() && validate_parameters(bp);
    543 	if (result) {
    544 		result = parameter_block_region_upload(bp, sizeof(*bp), block,
    545 		                                       BeamformerParameterBlockRegion_Parameters,
    546 		                                       offsetof(BeamformerParameterBlock, parameters),
    547 		                                       g_beamformer_library_context.timeout_ms);
    548 		if (result) {
    549 			BeamformerParameterBlock *pb = beamformer_parameter_block(g_beamformer_library_context.bp, block);
    550 			atomic_or_u32(&pb->region_update_flags, 1u << BeamformerParameterRegionFlag_NotifyUI);
    551 		}
    552 	}
    553 	return result;
    554 }
    555 
    556 b32
    557 beamformer_push_parameters(BeamformerParameters *bp)
    558 {
    559 	b32 result = beamformer_push_parameters_at(bp, 0);
    560 	return result;
    561 }
    562 
    563 b32
    564 beamformer_push_simple_parameters_at(BeamformerSimpleParameters *bp, u32 block)
    565 {
    566 	b32 result = check_shared_memory();
    567 	if (result) {
    568 		alignas(64) v2 focal_vectors[countof(bp->steering_angles)];
    569 		for (u32 i = 0; i < countof(bp->steering_angles); i++)
    570 			focal_vectors[i] = (v2){{bp->steering_angles[i], bp->focal_depths[i]}};
    571 
    572 		result &= beamformer_push_parameters_at((BeamformerParameters *)bp, block);
    573 		result &= beamformer_push_pipeline_at(bp->compute_stages, bp->compute_stages_count, (BeamformerDataKind)bp->data_kind, block);
    574 		result &= beamformer_push_channel_mapping_at(bp->channel_mapping, bp->channel_count, block);
    575 		result &= beamformer_push_focal_vectors_at((f32 *)focal_vectors, countof(focal_vectors), block);
    576 		result &= beamformer_push_transmit_receive_orientations_at(bp->transmit_receive_orientations,
    577 		                                                           bp->acquisition_count, block);
    578 
    579 		if (bp->acquisition_kind == BeamformerAcquisitionKind_UFORCES ||
    580 		    bp->acquisition_kind == BeamformerAcquisitionKind_UHERCULES)
    581 		{
    582 			result &= beamformer_push_sparse_elements_at(bp->sparse_elements, bp->acquisition_count, block);
    583 		}
    584 
    585 		for (u32 stage = 0; stage < bp->compute_stages_count; stage++)
    586 			result &= beamformer_set_pipeline_stage_parameters_at(stage, bp->compute_stage_parameters[stage], block);
    587 	}
    588 	return result;
    589 }
    590 
    591 b32
    592 beamformer_push_simple_parameters(BeamformerSimpleParameters *bp)
    593 {
    594 	b32 result = beamformer_push_simple_parameters_at(bp, 0);
    595 	return result;
    596 }
    597 
    598 function b32
    599 beamformer_export_buffer(BeamformerExportContext export_context)
    600 {
    601 	BeamformWork *work = try_push_work_queue();
    602 	b32 result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ExportSync, 0);
    603 	if (result) {
    604 		work->export_context = export_context;
    605 		work->kind = BeamformerWorkKind_ExportBuffer;
    606 		work->lock = BeamformerSharedMemoryLockKind_ScratchSpace;
    607 		beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
    608 	}
    609 	return result;
    610 }
    611 
    612 function b32
    613 beamformer_export(BeamformerExportContext export, void *out, i32 timeout_ms)
    614 {
    615 	b32 result = 0;
    616 	if (beamformer_export_buffer(export)) {
    617 		/* NOTE(rnp): if this fails it just means that the work from push_data hasn't
    618 		 * started yet. This is here to catch the other case where the work started
    619 		 * and finished before we finished queuing the export work item */
    620 		beamformer_flush_commands();
    621 
    622 		if (lib_try_lock(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) {
    623 			if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
    624 				Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    625 				                                                       g_beamformer_library_context.shared_memory_size);
    626 				mem_copy(out, scratch.beg, export.size);
    627 				lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
    628 				result = 1;
    629 			}
    630 			lib_release_lock(BeamformerSharedMemoryLockKind_ExportSync);
    631 		}
    632 	}
    633 	return result;
    634 }
    635 
    636 b32
    637 beamformer_beamform_data(BeamformerSimpleParameters *bp, void *data, uint32_t data_size,
    638                          void *out_data, int32_t timeout_ms)
    639 {
    640 	b32 result = beamformer_push_simple_parameters(bp);
    641 	if (result) {
    642 		iv3 output_points = bp->output_points.xyz;
    643 		output_points.E[0] = Max(1, output_points.E[0]);
    644 		output_points.E[1] = Max(1, output_points.E[1]);
    645 		output_points.E[2] = Max(1, output_points.E[2]);
    646 
    647 		b32 complex = 0;
    648 		for (u32 stage = 0; stage < bp->compute_stages_count; stage++) {
    649 			BeamformerShaderKind shader = (BeamformerShaderKind)bp->compute_stages[stage];
    650 			complex |= shader == BeamformerShaderKind_Demodulate || shader == BeamformerShaderKind_CudaHilbert;
    651 		}
    652 
    653 		iz output_size = output_points.x * output_points.y * output_points.z * (i32)sizeof(f32);
    654 		if (complex) output_size *= 2;
    655 
    656 		Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    657 		                                                       g_beamformer_library_context.shared_memory_size);
    658 		if (out_data) result &= lib_error_check(output_size <= arena_capacity(&scratch, u8), ExportSpaceOverflow);
    659 
    660 		if (result) {
    661 			result = beamformer_push_data_with_compute(data, data_size, 0, 0);
    662 			if (result && out_data) {
    663 				BeamformerExportContext export;
    664 				export.kind = BeamformerExportKind_BeamformedData;
    665 				export.size = (u32)output_size;
    666 				result = beamformer_export(export, out_data, timeout_ms);
    667 			}
    668 		}
    669 	}
    670 	return result;
    671 }
    672 
    673 b32
    674 beamformer_compute_timings(BeamformerComputeStatsTable *output, i32 timeout_ms)
    675 {
    676 	b32 result = 0;
    677 	if (check_shared_memory()) {
    678 		Arena scratch = beamformer_shared_memory_scratch_arena(g_beamformer_library_context.bp,
    679 		                                                       g_beamformer_library_context.shared_memory_size);
    680 		if (lib_error_check((iz)sizeof(*output) <= arena_capacity(&scratch, u8), ExportSpaceOverflow)) {
    681 			BeamformerExportContext export;
    682 			export.kind = BeamformerExportKind_Stats;
    683 			export.size = sizeof(*output);
    684 			result = beamformer_export(export, output, timeout_ms);
    685 		}
    686 	}
    687 	return result;
    688 }
    689 
    690 i32
    691 beamformer_live_parameters_get_dirty_flag(void)
    692 {
    693 	i32 result = -1;
    694 	if (check_shared_memory()) {
    695 		u32 flag = ctz_u32(g_beamformer_library_context.bp->live_imaging_dirty_flags);
    696 		if (flag != 32) {
    697 			atomic_and_u32(&g_beamformer_library_context.bp->live_imaging_dirty_flags, ~(1u << flag));
    698 			result = (i32)flag;
    699 		}
    700 	}
    701 	return result;
    702 }
    703 
    704 BeamformerLiveImagingParameters *
    705 beamformer_get_live_parameters(void)
    706 {
    707 	BeamformerLiveImagingParameters *result = 0;
    708 	if (check_shared_memory()) result = &g_beamformer_library_context.bp->live_imaging_parameters;
    709 	return result;
    710 }
    711 
    712 b32
    713 beamformer_set_live_parameters(BeamformerLiveImagingParameters *new)
    714 {
    715 	b32 result = 0;
    716 	if (check_shared_memory()) {
    717 		mem_copy(&g_beamformer_library_context.bp->live_imaging_parameters, new, sizeof(*new));
    718 		store_fence();
    719 		result = 1;
    720 	}
    721 	return result;
    722 }