beamformer.c (14082B)
1 /* See LICENSE for license details. */ 2 3 #include "beamformer_internal.h" 4 5 /* NOTE(rnp): magic variables to force discrete GPU usage on laptops with multiple devices */ 6 EXPORT i32 NvOptimusEnablement = 1; 7 EXPORT i32 AmdPowerXpressRequestHighPerformance = 1; 8 9 #if !BEAMFORMER_DEBUG 10 #include "beamformer_core.c" 11 #else 12 13 typedef void beamformer_frame_step_fn(BeamformerInput *); 14 15 #define BEAMFORMER_DEBUG_ENTRY_POINTS \ 16 X(beamformer_debug_ui_deinit) \ 17 X(beamformer_complete_compute) \ 18 X(beamformer_frame_step) \ 19 X(beamformer_rf_upload) \ 20 21 #define X(name) global name ##_fn *name; 22 BEAMFORMER_DEBUG_ENTRY_POINTS 23 #undef X 24 25 BEAMFORMER_EXPORT void 26 beamformer_debug_hot_release(BeamformerInput *input) 27 { 28 BeamformerCtx *ctx = BeamformerContextMemory(input->memory); 29 30 // TODO(rnp): this will deadlock if live imaging is active 31 /* NOTE(rnp): spin until compute thread finishes its work (we will probably 32 * never reload while compute is in progress but just incase). */ 33 spin_wait(atomic_load_u32(&ctx->upload_worker.awake)); 34 spin_wait(atomic_load_u32(&ctx->compute_worker.awake)); 35 } 36 37 BEAMFORMER_EXPORT void 38 beamformer_debug_hot_reload(OSLibrary library, BeamformerInput *input) 39 { 40 #define X(name) name = os_lookup_symbol(library, #name); 41 BEAMFORMER_DEBUG_ENTRY_POINTS 42 #undef X 43 44 s8 info = beamformer_info("reloaded main executable"); 45 os_console_log(info.data, info.len); 46 } 47 48 #endif /* BEAMFORMER_DEBUG */ 49 50 function no_return void 51 fatal(s8 message) 52 { 53 os_fatal(message.data, message.len); 54 unreachable(); 55 } 56 57 #include "vulkan.c" 58 59 // TODO(rnp): this doesn't belong here, but will be removed 60 // once vulkan migration is complete 61 void * glfwGetProcAddress(char *); 62 63 function void 64 gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, const void *userctx) 65 { 66 Stream *e = (Stream *)userctx; 67 stream_append_s8s(e, s8("[OpenGL] "), (s8){.len = len, .data = (u8 *)msg}, s8("\n")); 68 os_console_log(e->data, e->widx); 69 stream_reset(e, 0); 70 } 71 72 function void 73 load_gl(Stream *err) 74 { 75 #define X(name, ret, params) name = (name##_fn *)glfwGetProcAddress(#name); 76 OGLProcedureList 77 OGLRequiredExtensionProcedureList 78 #undef X 79 80 stream_reset(err, 0); 81 #define X(name, ret, params) if (!name) stream_append_s8(err, s8("missing required GL function: " #name "\n")); 82 OGLProcedureList 83 OGLRequiredExtensionProcedureListBase 84 #if OS_WINDOWS 85 OGLRequiredExtensionProcedureListW32 86 #else 87 OGLRequiredExtensionProcedureListLinux 88 #endif 89 #undef X 90 91 if (err->widx) fatal(stream_to_s8(err)); 92 } 93 94 function void 95 beamformer_load_cuda_library(BeamformerCtx *ctx, OSLibrary cuda, Arena arena) 96 { 97 /* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently 98 * causing a major performance regression. for now we are disabling its use 99 * altogether. it will be reenabled once the issue can be fixed */ 100 b32 result = 0 && vk_gpu_info()->vendor == GPUVendor_NVIDIA && ValidHandle(cuda); 101 if (result) { 102 Stream err = arena_stream(arena); 103 104 stream_append_s8(&err, beamformer_info("loading CUDA library functions")); 105 #define X(name, symname) cuda_## name = os_lookup_symbol(cuda, symname); 106 CUDALibraryProcedureList 107 #undef X 108 109 os_console_log(err.data, err.widx); 110 } 111 112 #define X(name, symname) if (!cuda_## name) cuda_## name = cuda_ ## name ## _stub; 113 CUDALibraryProcedureList 114 #undef X 115 } 116 117 function void 118 worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm) 119 { 120 for (;;) { 121 i32 expected = 0; 122 if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) || 123 atomic_load_u32(&sm->live_imaging_parameters.active)) 124 { 125 break; 126 } 127 128 /* TODO(rnp): clean this crap up; we shouldn't need two values to communicate this */ 129 atomic_store_u32(&ctx->awake, 0); 130 os_wait_on_address(&ctx->sync_variable, 1, (u32)-1); 131 atomic_store_u32(&ctx->awake, 1); 132 } 133 } 134 135 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point) 136 { 137 GLWorkerThreadContext *ctx = user_context; 138 139 BeamformerCtx *beamformer = (BeamformerCtx *)ctx->user_context; 140 141 for (;;) { 142 worker_thread_sleep(ctx, beamformer->shared_memory); 143 asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg); 144 beamformer_complete_compute(beamformer, &ctx->arena); 145 } 146 147 unreachable(); 148 149 return 0; 150 } 151 152 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point) 153 { 154 GLWorkerThreadContext *ctx = user_context; 155 BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context; 156 157 for (;;) { 158 worker_thread_sleep(ctx, up->shared_memory); 159 beamformer_rf_upload(up); 160 } 161 162 unreachable(); 163 164 return 0; 165 } 166 167 BEAMFORMER_EXPORT void 168 beamformer_init(BeamformerInput *input) 169 { 170 Arena memory = arena_from_memory(input->memory, input->memory_size); 171 Arena compute_arena = sub_arena_end(&memory, MB(2), KB(4)); 172 Arena upload_arena = sub_arena_end(&memory, KB(4), KB(4)); 173 Arena ui_arena = sub_arena_end(&memory, MB(2), KB(4)); 174 Stream error = arena_stream(sub_arena_end(&memory, MB(1), 1)); 175 176 BeamformerCtx *ctx = push_struct(&memory, BeamformerCtx); 177 178 str8 window_title = str8("VK Beamformer"); 179 ctx->main_window = os_window_create(window_title.data, window_title.length, 1280, 840); 180 ctx->window_size = (iv2){{1280, 840}}; 181 182 Arena scratch = {.beg = memory.end - 4096L, .end = memory.end}; 183 memory.end = scratch.beg; 184 185 ctx->error_stream = error; 186 ctx->ui_backing_store = ui_arena; 187 ctx->compute_worker.arena = compute_arena; 188 ctx->upload_worker.arena = upload_arena; 189 190 #if BEAMFORMER_RENDERDOC_HOOKS 191 start_frame_capture = input->renderdoc_start_frame_capture; 192 end_frame_capture = input->renderdoc_end_frame_capture; 193 set_capture_path_template = input->renderdoc_set_capture_file_path_template; 194 #endif 195 196 vk_load(input->vulkan_library_handle, &memory, &ctx->error_stream); 197 198 BeamformerComputeContext *cs = &ctx->compute_context; 199 200 // NOTE(rnp): allocate beamformed image ring buffer 201 { 202 u64 gpu_heap_size = vk_gpu_info()->gpu_heap_size; 203 u64 trial_sizes[] = { 204 GB(4), 205 GB(2), 206 GB(1) + MB(512), 207 GB(1), 208 }; 209 210 u32 base_index = 0; 211 for EachElement(trial_sizes, it) { 212 if (gpu_heap_size >= 2 * trial_sizes[it]) 213 break; 214 base_index++; 215 } 216 217 for (u32 i = base_index; i < countof(trial_sizes); i++) { 218 // TODO(rnp): it may be better to download data from this using the transfer queue 219 VulkanTimeline timelines[] = {VulkanTimeline_Compute, VulkanTimeline_Graphics}; 220 GPUBufferAllocateInfo allocate_info = { 221 .size = trial_sizes[i], 222 .flags = VulkanUsageFlag_TransferSource|VulkanUsageFlag_HostReadWrite, 223 .timeline_count = countof(timelines), 224 .timelines_used = timelines, 225 .label = str8("BeamformedData"), 226 }; 227 vk_buffer_allocate(cs->backlog.buffer, &allocate_info); 228 if (cs->backlog.buffer->size > 0) 229 break; 230 } 231 if (cs->backlog.buffer->size == 0) { 232 // NOTE(rnp): if this becomes an issue we may be able to get by in some other way 233 fatal(s8("Failed to allocate space for beamformed data\n")); 234 } 235 236 BeamformerShaderResourceInfo shader_resource_infos[] = { 237 { 238 .kind = BeamformerShaderResourceKind_Buffer, 239 .handle = cs->backlog.buffer->handle, 240 .slot = BeamformerShaderBufferSlot_BeamformedData, 241 }, 242 }; 243 vk_bind_shader_resources(shader_resource_infos, countof(shader_resource_infos)); 244 } 245 246 beamformer_load_cuda_library(ctx, input->cuda_library_handle, memory); 247 248 load_gl(&ctx->error_stream); 249 250 ctx->shared_memory = input->shared_memory; 251 ctx->shared_memory_size = input->shared_memory_size; 252 if (ctx->shared_memory_size < (i64)sizeof(*ctx->shared_memory)) 253 fatal(s8("Get more ram lol\n")); 254 zero_struct(ctx->shared_memory); 255 256 ctx->shared_memory->version = BEAMFORMER_SHARED_MEMORY_VERSION; 257 ctx->shared_memory->reserved_parameter_blocks = 1; 258 259 ctx->shared_memory->beamformed_frame_buffer_size = cs->backlog.buffer->size; 260 261 // TODO(rnp): dynamic rf data buffer slot usage 262 // NOTE(rnp): will be same as the max size we were able to get for the frame buffer 263 ctx->shared_memory->capabilities.max_rf_data_size = cs->backlog.buffer->size 264 / BeamformerMaxRawDataFramesInFlight; 265 266 ctx->shared_memory->capabilities.cuda = cuda_supported(); 267 // TODO(rnp): re-enable hilbert support, with and without cuda 268 ctx->shared_memory->capabilities.hilbert = 0; 269 270 /* TODO(rnp): I'm not sure if its a good idea to pre-reserve a bunch of semaphores 271 * on w32 but thats what we are doing for now */ 272 #if OS_WINDOWS 273 { 274 Stream sb = arena_stream(memory); 275 stream_append(&sb, input->shared_memory_name, input->shared_memory_name_length); 276 stream_append_s8(&sb, s8("_lock_")); 277 i32 start_index = sb.widx; 278 for EachElement(os_w32_shared_memory_semaphores, it) { 279 stream_reset(&sb, start_index); 280 stream_append_u64(&sb, it); 281 stream_append_byte(&sb, 0); 282 os_w32_shared_memory_semaphores[it] = os_w32_create_semaphore((c8 *)sb.data, 1, 1); 283 if InvalidHandle(os_w32_shared_memory_semaphores[it]) 284 fatal(beamformer_info("init: failed to create w32 shared memory semaphore\n")); 285 286 /* NOTE(rnp): hacky garbage because CreateSemaphore will just open an existing 287 * semaphore without any indication. Sometimes the other side of the shared memory 288 * will provide incorrect parameters or will otherwise fail and its faster to 289 * restart this program than to get that application to release the semaphores */ 290 /* TODO(rnp): figure out something more robust */ 291 os_w32_semaphore_release(os_w32_shared_memory_semaphores[it], 1); 292 } 293 } 294 #endif 295 296 GLWorkerThreadContext *worker = &ctx->compute_worker; 297 /* TODO(rnp): we should lock this down after we have something working */ 298 worker->user_context = (iptr)ctx; 299 worker->handle = os_create_thread("[compute]", worker, compute_worker_thread_entry_point); 300 301 GLWorkerThreadContext *upload = &ctx->upload_worker; 302 BeamformerUploadThreadContext *upctx = push_struct(&memory, typeof(*upctx)); 303 upload->user_context = (iptr)upctx; 304 upctx->rf_buffer = &cs->rf_buffer; 305 upctx->shared_memory = ctx->shared_memory; 306 upctx->shared_memory_size = ctx->shared_memory_size; 307 upctx->compute_timing_table = ctx->compute_timing_table; 308 upctx->compute_worker_sync = &ctx->compute_worker.sync_variable; 309 upload->handle = os_create_thread("[upload]", upload, beamformer_upload_entry_point); 310 311 /* NOTE: set up OpenGL debug logging */ 312 Stream *gl_error_stream = push_struct(&memory, Stream); 313 *gl_error_stream = stream_alloc(&memory, 1024); 314 glDebugMessageCallback(gl_debug_logger, gl_error_stream); 315 #ifdef _DEBUG 316 glEnable(GL_DEBUG_OUTPUT); 317 #endif 318 319 if (!BakeShaders) 320 { 321 for EachElement(beamformer_reloadable_compute_shader_info_indices, it) { 322 i32 index = beamformer_reloadable_compute_shader_info_indices[it]; 323 Arena temp = scratch; 324 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 325 beamformer_reloadable_shader_files[index][0]); 326 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 327 frc->kind = BeamformerFileReloadKind_ComputeShader; 328 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 329 os_add_file_watch((char *)file.data, file.len, frc); 330 } 331 332 for EachElement(beamformer_reloadable_compute_helpers_shader_info_indices, it) { 333 i32 index = beamformer_reloadable_compute_helpers_shader_info_indices[it]; 334 Arena temp = scratch; 335 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 336 beamformer_reloadable_shader_files[index][0]); 337 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 338 frc->kind = BeamformerFileReloadKind_ComputeShader; 339 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 340 os_add_file_watch((char *)file.data, file.len, frc); 341 } 342 343 for EachElement(beamformer_reloadable_compute_internal_shader_info_indices, it) { 344 i32 index = beamformer_reloadable_compute_internal_shader_info_indices[it]; 345 Arena temp = scratch; 346 s8 file = push_s8_from_parts(&temp, os_path_separator(), s8("shaders"), 347 beamformer_reloadable_shader_files[index][0]); 348 BeamformerFileReloadContext *frc = push_struct(&memory, typeof(*frc)); 349 frc->kind = BeamformerFileReloadKind_ComputeInternalShader; 350 frc->shader_reload.shader = beamformer_reloadable_shader_kinds[index]; 351 frc->shader_reload.pipeline = cs->compute_internal_pipelines + it; 352 os_add_file_watch((char *)file.data, file.len, frc); 353 } 354 } 355 356 memory.end = scratch.end; 357 ctx->arena = memory; 358 ctx->state = BeamformerState_Running; 359 } 360 361 BEAMFORMER_EXPORT void 362 beamformer_terminate(BeamformerInput *input) 363 { 364 /* NOTE(rnp): work around pebkac when the beamformer is closed while we are doing live 365 * imaging. if the verasonics is blocked in an external function (calling the library 366 * to start compute) it is impossible for us to get it to properly shut down which 367 * will sometimes result in us needing to power cycle the system. set the shared memory 368 * into an error state and release dispatch lock so that future calls will error instead 369 * of blocking. 370 */ 371 BeamformerCtx * ctx = BeamformerContextMemory(input->memory); 372 BeamformerSharedMemory * sm = input->shared_memory; 373 if (ctx->state != BeamformerState_Terminated) { 374 if (sm) { 375 BeamformerSharedMemoryLockKind lock = BeamformerSharedMemoryLockKind_DispatchCompute; 376 atomic_store_u32(&sm->invalid, 1); 377 atomic_store_u32(&sm->external_work_queue.ridx, sm->external_work_queue.widx); 378 DEBUG_DECL(if (sm->locks[lock])) { 379 beamformer_shared_memory_release_lock(sm, (i32)lock); 380 } 381 382 atomic_or_u32(&sm->live_imaging_dirty_flags, BeamformerLiveImagingDirtyFlags_StopImaging); 383 } 384 385 beamformer_debug_ui_deinit(ctx); 386 387 ctx->state = BeamformerState_Terminated; 388 } 389 } 390 391 BEAMFORMER_EXPORT u32 392 beamformer_should_close(BeamformerInput *input) 393 { 394 BeamformerCtx * ctx = BeamformerContextMemory(input->memory); 395 if (ctx->state == BeamformerState_ShouldClose) 396 beamformer_terminate(input); 397 return ctx->state == BeamformerState_Terminated; 398 }