mc.c (14236B)
1 /* Oblique Monte Carlo 2 * Written by Randy Palamar - March-April 2024 3 * Based on: Hybrid-MC by Li Li 4 * Reference: Wang, "Biomedical optics", chapter 3 & 6 5 */ 6 7 /* Details: 8 * - Mostly uses Cartesian coordinates. Polar coordinates are used for 9 * specifying incident location. 10 * - Beam faces in positive z direction incident from anywhere in r-theta 11 * plane. Initial launch direction is always towards origin. 12 */ 13 14 #include <immintrin.h> 15 #include <math.h> 16 #include <pthread.h> 17 #include <stdarg.h> 18 #include <stddef.h> 19 #include <stdint.h> 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <time.h> 23 24 #if defined(_DEBUG) && defined(__clang__) 25 #define ASSERT(c) do if (!(c)) __builtin_debugtrap(); while(0) 26 #elif defined(_DEBUG) /* as usual GCC is bad */ 27 #define ASSERT(c) do if (!(c)) asm("int3; nop"); while(0) 28 #else 29 #define ASSERT(a) 30 #endif 31 32 #define SGN(x) ((x) >= 0 ? 1 : -1) 33 #define ABS(x) ((x) >= 0 ? x : -x) 34 #define LEN(a) (sizeof(a) / sizeof(*a)) 35 #define DEG2RAD(a) ((a) * M_PI / 180.0) 36 37 #define ZERO 1.0e-6 38 #define BIGVAL 1.0e6 39 #define EPS 1.0e-9 40 41 typedef uint64_t u64; 42 typedef uint32_t u32; 43 typedef uint8_t u8; 44 typedef u32 b32; 45 typedef double f64; 46 typedef ptrdiff_t size; 47 48 typedef struct { u8 *data; size len; } s8; 49 #define s8(s) (s8){(u8 *)s, (LEN(s) - 1)} 50 51 typedef struct { f64 x, y, z; } Vec3; 52 typedef struct { f64 r, theta, z; } Vec3Pol; 53 typedef struct { f64 top, bot, left, right; } Rect; 54 typedef struct { u32 Nx, Ny; f64 *b; } Mat2; 55 56 typedef struct { 57 Vec3 pos; 58 Vec3 dir; 59 f64 w; 60 u32 n_scatters; 61 u32 dead; 62 } Photon; 63 64 typedef struct { 65 pthread_t id; 66 u64 rand_state[4]; 67 Mat2 Rd_xy; 68 u64 N_photons; 69 u32 highest_photon_scatters; 70 b32 done; 71 } WorkerCtx; 72 73 #include "config.h" 74 75 /* these are only modified during init() */ 76 static f64 g_dx, g_dy; 77 static f64 g_xoff, g_yoff; 78 static f64 g_mu_t; 79 80 static f64 g_d = 1e6; /* layer thickness in [cm] */ 81 82 /* these will be modified by multiple threads */ 83 static struct { pthread_mutex_t lock; u64 N; } completed_photons; 84 85 static void die(const char *, ...); 86 87 #if defined(__unix__) || defined(__APPLE__) 88 #include "posix.c" 89 #elif defined(_WIN32) 90 #include "win32.c" 91 #else 92 #error Unsupported Platform! 93 #endif 94 95 static void 96 die(const char *fmt, ...) 97 { 98 va_list ap; 99 100 va_start(ap, fmt); 101 vfprintf(stderr, fmt, ap); 102 va_end(ap); 103 104 os_pause(); 105 exit(1); 106 } 107 108 static Vec3 109 polar_to_cartesian(Vec3Pol v) 110 { 111 return (Vec3){ 112 .x = v.r * cos(v.theta), 113 .y = v.r * sin(v.theta), 114 .z = v.z 115 }; 116 } 117 118 static Vec3Pol 119 cartesian_to_polar(Vec3 v) 120 { 121 return (Vec3Pol){ 122 .r = sqrt(v.x * v.x + v.y * v.y), 123 .theta = atan2(v.y, v.x), 124 .z = v.z 125 }; 126 } 127 128 static size 129 c_str_len(char *s) 130 { 131 char *t; 132 for (t = s; *t; t++); 133 return t - s; 134 } 135 136 static void 137 mem_copy(s8 src, s8 dest) 138 { 139 ASSERT(src.len <= dest.len); 140 for (size i = 0; i < src.len; i++) 141 dest.data[i] = src.data[i]; 142 } 143 144 /* concatenates nstrs together. 0 terminates output for use with bad APIs */ 145 static s8 146 s8concat(s8 *strs, size nstrs) 147 { 148 s8 out = {0}; 149 for (size i = 0; i < nstrs; i++) 150 out.len += strs[i].len; 151 152 out.data = malloc(out.len + 1); 153 if (out.data == NULL) 154 die("s8concat\n"); 155 156 s8 tmp = out; 157 for (size i = 0; i < nstrs; i++) { 158 mem_copy(strs[i], tmp); 159 tmp.data += strs[i].len; 160 tmp.len -= strs[i].len; 161 } 162 out.data[out.len] = 0; 163 164 return out; 165 } 166 167 static void 168 dump_output(s8 pre, Mat2 Rd_xy) 169 { 170 s8 xy = s8("_xy.csv"); 171 s8 rd = s8("_Rd_xy.csv"); 172 s8 out = s8concat((s8 []){pre, xy}, 2); 173 os_file f = os_open(out, OS_WRITE); 174 175 u8 dbuf[4096]; 176 s8 buf = { .data = dbuf }; 177 178 os_write(f, s8("x [cm],y [cm]\n")); 179 for (u32 i = 0; i < g_Nx; i++) { 180 f64 x = (i + 0.5) * g_dx - g_xoff; 181 f64 y = (i + 0.5) * g_dy - g_yoff; 182 buf.len = snprintf((char *)buf.data, sizeof(dbuf), 183 "%e,%e\n", x, y); 184 os_write(f, buf); 185 } 186 187 printf("x,y axis written to: %s\n", out.data); 188 os_close(f); 189 free(out.data); 190 out = s8concat((s8 []){pre, rd}, 2); 191 f = os_open(out, OS_WRITE); 192 193 f64 scale = g_N_photons_per_line * g_N_lines * g_dx * g_dy; 194 f64 *b = Rd_xy.b; 195 for (u32 i = 0; i < Rd_xy.Nx; i++) { 196 for (u32 j = 0; j < Rd_xy.Ny; j++) { 197 buf.len = snprintf((char *)buf.data, sizeof(dbuf), 198 "%e,", b[j] / scale); 199 os_write(f, buf); 200 } 201 os_seek(f, -1, OS_SEEK_CUR); 202 os_write(f, s8("\n")); 203 b += Rd_xy.Ny; 204 } 205 printf("Rd_xy written to: %s\n", out.data); 206 os_close(f); 207 free(out.data); 208 } 209 210 /* fill in extra global ctx values */ 211 static void 212 init(void) 213 { 214 if (g_Nx != g_Ny) 215 die("Nx != Ny, output must be square!\n"); 216 f64 w = g_extent.right - g_extent.left; 217 f64 h = g_extent.top - g_extent.bot; 218 g_dx = w / g_Nx; 219 g_dy = h / g_Ny; 220 g_xoff = w / 2; 221 g_yoff = h / 2; 222 g_mu_t = g_mu_a + g_mu_s; 223 } 224 225 static void 226 random_init(u64 s[4]) 227 { 228 struct timespec ts; 229 clock_gettime(CLOCK_REALTIME, &ts); 230 s[0] = (intptr_t)&printf ^ ts.tv_sec ^ (u64)ts.tv_nsec; 231 s[1] = (intptr_t)&malloc ^ ts.tv_sec ^ (u64)ts.tv_nsec; 232 s[2] = (intptr_t)&die ^ ts.tv_sec ^ (u64)ts.tv_nsec; 233 s[3] = (intptr_t)&sin ^ ts.tv_sec ^ (u64)ts.tv_nsec; 234 } 235 236 static u64 237 xoroshiro256star(u64 s[4]) 238 { 239 u64 ts1 = s[1] * 5; 240 u64 result = ((ts1 << 7) | (ts1 >> 57)) * 9; 241 u64 t = s[1] << 17; 242 s[2] ^= s[0]; 243 s[3] ^= s[1]; 244 s[1] ^= s[2]; 245 s[0] ^= s[3]; 246 s[2] ^= t; 247 s[3] = ((s[3] << 45) | (s[3] >> 19)); 248 return result; 249 } 250 251 static f64 252 random_uniform(u64 s[4]) 253 { 254 return xoroshiro256star(s) / (f64)UINT64_MAX; 255 } 256 257 static Mat2 258 alloc_mat2(u32 x, u32 y) 259 { 260 Mat2 m; 261 m.b = calloc(x * y, sizeof(f64)); 262 if (m.b == NULL) 263 die("calloc\n"); 264 m.Nx = x; 265 m.Ny = y; 266 return m; 267 } 268 269 static void 270 sum_mat2(Mat2 m1, Mat2 m2) 271 { 272 u64 N_total = m1.Nx * m1.Ny; 273 f64 *b1 = m1.b; 274 f64 *b2 = m2.b; 275 276 #if defined(__AVX__) 277 while (N_total >= 4) { 278 __m256d v1 = _mm256_load_pd(b1); 279 __m256d v2 = _mm256_load_pd(b2); 280 _mm256_store_pd(b1, _mm256_add_pd(v1, v2)); 281 N_total -= 4; 282 b1 += 4; 283 b2 += 4; 284 } 285 #endif 286 for (u64 i = 0; i < N_total; i++) 287 b1[i] += b2[i]; 288 } 289 290 static Vec3 291 launch_direction_from_polar_angle(f64 theta) 292 { 293 f64 rdir = sin(g_theta_i) * g_n0 / g_n; 294 return (Vec3){ 295 .x = -rdir * cos(theta), 296 .y = -rdir * sin(theta), 297 .z = sqrt(1 - rdir * rdir) 298 }; 299 } 300 301 static void 302 launch_photon(Photon *p, Vec3Pol pos) 303 { 304 p->pos = polar_to_cartesian(pos); 305 p->dir = launch_direction_from_polar_angle(pos.theta); 306 p->n_scatters = 0; 307 p->dead = 0; 308 309 Vec3Pol dir = cartesian_to_polar(p->dir); 310 f64 sin_ti = sin(g_theta_i); 311 f64 cos_ti = cos(g_theta_i); 312 f64 sin_tt = dir.r; 313 f64 cos_tt = dir.z; 314 f64 raux1 = sin_ti * cos_tt - cos_ti * sin_tt; 315 raux1 *= raux1; 316 f64 raux2 = sin_ti * cos_tt + cos_ti * sin_tt; 317 raux2 *= raux2; 318 f64 rsp = raux1 / raux2 + raux1 * (1 - raux2) / raux2 / (1 - raux1); 319 rsp *= 0.5; 320 p->w = (1.0 - rsp); 321 } 322 323 static void 324 move_photon(Photon *p, f64 s) 325 { 326 p->pos.x += s * p->dir.x; 327 p->pos.y += s * p->dir.y; 328 p->pos.z += s * p->dir.z; 329 } 330 331 static void 332 absorb_photon(Photon *p) 333 { 334 f64 delta_w = p->w * g_mu_a / g_mu_t; /* eq 3.24 */ 335 p->w -= delta_w; 336 if (p->w < 0) 337 p->dead = 1; 338 } 339 340 static void 341 reflect_or_transmit_photon(Photon *p, WorkerCtx *ctx) 342 { 343 f64 sin_ai = sqrt(1 - p->dir.z * p->dir.z); 344 f64 sin_at = sin_ai * g_n / g_n0; /* eq. 3.35 */ 345 346 f64 r_ai; 347 if (sin_at < ZERO) { /* roughly normal */ 348 r_ai = (g_n - g_n0) / (g_n + g_n0); 349 r_ai *= r_ai; 350 } else if (sin_at > 1.0) { /* TIR */ 351 r_ai = 1; 352 } else { 353 f64 A = sin_ai * sqrt(1 - sin_at * sin_at) 354 - sin_at * sqrt(1 - sin_ai * sin_ai); 355 A *= A; 356 f64 B = sin_ai * sqrt(1 - sin_at * sin_at) 357 + sin_at * sqrt(1 - sin_ai * sin_ai); 358 B *= B; 359 r_ai = 0.5 * (2 * A - A * A - A * B) / (B * (1 - A)); /* eq 3.36 */ 360 } 361 362 f64 r = random_uniform(ctx->rand_state); 363 if (r <= r_ai) { 364 /* rebound to current layer */ 365 p->dir.z = -p->dir.z; 366 } else { 367 /* cross the boundary */ 368 if (p->dir.z < -ZERO) { 369 /* hit upper boundary. record reflactance if 370 * we are in bounds of output region */ 371 if (p->pos.x > -g_xoff && p->pos.x < g_xoff && 372 p->pos.y > -g_yoff && p->pos.y < g_yoff) { 373 u32 ri = (p->pos.y + g_yoff) / g_dy; 374 u32 ci = (p->pos.x + g_xoff) / g_dx; 375 ctx->Rd_xy.b[ri * ctx->Rd_xy.Nx + ci] += p->w; 376 } 377 } 378 p->dead = 1; 379 } 380 } 381 382 static void 383 scatter_photon(Photon *p, u64 rand_state[4]) 384 { 385 if (p->dead) 386 return; 387 388 f64 cos_t, phi; 389 f64 g = g_anisotropy; 390 if (g != 0) { 391 f64 r = random_uniform(rand_state); 392 f64 aa = (1 - g * g) / (1 - g + 2 * g * r); 393 cos_t = (1 + g * g - aa * aa) / (2 * g); 394 if (cos_t < -1) 395 cos_t = -1; 396 else if (cos_t > 1) 397 cos_t = 1; 398 } else { 399 cos_t = 2 * random_uniform(rand_state) - 1; 400 } /* eq. (3.28) */ 401 402 f64 sin_t, sin_phi, cos_phi; 403 sin_t = sqrt(1 - cos_t * cos_t); 404 phi = 2 * M_PI * random_uniform(rand_state); /* eq. (3.29) */ 405 cos_phi = cos(phi); 406 sin_phi = sqrt(1 - cos_phi * cos_phi); 407 408 Vec3 u_mu; 409 if (ABS(p->dir.z) < (1 - ZERO)) { 410 /* eq. (3.30) */ 411 Vec3 mu = p->dir; 412 f64 mu_zsq = sqrt(1 - mu.z * mu.z); 413 u_mu.x = sin_t * (mu.x * mu.z * cos_phi - mu.y * sin_phi) 414 / mu_zsq + mu.x * cos_t; 415 u_mu.y = sin_t * (mu.y * mu.z * cos_phi + mu.x * sin_phi) 416 / mu_zsq + mu.y * cos_t; 417 u_mu.z = -sin_t * cos_phi * mu_zsq + mu.z * cos_t; 418 } else { 419 /* close to normal propagation */ 420 /* eq. (3.31) */ 421 u_mu = (Vec3){ 422 .x = sin_t * cos_phi, 423 .y = sin_t * sin_phi, 424 .z = SGN(p->dir.z) * cos_t 425 }; 426 } 427 p->dir = u_mu; 428 p->n_scatters++; 429 } 430 431 static void 432 check_photon_life(Photon *p, u64 rand_state[4]) 433 { 434 if (p->dead) 435 return; 436 437 if (p->w > 1e-4) 438 return; 439 440 f64 m = 10; 441 f64 e = random_uniform(rand_state); 442 if (m * e > 1) { 443 p->dead = 1; 444 } else { 445 p->w *= m; 446 } 447 } 448 449 static f64 450 next_step(f64 s, u64 rand_state[4]) 451 { 452 if (s < ZERO) { 453 f64 r = random_uniform(rand_state); 454 s = -log(r + EPS); 455 } 456 return s; 457 } 458 459 static f64 460 step_towards_boundary(Photon *p) 461 { 462 f64 db; 463 if (p->dir.z < -ZERO) /* travel up */ 464 db = -p->pos.z / p->dir.z; 465 else if (p->dir.z > ZERO) /* travel down */ 466 db = (g_d - p->pos.z) / p->dir.z; 467 else 468 db = BIGVAL; 469 return db; 470 } 471 472 static void 473 simulate_photon(Photon *p, WorkerCtx *ctx) 474 { 475 f64 step = 0, boundary_dist = 0; 476 do { 477 step = next_step(step, ctx->rand_state); 478 boundary_dist = step_towards_boundary(p); 479 if (boundary_dist * g_mu_t <= step) { 480 move_photon(p, boundary_dist); 481 step -= boundary_dist * g_mu_t; 482 reflect_or_transmit_photon(p, ctx); 483 } else { 484 move_photon(p, step / g_mu_t); 485 absorb_photon(p); 486 scatter_photon(p, ctx->rand_state); 487 } 488 check_photon_life(p, ctx->rand_state); 489 } while (!p->dead); 490 } 491 492 static void 493 bump_completed_photons(u64 n) 494 { 495 pthread_mutex_lock(&completed_photons.lock); 496 completed_photons.N += n; 497 pthread_mutex_unlock(&completed_photons.lock); 498 } 499 500 static void * 501 worker_thread(WorkerCtx *ctx) 502 { 503 ctx->Rd_xy = alloc_mat2(g_Nx, g_Ny); 504 random_init(ctx->rand_state); 505 506 u64 photons_per_line = ctx->N_photons / g_N_lines; 507 u64 photon_inc = photons_per_line / 32; 508 u64 photon_rem = photons_per_line % 32; 509 for (u64 n_lines = g_N_lines; n_lines; n_lines--) { 510 /* cache starting photon; nothing here changes between runs */ 511 Photon p_start; 512 Vec3Pol pos = g_incidence_location; 513 pos.theta += (n_lines - 1) * 2 * M_PI / (f64)g_N_lines; 514 launch_photon(&p_start, pos); 515 for (u64 i = 1; i <= photons_per_line; i++) { 516 Photon p; 517 s8 src = {(u8 *)&p_start, sizeof(Photon)}; 518 s8 dest = {(u8 *)&p, sizeof(Photon)}; 519 mem_copy(src, dest); 520 simulate_photon(&p, ctx); 521 if (ctx->highest_photon_scatters < p.n_scatters) 522 ctx->highest_photon_scatters = p.n_scatters; 523 if (i % photon_inc == 0) 524 bump_completed_photons(photon_inc); 525 } 526 527 if (photon_rem != 0) 528 bump_completed_photons(photon_rem); 529 } 530 ctx->done = 1; 531 532 return NULL; 533 } 534 535 static void 536 print_progress(time_t start) 537 { 538 pthread_mutex_lock(&completed_photons.lock); 539 u64 n_done = completed_photons.N; 540 pthread_mutex_unlock(&completed_photons.lock); 541 542 time_t now; 543 time(&now); 544 u64 total_photons = g_N_photons_per_line * g_N_lines; 545 u64 n_remaining = total_photons - n_done; 546 f64 photons_per_sec = n_done / (f64)(now - start); 547 f64 sec_per_line = g_N_photons_per_line / photons_per_sec; 548 549 u32 secs_remaining = n_remaining / photons_per_sec; 550 u32 mins_remaining = secs_remaining / 60; 551 secs_remaining = secs_remaining % 60; 552 553 /* move cursor to line start and clear line */ 554 fputs("\r\x1B[K", stdout); 555 printf("\x1B[36;1m[%0.1f%%]\x1B[0m Photons/s = %0.2f | s/Line: %0.2f " 556 "| Time Remaining: %um %us", 100 * n_done/(f64)total_photons, 557 photons_per_sec, sec_per_line, mins_remaining, secs_remaining); 558 fflush(stdout); 559 } 560 561 int 562 main(int argc, char *argv[]) 563 { 564 if (argv[1]) 565 g_output_prefix = (s8){(u8 *)argv[1], c_str_len(argv[1])}; 566 if (g_output_prefix.len == 0) 567 die("usage: %s [output_prefix]\n", argv[0]); 568 569 /* TODO: check if prefix contains directories and ensure they exist */ 570 571 init(); 572 573 Mat2 Rd_xy_out = alloc_mat2(g_Nx, g_Ny); 574 575 pthread_mutex_init(&completed_photons.lock, NULL); 576 577 u32 thread_count = os_get_core_count(); 578 WorkerCtx *threads = calloc(thread_count, sizeof(WorkerCtx)); 579 if (!threads) 580 die("couldn't allocate thread contexts\n"); 581 582 /* adjust counts to make n_photons an even 583 * multiple of both g_N_lines and thread_count */ 584 g_N_photons_per_line += g_N_lines * g_N_photons_per_line % thread_count; 585 u64 n_photons = g_N_lines * g_N_photons_per_line / thread_count; 586 587 time_t tstart, tend; 588 time(&tstart); 589 for (u32 i = 0; i < thread_count; i++) { 590 threads[i].N_photons = n_photons; 591 pthread_create(&threads[i].id, NULL, 592 (void *(*)(void*))worker_thread, &threads[i]); 593 } 594 595 b32 all_done = 0; 596 u32 highest_photon_scatters = 0; 597 while (!all_done) { 598 struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 }; 599 nanosleep(&ts, NULL); 600 all_done = 1; 601 for (u32 i = 0; i < thread_count; i++) { 602 all_done &= threads[i].done; 603 if (threads[i].done && threads[i].Rd_xy.b) { 604 sum_mat2(Rd_xy_out, threads[i].Rd_xy); 605 free(threads[i].Rd_xy.b); 606 threads[i].Rd_xy.b = NULL; 607 if (highest_photon_scatters < threads[i].highest_photon_scatters) 608 highest_photon_scatters = threads[i].highest_photon_scatters; 609 } 610 } 611 print_progress(tstart); 612 } 613 time(&tend); 614 u32 secs = tend - tstart; 615 printf("\nRuntime: %um %us\n", secs/60, secs%60); 616 printf("Highest Number of Scatters: %u\n", highest_photon_scatters); 617 618 pthread_mutex_destroy(&completed_photons.lock); 619 620 dump_output(g_output_prefix, Rd_xy_out); 621 622 return 0; 623 }