ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

util.c (16091B)


      1 /* See LICENSE for license details. */
      2 static i32 hadamard_12_12_transpose[] = {
      3 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
      4 	1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,
      5 	1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,
      6 	1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,
      7 	1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
      8 	1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,
      9 	1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,
     10 	1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1,
     11 	1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,
     12 	1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,
     13 	1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
     14 	1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,
     15 };
     16 
     17 #define zero_struct(s) mem_clear(s, 0, sizeof(*s))
     18 static void *
     19 mem_clear(void *p_, u8 c, iz size)
     20 {
     21 	u8 *p = p_;
     22 	while (size > 0) p[--size] = c;
     23 	return p;
     24 }
     25 
     26 static void
     27 mem_copy(void *restrict dest, void *restrict src, uz n)
     28 {
     29 	u8 *s = src, *d = dest;
     30 	for (; n; n--) *d++ = *s++;
     31 }
     32 
     33 static void
     34 mem_move(u8 *dest, u8 *src, iz n)
     35 {
     36 	if (dest < src) mem_copy(dest, src, n);
     37 	else            while (n) { n--; dest[n] = src[n]; }
     38 }
     39 
     40 
     41 static u8 *
     42 arena_commit(Arena *a, iz size)
     43 {
     44 	ASSERT(a->end - a->beg >= size);
     45 	u8 *result = a->beg;
     46 	a->beg += size;
     47 	return result;
     48 }
     49 
     50 static void
     51 arena_pop(Arena *a, iz length)
     52 {
     53 	a->beg -= length;
     54 }
     55 
     56 #define alloc(a, t, n)    (t *)alloc_(a, sizeof(t), _Alignof(t), n)
     57 #define push_struct(a, t) (t *)alloc_(a, sizeof(t), _Alignof(t), 1)
     58 static void *
     59 alloc_(Arena *a, iz len, iz align, iz count)
     60 {
     61 	/* NOTE: special case 0 arena */
     62 	if (a->beg == 0)
     63 		return 0;
     64 
     65 	iz padding   = -(uintptr_t)a->beg & (align - 1);
     66 	iz available = a->end - a->beg - padding;
     67 	if (available < 0 || count > available / len)
     68 		ASSERT(0 && "arena OOM\n");
     69 	void *p = a->beg + padding;
     70 	a->beg += padding + count * len;
     71 	/* TODO: Performance? */
     72 	return mem_clear(p, 0, count * len);
     73 }
     74 
     75 enum { DA_INITIAL_CAP = 8 };
     76 #define da_reserve(a, s, n) \
     77   (s)->data = da_reserve_((a), (s)->data, &(s)->capacity, (s)->count + n, \
     78                           _Alignof(typeof(*(s)->data)), sizeof(*(s)->data))
     79 
     80 #define da_push(a, s) \
     81   ((s)->count == (s)->capacity  \
     82     ? da_reserve(a, s, 1),      \
     83       (s)->data + (s)->count++  \
     84     : (s)->data + (s)->count++)
     85 
     86 static void *
     87 da_reserve_(Arena *a, void *data, iz *capacity, iz needed, iz align, iz size)
     88 {
     89 	iz cap = *capacity;
     90 
     91 	/* NOTE(rnp): handle both 0 initialized DAs and DAs that need to be moved (they started
     92 	 * on the stack or someone allocated something in the middle of the arena during usage) */
     93 	if (!data || a->beg != (u8 *)data + cap * size) {
     94 		void *copy = alloc_(a, size, align, cap);
     95 		if (data) mem_copy(copy, data, cap * size);
     96 		data = copy;
     97 	}
     98 
     99 	if (!cap) cap = DA_INITIAL_CAP;
    100 	while (cap < needed) cap *= 2;
    101 	alloc_(a, size, align, cap - *capacity);
    102 	*capacity = cap;
    103 	return data;
    104 }
    105 
    106 static Arena
    107 sub_arena(Arena *a, iz len, iz align)
    108 {
    109 	Arena result = {0};
    110 
    111 	iz padding = -(uintptr_t)a->beg & (align - 1);
    112 	result.beg   = a->beg + padding;
    113 	result.end   = result.beg + len;
    114 	arena_commit(a, len + padding);
    115 
    116 	return result;
    117 }
    118 
    119 static TempArena
    120 begin_temp_arena(Arena *a)
    121 {
    122 	TempArena result = {.arena = a, .old_beg = a->beg};
    123 	return result;
    124 }
    125 
    126 static void
    127 end_temp_arena(TempArena ta)
    128 {
    129 	Arena *a = ta.arena;
    130 	if (a) {
    131 		ASSERT(a->beg >= ta.old_beg)
    132 		a->beg = ta.old_beg;
    133 	}
    134 }
    135 
    136 static u32
    137 utf8_encode(u8 *out, u32 cp)
    138 {
    139 	u32 result = 1;
    140 	if (cp <= 0x7F) {
    141 		out[0] = cp & 0x7F;
    142 	} else if (cp <= 0x7FF) {
    143 		result = 2;
    144 		out[0] = ((cp >>  6) & 0x1F) | 0xC0;
    145 		out[1] = ((cp >>  0) & 0x3F) | 0x80;
    146 	} else if (cp <= 0xFFFF) {
    147 		result = 3;
    148 		out[0] = ((cp >> 12) & 0x0F) | 0xE0;
    149 		out[1] = ((cp >>  6) & 0x3F) | 0x80;
    150 		out[2] = ((cp >>  0) & 0x3F) | 0x80;
    151 	} else if (cp <= 0x10FFFF) {
    152 		result = 4;
    153 		out[0] = ((cp >> 18) & 0x07) | 0xF0;
    154 		out[1] = ((cp >> 12) & 0x3F) | 0x80;
    155 		out[2] = ((cp >>  6) & 0x3F) | 0x80;
    156 		out[3] = ((cp >>  0) & 0x3F) | 0x80;
    157 	} else {
    158 		out[0] = '?';
    159 	}
    160 	return result;
    161 }
    162 
    163 static UnicodeDecode
    164 utf16_decode(u16 *data, iz length)
    165 {
    166 	UnicodeDecode result = {.cp = U32_MAX};
    167 	if (length) {
    168 		result.consumed = 1;
    169 		result.cp = data[0];
    170 		if (length > 1 && BETWEEN(data[0], 0xD800, 0xDBFF)
    171 		               && BETWEEN(data[1], 0xDC00, 0xDFFF))
    172 		{
    173 			result.consumed = 2;
    174 			result.cp = ((data[0] - 0xD800) << 10) | ((data[1] - 0xDC00) + 0x10000);
    175 		}
    176 	}
    177 	return result;
    178 }
    179 
    180 static u32
    181 utf16_encode(u16 *out, u32 cp)
    182 {
    183 	u32 result = 1;
    184 	if (cp == U32_MAX) {
    185 		out[0] = '?';
    186 	} else if (cp < 0x10000) {
    187 		out[0] = cp;
    188 	} else {
    189 		u32 value = cp - 0x10000;
    190 		out[0] = 0xD800 + (value >> 10u);
    191 		out[1] = 0xDC00 + (value & 0x3FFu);
    192 		result = 2;
    193 	}
    194 	return result;
    195 }
    196 
    197 static Stream
    198 arena_stream(Arena *a)
    199 {
    200 	Stream result = {0};
    201 	result.data   = a->beg;
    202 	result.cap    = a->end - a->beg;
    203 	a->beg = a->end;
    204 	return result;
    205 }
    206 
    207 static Stream
    208 stream_alloc(Arena *a, iz cap)
    209 {
    210 	Stream result = {.cap = cap};
    211 	result.data = alloc(a, u8, cap);
    212 	return result;
    213 }
    214 
    215 static s8
    216 stream_to_s8(Stream *s)
    217 {
    218 	s8 result = {.len = s->widx, .data = s->data};
    219 	return result;
    220 }
    221 
    222 static void
    223 stream_reset(Stream *s, iz index)
    224 {
    225 	s->errors = s->cap <= index;
    226 	if (!s->errors)
    227 		s->widx = index;
    228 }
    229 
    230 static void
    231 stream_commit(Stream *s, iz count)
    232 {
    233 	s->errors |= !BETWEEN(s->widx + count, 0, s->cap);
    234 	if (!s->errors)
    235 		s->widx += count;
    236 }
    237 
    238 static void
    239 stream_append(Stream *s, void *data, iz count)
    240 {
    241 	s->errors |= (s->cap - s->widx) < count;
    242 	if (!s->errors) {
    243 		mem_copy(s->data + s->widx, data, count);
    244 		s->widx += count;
    245 	}
    246 }
    247 
    248 static void
    249 stream_append_byte(Stream *s, u8 b)
    250 {
    251 	stream_append(s, &b, 1);
    252 }
    253 
    254 static void
    255 stream_pad(Stream *s, u8 b, i32 n)
    256 {
    257 	while (n > 0) stream_append_byte(s, b), n--;
    258 }
    259 
    260 static void
    261 stream_append_s8(Stream *s, s8 str)
    262 {
    263 	stream_append(s, str.data, str.len);
    264 }
    265 
    266 static void
    267 stream_append_s8_array(Stream *s, s8 *strs, iz count)
    268 {
    269 	for (iz i = 0; i < count; i++)
    270 		stream_append(s, strs[i].data, strs[i].len);
    271 }
    272 
    273 static void
    274 stream_append_u64(Stream *s, u64 n)
    275 {
    276 	u8 tmp[64];
    277 	u8 *end = tmp + sizeof(tmp);
    278 	u8 *beg = end;
    279 	do { *--beg = '0' + (n % 10); } while (n /= 10);
    280 	stream_append(s, beg, end - beg);
    281 }
    282 
    283 static void
    284 stream_append_hex_u64(Stream *s, u64 n)
    285 {
    286 	if (!s->errors) {
    287 		static u8 hex[16] = {"0123456789abcdef"};
    288 		u8  buf[16];
    289 		u8 *end = buf + sizeof(buf);
    290 		u8 *beg = end;
    291 		while (n) {
    292 			*--beg = hex[n & 0x0F];
    293 			n >>= 4;
    294 		}
    295 		while (end - beg < 2)
    296 			*--beg = '0';
    297 		stream_append(s, beg, end - beg);
    298 	}
    299 }
    300 
    301 static void
    302 stream_append_i64(Stream *s, i64 n)
    303 {
    304 	if (n < 0) {
    305 		stream_append_byte(s, '-');
    306 		n *= -1;
    307 	}
    308 	stream_append_u64(s, n);
    309 }
    310 
    311 static void
    312 stream_append_f64(Stream *s, f64 f, i64 prec)
    313 {
    314 	if (f < 0) {
    315 		stream_append_byte(s, '-');
    316 		f *= -1;
    317 	}
    318 
    319 	/* NOTE: round last digit */
    320 	f += 0.5f / prec;
    321 
    322 	if (f >= (f64)(-1UL >> 1)) {
    323 		stream_append_s8(s, s8("inf"));
    324 	} else {
    325 		u64 integral = f;
    326 		u64 fraction = (f - integral) * prec;
    327 		stream_append_u64(s, integral);
    328 		stream_append_byte(s, '.');
    329 		for (i64 i = prec / 10; i > 1; i /= 10) {
    330 			if (i > fraction)
    331 				stream_append_byte(s, '0');
    332 		}
    333 		stream_append_u64(s, fraction);
    334 	}
    335 }
    336 
    337 static void
    338 stream_append_f64_e(Stream *s, f64 f)
    339 {
    340 	/* TODO: there should be a better way of doing this */
    341 	#if 0
    342 	/* NOTE: we ignore subnormal numbers for now */
    343 	union { f64 f; u64 u; } u = {.f = f};
    344 	i32 exponent = ((u.u >> 52) & 0x7ff) - 1023;
    345 	f32 log_10_of_2 = 0.301f;
    346 	i32 scale       = (exponent * log_10_of_2);
    347 	/* NOTE: normalize f */
    348 	for (i32 i = ABS(scale); i > 0; i--)
    349 		f *= (scale > 0)? 0.1f : 10.0f;
    350 	#else
    351 	i32 scale = 0;
    352 	if (f != 0) {
    353 		while (f > 1) {
    354 			f *= 0.1f;
    355 			scale++;
    356 		}
    357 		while (f < 1) {
    358 			f *= 10.0f;
    359 			scale--;
    360 		}
    361 	}
    362 	#endif
    363 
    364 	i32 prec = 100;
    365 	stream_append_f64(s, f, prec);
    366 	stream_append_byte(s, 'e');
    367 	stream_append_byte(s, scale >= 0? '+' : '-');
    368 	for (i32 i = prec / 10; i > 1; i /= 10)
    369 		stream_append_byte(s, '0');
    370 	stream_append_u64(s, ABS(scale));
    371 }
    372 
    373 static void
    374 stream_append_v2(Stream *s, v2 v)
    375 {
    376 	stream_append_byte(s, '{');
    377 	stream_append_f64(s, v.x, 100);
    378 	stream_append_s8(s, s8(", "));
    379 	stream_append_f64(s, v.y, 100);
    380 	stream_append_byte(s, '}');
    381 }
    382 
    383 /* NOTE(rnp): FNV-1a hash */
    384 static u64
    385 s8_hash(s8 v)
    386 {
    387 	u64 h = 0x3243f6a8885a308d; /* digits of pi */
    388 	for (; v.len; v.len--) {
    389 		h ^= v.data[v.len - 1] & 0xFF;
    390 		h *= 1111111111111111111; /* random prime */
    391 	}
    392 	return h;
    393 }
    394 
    395 static s8
    396 c_str_to_s8(char *cstr)
    397 {
    398 	s8 result = {.data = (u8 *)cstr};
    399 	if (cstr) { while (*cstr) { result.len++; cstr++; } }
    400 	return result;
    401 }
    402 
    403 /* NOTE(rnp): returns < 0 if byte is not found */
    404 static iz
    405 s8_scan_backwards(s8 s, u8 byte)
    406 {
    407 	iz result = s.len;
    408 	while (result && s.data[result - 1] != byte) result--;
    409 	result--;
    410 	return result;
    411 }
    412 
    413 static s8
    414 s8_cut_head(s8 s, iz cut)
    415 {
    416 	s8 result = s;
    417 	if (cut > 0) {
    418 		result.data += cut;
    419 		result.len  -= cut;
    420 	}
    421 	return result;
    422 }
    423 
    424 static s8
    425 s8_alloc(Arena *a, iz len)
    426 {
    427 	return (s8){ .data = alloc(a, u8, len), .len = len };
    428 }
    429 
    430 static s8
    431 s16_to_s8(Arena *a, s16 in)
    432 {
    433 	s8 result = {0};
    434 	if (in.len) {
    435 		iz commit = in.len * 4;
    436 		iz length = 0;
    437 		u8 *data = arena_commit(a, commit + 1);
    438 		u16 *beg = in.data;
    439 		u16 *end = in.data + in.len;
    440 		while (beg < end) {
    441 			UnicodeDecode decode = utf16_decode(beg, end - beg);
    442 			length += utf8_encode(data + length, decode.cp);
    443 			beg    += decode.consumed;
    444 		}
    445 		data[length] = 0;
    446 		result = (s8){.len = length, .data = data};
    447 		arena_pop(a, commit - length);
    448 	}
    449 	return result;
    450 }
    451 
    452 static s16
    453 s8_to_s16(Arena *a, s8 in)
    454 {
    455 	s16 result = {0};
    456 	if (in.len) {
    457 		iz required = 2 * in.len + 1;
    458 		u16 *data   = alloc(a, u16, required);
    459 		iz length   = 0;
    460 		/* TODO(rnp): utf8_decode */
    461 		for (iz i = 0; i < in.len; i++) {
    462 			u32 cp  = in.data[i];
    463 			length += utf16_encode(data + length, cp);
    464 		}
    465 		result = (s16){.len = length, .data = data};
    466 		arena_pop(a, required - length);
    467 	}
    468 	return result;
    469 }
    470 
    471 static s8
    472 push_s8(Arena *a, s8 str)
    473 {
    474 	s8 result = s8_alloc(a, str.len);
    475 	mem_copy(result.data, str.data, result.len);
    476 	return result;
    477 }
    478 
    479 static s8
    480 push_s8_zero(Arena *a, s8 str)
    481 {
    482 	s8 result   = s8_alloc(a, str.len + 1);
    483 	result.len -= 1;
    484 	mem_copy(result.data, str.data, result.len);
    485 	return result;
    486 }
    487 
    488 static u32
    489 round_down_power_of_2(u32 a)
    490 {
    491 	u32 result = 0x80000000UL >> clz_u32(a);
    492 	return result;
    493 }
    494 
    495 static b32
    496 uv2_equal(uv2 a, uv2 b)
    497 {
    498 	return a.x == b.x && a.y == b.y;
    499 }
    500 
    501 static b32
    502 uv3_equal(uv3 a, uv3 b)
    503 {
    504 	return a.x == b.x && a.y == b.y && a.z == b.z;
    505 }
    506 
    507 static v3
    508 cross(v3 a, v3 b)
    509 {
    510 	v3 result = {
    511 		.x = a.y * b.z - a.z * b.y,
    512 		.y = a.z * b.x - a.x * b.z,
    513 		.z = a.x * b.y - a.y * b.x,
    514 	};
    515 	return result;
    516 }
    517 
    518 static v3
    519 sub_v3(v3 a, v3 b)
    520 {
    521 	v3 result = {
    522 		.x = a.x - b.x,
    523 		.y = a.y - b.y,
    524 		.z = a.z - b.z,
    525 	};
    526 	return result;
    527 }
    528 
    529 static f32
    530 length_v3(v3 a)
    531 {
    532 	f32 result = a.x * a.x + a.y * a.y + a.z * a.z;
    533 	return result;
    534 }
    535 
    536 static v3
    537 normalize_v3(v3 a)
    538 {
    539 	f32 length = length_v3(a);
    540 	v3 result = {.x = a.x / length, .y = a.y / length, .z = a.z / length};
    541 	return result;
    542 }
    543 
    544 static v2
    545 clamp_v2_rect(v2 v, Rect r)
    546 {
    547 	v2 result = v;
    548 	result.x = CLAMP(v.x, r.pos.x, r.pos.x + r.size.x);
    549 	result.y = CLAMP(v.y, r.pos.y, r.pos.y + r.size.y);
    550 	return result;
    551 }
    552 
    553 static v2
    554 add_v2(v2 a, v2 b)
    555 {
    556 	v2 result = {
    557 		.x = a.x + b.x,
    558 		.y = a.y + b.y,
    559 	};
    560 	return result;
    561 }
    562 
    563 static v2
    564 sub_v2(v2 a, v2 b)
    565 {
    566 	v2 result = {
    567 		.x = a.x - b.x,
    568 		.y = a.y - b.y,
    569 	};
    570 	return result;
    571 }
    572 
    573 static v2
    574 scale_v2(v2 a, f32 scale)
    575 {
    576 	v2 result = {
    577 		.x = a.x * scale,
    578 		.y = a.y * scale,
    579 	};
    580 	return result;
    581 }
    582 
    583 static v2
    584 mul_v2(v2 a, v2 b)
    585 {
    586 	v2 result = {
    587 		.x = a.x * b.x,
    588 		.y = a.y * b.y,
    589 	};
    590 	return result;
    591 }
    592 
    593 function v2
    594 div_v2(v2 a, v2 b)
    595 {
    596 	v2 result;
    597 	result.x = a.x / b.x;
    598 	result.y = a.y / b.y;
    599 	return result;
    600 }
    601 
    602 
    603 static v2
    604 floor_v2(v2 a)
    605 {
    606 	v2 result;
    607 	result.x = (u32)a.x;
    608 	result.y = (u32)a.y;
    609 	return result;
    610 }
    611 
    612 static f32
    613 magnitude_v2(v2 a)
    614 {
    615 	f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
    616 	return result;
    617 }
    618 
    619 static b32
    620 uv4_equal(uv4 a, uv4 b)
    621 {
    622 	return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w;
    623 }
    624 
    625 static v4
    626 sub_v4(v4 a, v4 b)
    627 {
    628 	v4 result;
    629 	result.x = a.x - b.x;
    630 	result.y = a.y - b.y;
    631 	result.z = a.z - b.z;
    632 	result.w = a.w - b.w;
    633 	return result;
    634 }
    635 
    636 static void
    637 split_rect_horizontal(Rect rect, f32 fraction, Rect *left, Rect *right)
    638 {
    639 	if (left) {
    640 		left->pos    = rect.pos;
    641 		left->size.h = rect.size.h;
    642 		left->size.w = rect.size.w * fraction;
    643 	}
    644 	if (right) {
    645 		right->pos    = rect.pos;
    646 		right->pos.x += rect.size.w * fraction;
    647 		right->size.h = rect.size.h;
    648 		right->size.w = rect.size.w * (1.0f - fraction);
    649 	}
    650 }
    651 
    652 static void
    653 split_rect_vertical(Rect rect, f32 fraction, Rect *top, Rect *bot)
    654 {
    655 	if (top) {
    656 		top->pos    = rect.pos;
    657 		top->size.w = rect.size.w;
    658 		top->size.h = rect.size.h * fraction;
    659 	}
    660 	if (bot) {
    661 		bot->pos    = rect.pos;
    662 		bot->pos.y += rect.size.h * fraction;
    663 		bot->size.w = rect.size.w;
    664 		bot->size.h = rect.size.h * (1.0f - fraction);
    665 	}
    666 }
    667 
    668 static void
    669 cut_rect_horizontal(Rect rect, f32 at, Rect *left, Rect *right)
    670 {
    671 	at = MIN(at, rect.size.w);
    672 	if (left) {
    673 		*left = rect;
    674 		left->size.w = at;
    675 	}
    676 	if (right) {
    677 		*right = rect;
    678 		right->pos.x  += at;
    679 		right->size.w -= at;
    680 	}
    681 }
    682 
    683 static void
    684 cut_rect_vertical(Rect rect, f32 at, Rect *top, Rect *bot)
    685 {
    686 	at = MIN(at, rect.size.h);
    687 	if (top) {
    688 		*top = rect;
    689 		top->size.h = at;
    690 	}
    691 	if (bot) {
    692 		*bot = rect;
    693 		bot->pos.y  += at;
    694 		bot->size.h -= at;
    695 	}
    696 }
    697 
    698 static f64
    699 parse_f64(s8 s)
    700 {
    701 	f64 integral = 0, fractional = 0, sign = 1;
    702 
    703 	if (s.len && *s.data == '-') {
    704 		sign = -1;
    705 		s.data++;
    706 		s.len--;
    707 	}
    708 
    709 	while (s.len && *s.data != '.') {
    710 		integral *= 10;
    711 		integral += *s.data - '0';
    712 		s.data++;
    713 		s.len--;
    714 	}
    715 
    716 	if (*s.data == '.') { s.data++; s.len--; }
    717 
    718 	while (s.len) {
    719 		ASSERT(s.data[s.len - 1] != '.');
    720 		fractional /= 10;
    721 		fractional += (f64)(s.data[--s.len] - '0') / 10.0;
    722 	}
    723 	f64 result = sign * (integral + fractional);
    724 	return result;
    725 }
    726 
    727 static FileWatchDirectory *
    728 lookup_file_watch_directory(FileWatchContext *ctx, u64 hash)
    729 {
    730 	FileWatchDirectory *result = 0;
    731 
    732 	for (u32 i = 0; i < ctx->directory_watch_count; i++) {
    733 		FileWatchDirectory *test = ctx->directory_watches + i;
    734 		if (test->hash == hash) {
    735 			result = test;
    736 			break;
    737 		}
    738 	}
    739 
    740 	return result;
    741 }
    742 
    743 static void
    744 insert_file_watch(FileWatchDirectory *dir, s8 name, iptr user_data, file_watch_callback *callback)
    745 {
    746 	ASSERT(dir->file_watch_count < ARRAY_COUNT(dir->file_watches));
    747 	FileWatch *fw = dir->file_watches + dir->file_watch_count++;
    748 	fw->hash      = s8_hash(name);
    749 	fw->user_data = user_data;
    750 	fw->callback  = callback;
    751 }
    752 
    753 static void
    754 fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
    755 {
    756 	f32x4 vscale = dup_f32x4(scale);
    757 	for (u32 i = 0; i < b_dim.y; i++) {
    758 		for (u32 j = 0; j < b_dim.x; j += 4, b += 4) {
    759 			f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b));
    760 			store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j);
    761 		}
    762 		out += out_stride;
    763 	}
    764 }
    765 
    766 /* NOTE: this won't check for valid space/etc and assumes row major order */
    767 static void
    768 kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
    769 {
    770 	uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
    771 	ASSERT(out_dim.y % 4 == 0);
    772 	for (u32 i = 0; i < a_dim.y; i++) {
    773 		i32 *vout = out;
    774 		for (u32 j = 0; j < a_dim.x; j++, a++) {
    775 			fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
    776 			vout += b_dim.y;
    777 		}
    778 		out += out_dim.y * b_dim.x;
    779 	}
    780 }
    781 
    782 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
    783 static void
    784 fill_hadamard_transpose(i32 *out, i32 *tmp, u32 dim)
    785 {
    786 	ASSERT(dim);
    787 	b32 power_of_2     = ISPOWEROF2(dim);
    788 	b32 multiple_of_12 = dim % 12 == 0;
    789 
    790 	if (!power_of_2 && !multiple_of_12)
    791 		return;
    792 
    793 	if (!power_of_2) {
    794 		ASSERT(multiple_of_12);
    795 		dim /= 12;
    796 	}
    797 
    798 	i32 *m;
    799 	if (power_of_2) m = out;
    800 	else            m = tmp;
    801 
    802 	#define IND(i, j) ((i) * dim + (j))
    803 	m[0] = 1;
    804 	for (u32 k = 1; k < dim; k *= 2) {
    805 		for (u32 i = 0; i < k; i++) {
    806 			for (u32 j = 0; j < k; j++) {
    807 				i32 val = m[IND(i, j)];
    808 				m[IND(i + k, j)]     =  val;
    809 				m[IND(i, j + k)]     =  val;
    810 				m[IND(i + k, j + k)] = -val;
    811 			}
    812 		}
    813 	}
    814 	#undef IND
    815 
    816 	if (!power_of_2)
    817 		kronecker_product(out, tmp, (uv2){.x = dim, .y = dim}, hadamard_12_12_transpose,
    818 		                  (uv2){.x = 12, .y = 12});
    819 }