ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

util.c (14915B)


      1 /* See LICENSE for license details. */
      2 static i32 hadamard_12_12_transpose[] = {
      3 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
      4 	1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,
      5 	1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,
      6 	1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,
      7 	1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
      8 	1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,
      9 	1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,
     10 	1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1,
     11 	1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,
     12 	1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,
     13 	1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
     14 	1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,
     15 };
     16 
     17 #define zero_struct(s) mem_clear(s, 0, sizeof(*s))
     18 static void *
     19 mem_clear(void *p_, u8 c, iz size)
     20 {
     21 	u8 *p = p_;
     22 	while (size > 0) p[--size] = c;
     23 	return p;
     24 }
     25 
     26 static void
     27 mem_copy(void *restrict dest, void *restrict src, uz n)
     28 {
     29 	ASSERT(n >= 0);
     30 	u8 *s = src, *d = dest;
     31 	for (; n; n--) *d++ = *s++;
     32 }
     33 
     34 static void
     35 mem_move(u8 *dest, u8 *src, iz n)
     36 {
     37 	if (dest < src) mem_copy(dest, src, n);
     38 	else            while (n) { n--; dest[n] = src[n]; }
     39 }
     40 
     41 
     42 static u8 *
     43 arena_commit(Arena *a, iz size)
     44 {
     45 	ASSERT(a->end - a->beg >= size);
     46 	u8 *result = a->beg;
     47 	a->beg += size;
     48 	return result;
     49 }
     50 
     51 static void
     52 arena_pop(Arena *a, iz length)
     53 {
     54 	a->beg -= length;
     55 }
     56 
     57 #define alloc(a, t, n)    (t *)alloc_(a, sizeof(t), _Alignof(t), n)
     58 #define push_struct(a, t) (t *)alloc_(a, sizeof(t), _Alignof(t), 1)
     59 static void *
     60 alloc_(Arena *a, iz len, iz align, iz count)
     61 {
     62 	/* NOTE: special case 0 arena */
     63 	if (a->beg == 0)
     64 		return 0;
     65 
     66 	iz padding   = -(uintptr_t)a->beg & (align - 1);
     67 	iz available = a->end - a->beg - padding;
     68 	if (available < 0 || count > available / len)
     69 		ASSERT(0 && "arena OOM\n");
     70 	void *p = a->beg + padding;
     71 	a->beg += padding + count * len;
     72 	/* TODO: Performance? */
     73 	return mem_clear(p, 0, count * len);
     74 }
     75 
     76 static Arena
     77 sub_arena(Arena *a, iz len, iz align)
     78 {
     79 	Arena result = {0};
     80 
     81 	iz padding = -(uintptr_t)a->beg & (align - 1);
     82 	result.beg   = a->beg + padding;
     83 	result.end   = result.beg + len;
     84 	arena_commit(a, len + padding);
     85 
     86 	return result;
     87 }
     88 
     89 static TempArena
     90 begin_temp_arena(Arena *a)
     91 {
     92 	TempArena result = {.arena = a, .old_beg = a->beg};
     93 	return result;
     94 }
     95 
     96 static void
     97 end_temp_arena(TempArena ta)
     98 {
     99 	Arena *a = ta.arena;
    100 	if (a) {
    101 		ASSERT(a->beg >= ta.old_beg)
    102 		a->beg = ta.old_beg;
    103 	}
    104 }
    105 
    106 static u32
    107 utf8_encode(u8 *out, u32 cp)
    108 {
    109 	u32 result = 1;
    110 	if (cp <= 0x7F) {
    111 		out[0] = cp & 0x7F;
    112 	} else if (cp <= 0x7FF) {
    113 		result = 2;
    114 		out[0] = ((cp >>  6) & 0x1F) | 0xC0;
    115 		out[1] = ((cp >>  0) & 0x3F) | 0x80;
    116 	} else if (cp <= 0xFFFF) {
    117 		result = 3;
    118 		out[0] = ((cp >> 12) & 0x0F) | 0xE0;
    119 		out[1] = ((cp >>  6) & 0x3F) | 0x80;
    120 		out[2] = ((cp >>  0) & 0x3F) | 0x80;
    121 	} else if (cp <= 0x10FFFF) {
    122 		result = 4;
    123 		out[0] = ((cp >> 18) & 0x07) | 0xF0;
    124 		out[1] = ((cp >> 12) & 0x3F) | 0x80;
    125 		out[2] = ((cp >>  6) & 0x3F) | 0x80;
    126 		out[3] = ((cp >>  0) & 0x3F) | 0x80;
    127 	} else {
    128 		out[0] = '?';
    129 	}
    130 	return result;
    131 }
    132 
    133 static UnicodeDecode
    134 utf16_decode(u16 *data, iz length)
    135 {
    136 	UnicodeDecode result = {.cp = U32_MAX};
    137 	if (length) {
    138 		result.consumed = 1;
    139 		result.cp = data[0];
    140 		if (length > 1 && BETWEEN(data[0], 0xD800, 0xDBFF)
    141 		               && BETWEEN(data[1], 0xDC00, 0xDFFF))
    142 		{
    143 			result.consumed = 2;
    144 			result.cp = ((data[0] - 0xD800) << 10) | ((data[1] - 0xDC00) + 0x10000);
    145 		}
    146 	}
    147 	return result;
    148 }
    149 
    150 static u32
    151 utf16_encode(u16 *out, u32 cp)
    152 {
    153 	u32 result = 1;
    154 	if (cp == U32_MAX) {
    155 		out[0] = '?';
    156 	} else if (cp < 0x10000) {
    157 		out[0] = cp;
    158 	} else {
    159 		u32 value = cp - 0x10000;
    160 		out[0] = 0xD800 + (value >> 10u);
    161 		out[1] = 0xDC00 + (value & 0x3FFu);
    162 		result = 2;
    163 	}
    164 	return result;
    165 }
    166 
    167 static Stream
    168 arena_stream(Arena *a)
    169 {
    170 	Stream result = {0};
    171 	result.data   = a->beg;
    172 	result.cap    = a->end - a->beg;
    173 	a->beg = a->end;
    174 	return result;
    175 }
    176 
    177 static Stream
    178 stream_alloc(Arena *a, iz cap)
    179 {
    180 	Stream result = {.cap = cap};
    181 	result.data = alloc(a, u8, cap);
    182 	return result;
    183 }
    184 
    185 static s8
    186 stream_to_s8(Stream *s)
    187 {
    188 	s8 result = {.len = s->widx, .data = s->data};
    189 	return result;
    190 }
    191 
    192 static void
    193 stream_reset(Stream *s, iz index)
    194 {
    195 	s->errors = s->cap <= index;
    196 	if (!s->errors)
    197 		s->widx = index;
    198 }
    199 
    200 static void
    201 stream_commit(Stream *s, iz count)
    202 {
    203 	s->errors |= !BETWEEN(s->widx + count, 0, s->cap);
    204 	if (!s->errors)
    205 		s->widx += count;
    206 }
    207 
    208 static void
    209 stream_append(Stream *s, void *data, iz count)
    210 {
    211 	s->errors |= (s->cap - s->widx) < count;
    212 	if (!s->errors) {
    213 		mem_copy(s->data + s->widx, data, count);
    214 		s->widx += count;
    215 	}
    216 }
    217 
    218 static void
    219 stream_append_byte(Stream *s, u8 b)
    220 {
    221 	stream_append(s, &b, 1);
    222 }
    223 
    224 static void
    225 stream_append_s8(Stream *s, s8 str)
    226 {
    227 	stream_append(s, str.data, str.len);
    228 }
    229 
    230 static void
    231 stream_append_s8_array(Stream *s, s8 *strs, iz count)
    232 {
    233 	for (iz i = 0; i < count; i++)
    234 		stream_append(s, strs[i].data, strs[i].len);
    235 }
    236 
    237 static void
    238 stream_append_u64(Stream *s, u64 n)
    239 {
    240 	u8 tmp[64];
    241 	u8 *end = tmp + sizeof(tmp);
    242 	u8 *beg = end;
    243 	do { *--beg = '0' + (n % 10); } while (n /= 10);
    244 	stream_append(s, beg, end - beg);
    245 }
    246 
    247 static void
    248 stream_append_hex_u64(Stream *s, u64 n)
    249 {
    250 	if (!s->errors) {
    251 		static u8 hex[16] = {"0123456789abcdef"};
    252 		u8  buf[16];
    253 		u8 *end = buf + sizeof(buf);
    254 		u8 *beg = end;
    255 		while (n) {
    256 			*--beg = hex[n & 0x0F];
    257 			n >>= 4;
    258 		}
    259 		while (end - beg < 2)
    260 			*--beg = '0';
    261 		stream_append(s, beg, end - beg);
    262 	}
    263 }
    264 
    265 static void
    266 stream_append_i64(Stream *s, i64 n)
    267 {
    268 	if (n < 0) {
    269 		stream_append_byte(s, '-');
    270 		n *= -1;
    271 	}
    272 	stream_append_u64(s, n);
    273 }
    274 
    275 static void
    276 stream_append_f64(Stream *s, f64 f, i64 prec)
    277 {
    278 	if (f < 0) {
    279 		stream_append_byte(s, '-');
    280 		f *= -1;
    281 	}
    282 
    283 	/* NOTE: round last digit */
    284 	f += 0.5f / prec;
    285 
    286 	if (f >= (f64)(-1UL >> 1)) {
    287 		stream_append_s8(s, s8("inf"));
    288 	} else {
    289 		u64 integral = f;
    290 		u64 fraction = (f - integral) * prec;
    291 		stream_append_u64(s, integral);
    292 		stream_append_byte(s, '.');
    293 		for (i64 i = prec / 10; i > 1; i /= 10) {
    294 			if (i > fraction)
    295 				stream_append_byte(s, '0');
    296 		}
    297 		stream_append_u64(s, fraction);
    298 	}
    299 }
    300 
    301 static void
    302 stream_append_f64_e(Stream *s, f64 f)
    303 {
    304 	/* TODO: there should be a better way of doing this */
    305 	#if 0
    306 	/* NOTE: we ignore subnormal numbers for now */
    307 	union { f64 f; u64 u; } u = {.f = f};
    308 	i32 exponent = ((u.u >> 52) & 0x7ff) - 1023;
    309 	f32 log_10_of_2 = 0.301f;
    310 	i32 scale       = (exponent * log_10_of_2);
    311 	/* NOTE: normalize f */
    312 	for (i32 i = ABS(scale); i > 0; i--)
    313 		f *= (scale > 0)? 0.1f : 10.0f;
    314 	#else
    315 	i32 scale = 0;
    316 	if (f != 0) {
    317 		while (f > 1) {
    318 			f *= 0.1f;
    319 			scale++;
    320 		}
    321 		while (f < 1) {
    322 			f *= 10.0f;
    323 			scale--;
    324 		}
    325 	}
    326 	#endif
    327 
    328 	i32 prec = 100;
    329 	stream_append_f64(s, f, prec);
    330 	stream_append_byte(s, 'e');
    331 	stream_append_byte(s, scale >= 0? '+' : '-');
    332 	for (i32 i = prec / 10; i > 1; i /= 10)
    333 		stream_append_byte(s, '0');
    334 	stream_append_u64(s, ABS(scale));
    335 }
    336 
    337 static void
    338 stream_append_v2(Stream *s, v2 v)
    339 {
    340 	stream_append_byte(s, '{');
    341 	stream_append_f64(s, v.x, 100);
    342 	stream_append_s8(s, s8(", "));
    343 	stream_append_f64(s, v.y, 100);
    344 	stream_append_byte(s, '}');
    345 }
    346 
    347 /* NOTE(rnp): FNV-1a hash */
    348 static u64
    349 s8_hash(s8 v)
    350 {
    351 	u64 h = 0x3243f6a8885a308d; /* digits of pi */
    352 	for (; v.len; v.len--) {
    353 		h ^= v.data[v.len - 1] & 0xFF;
    354 		h *= 1111111111111111111; /* random prime */
    355 	}
    356 	return h;
    357 }
    358 
    359 static s8
    360 c_str_to_s8(char *cstr)
    361 {
    362 	s8 result = {.data = (u8 *)cstr};
    363 	while (*cstr) { result.len++; cstr++; }
    364 	return result;
    365 }
    366 
    367 /* NOTE(rnp): returns < 0 if byte is not found */
    368 static iz
    369 s8_scan_backwards(s8 s, u8 byte)
    370 {
    371 	iz result = s.len;
    372 	while (result && s.data[result - 1] != byte) result--;
    373 	result--;
    374 	return result;
    375 }
    376 
    377 static s8
    378 s8_cut_head(s8 s, iz cut)
    379 {
    380 	s8 result = s;
    381 	if (cut > 0) {
    382 		result.data += cut;
    383 		result.len  -= cut;
    384 	}
    385 	return result;
    386 }
    387 
    388 static s8
    389 s8_alloc(Arena *a, iz len)
    390 {
    391 	return (s8){ .data = alloc(a, u8, len), .len = len };
    392 }
    393 
    394 static s8
    395 s16_to_s8(Arena *a, s16 in)
    396 {
    397 	s8 result = {0};
    398 	if (in.len) {
    399 		iz commit = in.len * 4;
    400 		iz length = 0;
    401 		u8 *data = arena_commit(a, commit + 1);
    402 		u16 *beg = in.data;
    403 		u16 *end = in.data + in.len;
    404 		while (beg < end) {
    405 			UnicodeDecode decode = utf16_decode(beg, end - beg);
    406 			length += utf8_encode(data + length, decode.cp);
    407 			beg    += decode.consumed;
    408 		}
    409 		data[length] = 0;
    410 		result = (s8){.len = length, .data = data};
    411 		arena_pop(a, commit - length);
    412 	}
    413 	return result;
    414 }
    415 
    416 static s16
    417 s8_to_s16(Arena *a, s8 in)
    418 {
    419 	s16 result = {0};
    420 	if (in.len) {
    421 		iz required = 2 * in.len + 1;
    422 		u16 *data   = alloc(a, u16, required);
    423 		iz length   = 0;
    424 		/* TODO(rnp): utf8_decode */
    425 		for (iz i = 0; i < in.len; i++) {
    426 			u32 cp  = in.data[i];
    427 			length += utf16_encode(data + length, cp);
    428 		}
    429 		result = (s16){.len = length, .data = data};
    430 		arena_pop(a, required - length);
    431 	}
    432 	return result;
    433 }
    434 
    435 static s8
    436 push_s8(Arena *a, s8 str)
    437 {
    438 	s8 result = s8_alloc(a, str.len);
    439 	mem_copy(result.data, str.data, result.len);
    440 	return result;
    441 }
    442 
    443 static s8
    444 push_s8_zero(Arena *a, s8 str)
    445 {
    446 	s8 result   = s8_alloc(a, str.len + 1);
    447 	result.len -= 1;
    448 	mem_copy(result.data, str.data, result.len);
    449 	return result;
    450 }
    451 
    452 static u32
    453 round_down_power_of_2(u32 a)
    454 {
    455 	u32 result = 0x80000000UL >> clz_u32(a);
    456 	return result;
    457 }
    458 
    459 static b32
    460 uv2_equal(uv2 a, uv2 b)
    461 {
    462 	return a.x == b.x && a.y == b.y;
    463 }
    464 
    465 static b32
    466 uv3_equal(uv3 a, uv3 b)
    467 {
    468 	return a.x == b.x && a.y == b.y && a.z == b.z;
    469 }
    470 
    471 static v3
    472 cross(v3 a, v3 b)
    473 {
    474 	v3 result = {
    475 		.x = a.y * b.z - a.z * b.y,
    476 		.y = a.z * b.x - a.x * b.z,
    477 		.z = a.x * b.y - a.y * b.x,
    478 	};
    479 	return result;
    480 }
    481 
    482 static v3
    483 sub_v3(v3 a, v3 b)
    484 {
    485 	v3 result = {
    486 		.x = a.x - b.x,
    487 		.y = a.y - b.y,
    488 		.z = a.z - b.z,
    489 	};
    490 	return result;
    491 }
    492 
    493 static f32
    494 length_v3(v3 a)
    495 {
    496 	f32 result = a.x * a.x + a.y * a.y + a.z * a.z;
    497 	return result;
    498 }
    499 
    500 static v3
    501 normalize_v3(v3 a)
    502 {
    503 	f32 length = length_v3(a);
    504 	v3 result = {.x = a.x / length, .y = a.y / length, .z = a.z / length};
    505 	return result;
    506 }
    507 
    508 static v2
    509 clamp_v2_rect(v2 v, Rect r)
    510 {
    511 	v2 result = v;
    512 	result.x = CLAMP(v.x, r.pos.x, r.pos.x + r.size.x);
    513 	result.y = CLAMP(v.y, r.pos.y, r.pos.y + r.size.y);
    514 	return result;
    515 }
    516 
    517 static v2
    518 add_v2(v2 a, v2 b)
    519 {
    520 	v2 result = {
    521 		.x = a.x + b.x,
    522 		.y = a.y + b.y,
    523 	};
    524 	return result;
    525 }
    526 
    527 static v2
    528 sub_v2(v2 a, v2 b)
    529 {
    530 	v2 result = {
    531 		.x = a.x - b.x,
    532 		.y = a.y - b.y,
    533 	};
    534 	return result;
    535 }
    536 
    537 static v2
    538 scale_v2(v2 a, f32 scale)
    539 {
    540 	v2 result = {
    541 		.x = a.x * scale,
    542 		.y = a.y * scale,
    543 	};
    544 	return result;
    545 }
    546 
    547 static v2
    548 mul_v2(v2 a, v2 b)
    549 {
    550 	v2 result = {
    551 		.x = a.x * b.x,
    552 		.y = a.y * b.y,
    553 	};
    554 	return result;
    555 }
    556 
    557 static v2
    558 floor_v2(v2 a)
    559 {
    560 	v2 result;
    561 	result.x = (u32)a.x;
    562 	result.y = (u32)a.y;
    563 	return result;
    564 }
    565 
    566 static f32
    567 magnitude_v2(v2 a)
    568 {
    569 	f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
    570 	return result;
    571 }
    572 
    573 static b32
    574 uv4_equal(uv4 a, uv4 b)
    575 {
    576 	return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w;
    577 }
    578 
    579 static v4
    580 sub_v4(v4 a, v4 b)
    581 {
    582 	v4 result;
    583 	result.x = a.x - b.x;
    584 	result.y = a.y - b.y;
    585 	result.z = a.z - b.z;
    586 	result.w = a.w - b.w;
    587 	return result;
    588 }
    589 
    590 static void
    591 split_rect_horizontal(Rect rect, f32 fraction, Rect *left, Rect *right)
    592 {
    593 	if (left) {
    594 		left->pos    = rect.pos;
    595 		left->size.h = rect.size.h;
    596 		left->size.w = rect.size.w * fraction;
    597 	}
    598 	if (right) {
    599 		right->pos    = rect.pos;
    600 		right->pos.x += rect.size.w * fraction;
    601 		right->size.h = rect.size.h;
    602 		right->size.w = rect.size.w * (1.0f - fraction);
    603 	}
    604 }
    605 
    606 static void
    607 split_rect_vertical(Rect rect, f32 fraction, Rect *top, Rect *bot)
    608 {
    609 	if (top) {
    610 		top->pos    = rect.pos;
    611 		top->size.w = rect.size.w;
    612 		top->size.h = rect.size.h * fraction;
    613 	}
    614 	if (bot) {
    615 		bot->pos    = rect.pos;
    616 		bot->pos.y += rect.size.h * fraction;
    617 		bot->size.w = rect.size.w;
    618 		bot->size.h = rect.size.h * (1.0f - fraction);
    619 	}
    620 }
    621 
    622 static void
    623 cut_rect_horizontal(Rect rect, f32 at, Rect *left, Rect *right)
    624 {
    625 	at = MIN(at, rect.size.w);
    626 	if (left) {
    627 		*left = rect;
    628 		left->size.w = at;
    629 	}
    630 	if (right) {
    631 		*right = rect;
    632 		right->pos.x  += at;
    633 		right->size.w -= at;
    634 	}
    635 }
    636 
    637 static void
    638 cut_rect_vertical(Rect rect, f32 at, Rect *top, Rect *bot)
    639 {
    640 	at = MIN(at, rect.size.h);
    641 	if (top) {
    642 		*top = rect;
    643 		top->size.h = at;
    644 	}
    645 	if (bot) {
    646 		*bot = rect;
    647 		bot->pos.y  += at;
    648 		bot->size.h -= at;
    649 	}
    650 }
    651 
    652 static f64
    653 parse_f64(s8 s)
    654 {
    655 	f64 integral = 0, fractional = 0, sign = 1;
    656 
    657 	if (s.len && *s.data == '-') {
    658 		sign = -1;
    659 		s.data++;
    660 		s.len--;
    661 	}
    662 
    663 	while (s.len && *s.data != '.') {
    664 		integral *= 10;
    665 		integral += *s.data - '0';
    666 		s.data++;
    667 		s.len--;
    668 	}
    669 
    670 	if (*s.data == '.') { s.data++; s.len--; }
    671 
    672 	while (s.len) {
    673 		ASSERT(s.data[s.len - 1] != '.');
    674 		fractional /= 10;
    675 		fractional += (f64)(s.data[--s.len] - '0') / 10.0;
    676 	}
    677 	f64 result = sign * (integral + fractional);
    678 	return result;
    679 }
    680 
    681 static FileWatchDirectory *
    682 lookup_file_watch_directory(FileWatchContext *ctx, u64 hash)
    683 {
    684 	FileWatchDirectory *result = 0;
    685 
    686 	for (u32 i = 0; i < ctx->directory_watch_count; i++) {
    687 		FileWatchDirectory *test = ctx->directory_watches + i;
    688 		if (test->hash == hash) {
    689 			result = test;
    690 			break;
    691 		}
    692 	}
    693 
    694 	return result;
    695 }
    696 
    697 static void
    698 insert_file_watch(FileWatchDirectory *dir, s8 name, iptr user_data, file_watch_callback *callback)
    699 {
    700 	ASSERT(dir->file_watch_count < ARRAY_COUNT(dir->file_watches));
    701 	FileWatch *fw = dir->file_watches + dir->file_watch_count++;
    702 	fw->hash      = s8_hash(name);
    703 	fw->user_data = user_data;
    704 	fw->callback  = callback;
    705 }
    706 
    707 static void
    708 fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
    709 {
    710 	f32x4 vscale = dup_f32x4(scale);
    711 	for (u32 i = 0; i < b_dim.y; i++) {
    712 		for (u32 j = 0; j < b_dim.x; j += 4, b += 4) {
    713 			f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b));
    714 			store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j);
    715 		}
    716 		out += out_stride;
    717 	}
    718 }
    719 
    720 /* NOTE: this won't check for valid space/etc and assumes row major order */
    721 static void
    722 kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
    723 {
    724 	uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
    725 	ASSERT(out_dim.y % 4 == 0);
    726 	for (u32 i = 0; i < a_dim.y; i++) {
    727 		i32 *vout = out;
    728 		for (u32 j = 0; j < a_dim.x; j++, a++) {
    729 			fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
    730 			vout += b_dim.y;
    731 		}
    732 		out += out_dim.y * b_dim.x;
    733 	}
    734 }
    735 
    736 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
    737 static void
    738 fill_hadamard_transpose(i32 *out, i32 *tmp, u32 dim)
    739 {
    740 	ASSERT(dim);
    741 	b32 power_of_2     = ISPOWEROF2(dim);
    742 	b32 multiple_of_12 = dim % 12 == 0;
    743 
    744 	if (!power_of_2 && !multiple_of_12)
    745 		return;
    746 
    747 	if (!power_of_2) {
    748 		ASSERT(multiple_of_12);
    749 		dim /= 12;
    750 	}
    751 
    752 	i32 *m;
    753 	if (power_of_2) m = out;
    754 	else            m = tmp;
    755 
    756 	#define IND(i, j) ((i) * dim + (j))
    757 	m[0] = 1;
    758 	for (u32 k = 1; k < dim; k *= 2) {
    759 		for (u32 i = 0; i < k; i++) {
    760 			for (u32 j = 0; j < k; j++) {
    761 				i32 val = m[IND(i, j)];
    762 				m[IND(i + k, j)]     =  val;
    763 				m[IND(i, j + k)]     =  val;
    764 				m[IND(i + k, j + k)] = -val;
    765 			}
    766 		}
    767 	}
    768 	#undef IND
    769 
    770 	if (!power_of_2)
    771 		kronecker_product(out, tmp, (uv2){.x = dim, .y = dim}, hadamard_12_12_transpose,
    772 		                  (uv2){.x = 12, .y = 12});
    773 }