ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

util.c (8619B)


      1 /* See LICENSE for license details. */
      2 static i32 hadamard_12_12_transpose[] = {
      3 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
      4 	1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,
      5 	1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,
      6 	1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,
      7 	1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,  1,
      8 	1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1,
      9 	1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1, -1,
     10 	1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1, -1,
     11 	1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,
     12 	1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,
     13 	1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
     14 	1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,
     15 };
     16 
     17 static void *
     18 mem_clear(void *p_, u8 c, size len)
     19 {
     20 	u8 *p = p_;
     21 	while (len) p[--len] = c;
     22 	return p;
     23 }
     24 
     25 static void
     26 mem_copy(void *restrict src, void *restrict dest, size n)
     27 {
     28 	ASSERT(n >= 0);
     29 	u8 *s = src, *d = dest;
     30 	for (; n; n--) *d++ = *s++;
     31 }
     32 
     33 static void
     34 mem_move(u8 *src, u8 *dest, size n)
     35 {
     36 	if (dest < src) mem_copy(src, dest, n);
     37 	else            while (n) { n--; dest[n] = src[n]; }
     38 }
     39 
     40 #define alloc(a, t, n)  (t *)alloc_(a, sizeof(t), _Alignof(t), n)
     41 static void *
     42 alloc_(Arena *a, size len, size align, size count)
     43 {
     44 	/* NOTE: special case 0 arena */
     45 	if (a->beg == 0)
     46 		return 0;
     47 
     48 	size padding   = -(uintptr_t)a->beg & (align - 1);
     49 	size available = a->end - a->beg - padding;
     50 	if (available < 0 || count > available / len)
     51 		ASSERT(0 && "arena OOM\n");
     52 	void *p = a->beg + padding;
     53 	a->beg += padding + count * len;
     54 	/* TODO: Performance? */
     55 	return mem_clear(p, 0, count * len);
     56 }
     57 
     58 static Arena
     59 sub_arena(Arena *a, size size)
     60 {
     61 	Arena result = {0};
     62 	if ((a->end - a->beg) >= size) {
     63 		result.beg  = a->beg;
     64 		result.end  = a->beg + size;
     65 		a->beg     += size;
     66 	}
     67 	return result;
     68 }
     69 
     70 static TempArena
     71 begin_temp_arena(Arena *a)
     72 {
     73 	TempArena result = {.arena = a, .old_beg = a->beg};
     74 	return result;
     75 }
     76 
     77 static void
     78 end_temp_arena(TempArena ta)
     79 {
     80 	Arena *a = ta.arena;
     81 	if (a) {
     82 		ASSERT(a->beg >= ta.old_beg)
     83 		a->beg = ta.old_beg;
     84 	}
     85 }
     86 
     87 static Stream
     88 arena_stream(Arena *a)
     89 {
     90 	Stream result = {0};
     91 	result.data   = a->beg;
     92 	result.cap    = a->end - a->beg;
     93 	a->beg = a->end;
     94 	return result;
     95 }
     96 
     97 static Stream
     98 stream_alloc(Arena *a, size cap)
     99 {
    100 	Stream result = {.cap = cap};
    101 	result.data = alloc(a, u8, cap);
    102 	return result;
    103 }
    104 
    105 static s8
    106 stream_to_s8(Stream *s)
    107 {
    108 	s8 result = {.len = s->widx, .data = s->data};
    109 	return result;
    110 }
    111 
    112 static void
    113 stream_append_byte(Stream *s, u8 b)
    114 {
    115 	s->errors |= s->widx + 1 > s->cap;
    116 	if (!s->errors)
    117 		s->data[s->widx++] = b;
    118 }
    119 
    120 static void
    121 stream_append_s8(Stream *s, s8 str)
    122 {
    123 	s->errors |= (s->cap - s->widx) < str.len;
    124 	if (!s->errors) {
    125 		mem_copy(str.data, s->data + s->widx, str.len);
    126 		s->widx += str.len;
    127 	}
    128 }
    129 
    130 static void
    131 stream_append_s8_array(Stream *s, s8 *strs, size count)
    132 {
    133 	while (count > 0) {
    134 		stream_append_s8(s, *strs);
    135 		strs++;
    136 		count--;
    137 	}
    138 }
    139 
    140 static void
    141 stream_append_u64(Stream *s, u64 n)
    142 {
    143 	u8 tmp[64];
    144 	u8 *end = tmp + sizeof(tmp);
    145 	u8 *beg = end;
    146 	do { *--beg = '0' + (n % 10); } while (n /= 10);
    147 	stream_append_s8(s, (s8){.len = end - beg, .data = beg});
    148 }
    149 
    150 static void
    151 stream_append_i64(Stream *s, i64 n)
    152 {
    153 	if (n < 0) {
    154 		stream_append_byte(s, '-');
    155 		n *= -1;
    156 	}
    157 	stream_append_u64(s, n);
    158 }
    159 
    160 static void
    161 stream_append_f64(Stream *s, f64 f, i64 prec)
    162 {
    163 	if (f < 0) {
    164 		stream_append_byte(s, '-');
    165 		f *= -1;
    166 	}
    167 
    168 	/* NOTE: round last digit */
    169 	f += 0.5f / prec;
    170 
    171 	if (f >= (f64)(-1UL >> 1)) {
    172 		stream_append_s8(s, s8("inf"));
    173 	} else {
    174 		u64 integral = f;
    175 		u64 fraction = (f - integral) * prec;
    176 		stream_append_u64(s, integral);
    177 		stream_append_byte(s, '.');
    178 		for (i64 i = prec / 10; i > 1; i /= 10) {
    179 			if (i > fraction)
    180 				stream_append_byte(s, '0');
    181 		}
    182 		stream_append_u64(s, fraction);
    183 	}
    184 }
    185 
    186 static void
    187 stream_append_f64_e(Stream *s, f64 f)
    188 {
    189 	/* TODO: there should be a better way of doing this */
    190 	#if 0
    191 	/* NOTE: we ignore subnormal numbers for now */
    192 	union { f64 f; u64 u; } u = {.f = f};
    193 	i32 exponent = ((u.u >> 52) & 0x7ff) - 1023;
    194 	f32 log_10_of_2 = 0.301f;
    195 	i32 scale       = (exponent * log_10_of_2);
    196 	/* NOTE: normalize f */
    197 	for (i32 i = ABS(scale); i > 0; i--)
    198 		f *= (scale > 0)? 0.1f : 10.0f;
    199 	#else
    200 	i32 scale = 0;
    201 	if (f != 0) {
    202 		while (f > 1) {
    203 			f *= 0.1f;
    204 			scale++;
    205 		}
    206 		while (f < 1) {
    207 			f *= 10.0f;
    208 			scale--;
    209 		}
    210 	}
    211 	#endif
    212 
    213 	i32 prec = 100;
    214 	stream_append_f64(s, f, prec);
    215 	stream_append_byte(s, 'e');
    216 	stream_append_byte(s, scale >= 0? '+' : '-');
    217 	for (i32 i = prec / 10; i > 1; i /= 10)
    218 		stream_append_byte(s, '0');
    219 	stream_append_u64(s, ABS(scale));
    220 }
    221 
    222 static void
    223 stream_append_variable(Stream *s, Variable *var)
    224 {
    225 	switch (var->type) {
    226 	case VT_F32: {
    227 		f32 *f32_val = var->store;
    228 		stream_append_f64(s, *f32_val * var->display_scale, 100);
    229 	} break;
    230 	case VT_I32: {
    231 		i32 *i32_val = var->store;
    232 		stream_append_i64(s, *i32_val * var->display_scale);
    233 	} break;
    234 	default: INVALID_CODE_PATH;
    235 	}
    236 }
    237 
    238 static s8
    239 cstr_to_s8(char *cstr)
    240 {
    241 	s8 result = {.data = (u8 *)cstr};
    242 	while (*cstr) { result.len++; cstr++; }
    243 	return result;
    244 }
    245 
    246 static s8
    247 s8_cut_head(s8 s, size cut)
    248 {
    249 	s8 result = s;
    250 	if (cut > 0) {
    251 		result.data += cut;
    252 		result.len -= cut;
    253 	}
    254 	return result;
    255 }
    256 
    257 static s8
    258 s8alloc(Arena *a, size len)
    259 {
    260 	return (s8){ .data = alloc(a, u8, len), .len = len };
    261 }
    262 
    263 static s8
    264 push_s8(Arena *a, s8 str)
    265 {
    266 	s8 result = s8alloc(a, str.len);
    267 	mem_copy(str.data, result.data, result.len);
    268 	return result;
    269 }
    270 
    271 static b32
    272 uv4_equal(uv4 a, uv4 b)
    273 {
    274 	return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w;
    275 }
    276 
    277 static u32
    278 round_down_power_of_2(u32 a)
    279 {
    280 	u32 result = 0x80000000UL >> clz_u32(a);
    281 	return result;
    282 }
    283 
    284 static v3
    285 cross(v3 a, v3 b)
    286 {
    287 	v3 result = {
    288 		.x = a.y * b.z - a.z * b.y,
    289 		.y = a.z * b.x - a.x * b.z,
    290 		.z = a.x * b.y - a.y * b.x,
    291 	};
    292 	return result;
    293 }
    294 
    295 static v3
    296 sub_v3(v3 a, v3 b)
    297 {
    298 	v3 result = {
    299 		.x = a.x - b.x,
    300 		.y = a.y - b.y,
    301 		.z = a.z - b.z,
    302 	};
    303 	return result;
    304 }
    305 
    306 static f32
    307 length_v3(v3 a)
    308 {
    309 	f32 result = a.x * a.x + a.y * a.y + a.z * a.z;
    310 	return result;
    311 }
    312 
    313 static v3
    314 normalize_v3(v3 a)
    315 {
    316 	f32 length = length_v3(a);
    317 	v3 result = {.x = a.x / length, .y = a.y / length, .z = a.z / length};
    318 	return result;
    319 }
    320 
    321 static v2
    322 sub_v2(v2 a, v2 b)
    323 {
    324 	v2 result = {
    325 		.x = a.x - b.x,
    326 		.y = a.y - b.y,
    327 	};
    328 	return result;
    329 }
    330 
    331 static v2
    332 mul_v2(v2 a, v2 b)
    333 {
    334 	v2 result = {
    335 		.x = a.x * b.x,
    336 		.y = a.y * b.y,
    337 	};
    338 	return result;
    339 }
    340 
    341 
    342 static f32
    343 magnitude_v2(v2 a)
    344 {
    345 	f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
    346 	return result;
    347 }
    348 
    349 static f64
    350 parse_f64(s8 s)
    351 {
    352 	f64 integral = 0, fractional = 0, sign = 1;
    353 
    354 	if (s.len && *s.data == '-') {
    355 		sign = -1;
    356 		s.data++;
    357 		s.len--;
    358 	}
    359 
    360 	while (s.len && *s.data != '.') {
    361 		integral *= 10;
    362 		integral += *s.data - '0';
    363 		s.data++;
    364 		s.len--;
    365 	}
    366 
    367 	if (*s.data == '.') { s.data++; s.len--; }
    368 
    369 	while (s.len) {
    370 		ASSERT(s.data[s.len - 1] != '.');
    371 		fractional /= 10;
    372 		fractional += (f64)(s.data[--s.len] - '0') / 10.0;
    373 	}
    374 	f64 result = sign * (integral + fractional);
    375 	return result;
    376 }
    377 
    378 static void
    379 fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
    380 {
    381 	f32x4 vscale = dup_f32x4(scale);
    382 	for (u32 i = 0; i < b_dim.y; i++) {
    383 		for (u32 j = 0; j < b_dim.x; j += 4, b += 4) {
    384 			f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b));
    385 			store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j);
    386 		}
    387 		out += out_stride;
    388 	}
    389 }
    390 
    391 /* NOTE: this won't check for valid space/etc and assumes row major order */
    392 static void
    393 kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
    394 {
    395 	uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
    396 	ASSERT(out_dim.y % 4 == 0);
    397 	for (u32 i = 0; i < a_dim.y; i++) {
    398 		i32 *vout = out;
    399 		for (u32 j = 0; j < a_dim.x; j++, a++) {
    400 			fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
    401 			vout += b_dim.y;
    402 		}
    403 		out += out_dim.y * b_dim.x;
    404 	}
    405 }
    406 
    407 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
    408 static void
    409 fill_hadamard_transpose(i32 *out, i32 *tmp, u32 dim)
    410 {
    411 	ASSERT(dim);
    412 	b32 power_of_2     = ISPOWEROF2(dim);
    413 	b32 multiple_of_12 = dim % 12 == 0;
    414 
    415 	if (!power_of_2 && !multiple_of_12)
    416 		return;
    417 
    418 	if (!power_of_2) {
    419 		ASSERT(multiple_of_12);
    420 		dim /= 12;
    421 	}
    422 
    423 	i32 *m;
    424 	if (power_of_2) m = out;
    425 	else            m = tmp;
    426 
    427 	#define IND(i, j) ((i) * dim + (j))
    428 	m[0] = 1;
    429 	for (u32 k = 1; k < dim; k *= 2) {
    430 		for (u32 i = 0; i < k; i++) {
    431 			for (u32 j = 0; j < k; j++) {
    432 				i32 val = m[IND(i, j)];
    433 				m[IND(i + k, j)]     =  val;
    434 				m[IND(i, j + k)]     =  val;
    435 				m[IND(i + k, j + k)] = -val;
    436 			}
    437 		}
    438 	}
    439 	#undef IND
    440 
    441 	if (!power_of_2)
    442 		kronecker_product(out, tmp, (uv2){.x = dim, .y = dim}, hadamard_12_12_transpose,
    443 		                  (uv2){.x = 12, .y = 12});
    444 }