util.c (8619B)
1 /* See LICENSE for license details. */ 2 static i32 hadamard_12_12_transpose[] = { 3 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, 5 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 6 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 7 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 8 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 9 1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 10 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, 11 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 12 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 13 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 14 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, 15 }; 16 17 static void * 18 mem_clear(void *p_, u8 c, size len) 19 { 20 u8 *p = p_; 21 while (len) p[--len] = c; 22 return p; 23 } 24 25 static void 26 mem_copy(void *restrict src, void *restrict dest, size n) 27 { 28 ASSERT(n >= 0); 29 u8 *s = src, *d = dest; 30 for (; n; n--) *d++ = *s++; 31 } 32 33 static void 34 mem_move(u8 *src, u8 *dest, size n) 35 { 36 if (dest < src) mem_copy(src, dest, n); 37 else while (n) { n--; dest[n] = src[n]; } 38 } 39 40 #define alloc(a, t, n) (t *)alloc_(a, sizeof(t), _Alignof(t), n) 41 static void * 42 alloc_(Arena *a, size len, size align, size count) 43 { 44 /* NOTE: special case 0 arena */ 45 if (a->beg == 0) 46 return 0; 47 48 size padding = -(uintptr_t)a->beg & (align - 1); 49 size available = a->end - a->beg - padding; 50 if (available < 0 || count > available / len) 51 ASSERT(0 && "arena OOM\n"); 52 void *p = a->beg + padding; 53 a->beg += padding + count * len; 54 /* TODO: Performance? */ 55 return mem_clear(p, 0, count * len); 56 } 57 58 static Arena 59 sub_arena(Arena *a, size size) 60 { 61 Arena result = {0}; 62 if ((a->end - a->beg) >= size) { 63 result.beg = a->beg; 64 result.end = a->beg + size; 65 a->beg += size; 66 } 67 return result; 68 } 69 70 static TempArena 71 begin_temp_arena(Arena *a) 72 { 73 TempArena result = {.arena = a, .old_beg = a->beg}; 74 return result; 75 } 76 77 static void 78 end_temp_arena(TempArena ta) 79 { 80 Arena *a = ta.arena; 81 if (a) { 82 ASSERT(a->beg >= ta.old_beg) 83 a->beg = ta.old_beg; 84 } 85 } 86 87 static Stream 88 arena_stream(Arena *a) 89 { 90 Stream result = {0}; 91 result.data = a->beg; 92 result.cap = a->end - a->beg; 93 a->beg = a->end; 94 return result; 95 } 96 97 static Stream 98 stream_alloc(Arena *a, size cap) 99 { 100 Stream result = {.cap = cap}; 101 result.data = alloc(a, u8, cap); 102 return result; 103 } 104 105 static s8 106 stream_to_s8(Stream *s) 107 { 108 s8 result = {.len = s->widx, .data = s->data}; 109 return result; 110 } 111 112 static void 113 stream_append_byte(Stream *s, u8 b) 114 { 115 s->errors |= s->widx + 1 > s->cap; 116 if (!s->errors) 117 s->data[s->widx++] = b; 118 } 119 120 static void 121 stream_append_s8(Stream *s, s8 str) 122 { 123 s->errors |= (s->cap - s->widx) < str.len; 124 if (!s->errors) { 125 mem_copy(str.data, s->data + s->widx, str.len); 126 s->widx += str.len; 127 } 128 } 129 130 static void 131 stream_append_s8_array(Stream *s, s8 *strs, size count) 132 { 133 while (count > 0) { 134 stream_append_s8(s, *strs); 135 strs++; 136 count--; 137 } 138 } 139 140 static void 141 stream_append_u64(Stream *s, u64 n) 142 { 143 u8 tmp[64]; 144 u8 *end = tmp + sizeof(tmp); 145 u8 *beg = end; 146 do { *--beg = '0' + (n % 10); } while (n /= 10); 147 stream_append_s8(s, (s8){.len = end - beg, .data = beg}); 148 } 149 150 static void 151 stream_append_i64(Stream *s, i64 n) 152 { 153 if (n < 0) { 154 stream_append_byte(s, '-'); 155 n *= -1; 156 } 157 stream_append_u64(s, n); 158 } 159 160 static void 161 stream_append_f64(Stream *s, f64 f, i64 prec) 162 { 163 if (f < 0) { 164 stream_append_byte(s, '-'); 165 f *= -1; 166 } 167 168 /* NOTE: round last digit */ 169 f += 0.5f / prec; 170 171 if (f >= (f64)(-1UL >> 1)) { 172 stream_append_s8(s, s8("inf")); 173 } else { 174 u64 integral = f; 175 u64 fraction = (f - integral) * prec; 176 stream_append_u64(s, integral); 177 stream_append_byte(s, '.'); 178 for (i64 i = prec / 10; i > 1; i /= 10) { 179 if (i > fraction) 180 stream_append_byte(s, '0'); 181 } 182 stream_append_u64(s, fraction); 183 } 184 } 185 186 static void 187 stream_append_f64_e(Stream *s, f64 f) 188 { 189 /* TODO: there should be a better way of doing this */ 190 #if 0 191 /* NOTE: we ignore subnormal numbers for now */ 192 union { f64 f; u64 u; } u = {.f = f}; 193 i32 exponent = ((u.u >> 52) & 0x7ff) - 1023; 194 f32 log_10_of_2 = 0.301f; 195 i32 scale = (exponent * log_10_of_2); 196 /* NOTE: normalize f */ 197 for (i32 i = ABS(scale); i > 0; i--) 198 f *= (scale > 0)? 0.1f : 10.0f; 199 #else 200 i32 scale = 0; 201 if (f != 0) { 202 while (f > 1) { 203 f *= 0.1f; 204 scale++; 205 } 206 while (f < 1) { 207 f *= 10.0f; 208 scale--; 209 } 210 } 211 #endif 212 213 i32 prec = 100; 214 stream_append_f64(s, f, prec); 215 stream_append_byte(s, 'e'); 216 stream_append_byte(s, scale >= 0? '+' : '-'); 217 for (i32 i = prec / 10; i > 1; i /= 10) 218 stream_append_byte(s, '0'); 219 stream_append_u64(s, ABS(scale)); 220 } 221 222 static void 223 stream_append_variable(Stream *s, Variable *var) 224 { 225 switch (var->type) { 226 case VT_F32: { 227 f32 *f32_val = var->store; 228 stream_append_f64(s, *f32_val * var->display_scale, 100); 229 } break; 230 case VT_I32: { 231 i32 *i32_val = var->store; 232 stream_append_i64(s, *i32_val * var->display_scale); 233 } break; 234 default: INVALID_CODE_PATH; 235 } 236 } 237 238 static s8 239 cstr_to_s8(char *cstr) 240 { 241 s8 result = {.data = (u8 *)cstr}; 242 while (*cstr) { result.len++; cstr++; } 243 return result; 244 } 245 246 static s8 247 s8_cut_head(s8 s, size cut) 248 { 249 s8 result = s; 250 if (cut > 0) { 251 result.data += cut; 252 result.len -= cut; 253 } 254 return result; 255 } 256 257 static s8 258 s8alloc(Arena *a, size len) 259 { 260 return (s8){ .data = alloc(a, u8, len), .len = len }; 261 } 262 263 static s8 264 push_s8(Arena *a, s8 str) 265 { 266 s8 result = s8alloc(a, str.len); 267 mem_copy(str.data, result.data, result.len); 268 return result; 269 } 270 271 static b32 272 uv4_equal(uv4 a, uv4 b) 273 { 274 return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 275 } 276 277 static u32 278 round_down_power_of_2(u32 a) 279 { 280 u32 result = 0x80000000UL >> clz_u32(a); 281 return result; 282 } 283 284 static v3 285 cross(v3 a, v3 b) 286 { 287 v3 result = { 288 .x = a.y * b.z - a.z * b.y, 289 .y = a.z * b.x - a.x * b.z, 290 .z = a.x * b.y - a.y * b.x, 291 }; 292 return result; 293 } 294 295 static v3 296 sub_v3(v3 a, v3 b) 297 { 298 v3 result = { 299 .x = a.x - b.x, 300 .y = a.y - b.y, 301 .z = a.z - b.z, 302 }; 303 return result; 304 } 305 306 static f32 307 length_v3(v3 a) 308 { 309 f32 result = a.x * a.x + a.y * a.y + a.z * a.z; 310 return result; 311 } 312 313 static v3 314 normalize_v3(v3 a) 315 { 316 f32 length = length_v3(a); 317 v3 result = {.x = a.x / length, .y = a.y / length, .z = a.z / length}; 318 return result; 319 } 320 321 static v2 322 sub_v2(v2 a, v2 b) 323 { 324 v2 result = { 325 .x = a.x - b.x, 326 .y = a.y - b.y, 327 }; 328 return result; 329 } 330 331 static v2 332 mul_v2(v2 a, v2 b) 333 { 334 v2 result = { 335 .x = a.x * b.x, 336 .y = a.y * b.y, 337 }; 338 return result; 339 } 340 341 342 static f32 343 magnitude_v2(v2 a) 344 { 345 f32 result = sqrt_f32(a.x * a.x + a.y * a.y); 346 return result; 347 } 348 349 static f64 350 parse_f64(s8 s) 351 { 352 f64 integral = 0, fractional = 0, sign = 1; 353 354 if (s.len && *s.data == '-') { 355 sign = -1; 356 s.data++; 357 s.len--; 358 } 359 360 while (s.len && *s.data != '.') { 361 integral *= 10; 362 integral += *s.data - '0'; 363 s.data++; 364 s.len--; 365 } 366 367 if (*s.data == '.') { s.data++; s.len--; } 368 369 while (s.len) { 370 ASSERT(s.data[s.len - 1] != '.'); 371 fractional /= 10; 372 fractional += (f64)(s.data[--s.len] - '0') / 10.0; 373 } 374 f64 result = sign * (integral + fractional); 375 return result; 376 } 377 378 static void 379 fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim) 380 { 381 f32x4 vscale = dup_f32x4(scale); 382 for (u32 i = 0; i < b_dim.y; i++) { 383 for (u32 j = 0; j < b_dim.x; j += 4, b += 4) { 384 f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b)); 385 store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j); 386 } 387 out += out_stride; 388 } 389 } 390 391 /* NOTE: this won't check for valid space/etc and assumes row major order */ 392 static void 393 kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim) 394 { 395 uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y}; 396 ASSERT(out_dim.y % 4 == 0); 397 for (u32 i = 0; i < a_dim.y; i++) { 398 i32 *vout = out; 399 for (u32 j = 0; j < a_dim.x; j++, a++) { 400 fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim); 401 vout += b_dim.y; 402 } 403 out += out_dim.y * b_dim.x; 404 } 405 } 406 407 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */ 408 static void 409 fill_hadamard_transpose(i32 *out, i32 *tmp, u32 dim) 410 { 411 ASSERT(dim); 412 b32 power_of_2 = ISPOWEROF2(dim); 413 b32 multiple_of_12 = dim % 12 == 0; 414 415 if (!power_of_2 && !multiple_of_12) 416 return; 417 418 if (!power_of_2) { 419 ASSERT(multiple_of_12); 420 dim /= 12; 421 } 422 423 i32 *m; 424 if (power_of_2) m = out; 425 else m = tmp; 426 427 #define IND(i, j) ((i) * dim + (j)) 428 m[0] = 1; 429 for (u32 k = 1; k < dim; k *= 2) { 430 for (u32 i = 0; i < k; i++) { 431 for (u32 j = 0; j < k; j++) { 432 i32 val = m[IND(i, j)]; 433 m[IND(i + k, j)] = val; 434 m[IND(i, j + k)] = val; 435 m[IND(i + k, j + k)] = -val; 436 } 437 } 438 } 439 #undef IND 440 441 if (!power_of_2) 442 kronecker_product(out, tmp, (uv2){.x = dim, .y = dim}, hadamard_12_12_transpose, 443 (uv2){.x = 12, .y = 12}); 444 }