jdict.c (13838B)
1 /* See LICENSE for license details. */ 2 #ifdef _DEBUG 3 #define ASSERT(c) do { __asm("int3; nop"); } while (0) 4 #else 5 #define ASSERT(c) {} 6 #endif 7 8 #ifndef unreachable 9 #define unreachable() __builtin_unreachable() 10 #endif 11 12 #define ARRAY_COUNT(a) (sizeof(a) / sizeof(*a)) 13 #define ISSPACE(c) ((c) == ' ' || (c) == '\n' || (c) == '\t') 14 15 #define MEGABYTE (1024ULL * 1024ULL) 16 17 typedef struct { 18 size len; 19 u8 *s; 20 } s8; 21 #define s8(cstr) (s8){.len = ARRAY_COUNT(cstr) - 1, .s = (u8 *)cstr} 22 23 typedef struct { 24 u8 *data; 25 u32 cap; 26 u32 widx; 27 i32 fd; 28 b32 errors; 29 } Stream; 30 31 typedef struct { 32 u8 *beg, *end; 33 #ifdef _DEBUG_ARENA 34 size min_capacity_remaining; 35 #endif 36 } Arena; 37 38 typedef struct { 39 Stream *dir_name; 40 void *dirfd; 41 } PathStream; 42 43 #include "yomidict.c" 44 45 #define YOMI_TOKS_PER_ENT 10 46 47 /* Number of hash table slots (1 << HT_EXP) */ 48 #define HT_EXP 20 49 50 typedef struct DictDef { 51 s8 text; 52 struct DictDef *next; 53 } DictDef; 54 55 typedef struct { 56 s8 term; 57 DictDef *def; 58 } DictEnt; 59 60 struct ht { 61 DictEnt **ents; 62 i32 len; 63 }; 64 65 typedef struct { 66 s8 rom; 67 s8 name; 68 struct ht ht; 69 } Dict; 70 71 #include "config.h" 72 73 static void __attribute__((noreturn)) os_exit(i32); 74 75 static b32 os_write(iptr, s8); 76 static s8 os_read_whole_file(char *, Arena *, u32); 77 static b32 os_read_stdin(u8 *, size); 78 79 static PathStream os_begin_path_stream(Stream *, Arena *, u32); 80 static s8 os_get_valid_file(PathStream *, s8, Arena *, u32); 81 static void os_end_path_stream(PathStream *); 82 83 static Stream error_stream; 84 static Stream stdout_stream; 85 86 static void 87 stream_flush(Stream *s) 88 { 89 if (s->fd <= 0) { 90 s->errors = 1; 91 } else if (s->widx) { 92 s->errors = !os_write(s->fd, (s8){.len = s->widx, .s = s->data}); 93 if (!s->errors) s->widx = 0; 94 } 95 } 96 97 static void 98 stream_append_byte(Stream *s, u8 b) 99 { 100 if (s->widx + 1 > s->cap) 101 stream_flush(s); 102 if (!s->errors) 103 s->data[s->widx++] = b; 104 } 105 106 static void 107 stream_append_s8(Stream *s, s8 str) 108 { 109 if (str.len > s->cap - s->widx) 110 stream_flush(s); 111 s->errors |= (s->cap - s->widx) < str.len; 112 if (!s->errors) { 113 for (size i = 0; i < str.len; i++) 114 s->data[s->widx++] = str.s[i]; 115 } 116 } 117 118 static void 119 stream_ensure_newline(Stream *s) 120 { 121 if (s->widx && s->data[s->widx - 1] != '\n') 122 stream_append_byte(s, '\n'); 123 } 124 125 #ifdef _DEBUG_ARENA 126 static void 127 stream_append_u64(Stream *s, u64 n) 128 { 129 u8 tmp[64]; 130 u8 *end = tmp + sizeof(tmp); 131 u8 *beg = end; 132 do { *--beg = '0' + (n % 10); } while (n /= 10); 133 stream_append_s8(s, (s8){.len = end - beg, .s = beg}); 134 } 135 #endif 136 137 static s8 138 cstr_to_s8(char *cstr) 139 { 140 s8 result = {.s = (u8 *)cstr}; 141 if (cstr) while (*cstr) { result.len++; cstr++; } 142 return result; 143 } 144 145 static void __attribute__((noreturn)) 146 die(Stream *s) 147 { 148 stream_ensure_newline(s); 149 stream_flush(s); 150 os_exit(1); 151 } 152 153 static void * 154 mem_clear(void *p_, u8 c, size len) 155 { 156 u8 *p = p_; 157 while (len) p[--len] = c; 158 return p; 159 } 160 161 enum arena_flags { 162 ARENA_NONE = 0 << 0, 163 ARENA_NO_CLEAR = 1 << 0, 164 ARENA_ALLOC_END = 1 << 1, 165 }; 166 167 #define alloc(a, t, n, flags) (t *)alloc_(a, sizeof(t), _Alignof(t), n, flags) 168 static void * 169 alloc_(Arena *a, size len, size align, size count, u32 flags) 170 { 171 size padding; 172 if (flags & ARENA_ALLOC_END) padding = (usize)a->end & (align - 1); 173 else padding = -(usize)a->beg & (align - 1); 174 175 size available = a->end - a->beg - padding; 176 if (available <= 0 || available / len <= count) 177 ASSERT(0); 178 179 void *result; 180 if (flags & ARENA_ALLOC_END) { 181 a->end -= padding + count * len; 182 result = a->end; 183 } else { 184 result = a->beg + padding; 185 a->beg += padding + count * len; 186 } 187 188 #ifdef _DEBUG_ARENA 189 if (a->end - a->beg < a->min_capacity_remaining) 190 a->min_capacity_remaining = a->end - a->beg; 191 #endif 192 193 if (flags & ARENA_NO_CLEAR) return result; 194 else return mem_clear(result, 0, count * len); 195 } 196 197 static void 198 usage(s8 argv0) 199 { 200 stream_append_s8(&error_stream, s8("usage: ")); 201 stream_append_s8(&error_stream, argv0); 202 stream_append_s8(&error_stream, s8(" [-d path] [-F FS] [-i] term ...\n")); 203 die(&error_stream); 204 } 205 206 static s8 207 s8_dup(Arena *a, s8 old) 208 { 209 s8 result = {.len = old.len, .s = alloc(a, u8, old.len, ARENA_NO_CLEAR)}; 210 for (size i = 0; i < old.len; i++) 211 result.s[i] = old.s[i]; 212 return result; 213 } 214 215 static b32 216 s8_equal(s8 a, s8 b) 217 { 218 i32 result = 0; 219 if (a.len != b.len) 220 return 0; 221 /* NOTE: we assume short strings in this program */ 222 for (size i = 0; i < a.len; i++) 223 result += b.s[i] - a.s[i]; 224 return result == 0; 225 } 226 227 static s8 228 s8_cut_head(s8 s, size count) 229 { 230 s8 result = s; 231 result.s += count; 232 result.len -= count; 233 return result; 234 } 235 236 /* 237 * trim whitespace from start and end of str 238 * returns a new s8 (same memory) 239 */ 240 static s8 241 s8trim(s8 str) 242 { 243 u8 *p = str.s + str.len - 1; 244 245 for (; str.len && ISSPACE(*p); str.len--, p--); 246 for (; str.len && ISSPACE(*str.s); str.len--, str.s++); 247 248 return str; 249 } 250 251 /* replace escaped control chars with their actual char */ 252 static s8 253 unescape(s8 str) 254 { 255 for (size i = 0; i < str.len; i++) { 256 if (str.s[i] == '\\') { 257 switch (str.s[i + 1]) { 258 case 'n': str.s[i] = '\n'; break; 259 case 't': str.s[i] = '\t'; break; 260 default: continue; 261 } 262 str.len--; 263 for (size j = i + 1; j < str.len; j++) 264 str.s[j] = str.s[j + 1]; 265 } 266 } 267 return str; 268 } 269 270 /* FNV-1a hash */ 271 static u64 272 hash(s8 v) 273 { 274 u64 h = 0x3243f6a8885a308d; /* digits of pi */ 275 for (; v.len; v.len--) { 276 h ^= v.s[v.len - 1] & 0xFF; 277 h *= 1111111111111111111; /* random prime */ 278 } 279 return h; 280 } 281 282 static i32 283 ht_lookup(u64 hash, int exp, i32 idx) 284 { 285 u32 mask = ((u32)1 << exp) - 1; 286 u32 step = (hash >> (64 - exp)) | 1; 287 return (idx + step) & mask; 288 } 289 290 static DictEnt ** 291 intern(struct ht *t, s8 key) 292 { 293 u64 h = hash(key); 294 i32 i = h; 295 for (;;) { 296 i = ht_lookup(h, HT_EXP, i); 297 if (!t->ents[i]) { 298 /* empty slot */ 299 #ifdef _DEBUG 300 if ((u32)t->len + 1 == (u32)1<<(HT_EXP - 1)) { 301 stream_append_s8(&error_stream, 302 s8("intern: ht exceeded 0.5 fill factor\n")); 303 } 304 #endif 305 t->len++; 306 return t->ents + i; 307 } else if (s8_equal(t->ents[i]->term, key)) { 308 /* found; return the stored instance */ 309 return t->ents + i; 310 } 311 /* NOTE: else relookup and try again */ 312 } 313 } 314 315 static void 316 parse_term_bank(Arena *a, struct ht *ht, s8 data) 317 { 318 /* allocate tokens */ 319 size ntoks = (1 << HT_EXP) * YOMI_TOKS_PER_ENT + 1; 320 YomiTok *toks = alloc(a, YomiTok, ntoks, ARENA_ALLOC_END|ARENA_NO_CLEAR); 321 322 YomiScanner s = {0}; 323 yomi_scanner_init(&s, (char *)data.s, data.len); 324 i32 r; 325 while ((r = yomi_scan(&s, toks, ntoks)) < 0) { 326 switch (r) { 327 case YOMI_ERROR_NOMEM: 328 goto cleanup; 329 case YOMI_ERROR_INVAL: 330 case YOMI_ERROR_MALFO: 331 stream_append_s8(&error_stream, s8("yomi_parse: ")); 332 if (r == YOMI_ERROR_INVAL) 333 stream_append_s8(&error_stream, s8("YOMI_ERROR_INVAL\n")); 334 else 335 stream_append_s8(&error_stream, s8("YOMI_ERROR_MALFO\n")); 336 goto cleanup; 337 } 338 } 339 340 for (i32 i = 0; i < r; i++) { 341 YomiTok *base_tok = toks + i; 342 if (base_tok->type != YOMI_ENTRY) 343 continue; 344 345 YomiTok *tstr = NULL, *tdefs = NULL; 346 for (usize j = 1; j < base_tok->len; j++) { 347 348 switch (base_tok[j].type) { 349 case YOMI_STR: 350 if (tstr == NULL) 351 tstr = base_tok + j; 352 break; 353 case YOMI_ARRAY: 354 if (tdefs == NULL) 355 tdefs = base_tok + j; 356 default: /* FALLTHROUGH */ 357 break; 358 } 359 } 360 361 /* check if entry was valid */ 362 if (tdefs == NULL || tstr == NULL) { 363 stream_append_s8(&error_stream, s8("parse_term_bank: invalid entry: got ")); 364 if (!tdefs) stream_append_s8(&error_stream, s8("tdefs")); 365 else stream_append_s8(&error_stream, s8("tstr")); 366 stream_append_s8(&error_stream, s8(" == NULL\n")); 367 break; 368 } 369 370 s8 mem_term = {.len = tstr->end - tstr->start, .s = data.s + tstr->start}; 371 DictEnt **n = intern(ht, mem_term); 372 373 if (!*n) { 374 *n = alloc(a, DictEnt, 1, 0); 375 (*n)->term = s8_dup(a, mem_term); 376 } else { 377 if (!s8_equal((*n)->term, mem_term)) { 378 stream_append_s8(&error_stream, s8("hash collision: ")); 379 stream_append_s8(&error_stream, mem_term); 380 stream_append_byte(&error_stream, '\t'); 381 stream_append_s8(&error_stream, (*n)->term); 382 stream_append_byte(&error_stream, '\n'); 383 } 384 } 385 386 for (usize i = 1; i <= tdefs->len; i++) { 387 DictDef *def = alloc(a, DictDef, 1, ARENA_NO_CLEAR); 388 def->text = s8_dup(a, (s8){.len = tdefs[i].end - tdefs[i].start, 389 .s = data.s + tdefs[i].start}); 390 def->next = (*n)->def; 391 (*n)->def = def; 392 } 393 } 394 395 cleanup: 396 stream_ensure_newline(&error_stream); 397 } 398 399 static int 400 make_dict(Arena *a, Dict *d) 401 { 402 u8 *starting_arena_end = a->end; 403 Stream path = {.cap = 1 * MEGABYTE}; 404 path.data = alloc(a, u8, path.cap, ARENA_ALLOC_END|ARENA_NO_CLEAR); 405 d->ht.ents = alloc(a, DictEnt *, 1 << HT_EXP, 0); 406 407 stream_append_s8(&path, prefix); 408 stream_append_s8(&path, os_path_sep); 409 stream_append_s8(&path, d->rom); 410 PathStream ps = os_begin_path_stream(&path, a, ARENA_ALLOC_END); 411 412 u8 *arena_end = a->end; 413 s8 fn_pre = s8("term"); 414 for (s8 filedata = os_get_valid_file(&ps, fn_pre, a, ARENA_ALLOC_END); 415 filedata.len; 416 filedata = os_get_valid_file(&ps, fn_pre, a, ARENA_ALLOC_END)) 417 { 418 parse_term_bank(a, &d->ht, filedata); 419 a->end = arena_end; 420 } 421 os_end_path_stream(&ps); 422 423 a->end = starting_arena_end; 424 425 return 1; 426 } 427 428 static void 429 make_dicts(Arena *a, Dict *dicts, u32 ndicts) 430 { 431 for (u32 i = 0; i < ndicts; i++) { 432 if (!make_dict(a, &dicts[i])) { 433 stream_append_s8(&error_stream, s8("make_dict failed for: ")); 434 stream_append_s8(&error_stream, dicts[i].rom); 435 stream_append_byte(&error_stream, '\n'); 436 } 437 } 438 } 439 440 static DictEnt * 441 find_ent(s8 term, Dict *d) 442 { 443 u64 h = hash(term); 444 i32 i = ht_lookup(h, HT_EXP, (i32)h); 445 return d->ht.ents[i]; 446 } 447 448 static void 449 find_and_print(s8 term, Dict *d) 450 { 451 DictEnt *ent = find_ent(term, d); 452 453 if (!ent || !s8_equal(term, ent->term)) 454 return; 455 456 b32 print_for_readability = s8_equal(fsep, s8("\n")); 457 b32 printed_header = 0; 458 for (DictDef *def = ent->def; def; def = def->next) { 459 if (print_for_readability) 460 def->text = unescape(def->text); 461 /* NOTE: some dictionaries are "hand-made" by idiots and have definitions 462 * with only white space in them */ 463 def->text = s8trim(def->text); 464 if (def->text.len) { 465 if (!print_for_readability) { 466 stream_append_s8(&stdout_stream, d->name); 467 } else if (!printed_header) { 468 stream_append_s8(&stdout_stream, s8("\x1b[36;1m")); 469 stream_append_s8(&stdout_stream, d->name); 470 stream_append_s8(&stdout_stream, s8("\x1b[0m")); 471 printed_header = 1; 472 } 473 474 stream_append_s8(&stdout_stream, fsep); 475 stream_append_s8(&stdout_stream, def->text); 476 stream_append_byte(&stdout_stream, '\n'); 477 } 478 } 479 if (print_for_readability && printed_header) 480 stream_append_byte(&stdout_stream, '\n'); 481 stream_flush(&stdout_stream); 482 } 483 484 static void 485 find_and_print_defs(Arena *a, Dict *dict, s8 *terms, u32 nterms) 486 { 487 if (!make_dict(a, dict)) { 488 stream_append_s8(&error_stream, s8("failed to allocate dict: ")); 489 stream_append_s8(&error_stream, dict->rom); 490 stream_append_byte(&stdout_stream, '\n'); 491 return; 492 } 493 494 for (u32 i = 0; i < nterms; i++) 495 find_and_print(terms[i], dict); 496 } 497 498 static b32 499 get_stdin_line(Stream *buf) 500 { 501 b32 result = 0; 502 for (; buf->widx < buf->cap; buf->widx++) { 503 u8 *c = buf->data + buf->widx; 504 if (!os_read_stdin(c, 1) || *c == (u8)-1) { 505 break; 506 } else if (*c == '\n') { 507 result = 1; 508 break; 509 } 510 } 511 return result; 512 } 513 514 static void 515 repl(Arena *a, Dict *dicts, u32 ndicts) 516 { 517 Stream buf = {.cap = 4096}; 518 buf.data = alloc(a, u8, buf.cap, ARENA_NO_CLEAR); 519 520 make_dicts(a, dicts, ndicts); 521 522 fsep = s8("\n"); 523 for (;;) { 524 stream_append_s8(&stdout_stream, repl_prompt); 525 stream_flush(&stdout_stream); 526 if (!get_stdin_line(&buf)) 527 break; 528 s8 trimmed = s8trim((s8){.len = buf.widx, .s = buf.data}); 529 for (u32 i = 0; i < ndicts; i++) 530 find_and_print(trimmed, &dicts[i]); 531 buf.widx = 0; 532 } 533 stream_append_s8(&stdout_stream, repl_quit); 534 } 535 536 static i32 537 jdict(Arena *a, i32 argc, char *argv[]) 538 { 539 Dict *dicts = NULL; 540 i32 ndicts = 0, nterms = 0; 541 i32 iflag = 0; 542 543 s8 argv0 = cstr_to_s8(argv[0]); 544 for (argv++, argc--; argv[0] && argv[0][0] == '-' && argv[0][1]; argc--, argv++) { 545 /* NOTE: '--' to end parameters */ 546 if (argv[0][1] == '-' && argv[0][2] == 0) { 547 argv++; 548 argc--; 549 break; 550 } 551 switch (argv[0][1]) { 552 case 'F': 553 if (!argv[1] || !argv[1][0]) 554 usage(argv0); 555 fsep = unescape(cstr_to_s8(argv[1])); 556 argv++; 557 break; 558 case 'd': { 559 if (!argv[1] || !argv[1][0]) 560 usage(argv0); 561 s8 dname = cstr_to_s8(argv[1]); 562 for (u32 j = 0; j < ARRAY_COUNT(default_dict_map); j++) { 563 if (s8_equal(dname, default_dict_map[j].rom)) { 564 dicts = &default_dict_map[j]; 565 ndicts++; 566 break; 567 } 568 } 569 if (dicts == NULL) { 570 stream_append_s8(&error_stream, s8("invalid dictionary name: ")); 571 stream_append_s8(&error_stream, dname); 572 die(&error_stream); 573 } 574 argv++; 575 } break; 576 case 'i': iflag = 1; break; 577 default: usage(argv0); break; 578 } 579 } 580 581 if (ndicts == 0) { 582 dicts = default_dict_map; 583 ndicts = ARRAY_COUNT(default_dict_map); 584 } 585 586 /* NOTE: remaining argv elements are search terms */ 587 nterms = argc; 588 s8 *terms = alloc(a, s8, nterms, 0); 589 for (i32 i = 0; argc && *argv; argv++, i++, argc--) 590 terms[i] = cstr_to_s8(*argv); 591 592 if (nterms == 0 && iflag == 0) 593 usage(argv0); 594 595 if (iflag == 0) 596 for (i32 i = 0; i < ndicts; i++) 597 find_and_print_defs(a, &dicts[i], terms, nterms); 598 else 599 repl(a, dicts, ndicts); 600 601 #ifdef _DEBUG_ARENA 602 stream_append_s8(&error_stream, s8("min remaining arena capacity: ")); 603 stream_append_u64(&error_stream, memory.min_capacity_remaining); 604 stream_append_s8(&error_stream, s8("\nremaining arena capacity: ")); 605 stream_append_u64(&error_stream, memory.end - memory.beg); 606 #endif 607 608 stream_ensure_newline(&error_stream); 609 stream_flush(&error_stream); 610 611 stream_ensure_newline(&stdout_stream); 612 stream_flush(&stdout_stream); 613 614 return 0; 615 }