jdict

command line tool for looking up terms in yomidict dictionaries
git clone anongit@rnpnr.xyz:jdict.git
Log | Files | Refs | Feed | README | LICENSE

jdict.c (13983B)


      1 /* See LICENSE for license details. */
      2 #ifndef asm
      3 #ifdef __asm
      4 #define asm __asm
      5 #else
      6 #define asm __asm__
      7 #endif
      8 #endif
      9 
     10 #define FORCE_INLINE inline __attribute__((always_inline))
     11 
     12 #ifdef __ARM_ARCH_ISA_A64
     13 /* TODO? debuggers just loop here forever and need a manual PC increment (jump +1 in gdb) */
     14 #define debugbreak() asm volatile ("brk 0xf000")
     15 #elif __x86_64__
     16 #define debugbreak() asm volatile ("int3; nop")
     17 #endif
     18 
     19 #ifdef _DEBUG
     20 #define ASSERT(c) do { debugbreak(); } while (0)
     21 #else
     22 #define ASSERT(c) {}
     23 #endif
     24 
     25 #ifndef unreachable
     26 #define unreachable() __builtin_unreachable()
     27 #endif
     28 
     29 #define ARRAY_COUNT(a) (sizeof(a) / sizeof(*a))
     30 #define ISSPACE(c)     ((c) == ' ' || (c) == '\n' || (c) == '\t')
     31 
     32 #define MEGABYTE (1024ULL * 1024ULL)
     33 
     34 typedef struct {
     35 	size len;
     36 	u8   *s;
     37 } s8;
     38 #define s8(cstr) (s8){.len = ARRAY_COUNT(cstr) - 1, .s = (u8 *)cstr}
     39 
     40 typedef struct {
     41 	u8   *data;
     42 	u32   cap;
     43 	u32   widx;
     44 	i32   fd;
     45 	b32   errors;
     46 } Stream;
     47 
     48 typedef struct {
     49 	u8 *beg, *end;
     50 #ifdef _DEBUG_ARENA
     51 	size min_capacity_remaining;
     52 #endif
     53 } Arena;
     54 
     55 #include "yomidict.c"
     56 
     57 #define YOMI_TOKS_PER_ENT 10
     58 
     59 /* Number of hash table slots (1 << HT_EXP) */
     60 #define HT_EXP 20
     61 
     62 typedef struct DictDef {
     63 	s8 text;
     64 	struct DictDef *next;
     65 } DictDef;
     66 
     67 typedef struct {
     68 	s8 term;
     69 	DictDef *def;
     70 } DictEnt;
     71 
     72 struct ht {
     73 	DictEnt **ents;
     74 	i32 len;
     75 };
     76 
     77 typedef struct {
     78 	s8 rom;
     79 	s8 name;
     80 	struct ht ht;
     81 } Dict;
     82 
     83 #include "config.h"
     84 
     85 static void __attribute__((noreturn)) os_exit(i32);
     86 
     87 static b32 os_write(iptr, s8);
     88 static b32 os_read_stdin(u8 *, size);
     89 
     90 static iptr os_begin_path_stream(Stream *, Arena *, u32);
     91 static s8   os_get_valid_file(iptr, s8, Arena *, u32);
     92 static void os_end_path_stream(iptr);
     93 
     94 static Stream error_stream;
     95 static Stream stdout_stream;
     96 
     97 static void
     98 stream_flush(Stream *s)
     99 {
    100 	if (s->fd <= 0) {
    101 		s->errors = 1;
    102 	} else if (s->widx) {
    103 		s->errors = !os_write(s->fd, (s8){.len = s->widx, .s = s->data});
    104 		if (!s->errors) s->widx = 0;
    105 	}
    106 }
    107 
    108 static void
    109 stream_append_byte(Stream *s, u8 b)
    110 {
    111 	if (s->widx + 1 > s->cap)
    112 		stream_flush(s);
    113 	if (!s->errors)
    114 		s->data[s->widx++] = b;
    115 }
    116 
    117 static void
    118 stream_append_s8(Stream *s, s8 str)
    119 {
    120 	if (str.len > s->cap - s->widx)
    121 		stream_flush(s);
    122 	s->errors |= (s->cap - s->widx) < str.len;
    123 	if (!s->errors) {
    124 		for (size i = 0; i < str.len; i++)
    125 			s->data[s->widx++] = str.s[i];
    126 	}
    127 }
    128 
    129 static void
    130 stream_ensure_newline(Stream *s)
    131 {
    132 	if (s->widx && s->data[s->widx - 1] != '\n')
    133 		stream_append_byte(s, '\n');
    134 }
    135 
    136 #ifdef _DEBUG_ARENA
    137 static void
    138 stream_append_u64(Stream *s, u64 n)
    139 {
    140 	u8 tmp[64];
    141 	u8 *end = tmp + sizeof(tmp);
    142 	u8 *beg = end;
    143 	do { *--beg = '0' + (n % 10); } while (n /= 10);
    144 	stream_append_s8(s, (s8){.len = end - beg, .s = beg});
    145 }
    146 #endif
    147 
    148 static s8
    149 cstr_to_s8(char *cstr)
    150 {
    151 	s8 result = {.s = (u8 *)cstr};
    152 	if (cstr) while (*cstr) { result.len++; cstr++; }
    153 	return result;
    154 }
    155 
    156 static void __attribute__((noreturn))
    157 die(Stream *s)
    158 {
    159 	stream_ensure_newline(s);
    160 	stream_flush(s);
    161 	os_exit(1);
    162 }
    163 
    164 static void *
    165 mem_clear(void *p_, u8 c, size len)
    166 {
    167 	u8 *p = p_;
    168 	while (len) p[--len] = c;
    169 	return p;
    170 }
    171 
    172 enum arena_flags {
    173 	ARENA_NONE      = 0 << 0,
    174 	ARENA_NO_CLEAR  = 1 << 0,
    175 	ARENA_ALLOC_END = 1 << 1,
    176 };
    177 
    178 #define alloc(a, t, n, flags)  (t *)alloc_(a, sizeof(t), _Alignof(t), n, flags)
    179 static void *
    180 alloc_(Arena *a, size len, size align, size count, u32 flags)
    181 {
    182 	size padding;
    183 	if (flags & ARENA_ALLOC_END) padding =  (usize)a->end & (align - 1);
    184 	else                         padding = -(usize)a->beg & (align - 1);
    185 
    186 	size available = a->end - a->beg - padding;
    187 	if (available <= 0 || available / len <= count)
    188 		ASSERT(0);
    189 
    190 	void *result;
    191 	if (flags & ARENA_ALLOC_END) {
    192 		a->end -= padding + count * len;
    193 		result  = a->end;
    194 	} else {
    195 		result  = a->beg + padding;
    196 		a->beg += padding + count * len;
    197 	}
    198 
    199 #ifdef _DEBUG_ARENA
    200 	if (a->end - a->beg < a->min_capacity_remaining)
    201 		a->min_capacity_remaining = a->end - a->beg;
    202 #endif
    203 
    204 	if (flags & ARENA_NO_CLEAR) return result;
    205 	else                        return mem_clear(result, 0, count * len);
    206 }
    207 
    208 static void
    209 usage(s8 argv0)
    210 {
    211 	stream_append_s8(&error_stream, s8("usage: "));
    212 	stream_append_s8(&error_stream, argv0);
    213 	stream_append_s8(&error_stream, s8(" [-d path] [-F FS] [-i] term ...\n"));
    214 	die(&error_stream);
    215 }
    216 
    217 static s8
    218 s8_dup(Arena *a, s8 old)
    219 {
    220 	s8 result = {.len = old.len, .s = alloc(a, u8, old.len, ARENA_NO_CLEAR)};
    221 	for (size i = 0; i < old.len; i++)
    222 		result.s[i] = old.s[i];
    223 	return result;
    224 }
    225 
    226 static b32
    227 s8_equal(s8 a, s8 b)
    228 {
    229 	i32 result = 0;
    230 	if (a.len != b.len)
    231 		return 0;
    232 	/* NOTE: we assume short strings in this program */
    233 	for (size i = 0; i < a.len; i++)
    234 		result += b.s[i] - a.s[i];
    235 	return result == 0;
    236 }
    237 
    238 static s8
    239 s8_cut_head(s8 s, size count)
    240 {
    241 	s8 result   = s;
    242 	result.s   += count;
    243 	result.len -= count;
    244 	return result;
    245 }
    246 
    247 /*
    248  * trim whitespace from start and end of str
    249  * returns a new s8 (same memory)
    250  */
    251 static s8
    252 s8trim(s8 str)
    253 {
    254 	u8 *p = str.s + str.len - 1;
    255 
    256 	for (; str.len && ISSPACE(*p); str.len--, p--);
    257 	for (; str.len && ISSPACE(*str.s); str.len--, str.s++);
    258 
    259 	return str;
    260 }
    261 
    262 /* replace escaped control chars with their actual char */
    263 static s8
    264 unescape(s8 str)
    265 {
    266 	for (size i = 0; i < str.len; i++) {
    267 		if (str.s[i] == '\\') {
    268 			switch (str.s[i + 1]) {
    269 			case 'n': str.s[i] = '\n'; break;
    270 			case 't': str.s[i] = '\t'; break;
    271 			default: continue;
    272 			}
    273 			str.len--;
    274 			for (size j = i + 1; j < str.len; j++)
    275 				str.s[j] = str.s[j + 1];
    276 		}
    277 	}
    278 	return str;
    279 }
    280 
    281 /* FNV-1a hash */
    282 static u64
    283 hash(s8 v)
    284 {
    285 	u64 h = 0x3243f6a8885a308d; /* digits of pi */
    286 	for (; v.len; v.len--) {
    287 		h ^= v.s[v.len - 1] & 0xFF;
    288 		h *= 1111111111111111111; /* random prime */
    289 	}
    290 	return h;
    291 }
    292 
    293 static i32
    294 ht_lookup(u64 hash, int exp, i32 idx)
    295 {
    296 	u32 mask = ((u32)1 << exp) - 1;
    297 	u32 step = (hash >> (64 - exp)) | 1;
    298 	return (idx + step) & mask;
    299 }
    300 
    301 static DictEnt **
    302 intern(struct ht *t, s8 key)
    303 {
    304 	u64 h = hash(key);
    305 	i32 i = h;
    306 	for (;;) {
    307 		i = ht_lookup(h, HT_EXP, i);
    308 		if (!t->ents[i]) {
    309 			/* empty slot */
    310 			#ifdef _DEBUG
    311 			if ((u32)t->len + 1 == (u32)1<<(HT_EXP - 1)) {
    312 				stream_append_s8(&error_stream,
    313 				                 s8("intern: ht exceeded 0.5 fill factor\n"));
    314 			}
    315 			#endif
    316 			t->len++;
    317 			return t->ents + i;
    318 		} else if (s8_equal(t->ents[i]->term, key)) {
    319 			/* found; return the stored instance */
    320 			return t->ents + i;
    321 		}
    322 		/* NOTE: else relookup and try again */
    323 	}
    324 }
    325 
    326 static void
    327 parse_term_bank(Arena *a, struct ht *ht, s8 data)
    328 {
    329 	/* allocate tokens */
    330 	size ntoks = (1 << HT_EXP) * YOMI_TOKS_PER_ENT + 1;
    331 	YomiTok *toks = alloc(a, YomiTok, ntoks, ARENA_ALLOC_END|ARENA_NO_CLEAR);
    332 
    333 	YomiScanner s = {0};
    334 	yomi_scanner_init(&s, (char *)data.s, data.len);
    335 	i32 r;
    336 	while ((r = yomi_scan(&s, toks, ntoks)) < 0) {
    337 		switch (r) {
    338 		case YOMI_ERROR_NOMEM:
    339 			goto cleanup;
    340 		case YOMI_ERROR_INVAL:
    341 		case YOMI_ERROR_MALFO:
    342 			stream_append_s8(&error_stream, s8("yomi_parse: "));
    343 			if (r == YOMI_ERROR_INVAL)
    344 				stream_append_s8(&error_stream, s8("YOMI_ERROR_INVAL\n"));
    345 			else
    346 				stream_append_s8(&error_stream, s8("YOMI_ERROR_MALFO\n"));
    347 			goto cleanup;
    348 		}
    349 	}
    350 
    351 	for (i32 i = 0; i < r; i++) {
    352 		YomiTok *base_tok = toks + i;
    353 		if (base_tok->type != YOMI_ENTRY)
    354 			continue;
    355 
    356 		YomiTok *tstr = 0, *tdefs = 0;
    357 		for (usize j = 1; j < base_tok->len; j++) {
    358 			switch (base_tok[j].type) {
    359 			case YOMI_STR:   if (!tstr)  tstr  = base_tok + j; break;
    360 			case YOMI_ARRAY: if (!tdefs) tdefs = base_tok + j; break;
    361 			default: break;
    362 			}
    363 		}
    364 
    365 		/* check if entry was valid */
    366 		if (!tdefs || !tstr) {
    367 			stream_append_s8(&error_stream, s8("parse_term_bank: invalid entry: missing "));
    368 			if (!tdefs) stream_append_s8(&error_stream, s8("definition token\n"));
    369 			else        stream_append_s8(&error_stream, s8("name token\n"));
    370 			break;
    371 		}
    372 
    373 		s8 mem_term = {.len = tstr->end - tstr->start, .s = data.s + tstr->start};
    374 		DictEnt **n = intern(ht, mem_term);
    375 
    376 		if (!*n) {
    377 			*n         = alloc(a, DictEnt, 1, 0);
    378 			(*n)->term = s8_dup(a, mem_term);
    379 		} else {
    380 			if (!s8_equal((*n)->term, mem_term)) {
    381 				stream_append_s8(&error_stream, s8("hash collision: "));
    382 				stream_append_s8(&error_stream, mem_term);
    383 				stream_append_byte(&error_stream, '\t');
    384 				stream_append_s8(&error_stream, (*n)->term);
    385 				stream_append_byte(&error_stream, '\n');
    386 			}
    387 		}
    388 
    389 		for (usize i = 1; i <= tdefs->len; i++) {
    390 			DictDef *def = alloc(a, DictDef, 1, ARENA_NO_CLEAR);
    391 			def->text = s8_dup(a, (s8){.len = tdefs[i].end - tdefs[i].start,
    392 			                           .s = data.s + tdefs[i].start});
    393 			def->next = (*n)->def;
    394 			(*n)->def = def;
    395 		}
    396 	}
    397 
    398 cleanup:
    399 	stream_ensure_newline(&error_stream);
    400 }
    401 
    402 static int
    403 make_dict(Arena *a, Dict *d)
    404 {
    405 	u8 *starting_arena_end = a->end;
    406 	Stream path = {.cap = 1 * MEGABYTE};
    407 	path.data   = alloc(a, u8, path.cap, ARENA_ALLOC_END|ARENA_NO_CLEAR);
    408 	d->ht.ents  = alloc(a, DictEnt *, 1 << HT_EXP, 0);
    409 
    410 	stream_append_s8(&path, prefix);
    411 	stream_append_s8(&path, os_path_sep);
    412 	stream_append_s8(&path, d->rom);
    413 	iptr path_stream = os_begin_path_stream(&path, a, ARENA_ALLOC_END);
    414 
    415 	u8 *arena_end = a->end;
    416 	s8 fn_pre = s8("term");
    417 	for (s8 filedata = os_get_valid_file(path_stream, fn_pre, a, ARENA_ALLOC_END);
    418 	     filedata.len;
    419 	     filedata = os_get_valid_file(path_stream, fn_pre, a, ARENA_ALLOC_END))
    420 	{
    421 		parse_term_bank(a, &d->ht, filedata);
    422 		a->end = arena_end;
    423 	}
    424 	os_end_path_stream(path_stream);
    425 
    426 	a->end = starting_arena_end;
    427 
    428 	return 1;
    429 }
    430 
    431 static void
    432 make_dicts(Arena *a, Dict *dicts, u32 ndicts)
    433 {
    434 	for (u32 i = 0; i < ndicts; i++) {
    435 		if (!make_dict(a, &dicts[i])) {
    436 			stream_append_s8(&error_stream, s8("make_dict failed for: "));
    437 			stream_append_s8(&error_stream, dicts[i].rom);
    438 			stream_append_byte(&error_stream, '\n');
    439 		}
    440 	}
    441 }
    442 
    443 static DictEnt *
    444 find_ent(s8 term, Dict *d)
    445 {
    446 	u64 h = hash(term);
    447 	i32 i = ht_lookup(h, HT_EXP, (i32)h);
    448 	return d->ht.ents[i];
    449 }
    450 
    451 static void
    452 find_and_print(s8 term, Dict *d)
    453 {
    454 	DictEnt *ent = find_ent(term, d);
    455 
    456 	if (!ent || !s8_equal(term, ent->term))
    457 		return;
    458 
    459 	b32 print_for_readability = s8_equal(fsep, s8("\n"));
    460 	b32 printed_header        = 0;
    461 	for (DictDef *def = ent->def; def; def = def->next) {
    462 		if (print_for_readability)
    463 			def->text = unescape(def->text);
    464 		/* NOTE: some dictionaries are "hand-made" by idiots and have definitions
    465 		 * with only white space in them */
    466 		def->text = s8trim(def->text);
    467 		if (def->text.len) {
    468 			if (!print_for_readability) {
    469 				stream_append_s8(&stdout_stream, d->name);
    470 			} else if (!printed_header) {
    471 				stream_append_s8(&stdout_stream, s8("\x1b[36;1m"));
    472 				stream_append_s8(&stdout_stream, d->name);
    473 				stream_append_s8(&stdout_stream, s8("\x1b[0m"));
    474 				printed_header = 1;
    475 			}
    476 
    477 			stream_append_s8(&stdout_stream, fsep);
    478 			stream_append_s8(&stdout_stream, def->text);
    479 			stream_append_byte(&stdout_stream, '\n');
    480 		}
    481 	}
    482 	if (print_for_readability && printed_header)
    483 		stream_append_byte(&stdout_stream, '\n');
    484 	stream_flush(&stdout_stream);
    485 }
    486 
    487 static void
    488 find_and_print_defs(Arena *a, Dict *dict, s8 *terms, u32 nterms)
    489 {
    490 	if (!make_dict(a, dict)) {
    491 		stream_append_s8(&error_stream, s8("failed to allocate dict: "));
    492 		stream_append_s8(&error_stream, dict->rom);
    493 		stream_append_byte(&stdout_stream, '\n');
    494 		return;
    495 	}
    496 
    497 	for (u32 i = 0; i < nterms; i++)
    498 		find_and_print(terms[i], dict);
    499 }
    500 
    501 static b32
    502 get_stdin_line(Stream *buf)
    503 {
    504 	b32 result = 0;
    505 	for (; buf->widx < buf->cap; buf->widx++) {
    506 		u8 *c = buf->data + buf->widx;
    507 		if (!os_read_stdin(c, 1) || *c == (u8)-1) {
    508 			break;
    509 		} else if (*c == '\n') {
    510 			result = 1;
    511 			break;
    512 		}
    513 	}
    514 	return result;
    515 }
    516 
    517 static void
    518 repl(Arena *a, Dict *dicts, u32 ndicts)
    519 {
    520 	Stream buf = {.cap = 4096};
    521 	buf.data   = alloc(a, u8, buf.cap, ARENA_NO_CLEAR);
    522 
    523 	make_dicts(a, dicts, ndicts);
    524 
    525 	fsep = s8("\n");
    526 	for (;;) {
    527 		stream_append_s8(&stdout_stream, repl_prompt);
    528 		stream_flush(&stdout_stream);
    529 		if (!get_stdin_line(&buf))
    530 			break;
    531 		s8 trimmed = s8trim((s8){.len = buf.widx, .s = buf.data});
    532 		for (u32 i = 0; i < ndicts; i++)
    533 			find_and_print(trimmed, &dicts[i]);
    534 		buf.widx = 0;
    535 	}
    536 	stream_append_s8(&stdout_stream, repl_quit);
    537 }
    538 
    539 static i32
    540 jdict(Arena *a, i32 argc, char *argv[])
    541 {
    542 	Dict *dicts = 0;
    543 	i32 ndicts = 0, nterms = 0;
    544 	i32 iflag = 0;
    545 
    546 	s8 argv0 = cstr_to_s8(argv[0]);
    547 	for (argv++, argc--; argv[0] && argv[0][0] == '-' && argv[0][1]; argc--, argv++) {
    548 		/* NOTE: '--' to end parameters */
    549 		if (argv[0][1] == '-' && argv[0][2] == 0) {
    550 			argv++;
    551 			argc--;
    552 			break;
    553 		}
    554 		switch (argv[0][1]) {
    555 		case 'F':
    556 			if (!argv[1] || !argv[1][0])
    557 				usage(argv0);
    558 			fsep = unescape(cstr_to_s8(argv[1]));
    559 			argv++;
    560 			break;
    561 		case 'd': {
    562 			if (!argv[1] || !argv[1][0])
    563 				usage(argv0);
    564 			s8 dname = cstr_to_s8(argv[1]);
    565 			for (u32 j = 0; j < ARRAY_COUNT(default_dict_map); j++) {
    566 				if (s8_equal(dname, default_dict_map[j].rom)) {
    567 					dicts = &default_dict_map[j];
    568 					ndicts++;
    569 					break;
    570 				}
    571 			}
    572 			if (!dicts) {
    573 				stream_append_s8(&error_stream, s8("invalid dictionary name: "));
    574 				stream_append_s8(&error_stream, dname);
    575 				die(&error_stream);
    576 			}
    577 			argv++;
    578 		} break;
    579 		case 'i': iflag = 1;   break;
    580 		default: usage(argv0); break;
    581 		}
    582 	}
    583 
    584 	if (ndicts == 0) {
    585 		dicts  = default_dict_map;
    586 		ndicts = ARRAY_COUNT(default_dict_map);
    587 	}
    588 
    589 	/* NOTE: remaining argv elements are search terms */
    590 	nterms = argc;
    591 	s8 *terms = alloc(a, s8, nterms, 0);
    592 	for (i32 i = 0; argc && *argv; argv++, i++, argc--)
    593 		terms[i] = cstr_to_s8(*argv);
    594 
    595 	if (nterms == 0 && iflag == 0)
    596 		usage(argv0);
    597 
    598 	if (iflag == 0)
    599 		for (i32 i = 0; i < ndicts; i++)
    600 			find_and_print_defs(a, &dicts[i], terms, nterms);
    601 	else
    602 		repl(a, dicts, ndicts);
    603 
    604 #ifdef _DEBUG_ARENA
    605 	stream_append_s8(&error_stream, s8("min remaining arena capacity: "));
    606 	stream_append_u64(&error_stream, memory.min_capacity_remaining);
    607 	stream_append_s8(&error_stream, s8("\nremaining arena capacity: "));
    608 	stream_append_u64(&error_stream, memory.end - memory.beg);
    609 #endif
    610 
    611 	stream_ensure_newline(&error_stream);
    612 	stream_flush(&error_stream);
    613 
    614 	stream_ensure_newline(&stdout_stream);
    615 	stream_flush(&stdout_stream);
    616 
    617 	return 0;
    618 }