jdict

command line tool for looking up terms in yomidict dictionaries
git clone anongit@rnpnr.xyz:jdict.git
Log | Files | Refs | Feed | README | LICENSE

jdict.c (13838B)


      1 /* See LICENSE for license details. */
      2 #ifdef _DEBUG
      3 #define ASSERT(c) do { __asm("int3; nop"); } while (0)
      4 #else
      5 #define ASSERT(c) {}
      6 #endif
      7 
      8 #ifndef unreachable
      9 #define unreachable() __builtin_unreachable()
     10 #endif
     11 
     12 #define ARRAY_COUNT(a) (sizeof(a) / sizeof(*a))
     13 #define ISSPACE(c)     ((c) == ' ' || (c) == '\n' || (c) == '\t')
     14 
     15 #define MEGABYTE (1024ULL * 1024ULL)
     16 
     17 typedef struct {
     18 	size len;
     19 	u8   *s;
     20 } s8;
     21 #define s8(cstr) (s8){.len = ARRAY_COUNT(cstr) - 1, .s = (u8 *)cstr}
     22 
     23 typedef struct {
     24 	u8   *data;
     25 	u32   cap;
     26 	u32   widx;
     27 	i32   fd;
     28 	b32   errors;
     29 } Stream;
     30 
     31 typedef struct {
     32 	u8 *beg, *end;
     33 #ifdef _DEBUG_ARENA
     34 	size min_capacity_remaining;
     35 #endif
     36 } Arena;
     37 
     38 typedef struct {
     39 	Stream *dir_name;
     40 	void   *dirfd;
     41 } PathStream;
     42 
     43 #include "yomidict.c"
     44 
     45 #define YOMI_TOKS_PER_ENT 10
     46 
     47 /* Number of hash table slots (1 << HT_EXP) */
     48 #define HT_EXP 20
     49 
     50 typedef struct DictDef {
     51 	s8 text;
     52 	struct DictDef *next;
     53 } DictDef;
     54 
     55 typedef struct {
     56 	s8 term;
     57 	DictDef *def;
     58 } DictEnt;
     59 
     60 struct ht {
     61 	DictEnt **ents;
     62 	i32 len;
     63 };
     64 
     65 typedef struct {
     66 	s8 rom;
     67 	s8 name;
     68 	struct ht ht;
     69 } Dict;
     70 
     71 #include "config.h"
     72 
     73 static void __attribute__((noreturn)) os_exit(i32);
     74 
     75 static b32 os_write(iptr, s8);
     76 static s8  os_read_whole_file(char *, Arena *, u32);
     77 static b32 os_read_stdin(u8 *, size);
     78 
     79 static PathStream os_begin_path_stream(Stream *, Arena *, u32);
     80 static s8         os_get_valid_file(PathStream *, s8, Arena *, u32);
     81 static void       os_end_path_stream(PathStream *);
     82 
     83 static Stream error_stream;
     84 static Stream stdout_stream;
     85 
     86 static void
     87 stream_flush(Stream *s)
     88 {
     89 	if (s->fd <= 0) {
     90 		s->errors = 1;
     91 	} else if (s->widx) {
     92 		s->errors = !os_write(s->fd, (s8){.len = s->widx, .s = s->data});
     93 		if (!s->errors) s->widx = 0;
     94 	}
     95 }
     96 
     97 static void
     98 stream_append_byte(Stream *s, u8 b)
     99 {
    100 	if (s->widx + 1 > s->cap)
    101 		stream_flush(s);
    102 	if (!s->errors)
    103 		s->data[s->widx++] = b;
    104 }
    105 
    106 static void
    107 stream_append_s8(Stream *s, s8 str)
    108 {
    109 	if (str.len > s->cap - s->widx)
    110 		stream_flush(s);
    111 	s->errors |= (s->cap - s->widx) < str.len;
    112 	if (!s->errors) {
    113 		for (size i = 0; i < str.len; i++)
    114 			s->data[s->widx++] = str.s[i];
    115 	}
    116 }
    117 
    118 static void
    119 stream_ensure_newline(Stream *s)
    120 {
    121 	if (s->widx && s->data[s->widx - 1] != '\n')
    122 		stream_append_byte(s, '\n');
    123 }
    124 
    125 #ifdef _DEBUG_ARENA
    126 static void
    127 stream_append_u64(Stream *s, u64 n)
    128 {
    129 	u8 tmp[64];
    130 	u8 *end = tmp + sizeof(tmp);
    131 	u8 *beg = end;
    132 	do { *--beg = '0' + (n % 10); } while (n /= 10);
    133 	stream_append_s8(s, (s8){.len = end - beg, .s = beg});
    134 }
    135 #endif
    136 
    137 static s8
    138 cstr_to_s8(char *cstr)
    139 {
    140 	s8 result = {.s = (u8 *)cstr};
    141 	if (cstr) while (*cstr) { result.len++; cstr++; }
    142 	return result;
    143 }
    144 
    145 static void __attribute__((noreturn))
    146 die(Stream *s)
    147 {
    148 	stream_ensure_newline(s);
    149 	stream_flush(s);
    150 	os_exit(1);
    151 }
    152 
    153 static void *
    154 mem_clear(void *p_, u8 c, size len)
    155 {
    156 	u8 *p = p_;
    157 	while (len) p[--len] = c;
    158 	return p;
    159 }
    160 
    161 enum arena_flags {
    162 	ARENA_NONE      = 0 << 0,
    163 	ARENA_NO_CLEAR  = 1 << 0,
    164 	ARENA_ALLOC_END = 1 << 1,
    165 };
    166 
    167 #define alloc(a, t, n, flags)  (t *)alloc_(a, sizeof(t), _Alignof(t), n, flags)
    168 static void *
    169 alloc_(Arena *a, size len, size align, size count, u32 flags)
    170 {
    171 	size padding;
    172 	if (flags & ARENA_ALLOC_END) padding =  (usize)a->end & (align - 1);
    173 	else                         padding = -(usize)a->beg & (align - 1);
    174 
    175 	size available = a->end - a->beg - padding;
    176 	if (available <= 0 || available / len <= count)
    177 		ASSERT(0);
    178 
    179 	void *result;
    180 	if (flags & ARENA_ALLOC_END) {
    181 		a->end -= padding + count * len;
    182 		result  = a->end;
    183 	} else {
    184 		result  = a->beg + padding;
    185 		a->beg += padding + count * len;
    186 	}
    187 
    188 #ifdef _DEBUG_ARENA
    189 	if (a->end - a->beg < a->min_capacity_remaining)
    190 		a->min_capacity_remaining = a->end - a->beg;
    191 #endif
    192 
    193 	if (flags & ARENA_NO_CLEAR) return result;
    194 	else                        return mem_clear(result, 0, count * len);
    195 }
    196 
    197 static void
    198 usage(s8 argv0)
    199 {
    200 	stream_append_s8(&error_stream, s8("usage: "));
    201 	stream_append_s8(&error_stream, argv0);
    202 	stream_append_s8(&error_stream, s8(" [-d path] [-F FS] [-i] term ...\n"));
    203 	die(&error_stream);
    204 }
    205 
    206 static s8
    207 s8_dup(Arena *a, s8 old)
    208 {
    209 	s8 result = {.len = old.len, .s = alloc(a, u8, old.len, ARENA_NO_CLEAR)};
    210 	for (size i = 0; i < old.len; i++)
    211 		result.s[i] = old.s[i];
    212 	return result;
    213 }
    214 
    215 static b32
    216 s8_equal(s8 a, s8 b)
    217 {
    218 	i32 result = 0;
    219 	if (a.len != b.len)
    220 		return 0;
    221 	/* NOTE: we assume short strings in this program */
    222 	for (size i = 0; i < a.len; i++)
    223 		result += b.s[i] - a.s[i];
    224 	return result == 0;
    225 }
    226 
    227 static s8
    228 s8_cut_head(s8 s, size count)
    229 {
    230 	s8 result   = s;
    231 	result.s   += count;
    232 	result.len -= count;
    233 	return result;
    234 }
    235 
    236 /*
    237  * trim whitespace from start and end of str
    238  * returns a new s8 (same memory)
    239  */
    240 static s8
    241 s8trim(s8 str)
    242 {
    243 	u8 *p = str.s + str.len - 1;
    244 
    245 	for (; str.len && ISSPACE(*p); str.len--, p--);
    246 	for (; str.len && ISSPACE(*str.s); str.len--, str.s++);
    247 
    248 	return str;
    249 }
    250 
    251 /* replace escaped control chars with their actual char */
    252 static s8
    253 unescape(s8 str)
    254 {
    255 	for (size i = 0; i < str.len; i++) {
    256 		if (str.s[i] == '\\') {
    257 			switch (str.s[i + 1]) {
    258 			case 'n': str.s[i] = '\n'; break;
    259 			case 't': str.s[i] = '\t'; break;
    260 			default: continue;
    261 			}
    262 			str.len--;
    263 			for (size j = i + 1; j < str.len; j++)
    264 				str.s[j] = str.s[j + 1];
    265 		}
    266 	}
    267 	return str;
    268 }
    269 
    270 /* FNV-1a hash */
    271 static u64
    272 hash(s8 v)
    273 {
    274 	u64 h = 0x3243f6a8885a308d; /* digits of pi */
    275 	for (; v.len; v.len--) {
    276 		h ^= v.s[v.len - 1] & 0xFF;
    277 		h *= 1111111111111111111; /* random prime */
    278 	}
    279 	return h;
    280 }
    281 
    282 static i32
    283 ht_lookup(u64 hash, int exp, i32 idx)
    284 {
    285 	u32 mask = ((u32)1 << exp) - 1;
    286 	u32 step = (hash >> (64 - exp)) | 1;
    287 	return (idx + step) & mask;
    288 }
    289 
    290 static DictEnt **
    291 intern(struct ht *t, s8 key)
    292 {
    293 	u64 h = hash(key);
    294 	i32 i = h;
    295 	for (;;) {
    296 		i = ht_lookup(h, HT_EXP, i);
    297 		if (!t->ents[i]) {
    298 			/* empty slot */
    299 			#ifdef _DEBUG
    300 			if ((u32)t->len + 1 == (u32)1<<(HT_EXP - 1)) {
    301 				stream_append_s8(&error_stream,
    302 				                 s8("intern: ht exceeded 0.5 fill factor\n"));
    303 			}
    304 			#endif
    305 			t->len++;
    306 			return t->ents + i;
    307 		} else if (s8_equal(t->ents[i]->term, key)) {
    308 			/* found; return the stored instance */
    309 			return t->ents + i;
    310 		}
    311 		/* NOTE: else relookup and try again */
    312 	}
    313 }
    314 
    315 static void
    316 parse_term_bank(Arena *a, struct ht *ht, s8 data)
    317 {
    318 	/* allocate tokens */
    319 	size ntoks = (1 << HT_EXP) * YOMI_TOKS_PER_ENT + 1;
    320 	YomiTok *toks = alloc(a, YomiTok, ntoks, ARENA_ALLOC_END|ARENA_NO_CLEAR);
    321 
    322 	YomiScanner s = {0};
    323 	yomi_scanner_init(&s, (char *)data.s, data.len);
    324 	i32 r;
    325 	while ((r = yomi_scan(&s, toks, ntoks)) < 0) {
    326 		switch (r) {
    327 		case YOMI_ERROR_NOMEM:
    328 			goto cleanup;
    329 		case YOMI_ERROR_INVAL:
    330 		case YOMI_ERROR_MALFO:
    331 			stream_append_s8(&error_stream, s8("yomi_parse: "));
    332 			if (r == YOMI_ERROR_INVAL)
    333 				stream_append_s8(&error_stream, s8("YOMI_ERROR_INVAL\n"));
    334 			else
    335 				stream_append_s8(&error_stream, s8("YOMI_ERROR_MALFO\n"));
    336 			goto cleanup;
    337 		}
    338 	}
    339 
    340 	for (i32 i = 0; i < r; i++) {
    341 		YomiTok *base_tok = toks + i;
    342 		if (base_tok->type != YOMI_ENTRY)
    343 			continue;
    344 
    345 		YomiTok *tstr = NULL, *tdefs = NULL;
    346 		for (usize j = 1; j < base_tok->len; j++) {
    347 
    348 			switch (base_tok[j].type) {
    349 			case YOMI_STR:
    350 				if (tstr == NULL)
    351 					tstr = base_tok + j;
    352 				break;
    353 			case YOMI_ARRAY:
    354 				if (tdefs == NULL)
    355 					tdefs = base_tok + j;
    356 			default: /* FALLTHROUGH */
    357 				break;
    358 			}
    359 		}
    360 
    361 		/* check if entry was valid */
    362 		if (tdefs == NULL || tstr == NULL) {
    363 			stream_append_s8(&error_stream, s8("parse_term_bank: invalid entry: got "));
    364 			if (!tdefs) stream_append_s8(&error_stream, s8("tdefs"));
    365 			else        stream_append_s8(&error_stream, s8("tstr"));
    366 			stream_append_s8(&error_stream, s8(" == NULL\n"));
    367 			break;
    368 		}
    369 
    370 		s8 mem_term = {.len = tstr->end - tstr->start, .s = data.s + tstr->start};
    371 		DictEnt **n = intern(ht, mem_term);
    372 
    373 		if (!*n) {
    374 			*n         = alloc(a, DictEnt, 1, 0);
    375 			(*n)->term = s8_dup(a, mem_term);
    376 		} else {
    377 			if (!s8_equal((*n)->term, mem_term)) {
    378 				stream_append_s8(&error_stream, s8("hash collision: "));
    379 				stream_append_s8(&error_stream, mem_term);
    380 				stream_append_byte(&error_stream, '\t');
    381 				stream_append_s8(&error_stream, (*n)->term);
    382 				stream_append_byte(&error_stream, '\n');
    383 			}
    384 		}
    385 
    386 		for (usize i = 1; i <= tdefs->len; i++) {
    387 			DictDef *def = alloc(a, DictDef, 1, ARENA_NO_CLEAR);
    388 			def->text = s8_dup(a, (s8){.len = tdefs[i].end - tdefs[i].start,
    389 			                           .s = data.s + tdefs[i].start});
    390 			def->next = (*n)->def;
    391 			(*n)->def = def;
    392 		}
    393 	}
    394 
    395 cleanup:
    396 	stream_ensure_newline(&error_stream);
    397 }
    398 
    399 static int
    400 make_dict(Arena *a, Dict *d)
    401 {
    402 	u8 *starting_arena_end = a->end;
    403 	Stream path = {.cap = 1 * MEGABYTE};
    404 	path.data   = alloc(a, u8, path.cap, ARENA_ALLOC_END|ARENA_NO_CLEAR);
    405 	d->ht.ents  = alloc(a, DictEnt *, 1 << HT_EXP, 0);
    406 
    407 	stream_append_s8(&path, prefix);
    408 	stream_append_s8(&path, os_path_sep);
    409 	stream_append_s8(&path, d->rom);
    410 	PathStream ps = os_begin_path_stream(&path, a, ARENA_ALLOC_END);
    411 
    412 	u8 *arena_end = a->end;
    413 	s8 fn_pre = s8("term");
    414 	for (s8 filedata = os_get_valid_file(&ps, fn_pre, a, ARENA_ALLOC_END);
    415 	     filedata.len;
    416 	     filedata = os_get_valid_file(&ps, fn_pre, a, ARENA_ALLOC_END))
    417 	{
    418 		parse_term_bank(a, &d->ht, filedata);
    419 		a->end = arena_end;
    420 	}
    421 	os_end_path_stream(&ps);
    422 
    423 	a->end = starting_arena_end;
    424 
    425 	return 1;
    426 }
    427 
    428 static void
    429 make_dicts(Arena *a, Dict *dicts, u32 ndicts)
    430 {
    431 	for (u32 i = 0; i < ndicts; i++) {
    432 		if (!make_dict(a, &dicts[i])) {
    433 			stream_append_s8(&error_stream, s8("make_dict failed for: "));
    434 			stream_append_s8(&error_stream, dicts[i].rom);
    435 			stream_append_byte(&error_stream, '\n');
    436 		}
    437 	}
    438 }
    439 
    440 static DictEnt *
    441 find_ent(s8 term, Dict *d)
    442 {
    443 	u64 h = hash(term);
    444 	i32 i = ht_lookup(h, HT_EXP, (i32)h);
    445 	return d->ht.ents[i];
    446 }
    447 
    448 static void
    449 find_and_print(s8 term, Dict *d)
    450 {
    451 	DictEnt *ent = find_ent(term, d);
    452 
    453 	if (!ent || !s8_equal(term, ent->term))
    454 		return;
    455 
    456 	b32 print_for_readability = s8_equal(fsep, s8("\n"));
    457 	b32 printed_header        = 0;
    458 	for (DictDef *def = ent->def; def; def = def->next) {
    459 		if (print_for_readability)
    460 			def->text = unescape(def->text);
    461 		/* NOTE: some dictionaries are "hand-made" by idiots and have definitions
    462 		 * with only white space in them */
    463 		def->text = s8trim(def->text);
    464 		if (def->text.len) {
    465 			if (!print_for_readability) {
    466 				stream_append_s8(&stdout_stream, d->name);
    467 			} else if (!printed_header) {
    468 				stream_append_s8(&stdout_stream, s8("\x1b[36;1m"));
    469 				stream_append_s8(&stdout_stream, d->name);
    470 				stream_append_s8(&stdout_stream, s8("\x1b[0m"));
    471 				printed_header = 1;
    472 			}
    473 
    474 			stream_append_s8(&stdout_stream, fsep);
    475 			stream_append_s8(&stdout_stream, def->text);
    476 			stream_append_byte(&stdout_stream, '\n');
    477 		}
    478 	}
    479 	if (print_for_readability && printed_header)
    480 		stream_append_byte(&stdout_stream, '\n');
    481 	stream_flush(&stdout_stream);
    482 }
    483 
    484 static void
    485 find_and_print_defs(Arena *a, Dict *dict, s8 *terms, u32 nterms)
    486 {
    487 	if (!make_dict(a, dict)) {
    488 		stream_append_s8(&error_stream, s8("failed to allocate dict: "));
    489 		stream_append_s8(&error_stream, dict->rom);
    490 		stream_append_byte(&stdout_stream, '\n');
    491 		return;
    492 	}
    493 
    494 	for (u32 i = 0; i < nterms; i++)
    495 		find_and_print(terms[i], dict);
    496 }
    497 
    498 static b32
    499 get_stdin_line(Stream *buf)
    500 {
    501 	b32 result = 0;
    502 	for (; buf->widx < buf->cap; buf->widx++) {
    503 		u8 *c = buf->data + buf->widx;
    504 		if (!os_read_stdin(c, 1) || *c == (u8)-1) {
    505 			break;
    506 		} else if (*c == '\n') {
    507 			result = 1;
    508 			break;
    509 		}
    510 	}
    511 	return result;
    512 }
    513 
    514 static void
    515 repl(Arena *a, Dict *dicts, u32 ndicts)
    516 {
    517 	Stream buf = {.cap = 4096};
    518 	buf.data   = alloc(a, u8, buf.cap, ARENA_NO_CLEAR);
    519 
    520 	make_dicts(a, dicts, ndicts);
    521 
    522 	fsep = s8("\n");
    523 	for (;;) {
    524 		stream_append_s8(&stdout_stream, repl_prompt);
    525 		stream_flush(&stdout_stream);
    526 		if (!get_stdin_line(&buf))
    527 			break;
    528 		s8 trimmed = s8trim((s8){.len = buf.widx, .s = buf.data});
    529 		for (u32 i = 0; i < ndicts; i++)
    530 			find_and_print(trimmed, &dicts[i]);
    531 		buf.widx = 0;
    532 	}
    533 	stream_append_s8(&stdout_stream, repl_quit);
    534 }
    535 
    536 static i32
    537 jdict(Arena *a, i32 argc, char *argv[])
    538 {
    539 	Dict *dicts = NULL;
    540 	i32 ndicts = 0, nterms = 0;
    541 	i32 iflag = 0;
    542 
    543 	s8 argv0 = cstr_to_s8(argv[0]);
    544 	for (argv++, argc--; argv[0] && argv[0][0] == '-' && argv[0][1]; argc--, argv++) {
    545 		/* NOTE: '--' to end parameters */
    546 		if (argv[0][1] == '-' && argv[0][2] == 0) {
    547 			argv++;
    548 			argc--;
    549 			break;
    550 		}
    551 		switch (argv[0][1]) {
    552 		case 'F':
    553 			if (!argv[1] || !argv[1][0])
    554 				usage(argv0);
    555 			fsep = unescape(cstr_to_s8(argv[1]));
    556 			argv++;
    557 			break;
    558 		case 'd': {
    559 			if (!argv[1] || !argv[1][0])
    560 				usage(argv0);
    561 			s8 dname = cstr_to_s8(argv[1]);
    562 			for (u32 j = 0; j < ARRAY_COUNT(default_dict_map); j++) {
    563 				if (s8_equal(dname, default_dict_map[j].rom)) {
    564 					dicts = &default_dict_map[j];
    565 					ndicts++;
    566 					break;
    567 				}
    568 			}
    569 			if (dicts == NULL) {
    570 				stream_append_s8(&error_stream, s8("invalid dictionary name: "));
    571 				stream_append_s8(&error_stream, dname);
    572 				die(&error_stream);
    573 			}
    574 			argv++;
    575 		} break;
    576 		case 'i': iflag = 1;   break;
    577 		default: usage(argv0); break;
    578 		}
    579 	}
    580 
    581 	if (ndicts == 0) {
    582 		dicts  = default_dict_map;
    583 		ndicts = ARRAY_COUNT(default_dict_map);
    584 	}
    585 
    586 	/* NOTE: remaining argv elements are search terms */
    587 	nterms = argc;
    588 	s8 *terms = alloc(a, s8, nterms, 0);
    589 	for (i32 i = 0; argc && *argv; argv++, i++, argc--)
    590 		terms[i] = cstr_to_s8(*argv);
    591 
    592 	if (nterms == 0 && iflag == 0)
    593 		usage(argv0);
    594 
    595 	if (iflag == 0)
    596 		for (i32 i = 0; i < ndicts; i++)
    597 			find_and_print_defs(a, &dicts[i], terms, nterms);
    598 	else
    599 		repl(a, dicts, ndicts);
    600 
    601 #ifdef _DEBUG_ARENA
    602 	stream_append_s8(&error_stream, s8("min remaining arena capacity: "));
    603 	stream_append_u64(&error_stream, memory.min_capacity_remaining);
    604 	stream_append_s8(&error_stream, s8("\nremaining arena capacity: "));
    605 	stream_append_u64(&error_stream, memory.end - memory.beg);
    606 #endif
    607 
    608 	stream_ensure_newline(&error_stream);
    609 	stream_flush(&error_stream);
    610 
    611 	stream_ensure_newline(&stdout_stream);
    612 	stream_flush(&stdout_stream);
    613 
    614 	return 0;
    615 }