jdict

command line tool for looking up terms in yomidict dictionaries
git clone anongit@rnpnr.xyz:jdict.git
Log | Files | Refs | Feed | README | LICENSE

jdict.c (7809B)


      1 /* See LICENSE for license details. */
      2 #include <dirent.h>
      3 #include <fcntl.h>
      4 #include <limits.h>
      5 #include <stddef.h>
      6 #include <stdint.h>
      7 #include <stdio.h>
      8 #include <stdlib.h>
      9 #include <string.h>
     10 #include <sys/mman.h>
     11 #include <unistd.h>
     12 
     13 #include "arg.h"
     14 #include "util.c"
     15 #include "yomidict.c"
     16 
     17 #define YOMI_TOKS_PER_ENT 10
     18 
     19 /* buffer length for interactive mode */
     20 #define BUFLEN 256
     21 
     22 /* Number of hash table slots (1 << HT_EXP) */
     23 #define HT_EXP 20
     24 
     25 typedef uint64_t u64;
     26 typedef uint32_t u32;
     27 typedef int32_t  i32;
     28 
     29 typedef struct {
     30 	s8 term;
     31 	s8 *defs;
     32 	size_t ndefs;
     33 } DictEnt;
     34 
     35 struct ht {
     36 	DictEnt **ents;
     37 	i32 len;
     38 };
     39 
     40 typedef struct {
     41 	const char *rom;
     42 	const char *name;
     43 	struct ht ht;
     44 } Dict;
     45 
     46 #include "config.h"
     47 
     48 char *argv0;
     49 
     50 static void
     51 usage(void)
     52 {
     53 	die("usage: %s [-d path] [-F FS] [-i] term ...\n", argv0);
     54 }
     55 
     56 static void
     57 merge_ents(DictEnt *a, DictEnt *b)
     58 {
     59 	size_t i, nlen = a->ndefs + b->ndefs;
     60 
     61 	if (nlen == 0)
     62 		return;
     63 
     64 	a->defs = xreallocarray(a->defs, nlen, sizeof(s8));
     65 
     66 	for (i = 0; i < b->ndefs; i++)
     67 		a->defs[a->ndefs + i] = b->defs[i];
     68 	a->ndefs = nlen;
     69 }
     70 
     71 /* FNV-1a hash */
     72 static u64
     73 hash(s8 v)
     74 {
     75 	u64 h = 0x3243f6a8885a308d; /* digits of pi */
     76 	for (; v.len; v.len--) {
     77 		h ^= v.s[v.len - 1] & 0xFF;
     78 		h *= 1111111111111111111; /* random prime */
     79 	}
     80 	return h;
     81 }
     82 
     83 static i32
     84 ht_lookup(u64 hash, int exp, i32 idx)
     85 {
     86 	u32 mask = ((u32)1 << exp) - 1;
     87 	u32 step = (hash >> (64 - exp)) | 1;
     88 	return (idx + step) & mask;
     89 }
     90 
     91 static DictEnt *
     92 intern(struct ht *t, DictEnt *e)
     93 {
     94 	s8 key = e->term;
     95 	u64 h = hash(key);
     96 	i32 i = h;
     97 	for (;;) {
     98 		i = ht_lookup(h, HT_EXP, i);
     99 		if (!t->ents[i]) {
    100 			/* empty slot */
    101 			if ((u32)t->len + 1 == (u32)1<<(HT_EXP - 1)) {
    102 				fputs("intern: ht exceeded 0.5 fill factor\n", stderr);
    103 				return NULL;
    104 			}
    105 			t->len++;
    106 			t->ents[i] = e;
    107 			return e;
    108 		} else if (!s8cmp(t->ents[i]->term, e->term)) {
    109 			/* found; return the stored instance */
    110 			return t->ents[i];
    111 		}
    112 	}
    113 }
    114 
    115 static size_t
    116 count_term_banks(const char *path)
    117 {
    118 	DIR *dir;
    119 	struct dirent *dent;
    120 	size_t nbanks = 0;
    121 
    122 	if (!(dir = opendir(path)))
    123 		die("opendir(): failed to open: %s\n", path);
    124 
    125 	/* count term banks in path */
    126 	while ((dent = readdir(dir)) != NULL)
    127 		if (dent->d_type == DT_REG)
    128 			nbanks++;
    129 	/* remove index.json from count */
    130 	nbanks--;
    131 
    132 	closedir(dir);
    133 	return nbanks;
    134 }
    135 
    136 /* takes a token of type YOMI_ENTRY and creates a DictEnt */
    137 static DictEnt *
    138 make_ent(YomiTok *toks, char *data)
    139 {
    140 	size_t i;
    141 	DictEnt *d;
    142 	YomiTok *tstr = NULL, *tdefs = NULL;
    143 
    144 	if (toks[0].type != YOMI_ENTRY) {
    145 		fprintf(stderr, "toks[0].type = %d\n", toks[0].type);
    146 		return NULL;
    147 	}
    148 
    149 	for (i = 1; i < toks[0].len; i++)
    150 		switch (toks[i].type) {
    151 		case YOMI_STR:
    152 			if (tstr == NULL)
    153 				tstr = &toks[i];
    154 			break;
    155 		case YOMI_ARRAY:
    156 			if (tdefs == NULL)
    157 				tdefs = &toks[i];
    158 		default: /* FALLTHROUGH */
    159 			break;
    160 		}
    161 
    162 	/* check if entry was valid */
    163 	if (tdefs == NULL || tstr == NULL) {
    164 		fprintf(stderr, "make_ent: %s == NULL\n",
    165 		        tdefs == NULL? "tdefs" : "tstr");
    166 		return NULL;
    167 	}
    168 
    169 	d = xreallocarray(NULL, 1, sizeof(DictEnt));
    170 	d->term = s8dup(data + tstr->start, tstr->end - tstr->start);
    171 	d->ndefs = tdefs->len;
    172 	d->defs = xreallocarray(NULL, d->ndefs, sizeof(s8));
    173 	for (i = 1; i <= d->ndefs; i++)
    174 		d->defs[i - 1] = s8dup(data + tdefs[i].start,
    175 		                       tdefs[i].end - tdefs[i].start);
    176 
    177 	return d;
    178 }
    179 
    180 static void
    181 parse_term_bank(struct ht *ht, const char *tbank)
    182 {
    183 	int i = 0, r, ntoks, fd;
    184 	size_t flen;
    185 	char *data;
    186 	YomiTok *toks = NULL;
    187 	YomiScanner s = {0};
    188 	DictEnt *e, *n;
    189 
    190 	if ((fd = open(tbank, O_RDONLY)) < 0)
    191 		die("can't open file: %s\n", tbank);
    192 	flen = lseek(fd, 0, SEEK_END);
    193 	data = mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
    194 	close(fd);
    195 
    196 	if (data == MAP_FAILED)
    197 		die("couldn't mmap file: %s\n", tbank);
    198 
    199 	/* allocate tokens */
    200 	ntoks = (1 << HT_EXP) * YOMI_TOKS_PER_ENT + 1;
    201 	toks = xreallocarray(toks, ntoks, sizeof(YomiTok));
    202 
    203 	yomi_scanner_init(&s, data, flen);
    204 	while ((r = yomi_scan(&s, toks, ntoks)) < 0) {
    205 		switch (r) {
    206 		case YOMI_ERROR_NOMEM:
    207 			goto cleanup;
    208 		case YOMI_ERROR_INVAL:
    209 		case YOMI_ERROR_MALFO:
    210 			fprintf(stderr, "yomi_parse: %s\n",
    211 			        r == YOMI_ERROR_INVAL? "YOMI_ERROR_INVAL"
    212 			        : "YOMI_ERROR_MALFO");
    213 			goto cleanup;
    214 		}
    215 	}
    216 
    217 	for (i = 0; i < r; i++) {
    218 		if (toks[i].type != YOMI_ENTRY)
    219 			continue;
    220 
    221 		if ((e = make_ent(&toks[i], data)) == NULL)
    222 			break;
    223 		if ((n = intern(ht, e)) == NULL)
    224 			break;
    225 		if (n == e)
    226 			continue;
    227 		/* hash table entry already exists, append new defs */
    228 		if (s8cmp(n->term, e->term)) {
    229 			fputs("hash collision: ", stderr);
    230 			fwrite(e->term.s, e->term.len, 1, stderr);
    231 			fputc('\t', stderr);
    232 			fwrite(n->term.s, n->term.len, 1, stderr);
    233 			fputc('\n', stderr);
    234 		}
    235 		merge_ents(n, e);
    236 		free(e->term.s);
    237 		free(e->defs);
    238 		free(e);
    239 	}
    240 
    241 cleanup:
    242 	munmap(data, flen);
    243 	free(toks);
    244 }
    245 
    246 static int
    247 make_dict(Dict *d)
    248 {
    249 	char path[PATH_MAX - 20], tbank[PATH_MAX];
    250 	size_t nbanks;
    251 
    252 	d->ht.ents = xreallocarray(NULL, sizeof(DictEnt *), 1 << HT_EXP);
    253 
    254 	snprintf(path, LEN(path), "%s/%s", prefix, d->rom);
    255 	if ((nbanks = count_term_banks(path)) == 0) {
    256 		fprintf(stderr, "no term banks found: %s\n", path);
    257 		return 0;
    258 	}
    259 
    260 	for (size_t i = 1; i <= nbanks; i++) {
    261 		snprintf(tbank, LEN(tbank), "%s/term_bank_%zu.json", path, i);
    262 		parse_term_bank(&d->ht, tbank);
    263 	}
    264 
    265 	return 1;
    266 }
    267 
    268 static void
    269 make_dicts(Dict *dicts, size_t ndicts)
    270 {
    271 	for (size_t i = 0; i < ndicts; i++)
    272 		if (!make_dict(&dicts[i]))
    273 			die("make_dict(%s): returned NULL\n", dicts[i].rom);
    274 }
    275 
    276 static DictEnt *
    277 find_ent(s8 term, Dict *d)
    278 {
    279 	u64 h = hash(term);
    280 	i32 i = ht_lookup(h, HT_EXP, (i32)h);
    281 	return d->ht.ents[i];
    282 }
    283 
    284 static void
    285 find_and_print(s8 term, Dict *d)
    286 {
    287 	DictEnt *ent = find_ent(term, d);
    288 	size_t i;
    289 
    290 	if (!ent || s8cmp(term, ent->term))
    291 		return;
    292 
    293 	for (i = 0; i < ent->ndefs; i++) {
    294 		if (!s8cmp(fsep, s8("\n")))
    295 			ent->defs[i] = unescape(ent->defs[i]);
    296 		fputs(d->name, stdout);
    297 		fwrite(fsep.s, fsep.len, 1, stdout);
    298 		fwrite(ent->defs[i].s, ent->defs[i].len, 1, stdout);
    299 		fputc('\n', stdout);
    300 	}
    301 }
    302 
    303 static void
    304 find_and_print_defs(Dict *dict, s8 *terms, size_t nterms)
    305 {
    306 	size_t i;
    307 
    308 	if (!make_dict(dict)) {
    309 		fputs("failed to allocate dict: ", stdout);
    310 		puts(dict->rom);
    311 		return;
    312 	}
    313 
    314 	for (i = 0; i < nterms; i++)
    315 		find_and_print(terms[i], dict);
    316 }
    317 
    318 static void
    319 repl(Dict *dicts, size_t ndicts)
    320 {
    321 	char t[BUFLEN];
    322 	s8 buf = {t, BUFLEN};
    323 	size_t i;
    324 
    325 	make_dicts(dicts, ndicts);
    326 
    327 	fsep = s8("\n");
    328 	for (;;) {
    329 		fputs(repl_prompt, stdout);
    330 		fflush(stdout);
    331 		buf.len = BUFLEN;
    332 		if (fgets(buf.s, buf.len, stdin) == NULL)
    333 			break;
    334 		buf.len = strlen(buf.s);
    335 		for (i = 0; i < ndicts; i++)
    336 			find_and_print(s8trim(buf), &dicts[i]);
    337 	}
    338 	puts(repl_quit);
    339 }
    340 
    341 int
    342 main(int argc, char *argv[])
    343 {
    344 	s8 *terms = NULL;
    345 	char *t;
    346 	Dict *dicts = NULL;
    347 	size_t i, ndicts = 0, nterms = 0;
    348 	int iflag = 0;
    349 
    350 	argv0 = argv[0];
    351 
    352 	ARGBEGIN {
    353 	case 'd':
    354 		t = EARGF(usage());
    355 		for (i = 0; i < LEN(default_dict_map); i++) {
    356 			if (strcmp(t, default_dict_map[i].rom) == 0) {
    357 				dicts = &default_dict_map[i];
    358 				ndicts++;
    359 				break;
    360 			}
    361 		}
    362 		if (dicts == NULL)
    363 			die("invalid dictionary name: %s\n", t);
    364 		break;
    365 	case 'F':
    366 		t = EARGF(usage());
    367 		fsep = unescape((s8){t, strlen(t)});
    368 		break;
    369 	case 'i':
    370 		iflag = 1;
    371 		break;
    372 	default:
    373 		usage();
    374 	} ARGEND
    375 
    376 	if (ndicts == 0) {
    377 		dicts = default_dict_map;
    378 		ndicts = LEN(default_dict_map);
    379 	}
    380 
    381 	/* remaining argv elements are terms to search for */
    382 	for (i = 0; argc && *argv; argv++, i++, argc--) {
    383 		terms = xreallocarray(terms, ++nterms, sizeof(s8));
    384 		terms[i].s = *argv;
    385 		terms[i].len = strlen(terms[i].s);
    386 	}
    387 
    388 	if (nterms == 0 && iflag == 0)
    389 		usage();
    390 
    391 	if (iflag == 0)
    392 		for (i = 0; i < ndicts; i++)
    393 			find_and_print_defs(&dicts[i], terms, nterms);
    394 	else
    395 		repl(dicts, ndicts);
    396 
    397 	free(terms);
    398 
    399 	return 0;
    400 }