jdict

command line tool for looking up terms in yomidict dictionaries
git clone anongit@rnpnr.xyz:jdict.git
Log | Files | Refs | Feed | README | LICENSE

Commit: f344c7ef726eb97e83f04b77e5b67094b5b3b1cd
Author: Randy Palamar
Date:   Sat, 18 Jun 2022 23:30:22 -0600

initial import of jdict

this program lets you search yomidict dictionaries from the command line

Diffstat:
ALICENSE | 15+++++++++++++++
AMakefile | 29+++++++++++++++++++++++++++++
Aarg.h | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Aconfig.def.h | 14++++++++++++++
Aconfig.mk | 6++++++
Ajdict.c | 273+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autil.c | 29+++++++++++++++++++++++++++++
Autil.h | 5+++++
Ayomidict.c | 223+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ayomidict.h | 31+++++++++++++++++++++++++++++++
10 files changed, 675 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,15 @@ +ISC License (ISC) + +© 2022 Randy Palamar <palamar@ualberta.ca> + +Permission to use, copy, modify, and distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,29 @@ +# See LICENSE for license details. +include config.mk + +SRC = jdict.c yomidict.c util.c +OBJ = $(SRC:.c=.o) + +default: jdict + +config.h: + cp config.def.h $@ + +.c.o: + $(CC) $(CFLAGS) -o $@ -c $< + +$(OBJ): config.h + +jdict: $(OBJ) + $(CC) -o $@ $(OBJ) $(LDFLAGS) + +install: jdict + mkdir -p $(PREFIX)/bin + cp jdict $(PREFIX)/bin + chmod 755 $(PREFIX)/bin/jdict + +uninstall: + rm $(PREFIX)/bin/jdict + +clean: + rm *.o jdict diff --git a/arg.h b/arg.h @@ -0,0 +1,50 @@ +/* + * Copy me if you can. + * by 20h + */ + +#ifndef ARG_H__ +#define ARG_H__ + +extern char *argv0; + +/* use main(int argc, char *argv[]) */ +#define ARGBEGIN for (argv0 = *argv, argv++, argc--;\ + argv[0] && argv[0][0] == '-'\ + && argv[0][1];\ + argc--, argv++) {\ + char argc_;\ + char **argv_;\ + int brk_;\ + if (argv[0][1] == '-' && argv[0][2] == '\0') {\ + argv++;\ + argc--;\ + break;\ + }\ + int i_;\ + for (i_ = 1, brk_ = 0, argv_ = argv;\ + argv[0][i_] && !brk_;\ + i_++) {\ + if (argv_ != argv)\ + break;\ + argc_ = argv[0][i_];\ + switch (argc_) + +#define ARGEND }\ + } + +#define ARGC() argc_ + +#define EARGF(x) ((argv[0][i_+1] == '\0' && argv[1] == NULL)?\ + ((x), abort(), (char *)0) :\ + (brk_ = 1, (argv[0][i_+1] != '\0')?\ + (&argv[0][i_+1]) :\ + (argc--, argv++, argv[0]))) + +#define ARGF() ((argv[0][i_+1] == '\0' && argv[1] == NULL)?\ + (char *)0 :\ + (brk_ = 1, (argv[0][i_+1] != '\0')?\ + (&argv[0][i_+1]) :\ + (argc--, argv++, argv[0]))) + +#endif diff --git a/config.def.h b/config.def.h @@ -0,0 +1,14 @@ +/* See LICENSE for license details. */ + +/* max terms per term bank, all dicts should use this stride */ +#define DICT_STRIDE 10000 + +/* dir where unzipped yomidicts are stored */ +static char *prefix = "/usr/share/yomidicts"; + +/* default yomidicts to search */ +static char *default_dicts[] = { + "daijirin" + "daijisen", + "koujien" +}; diff --git a/config.mk b/config.mk @@ -0,0 +1,6 @@ +# See LICENSE for license details. +PREFIX = /usr/local + +CPPFLAGS = -D_BSD_SOURCE +CFLAGS = -O2 -std=c99 -Wall -pedantic $(CPPFLAGS) $(INCS) +LDFLAGS = -s -static diff --git a/jdict.c b/jdict.c @@ -0,0 +1,273 @@ +/* See LICENSE for license details. */ +#include <dirent.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <unistd.h> + +#include "arg.h" +#include "util.h" +#include "yomidict.h" + +#include "config.h" + +#define YOMI_TOKS_PER_ENT 10 + +typedef struct { + char *term; + char **defs; + size_t ndefs; +} DictEnt; + +char *argv0; + +static void +cleanup(char **dicts, char **terms) +{ + if (dicts != default_dicts) + free(dicts); + free(terms); + + dicts = NULL; + terms = NULL; +} + +static void +usage(void) +{ + die("usage: %s [-d path] term ...\n", argv0); +} + +/* takes a token of type YOMI_ENTRY and creates a DictEnt */ +static DictEnt * +make_ent(YomiTok *tok, char *data) +{ + size_t i; + DictEnt *d; + YomiTok *tstr, *tdefs; + + if (tok->type != YOMI_ENTRY) + return NULL; + + /* FIXME: hacky but works */ + /* term = YOMI_ENT tok + 1 */ + tstr = tok + 1; + /* definition array = YOMI_ENT tok + 6 */ + tdefs = tok + 6; + + d = xreallocarray(NULL, 1, sizeof(DictEnt)); + d->term = strndup(data + tstr->start, tstr->end - tstr->start); + d->ndefs = tdefs->len; + d->defs = xreallocarray(NULL, d->ndefs, sizeof(char *)); + for (i = 1; i <= d->ndefs; i++) + d->defs[i-1] = strndup(data + (tdefs + i)->start, + (tdefs + i)->end - (tdefs + i)->start); + + return d; +} + +static DictEnt * +parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, YomiTok *toks, size_t ntoks) +{ + int r, fd; + size_t flen, i; + char *data; + YomiParser p; + DictEnt *e; + + /* FIXME: these need to be checked for errors */ + fd = open(tbank, O_RDONLY); + flen = lseek(fd, 0, SEEK_END); + data = mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + yomi_init(&p); + r = yomi_parse(&p, toks, ntoks, data, flen); + if (r < 0) + return NULL; + + ents = xreallocarray(ents, (*nents) + r/YOMI_TOKS_PER_ENT, sizeof(DictEnt)); + for (i = 0; i < r; i++) { + if (toks[i].type == YOMI_ENTRY) { + e = make_ent(&toks[i], data); + if (e == NULL) + return NULL; + memcpy(&ents[(*nents)++], e, sizeof(DictEnt)); + } + } + + munmap(data, flen); + + return ents; +} + +static DictEnt * +make_dict(const char *path, size_t stride, size_t *nents) +{ + char tbank[PATH_MAX]; + size_t i, ntoks, nbanks = 0; + DIR *dir; + struct dirent *dent; + YomiTok *toks = NULL; + DictEnt *dict = NULL; + + ntoks = stride * YOMI_TOKS_PER_ENT + 1; + if ((ntoks - 1) / YOMI_TOKS_PER_ENT != stride) + die("stride multiplication overflowed: %s\n", path); + + toks = xreallocarray(toks, ntoks, sizeof(YomiTok)); + + if (!(dir = opendir(path))) + die("opendir(): failed to open: %s\n", path); + + /* count term banks in path */ + while ((dent = readdir(dir)) != NULL) + if (dent->d_type == DT_REG) + nbanks++; + /* remove index.json from count */ + nbanks--; + + closedir(dir); + + for (i = 1; i <= nbanks; i++) { + snprintf(tbank, sizeof(tbank), "%s/term_bank_%d.json", path, (int)i); + dict = parse_term_bank(dict, nents, tbank, toks, ntoks); + if (dict == NULL) + return NULL; + } + free(toks); + + return dict; +} + +static int +entcmp(const void *va, const void *vb) +{ + const DictEnt *a = va, *b = vb; + return strcmp(a->term, b->term); +} + + +static DictEnt * +find_ent(const char *term, DictEnt *ents, size_t nents) +{ + int r; + + if (nents == 0) + return NULL; + + r = strcmp(term, ents[nents/2].term); + if (r == 0) + return &ents[nents/2]; + else if (r < 0) + return find_ent(term, ents, nents/2); + + if (nents % 2) + return find_ent(term, &ents[nents/2 + 1], nents/2 - 1); + else + return find_ent(term, &ents[nents/2 + 1], nents/2); +} + +static char * +fix_newlines(char *str) +{ + char *t = str; + + while ((t = strstr(t, "\\n")) != NULL) { + t[0] = '\n'; + t++; + memmove(t, t + 1, strlen(t + 1) + 1); + } + + return str; +} + +static void +print_ent(DictEnt *ent) +{ + size_t i; + for (i = 0; i < ent->ndefs; i++) + printf("%s\n", fix_newlines(ent->defs[i])); +} + +static int +find_and_print_defs(char **terms, size_t nterms, char **dicts, size_t ndicts) +{ + char path[PATH_MAX - 18]; + size_t i, j, k; + size_t nents; + DictEnt *ent, *ents; + + for (i = 0; i < ndicts; i++) { + snprintf(path, LEN(path), "%s/%s", prefix, dicts[i]); + nents = 0; + ents = make_dict(path, DICT_STRIDE, &nents); + if (ents == NULL) + return -1; + qsort(ents, nents, sizeof(DictEnt), entcmp); + + printf("%s\n", dicts[i]); + for (j = 0; j < nterms; j++) { + ent = find_ent(terms[j], ents, nents); + if (ent == NULL) { + printf("term not found:%s\n", terms[j]); + return -1; + } + print_ent(ent); + } + + for (j = 0; j < nents; j++) { + for (k = 0; k < ents[j].ndefs; k++) + free(ents[j].defs[k]); + free(ents[j].defs); + free(ents[j].term); + } + free(ents); + } + return 0; +} + +int +main(int argc, char *argv[]) +{ + char **dicts = NULL, **terms = NULL; + size_t ndicts = 0, nterms = 0; + int i; + + argv0 = argv[0]; + + ARGBEGIN { + case 'd': + dicts = xreallocarray(dicts, ++ndicts, sizeof(char *)); + dicts[0] = EARGF(usage()); + break; + default: + usage(); + } ARGEND + + if (ndicts == 0) { + dicts = default_dicts; + ndicts = LEN(default_dicts); + } + + /* remaining argv elements are terms to search for */ + for (i = 0; argc && *argv; argv++, i++, argc--) { + terms = xreallocarray(terms, ++nterms, sizeof(char *)); + terms[i] = *argv; + } + + if (nterms == 0) { + cleanup(dicts, terms); + usage(); + } + + find_and_print_defs(terms, nterms, dicts, ndicts); + + cleanup(dicts, terms); + + return 0; +} diff --git a/util.c b/util.c @@ -0,0 +1,29 @@ +/* See LICENSE for license details. */ +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +#include "util.h" + +void +die(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + exit(1); +} + +void * +xreallocarray(void *o, size_t n, size_t s) +{ + void *new; + + if (!(new = reallocarray(o, n, s))) + die("reallocarray()\n"); + + return new; +} diff --git a/util.h b/util.h @@ -0,0 +1,5 @@ +/* See LICENSE for license details. */ +#define LEN(a) (sizeof(a) / sizeof(*a)) + +void die(const char *, ...); +void *xreallocarray(void *, size_t, size_t); diff --git a/yomidict.c b/yomidict.c @@ -0,0 +1,223 @@ +/* See LICENSE for license details. + * + * yomidict.c implements a simple parser for yomichan dictionary text. This is + * all it knows how to do. Finding and reading term banks as well as searching + * through parsed entries should be implemented elsewhere. + */ +#include <ctype.h> +#include <stddef.h> +#include <sys/types.h> + +#include "yomidict.h" + +void +yomi_init(YomiParser *p) +{ + p->pos = 0; + p->toknext = 0; + p->parent = -1; +} + +static YomiTok * +yomi_alloc_tok(YomiParser *p, YomiTok *toks, size_t ntoks) +{ + YomiTok *t; + + if (ntoks <= p->toknext) + return NULL; + + t = &toks[p->toknext++]; + t->parent = -1; + t->start = -1; + t->end = -1; + t->len = 0; + + return t; +} + +static int +yomi_parse_str(YomiParser *p, YomiTok *t, const char *s, size_t slen) +{ + size_t i, start = p->pos; + int c; + + /* skip leading quote */ + p->pos++; + + for (; p->pos < slen && s[p->pos]; p->pos++) { + c = s[p->pos]; + + /* end of str */ + if (c == '\"') { + t->start = start + 1; + t->end = p->pos; + t->parent = p->parent; + t->type = YOMI_STR; + return 0; + } + + /* handle escape chars */ + if (c == '\\' && p->pos + 1 < slen) { + p->pos++; + switch (s[p->pos]) { + case '/': /* FALLTHROUGH */ + case '\"': + case '\\': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + break; + case 'u': /* unicode symbol */ + p->pos++; + for (i = 0; i < 4 && p->pos < slen && s[p->pos]; i++) { + if (!isxdigit(s[p->pos])) { + p->pos = start; + return YOMI_ERROR_INVAL; + } + p->pos++; + } + p->pos--; + break; + default: + p->pos = start; + return YOMI_ERROR_INVAL; + } + } + } + p->pos = start; + return YOMI_ERROR_MALFO; +} + +static int +yomi_parse_num(YomiParser *p, YomiTok *t, const char *s, size_t slen) +{ + size_t start = p->pos; + + for (; p->pos < slen && s[p->pos]; p->pos++) { + switch (s[p->pos]) { + case ' ': + case ',': + case '\n': + case '\r': + case '\t': + case ']': + t->parent = p->parent; + t->start = start; + t->end = p->pos; + t->type = YOMI_NUM; + p->pos--; + return 0; + } + if (!isdigit(s[p->pos])) { + p->pos = start; + return YOMI_ERROR_INVAL; + } + } + p->pos = start; + return YOMI_ERROR_MALFO; +} + +ssize_t +yomi_parse(YomiParser *p, YomiTok *toks, size_t ntoks, + const char *bank, size_t blen) +{ + YomiTok *tok, *t; + size_t count = p->toknext; + int r; + + if (toks == NULL) + return -1; + + for (; p->pos < blen && bank[p->pos]; p->pos++) { + switch (bank[p->pos]) { + case '[': /* YOMI_ARRAY || YOMI_ENTRY */ + count++; + + tok = yomi_alloc_tok(p, toks, ntoks); + if (!tok) + return YOMI_ERROR_NOMEM; + + t = NULL; + if (p->parent != -1) { + t = &toks[p->parent]; + t->len++; + } + + if (t && t->type == YOMI_ARRAY) + tok->type = YOMI_ENTRY; + else + tok->type = YOMI_ARRAY; + + tok->start = p->pos; + tok->parent = p->parent; + p->parent = p->toknext - 1; /* the current tok */ + break; + + case ']': + if (p->toknext < 1 || p->parent == -1) + return YOMI_ERROR_INVAL; + + tok = &toks[p->parent]; + for (;;) { + if (tok->start != -1 && tok->end == -1) { + /* inside unfinished tok */ + tok->end = p->pos + 1; + p->parent = tok->parent; + break; + } else if (tok->parent == -1) { + /* this is the super tok */ + break; + } else { + tok = &toks[tok->parent]; + } + } + break; + + case ',': + if (p->parent != -1 && + toks[p->parent].type != YOMI_ARRAY && + toks[p->parent].type != YOMI_ENTRY) + p->parent = toks[p->parent].parent; + break; + + case '\"': + tok = yomi_alloc_tok(p, toks, ntoks); + if (tok == NULL) + return YOMI_ERROR_NOMEM; + + r = yomi_parse_str(p, tok, bank, blen); + if (r != 0) + return r; + + count++; + if (p->parent != -1) + toks[p->parent].len++; + else + toks[0].len++; + + case ' ': /* FALLTHROUGH */ + case '\n': + case '\r': + case '\t': + break; + + default: + tok = yomi_alloc_tok(p, toks, ntoks); + if (tok == NULL) + return YOMI_ERROR_NOMEM; + + r = yomi_parse_num(p, tok, bank, blen); + if (r != 0) + return r; + + count++; + if (p->parent != -1) + toks[p->parent].len++; + else + toks[0].len++; + } + } + return count; +} diff --git a/yomidict.h b/yomidict.h @@ -0,0 +1,31 @@ +/* See LICENSE for license details. */ +typedef enum { + YOMI_UNDEF = 0, + YOMI_ENTRY = 1, + YOMI_ARRAY = 2, + YOMI_STR = 4, + YOMI_NUM = 8 +} YomiType; + +typedef struct { + YomiType type; + size_t start; + size_t end; + size_t len; + size_t parent; /* parent tok number */ +} YomiTok; + +typedef struct { + size_t pos; /* offset in yomi bank */ + size_t toknext; + ssize_t parent; /* parent tok of current element */ +} YomiParser; + +enum { + YOMI_ERROR_NOMEM = -1, + YOMI_ERROR_INVAL = -2, + YOMI_ERROR_MALFO = -3 +}; + +void yomi_init(YomiParser *); +ssize_t yomi_parse(YomiParser *, YomiTok *, size_t, const char *, size_t);