Commit: f344c7ef726eb97e83f04b77e5b67094b5b3b1cd
Author: Randy Palamar
Date: Sat, 18 Jun 2022 23:30:22 -0600
initial import of jdict
this program lets you search yomidict dictionaries from the command line
Diffstat:
A | LICENSE | | | 15 | +++++++++++++++ |
A | Makefile | | | 29 | +++++++++++++++++++++++++++++ |
A | arg.h | | | 50 | ++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | config.def.h | | | 14 | ++++++++++++++ |
A | config.mk | | | 6 | ++++++ |
A | jdict.c | | | 273 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | util.c | | | 29 | +++++++++++++++++++++++++++++ |
A | util.h | | | 5 | +++++ |
A | yomidict.c | | | 223 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | yomidict.h | | | 31 | +++++++++++++++++++++++++++++++ |
10 files changed, 675 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License (ISC)
+
+© 2022 Randy Palamar <palamar@ualberta.ca>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,29 @@
+# See LICENSE for license details.
+include config.mk
+
+SRC = jdict.c yomidict.c util.c
+OBJ = $(SRC:.c=.o)
+
+default: jdict
+
+config.h:
+ cp config.def.h $@
+
+.c.o:
+ $(CC) $(CFLAGS) -o $@ -c $<
+
+$(OBJ): config.h
+
+jdict: $(OBJ)
+ $(CC) -o $@ $(OBJ) $(LDFLAGS)
+
+install: jdict
+ mkdir -p $(PREFIX)/bin
+ cp jdict $(PREFIX)/bin
+ chmod 755 $(PREFIX)/bin/jdict
+
+uninstall:
+ rm $(PREFIX)/bin/jdict
+
+clean:
+ rm *.o jdict
diff --git a/arg.h b/arg.h
@@ -0,0 +1,50 @@
+/*
+ * Copy me if you can.
+ * by 20h
+ */
+
+#ifndef ARG_H__
+#define ARG_H__
+
+extern char *argv0;
+
+/* use main(int argc, char *argv[]) */
+#define ARGBEGIN for (argv0 = *argv, argv++, argc--;\
+ argv[0] && argv[0][0] == '-'\
+ && argv[0][1];\
+ argc--, argv++) {\
+ char argc_;\
+ char **argv_;\
+ int brk_;\
+ if (argv[0][1] == '-' && argv[0][2] == '\0') {\
+ argv++;\
+ argc--;\
+ break;\
+ }\
+ int i_;\
+ for (i_ = 1, brk_ = 0, argv_ = argv;\
+ argv[0][i_] && !brk_;\
+ i_++) {\
+ if (argv_ != argv)\
+ break;\
+ argc_ = argv[0][i_];\
+ switch (argc_)
+
+#define ARGEND }\
+ }
+
+#define ARGC() argc_
+
+#define EARGF(x) ((argv[0][i_+1] == '\0' && argv[1] == NULL)?\
+ ((x), abort(), (char *)0) :\
+ (brk_ = 1, (argv[0][i_+1] != '\0')?\
+ (&argv[0][i_+1]) :\
+ (argc--, argv++, argv[0])))
+
+#define ARGF() ((argv[0][i_+1] == '\0' && argv[1] == NULL)?\
+ (char *)0 :\
+ (brk_ = 1, (argv[0][i_+1] != '\0')?\
+ (&argv[0][i_+1]) :\
+ (argc--, argv++, argv[0])))
+
+#endif
diff --git a/config.def.h b/config.def.h
@@ -0,0 +1,14 @@
+/* See LICENSE for license details. */
+
+/* max terms per term bank, all dicts should use this stride */
+#define DICT_STRIDE 10000
+
+/* dir where unzipped yomidicts are stored */
+static char *prefix = "/usr/share/yomidicts";
+
+/* default yomidicts to search */
+static char *default_dicts[] = {
+ "daijirin"
+ "daijisen",
+ "koujien"
+};
diff --git a/config.mk b/config.mk
@@ -0,0 +1,6 @@
+# See LICENSE for license details.
+PREFIX = /usr/local
+
+CPPFLAGS = -D_BSD_SOURCE
+CFLAGS = -O2 -std=c99 -Wall -pedantic $(CPPFLAGS) $(INCS)
+LDFLAGS = -s -static
diff --git a/jdict.c b/jdict.c
@@ -0,0 +1,273 @@
+/* See LICENSE for license details. */
+#include <dirent.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "arg.h"
+#include "util.h"
+#include "yomidict.h"
+
+#include "config.h"
+
+#define YOMI_TOKS_PER_ENT 10
+
+typedef struct {
+ char *term;
+ char **defs;
+ size_t ndefs;
+} DictEnt;
+
+char *argv0;
+
+static void
+cleanup(char **dicts, char **terms)
+{
+ if (dicts != default_dicts)
+ free(dicts);
+ free(terms);
+
+ dicts = NULL;
+ terms = NULL;
+}
+
+static void
+usage(void)
+{
+ die("usage: %s [-d path] term ...\n", argv0);
+}
+
+/* takes a token of type YOMI_ENTRY and creates a DictEnt */
+static DictEnt *
+make_ent(YomiTok *tok, char *data)
+{
+ size_t i;
+ DictEnt *d;
+ YomiTok *tstr, *tdefs;
+
+ if (tok->type != YOMI_ENTRY)
+ return NULL;
+
+ /* FIXME: hacky but works */
+ /* term = YOMI_ENT tok + 1 */
+ tstr = tok + 1;
+ /* definition array = YOMI_ENT tok + 6 */
+ tdefs = tok + 6;
+
+ d = xreallocarray(NULL, 1, sizeof(DictEnt));
+ d->term = strndup(data + tstr->start, tstr->end - tstr->start);
+ d->ndefs = tdefs->len;
+ d->defs = xreallocarray(NULL, d->ndefs, sizeof(char *));
+ for (i = 1; i <= d->ndefs; i++)
+ d->defs[i-1] = strndup(data + (tdefs + i)->start,
+ (tdefs + i)->end - (tdefs + i)->start);
+
+ return d;
+}
+
+static DictEnt *
+parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, YomiTok *toks, size_t ntoks)
+{
+ int r, fd;
+ size_t flen, i;
+ char *data;
+ YomiParser p;
+ DictEnt *e;
+
+ /* FIXME: these need to be checked for errors */
+ fd = open(tbank, O_RDONLY);
+ flen = lseek(fd, 0, SEEK_END);
+ data = mmap(NULL, flen, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+
+ yomi_init(&p);
+ r = yomi_parse(&p, toks, ntoks, data, flen);
+ if (r < 0)
+ return NULL;
+
+ ents = xreallocarray(ents, (*nents) + r/YOMI_TOKS_PER_ENT, sizeof(DictEnt));
+ for (i = 0; i < r; i++) {
+ if (toks[i].type == YOMI_ENTRY) {
+ e = make_ent(&toks[i], data);
+ if (e == NULL)
+ return NULL;
+ memcpy(&ents[(*nents)++], e, sizeof(DictEnt));
+ }
+ }
+
+ munmap(data, flen);
+
+ return ents;
+}
+
+static DictEnt *
+make_dict(const char *path, size_t stride, size_t *nents)
+{
+ char tbank[PATH_MAX];
+ size_t i, ntoks, nbanks = 0;
+ DIR *dir;
+ struct dirent *dent;
+ YomiTok *toks = NULL;
+ DictEnt *dict = NULL;
+
+ ntoks = stride * YOMI_TOKS_PER_ENT + 1;
+ if ((ntoks - 1) / YOMI_TOKS_PER_ENT != stride)
+ die("stride multiplication overflowed: %s\n", path);
+
+ toks = xreallocarray(toks, ntoks, sizeof(YomiTok));
+
+ if (!(dir = opendir(path)))
+ die("opendir(): failed to open: %s\n", path);
+
+ /* count term banks in path */
+ while ((dent = readdir(dir)) != NULL)
+ if (dent->d_type == DT_REG)
+ nbanks++;
+ /* remove index.json from count */
+ nbanks--;
+
+ closedir(dir);
+
+ for (i = 1; i <= nbanks; i++) {
+ snprintf(tbank, sizeof(tbank), "%s/term_bank_%d.json", path, (int)i);
+ dict = parse_term_bank(dict, nents, tbank, toks, ntoks);
+ if (dict == NULL)
+ return NULL;
+ }
+ free(toks);
+
+ return dict;
+}
+
+static int
+entcmp(const void *va, const void *vb)
+{
+ const DictEnt *a = va, *b = vb;
+ return strcmp(a->term, b->term);
+}
+
+
+static DictEnt *
+find_ent(const char *term, DictEnt *ents, size_t nents)
+{
+ int r;
+
+ if (nents == 0)
+ return NULL;
+
+ r = strcmp(term, ents[nents/2].term);
+ if (r == 0)
+ return &ents[nents/2];
+ else if (r < 0)
+ return find_ent(term, ents, nents/2);
+
+ if (nents % 2)
+ return find_ent(term, &ents[nents/2 + 1], nents/2 - 1);
+ else
+ return find_ent(term, &ents[nents/2 + 1], nents/2);
+}
+
+static char *
+fix_newlines(char *str)
+{
+ char *t = str;
+
+ while ((t = strstr(t, "\\n")) != NULL) {
+ t[0] = '\n';
+ t++;
+ memmove(t, t + 1, strlen(t + 1) + 1);
+ }
+
+ return str;
+}
+
+static void
+print_ent(DictEnt *ent)
+{
+ size_t i;
+ for (i = 0; i < ent->ndefs; i++)
+ printf("%s\n", fix_newlines(ent->defs[i]));
+}
+
+static int
+find_and_print_defs(char **terms, size_t nterms, char **dicts, size_t ndicts)
+{
+ char path[PATH_MAX - 18];
+ size_t i, j, k;
+ size_t nents;
+ DictEnt *ent, *ents;
+
+ for (i = 0; i < ndicts; i++) {
+ snprintf(path, LEN(path), "%s/%s", prefix, dicts[i]);
+ nents = 0;
+ ents = make_dict(path, DICT_STRIDE, &nents);
+ if (ents == NULL)
+ return -1;
+ qsort(ents, nents, sizeof(DictEnt), entcmp);
+
+ printf("%s\n", dicts[i]);
+ for (j = 0; j < nterms; j++) {
+ ent = find_ent(terms[j], ents, nents);
+ if (ent == NULL) {
+ printf("term not found:%s\n", terms[j]);
+ return -1;
+ }
+ print_ent(ent);
+ }
+
+ for (j = 0; j < nents; j++) {
+ for (k = 0; k < ents[j].ndefs; k++)
+ free(ents[j].defs[k]);
+ free(ents[j].defs);
+ free(ents[j].term);
+ }
+ free(ents);
+ }
+ return 0;
+}
+
+int
+main(int argc, char *argv[])
+{
+ char **dicts = NULL, **terms = NULL;
+ size_t ndicts = 0, nterms = 0;
+ int i;
+
+ argv0 = argv[0];
+
+ ARGBEGIN {
+ case 'd':
+ dicts = xreallocarray(dicts, ++ndicts, sizeof(char *));
+ dicts[0] = EARGF(usage());
+ break;
+ default:
+ usage();
+ } ARGEND
+
+ if (ndicts == 0) {
+ dicts = default_dicts;
+ ndicts = LEN(default_dicts);
+ }
+
+ /* remaining argv elements are terms to search for */
+ for (i = 0; argc && *argv; argv++, i++, argc--) {
+ terms = xreallocarray(terms, ++nterms, sizeof(char *));
+ terms[i] = *argv;
+ }
+
+ if (nterms == 0) {
+ cleanup(dicts, terms);
+ usage();
+ }
+
+ find_and_print_defs(terms, nterms, dicts, ndicts);
+
+ cleanup(dicts, terms);
+
+ return 0;
+}
diff --git a/util.c b/util.c
@@ -0,0 +1,29 @@
+/* See LICENSE for license details. */
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util.h"
+
+void
+die(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ exit(1);
+}
+
+void *
+xreallocarray(void *o, size_t n, size_t s)
+{
+ void *new;
+
+ if (!(new = reallocarray(o, n, s)))
+ die("reallocarray()\n");
+
+ return new;
+}
diff --git a/util.h b/util.h
@@ -0,0 +1,5 @@
+/* See LICENSE for license details. */
+#define LEN(a) (sizeof(a) / sizeof(*a))
+
+void die(const char *, ...);
+void *xreallocarray(void *, size_t, size_t);
diff --git a/yomidict.c b/yomidict.c
@@ -0,0 +1,223 @@
+/* See LICENSE for license details.
+ *
+ * yomidict.c implements a simple parser for yomichan dictionary text. This is
+ * all it knows how to do. Finding and reading term banks as well as searching
+ * through parsed entries should be implemented elsewhere.
+ */
+#include <ctype.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#include "yomidict.h"
+
+void
+yomi_init(YomiParser *p)
+{
+ p->pos = 0;
+ p->toknext = 0;
+ p->parent = -1;
+}
+
+static YomiTok *
+yomi_alloc_tok(YomiParser *p, YomiTok *toks, size_t ntoks)
+{
+ YomiTok *t;
+
+ if (ntoks <= p->toknext)
+ return NULL;
+
+ t = &toks[p->toknext++];
+ t->parent = -1;
+ t->start = -1;
+ t->end = -1;
+ t->len = 0;
+
+ return t;
+}
+
+static int
+yomi_parse_str(YomiParser *p, YomiTok *t, const char *s, size_t slen)
+{
+ size_t i, start = p->pos;
+ int c;
+
+ /* skip leading quote */
+ p->pos++;
+
+ for (; p->pos < slen && s[p->pos]; p->pos++) {
+ c = s[p->pos];
+
+ /* end of str */
+ if (c == '\"') {
+ t->start = start + 1;
+ t->end = p->pos;
+ t->parent = p->parent;
+ t->type = YOMI_STR;
+ return 0;
+ }
+
+ /* handle escape chars */
+ if (c == '\\' && p->pos + 1 < slen) {
+ p->pos++;
+ switch (s[p->pos]) {
+ case '/': /* FALLTHROUGH */
+ case '\"':
+ case '\\':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ break;
+ case 'u': /* unicode symbol */
+ p->pos++;
+ for (i = 0; i < 4 && p->pos < slen && s[p->pos]; i++) {
+ if (!isxdigit(s[p->pos])) {
+ p->pos = start;
+ return YOMI_ERROR_INVAL;
+ }
+ p->pos++;
+ }
+ p->pos--;
+ break;
+ default:
+ p->pos = start;
+ return YOMI_ERROR_INVAL;
+ }
+ }
+ }
+ p->pos = start;
+ return YOMI_ERROR_MALFO;
+}
+
+static int
+yomi_parse_num(YomiParser *p, YomiTok *t, const char *s, size_t slen)
+{
+ size_t start = p->pos;
+
+ for (; p->pos < slen && s[p->pos]; p->pos++) {
+ switch (s[p->pos]) {
+ case ' ':
+ case ',':
+ case '\n':
+ case '\r':
+ case '\t':
+ case ']':
+ t->parent = p->parent;
+ t->start = start;
+ t->end = p->pos;
+ t->type = YOMI_NUM;
+ p->pos--;
+ return 0;
+ }
+ if (!isdigit(s[p->pos])) {
+ p->pos = start;
+ return YOMI_ERROR_INVAL;
+ }
+ }
+ p->pos = start;
+ return YOMI_ERROR_MALFO;
+}
+
+ssize_t
+yomi_parse(YomiParser *p, YomiTok *toks, size_t ntoks,
+ const char *bank, size_t blen)
+{
+ YomiTok *tok, *t;
+ size_t count = p->toknext;
+ int r;
+
+ if (toks == NULL)
+ return -1;
+
+ for (; p->pos < blen && bank[p->pos]; p->pos++) {
+ switch (bank[p->pos]) {
+ case '[': /* YOMI_ARRAY || YOMI_ENTRY */
+ count++;
+
+ tok = yomi_alloc_tok(p, toks, ntoks);
+ if (!tok)
+ return YOMI_ERROR_NOMEM;
+
+ t = NULL;
+ if (p->parent != -1) {
+ t = &toks[p->parent];
+ t->len++;
+ }
+
+ if (t && t->type == YOMI_ARRAY)
+ tok->type = YOMI_ENTRY;
+ else
+ tok->type = YOMI_ARRAY;
+
+ tok->start = p->pos;
+ tok->parent = p->parent;
+ p->parent = p->toknext - 1; /* the current tok */
+ break;
+
+ case ']':
+ if (p->toknext < 1 || p->parent == -1)
+ return YOMI_ERROR_INVAL;
+
+ tok = &toks[p->parent];
+ for (;;) {
+ if (tok->start != -1 && tok->end == -1) {
+ /* inside unfinished tok */
+ tok->end = p->pos + 1;
+ p->parent = tok->parent;
+ break;
+ } else if (tok->parent == -1) {
+ /* this is the super tok */
+ break;
+ } else {
+ tok = &toks[tok->parent];
+ }
+ }
+ break;
+
+ case ',':
+ if (p->parent != -1 &&
+ toks[p->parent].type != YOMI_ARRAY &&
+ toks[p->parent].type != YOMI_ENTRY)
+ p->parent = toks[p->parent].parent;
+ break;
+
+ case '\"':
+ tok = yomi_alloc_tok(p, toks, ntoks);
+ if (tok == NULL)
+ return YOMI_ERROR_NOMEM;
+
+ r = yomi_parse_str(p, tok, bank, blen);
+ if (r != 0)
+ return r;
+
+ count++;
+ if (p->parent != -1)
+ toks[p->parent].len++;
+ else
+ toks[0].len++;
+
+ case ' ': /* FALLTHROUGH */
+ case '\n':
+ case '\r':
+ case '\t':
+ break;
+
+ default:
+ tok = yomi_alloc_tok(p, toks, ntoks);
+ if (tok == NULL)
+ return YOMI_ERROR_NOMEM;
+
+ r = yomi_parse_num(p, tok, bank, blen);
+ if (r != 0)
+ return r;
+
+ count++;
+ if (p->parent != -1)
+ toks[p->parent].len++;
+ else
+ toks[0].len++;
+ }
+ }
+ return count;
+}
diff --git a/yomidict.h b/yomidict.h
@@ -0,0 +1,31 @@
+/* See LICENSE for license details. */
+typedef enum {
+ YOMI_UNDEF = 0,
+ YOMI_ENTRY = 1,
+ YOMI_ARRAY = 2,
+ YOMI_STR = 4,
+ YOMI_NUM = 8
+} YomiType;
+
+typedef struct {
+ YomiType type;
+ size_t start;
+ size_t end;
+ size_t len;
+ size_t parent; /* parent tok number */
+} YomiTok;
+
+typedef struct {
+ size_t pos; /* offset in yomi bank */
+ size_t toknext;
+ ssize_t parent; /* parent tok of current element */
+} YomiParser;
+
+enum {
+ YOMI_ERROR_NOMEM = -1,
+ YOMI_ERROR_INVAL = -2,
+ YOMI_ERROR_MALFO = -3
+};
+
+void yomi_init(YomiParser *);
+ssize_t yomi_parse(YomiParser *, YomiTok *, size_t, const char *, size_t);