jdict

command line tool for looking up terms in yomidict dictionaries
git clone anongit@rnpnr.xyz:jdict.git
Log | Files | Refs | Feed | README | LICENSE

Commit: 048e7dd0d3ffdfe6066be5818ccb9289420a7037
Parent: bc3f25b9e58c0d75c2b0e0659ec303a5e7c4c200
Author: Randy Palamar
Date:   Sun,  5 Nov 2023 15:23:37 -0700

make parser an opaque type and rename to YomiScanner

first, its not a parser its a lexer. second, each scanner should
be tied to a particular set of data.

hiding the type should make it easier to optimize

Diffstat:
Mjdict.c | 7++++---
Myomidict.c | 125+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Myomidict.h | 10+++-------
3 files changed, 78 insertions(+), 64 deletions(-)

diff --git a/jdict.c b/jdict.c @@ -147,7 +147,7 @@ parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, size_t *stride) size_t i, flen; char *data; YomiTok *toks = NULL; - YomiParser p; + YomiScanner *s = NULL; DictEnt *e; if ((fd = open(tbank, O_RDONLY)) < 0) @@ -165,8 +165,8 @@ parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, size_t *stride) die("stride multiplication overflowed: %s\n", tbank); toks = xreallocarray(toks, ntoks, sizeof(YomiTok)); - yomi_init(&p); - while ((r = yomi_parse(&p, toks, ntoks, data, flen)) < 0) { + s = yomi_scanner_new(data, flen); + while ((r = yomi_parse(s, toks, ntoks)) < 0) { switch (r) { case YOMI_ERROR_NOMEM: /* allocate more mem and try again */ @@ -205,6 +205,7 @@ parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, size_t *stride) cleanup: munmap(data, flen); free(toks); + free(s); return ents; } diff --git a/yomidict.c b/yomidict.c @@ -4,30 +4,45 @@ * all it knows how to do. Finding and reading term banks as well as searching * through parsed entries should be implemented elsewhere. */ +#include <stddef.h> +#include <stdlib.h> + +#include "util.h" #include "yomidict.h" -#define NULL 0 #define ul unsigned long #define ISDIGIT(c) ((c) >= '0' && (c) <= '9') -void -yomi_init(YomiParser *p) +struct YomiScanner { + const char *data; + ul len; + ul pos; /* offset in yomi bank */ + ul toknext; + int parent; /* parent tok of current element */ +}; + +YomiScanner * +yomi_scanner_new(const char *data, unsigned long datalen) { - p->pos = 0; - p->toknext = 0; - p->parent = -1; + YomiScanner *s = xreallocarray(NULL, sizeof(YomiScanner), 1); + s->data = data; + s->len = datalen; + s->pos = 0; + s->toknext = 0; + s->parent = -1; + return s; } static YomiTok * -yomi_alloc_tok(YomiParser *p, YomiTok *toks, ul ntoks) +yomi_alloc_tok(YomiScanner *s, YomiTok *toks, ul ntoks) { YomiTok *t; - if (ntoks <= p->toknext) + if (ntoks <= s->toknext) return NULL; - t = &toks[p->toknext++]; + t = &toks[s->toknext++]; t->parent = -1; t->start = -1; t->end = -1; @@ -37,100 +52,102 @@ yomi_alloc_tok(YomiParser *p, YomiTok *toks, ul ntoks) } static int -yomi_parse_str(YomiParser *p, YomiTok *t, const char *s, ul slen) +yomi_parse_str(YomiScanner *s, YomiTok *t) { - ul start = p->pos++; + const char *d = s->data; + ul start = s->pos++; - for (; p->pos < slen; p->pos++) { + for (; s->pos < s->len; s->pos++) { /* skip over escaped " */ - if (s[p->pos] == '\\' && p->pos + 1 < slen && s[p->pos + 1] == '\"') { - p->pos++; + if (d[s->pos] == '\\' && s->pos + 1 < s->len && d[s->pos + 1] == '\"') { + s->pos++; continue; } /* end of str */ - if (s[p->pos] == '\"') { + if (d[s->pos] == '\"') { t->start = start + 1; - t->end = p->pos; - t->parent = p->parent; + t->end = s->pos; + t->parent = s->parent; t->type = YOMI_STR; return 0; } } - p->pos = start; + s->pos = start; return YOMI_ERROR_MALFO; } static int -yomi_parse_num(YomiParser *p, YomiTok *t, const char *s, ul slen) +yomi_parse_num(YomiScanner *s, YomiTok *t) { - ul start = p->pos; + const char *d = s->data; + ul start = s->pos; - for (; p->pos < slen && s[p->pos]; p->pos++) { - switch (s[p->pos]) { + for (; s->pos < s->len && d[s->pos]; s->pos++) { + switch (d[s->pos]) { case ' ': case ',': case '\n': case '\r': case '\t': case ']': - t->parent = p->parent; + t->parent = s->parent; t->start = start; - t->end = p->pos; + t->end = s->pos; t->type = YOMI_NUM; - p->pos--; + s->pos--; return 0; } - if (!ISDIGIT(s[p->pos])) { - p->pos = start; + if (!ISDIGIT(d[s->pos])) { + s->pos = start; return YOMI_ERROR_INVAL; } } - p->pos = start; + s->pos = start; return YOMI_ERROR_MALFO; } int -yomi_parse(YomiParser *p, YomiTok *toks, ul ntoks, const char *bank, ul blen) +yomi_parse(YomiScanner *s, YomiTok *toks, ul ntoks) { YomiTok *tok; - int r, count = p->toknext; + int r, count = s->toknext; if (toks == NULL) return -1; - for (; p->pos < blen && bank[p->pos]; p->pos++) { - switch (bank[p->pos]) { + for (; s->pos < s->len && s->data[s->pos]; s->pos++) { + switch (s->data[s->pos]) { case '[': /* YOMI_ARRAY || YOMI_ENTRY */ count++; - tok = yomi_alloc_tok(p, toks, ntoks); + tok = yomi_alloc_tok(s, toks, ntoks); if (!tok) return YOMI_ERROR_NOMEM; - if (p->parent == -1 || toks[p->parent].type != YOMI_ARRAY) { + if (s->parent == -1 || toks[s->parent].type != YOMI_ARRAY) { tok->type = YOMI_ARRAY; } else { tok->type = YOMI_ENTRY; - toks[p->parent].len++; + toks[s->parent].len++; } - tok->start = p->pos; - tok->parent = p->parent; - p->parent = p->toknext - 1; /* the current tok */ + tok->start = s->pos; + tok->parent = s->parent; + s->parent = s->toknext - 1; /* the current tok */ break; case ']': - if (p->toknext < 1 || p->parent == -1) + if (s->toknext < 1 || s->parent == -1) return YOMI_ERROR_INVAL; - tok = &toks[p->parent]; + tok = &toks[s->parent]; for (;;) { if (tok->start != -1 && tok->end == -1) { /* inside unfinished tok */ - tok->end = p->pos + 1; - p->parent = tok->parent; + tok->end = s->pos + 1; + s->parent = tok->parent; break; } else if (tok->parent == -1) { /* this is the super tok */ @@ -142,24 +159,24 @@ yomi_parse(YomiParser *p, YomiTok *toks, ul ntoks, const char *bank, ul blen) break; case ',': - if (p->parent != -1 && - toks[p->parent].type != YOMI_ARRAY && - toks[p->parent].type != YOMI_ENTRY) - p->parent = toks[p->parent].parent; + if (s->parent != -1 && + toks[s->parent].type != YOMI_ARRAY && + toks[s->parent].type != YOMI_ENTRY) + s->parent = toks[s->parent].parent; break; case '\"': - tok = yomi_alloc_tok(p, toks, ntoks); + tok = yomi_alloc_tok(s, toks, ntoks); if (tok == NULL) return YOMI_ERROR_NOMEM; - r = yomi_parse_str(p, tok, bank, blen); + r = yomi_parse_str(s, tok); if (r != 0) return r; count++; - if (p->parent != -1) - toks[p->parent].len++; + if (s->parent != -1) + toks[s->parent].len++; else toks[0].len++; @@ -170,17 +187,17 @@ yomi_parse(YomiParser *p, YomiTok *toks, ul ntoks, const char *bank, ul blen) break; default: - tok = yomi_alloc_tok(p, toks, ntoks); + tok = yomi_alloc_tok(s, toks, ntoks); if (tok == NULL) return YOMI_ERROR_NOMEM; - r = yomi_parse_num(p, tok, bank, blen); + r = yomi_parse_num(s, tok); if (r != 0) return r; count++; - if (p->parent != -1) - toks[p->parent].len++; + if (s->parent != -1) + toks[s->parent].len++; else toks[0].len++; } diff --git a/yomidict.h b/yomidict.h @@ -15,11 +15,7 @@ typedef struct { YomiType type; } YomiTok; -typedef struct { - unsigned long pos; /* offset in yomi bank */ - unsigned long toknext; - int parent; /* parent tok of current element */ -} YomiParser; +typedef struct YomiScanner YomiScanner; enum { YOMI_ERROR_NOMEM = -1, @@ -27,5 +23,5 @@ enum { YOMI_ERROR_MALFO = -3 }; -void yomi_init(YomiParser *); -int yomi_parse(YomiParser *, YomiTok *, unsigned long, const char *, unsigned long); +YomiScanner *yomi_scanner_new(const char *, unsigned long); +int yomi_parse(YomiScanner *, YomiTok *, unsigned long);