jdict: infer the stride to avoid some reallocs - jdict - command line tool for looking up terms in yomidict dictionaries

Commit: 5c50927610dd9fc5595e6a54079c88cc1ab5e4aa
Parent: ab073161505c05f79206c99834238d6529b63be9
Author: Randy Palamar
Date:   Thu,  9 Nov 2023 06:24:34 -0700

jdict: infer the stride to avoid some reallocs

If people use jdict with dictionaries they didn't generate they won't
know the stride. We can make a guess then parse the first term bank
to figure out what the stride actually was. Then we can allocate
enough space for all the entries (with some small amount of waste).

This gives an insignificant speed up now but it should be better
if multithreading is added.

Diffstat:
M README.md  | 18 +++++++++---------
M config.def.h  | 8 ++++----
M jdict.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------

3 files changed, 77 insertions(+), 58 deletions(-)
diff --git a/README.md b/README.md
@@ -1,17 +1,17 @@
 # jdict
 A command line lookup tool similar to the browser extension
-[Yomichan](https://github.com/FooSoft/yomichan).
+[Yomichan][].
 
 ## Example Output
 	$ jdict 百戦錬磨
 	【三省堂　スーパー大辞林】
 	ひゃくせん-れんま [5] 【百戦練磨・百戦錬磨】
 	多くの戦いできたえられること。多くの経験を積んでいること。「―の勇士」
-	
+
 	【大辞泉】
 	ひゃくせん‐れんま【百戦錬磨】
 	たびたびの戦いで鍛えられていること。また、経験が豊かで処理能力にすぐれていること。「―のつわもの」
-	
+
 	【広辞苑】
 	ひゃくせん‐れんま【百戦錬磨】
 	かずかずの実戦や経験を積んできたえられていること。「―の強者つわもの」
@@ -19,12 +19,10 @@ A command line lookup tool similar to the browser extension
 
 ## Installation
 
-This tool reads dictionaries created by
-[yomichan-import](https://github.com/FooSoft/yomichan-import/). The
-zip file created by `yomichan-import` needs to be extracted and stored
-in the prefix specified in `config.h`. The folder name should be also
-specified in `config.h` in addition to the stride parameter used to
-generate the dictionary.
+This tool reads dictionaries created by [yomichan-import][]. The
+zip file created by `yomichan-import` needs to be extracted and
+stored in the prefix specified in `config.h`. The folder name should
+be also specified in `config.h`.
 
 After modifying `config.h`, `config.mk` can also be modified to suit
 your system and then the following can be used to install (using root
@@ -32,3 +30,5 @@ as needed):
 
 	make clean install
 
+[Yomichan]: https://github.com/FooSoft/yomichan
+[yomichan-import]: https://github.com/FooSoft/yomichan-import/
diff --git a/config.def.h b/config.def.h
@@ -11,9 +11,9 @@ static char *repl_quit = "\n\033[36m(=^ᆺ^)ﾉ　バイバイ～\033[0m";
 static struct Dict {
 	const char *rom;
 	const char *name;
-	size_t stride;
 } default_dict_map[] = {
-	{"daijirin", "【三省堂　スーパー大辞林】", 10000},
-	{"daijisen", "【大辞泉】", 10000},
-	{"koujien", "【広辞苑】", 10000},
+	/* folder name      display name */
+	{"daijirin",      "【三省堂　スーパー大辞林】"},
+	{"daijisen",      "【大辞泉】"},
+	{"koujien",       "【広辞苑】"},
 };
diff --git a/jdict.c b/jdict.c
@@ -3,6 +3,7 @@
 #include <fcntl.h>
 #include <limits.h>
 #include <stddef.h>
+#include <stdint.h> /* for SIZE_MAX */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +16,7 @@
 
 #include "config.h"
 
+#define YOMI_STRIDE_GUESS 10000UL
 #define YOMI_TOKS_PER_ENT 10
 #define YOMI_TOK_DELTA (YOMI_TOKS_PER_ENT * 100)
 
@@ -96,6 +98,27 @@ dedup(DictEnt *ents, size_t *nents)
 	return xreallocarray(dents, *nents, sizeof(DictEnt));
 }
 
+static size_t
+count_term_banks(const char *path)
+{
+	DIR *dir;
+	struct dirent *dent;
+	size_t nbanks = 0;
+
+	if (!(dir = opendir(path)))
+		die("opendir(): failed to open: %s\n", path);
+
+	/* count term banks in path */
+	while ((dent = readdir(dir)) != NULL)
+		if (dent->d_type == DT_REG)
+			nbanks++;
+	/* remove index.json from count */
+	nbanks--;
+
+	closedir(dir);
+	return nbanks;
+}
+
 /* takes a token of type YOMI_ENTRY and creates a DictEnt */
 static DictEnt *
 make_ent(YomiTok *toks, char *data)
@@ -140,16 +163,19 @@ make_ent(YomiTok *toks, char *data)
 	return d;
 }
 
-static DictEnt *
-parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, size_t *stride)
+static size_t
+parse_term_bank(DictEnt *ents, size_t len, const char *tbank)
 {
 	int r, ntoks, fd;
-	size_t i, flen;
+	size_t flen, i = 0, nents = 0;
 	char *data;
 	YomiTok *toks = NULL;
 	YomiScanner *s = NULL;
 	DictEnt *e;
 
+	if (len == 0)
+		return 0;
+
 	if ((fd = open(tbank, O_RDONLY)) < 0)
 		die("can't open file: %s\n", tbank);
 	flen = lseek(fd, 0, SEEK_END);
@@ -160,46 +186,33 @@ parse_term_bank(DictEnt *ents, size_t *nents, const char *tbank, size_t *stride)
 		die("couldn't mmap file: %s\n", tbank);
 
 	/* allocate tokens */
-	ntoks = *stride * YOMI_TOKS_PER_ENT + 1;
-	if ((ntoks - 1) / YOMI_TOKS_PER_ENT != *stride)
-		die("stride multiplication overflowed: %s\n", tbank);
+	if ((SIZE_MAX - 1) / YOMI_TOKS_PER_ENT < len)
+		die("ntoks multiplication overflowed: %s\n", tbank);
+	ntoks = len * YOMI_TOKS_PER_ENT + 1;
 	toks = xreallocarray(toks, ntoks, sizeof(YomiTok));
 
 	s = yomi_scanner_new(data, flen);
 	while ((r = yomi_scan(s, toks, ntoks)) < 0) {
 		switch (r) {
 		case YOMI_ERROR_NOMEM:
-			/* allocate more mem and try again */
-			if (ntoks + YOMI_TOK_DELTA < 0)
-				die("too many toks: %s\n", tbank);
-			ntoks += YOMI_TOK_DELTA;
-			toks = xreallocarray(toks, ntoks, sizeof(YomiTok));
-			*stride = ntoks/YOMI_TOKS_PER_ENT;
-			break;
-		case YOMI_ERROR_INVAL: /* FALLTHROUGH */
+			goto cleanup;
+		case YOMI_ERROR_INVAL:
 		case YOMI_ERROR_MALFO:
 			fprintf(stderr, "yomi_parse: %s\n",
 			        r == YOMI_ERROR_INVAL? "YOMI_ERROR_INVAL"
 			        : "YOMI_ERROR_MALFO");
-			free(ents);
-			ents = NULL;
 			goto cleanup;
 		}
 	}
 
-	ents = xreallocarray(ents, (*nents) + r/YOMI_TOKS_PER_ENT, sizeof(DictEnt));
 	for (i = 0; i < r; i++) {
 		if (toks[i].type != YOMI_ENTRY)
 			continue;
 
 		e = make_ent(&toks[i], data);
-		if (e != NULL) {
-			memcpy(&ents[(*nents)++], e, sizeof(DictEnt));
-		} else {
-			free(ents);
-			ents = NULL;
+		if (e == NULL)
 			break;
-		}
+		memcpy(&ents[nents++], e, sizeof(DictEnt));
 	}
 
 cleanup:
@@ -207,43 +220,49 @@ cleanup:
 	free(toks);
 	free(s);
 
-	return ents;
+	return nents;
 }
 
 static DictEnt *
-make_dict(struct Dict *dict, size_t *nents)
+make_dict(struct Dict *dict, size_t *nlen)
 {
 	char path[PATH_MAX - 20], tbank[PATH_MAX];
-	size_t i, nbanks = 0;
-	DIR *dir;
-	struct dirent *dent;
+	size_t i, nbanks, nents = 0, lents = 0;
 	DictEnt *ents = NULL;
 
 	snprintf(path, LEN(path), "%s/%s", prefix, dict->rom);
-	if (!(dir = opendir(path)))
-		die("opendir(): failed to open: %s\n", path);
 
-	/* count term banks in path */
-	while ((dent = readdir(dir)) != NULL)
-		if (dent->d_type == DT_REG)
-			nbanks++;
-	/* remove index.json from count */
-	nbanks--;
-
-	closedir(dir);
-	if (nbanks == 0) {
-		fputs("nbanks == 0\n", stderr);
+	if ((nbanks = count_term_banks(path)) == 0)  {
+		fprintf(stderr, "no term banks found: %s\n", path);
 		return NULL;
 	}
 
-	for (i = 1; i <= nbanks; i++) {
+	/* parse first bank to get a guess for the total number of entries */
+	snprintf(tbank, LEN(tbank), "%s/term_bank_%d.json", path, 1);
+	do {
+		lents += YOMI_STRIDE_GUESS;
+		ents = xreallocarray(ents, lents, sizeof(DictEnt));
+		nents = parse_term_bank(ents, lents, tbank);
+	} while (nents == 0);
+
+	/* alloc enough memory for all ents */
+	if (SIZE_MAX / nbanks < nents)
+		die("dict has too many entries: %s\n", dict->rom);
+	lents = nents * nbanks;
+	ents = xreallocarray(ents, lents, sizeof(DictEnt));
+
+	for (i = 2; i <= nbanks; i++) {
+		size_t rem = lents - nents;
 		snprintf(tbank, LEN(tbank), "%s/term_bank_%d.json", path, (int)i);
-		ents = parse_term_bank(ents, nents, tbank, &dict->stride);
-		if (ents == NULL)
+		nents += parse_term_bank(&ents[nents], rem, tbank);
+		if (lents - nents == rem) {
+			free(ents);
 			return NULL;
+		}
 	}
-	qsort(ents, *nents, sizeof(DictEnt), entcmp);
-	ents = dedup(ents, nents);
+	qsort(ents, nents, sizeof(DictEnt), entcmp);
+	ents = dedup(ents, &nents);
+	*nlen = nents;
 
 	return ents;
 }

M	README.md	\|	18	+++++++++---------
M	config.def.h	\|	8	++++----
M	jdict.c	\|	109	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------