initial import - aozora2fmt - a tool for converting Aozora Bunko files to better formats

Commit: dd2f3a6212170fc29b8b22d96e1ca91217cf018f
Author: Randy Palamar
Date:   Fri, 15 Jul 2022 07:57:55 -0600

initial import

Diffstat:
A LICENSE  | 15 +++++++++++++++
A Makefile  | 22 ++++++++++++++++++++++
A README.md  | 20 ++++++++++++++++++++
A aozora2fmt.go  | 363 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 420 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License (ISC)
+
+© 2022 Randy Palamar <palamar@ualberta.ca>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,22 @@
+# See LICENSE for license details.
+PREFIX = /usr/local
+
+LDFLAGS = -s
+
+BINS = aozora2fmt
+
+default: $(BINS)
+
+aozora2fmt: aozora2fmt.go
+	go build -ldflags "$(LDFLAGS)" $@.go
+
+install: $(BINS)
+	mkdir -p $(PREFIX)/bin
+	cp $(BINS) $(PREFIX)/bin
+	chmod 755 $(BINS:%=$(PREFIX)/bin/%)
+
+uninstall:
+	rm $(BINS:%=$(PREFIX)/bin/%)
+
+clean:
+	rm $(BINS)
diff --git a/README.md b/README.md
@@ -0,0 +1,20 @@
+# aozora2fmt
+
+A command line tool for converting Aozora Bunko
+([青空文庫](https://www.aozora.gr.jp/index.html)) files to better
+formats.
+
+## Description
+
+This tool walks through the text and replaces the following:
+* JIS codepoint markers are replaced with the correct UTF-8 character
+* Ruby text markers are replaced with tags appropriate for the output format
+* Headers are replaced with markers appropriate for the output format
+* Page breaks are replaced with a marker appropriate for the output format
+* The info block at the start of the file is removed (unless `[-d]` is specified)
+
+## Installation
+
+Simply clone the repository and run:
+
+	make install
diff --git a/aozora2fmt.go b/aozora2fmt.go
@@ -0,0 +1,363 @@
+/* See LICENSE for license details. */
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"log"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+type OutFmt struct {
+	ruby  string /* Ruby output format */
+	hdr   string /* Header format */
+	shdr  string /* Subheader format */
+	sshdr string /* Subsubheader format */
+	pb    string /* Page Break text */
+}
+
+func usage() {
+	fmt.Fprintf(os.Stderr, "usage: %s [-d] [-f format] file\n", os.Args[0])
+	flag.PrintDefaults()
+}
+
+func get_outfmt(fmt string) *OutFmt {
+	of := new(OutFmt)
+
+	switch fmt {
+	case "tex":
+		of.ruby  = "\\ruby{%s}{%s}"
+		of.hdr   = "\\chapter{%s}"
+		of.shdr  = "\\section*{%s}"
+		of.sshdr = "\\subsection*{%s}"
+		of.pb    = "\\newpage"
+	case "md":
+		of.ruby  = "<ruby>%s<rp>《</rp><rt>%s</rt><rp>》</rp></ruby>"
+		of.hdr   = "# %s"
+		of.shdr  = "## %s"
+		of.sshdr = "### %s"
+		of.pb    = "<div style='break-after:always'></div>"
+	case "plain":
+		of.ruby  = "[%s:%s]"
+		of.hdr   = "%s"
+		of.shdr  = "%s"
+		of.sshdr = "%s"
+		of.pb    = ""
+	}
+
+	return of
+}
+
+func accent_map() map[string]string {
+	/* https://web.archive.org/web/20220206093806/http://aozora.gr.jp/accent_separation.html */
+	return map[string]string {
+		"A&": "Å",
+		"A'": "Á",
+		"A:": "Ä",
+		"AE&": "Æ",
+		"A^": "Â",
+		"A_": "Ā",
+		"A`": "À",
+		"A~": "Ã",
+		"C'": "Ć",
+		"C,": "Ç",
+		"C^": "Ĉ",
+		"D/": "Đ",
+		"E'": "É",
+		"E:": "Ë",
+		"E^": "Ê",
+		"E_": "Ē",
+		"E`": "È",
+		"E~": "Ẽ",
+		"G^": "Ĝ",
+		"H^": "Ĥ",
+		"I'": "Í",
+		"I:": "Ï",
+		"I^": "Î",
+		"I_": "Ī",
+		"I`": "Ì",
+		"I~": "Ĩ",
+		"J^": "Ĵ",
+		"L'": "Ĺ",
+		"L/": "Ł",
+		"M'": "Ḿ",
+		"N'": "Ń",
+		"N`": "Ǹ",
+		"N~": "Ñ",
+		"O'": "Ó",
+		"O/": "Ø",
+		"O:": "Ö",
+		"OE&": "Œ",
+		"O^": "Ô",
+		"O_": "Ō",
+		"O`": "Ò",
+		"O~": "Õ",
+		"R'": "Ŕ",
+		"S'": "Ś",
+		"S,": "Ş",
+		"S^": "Ŝ",
+		"T,": "Ţ",
+		"U&": "Ů",
+		"U'": "Ú",
+		"U:": "Ü",
+		"U^": "Û",
+		"U_": "Ū",
+		"U`": "Ù",
+		"U~": "Ũ",
+		"Y'": "Ý",
+		"Z'": "Ź",
+		"a&": "å",
+		"a'": "á",
+		"a:": "ä",
+		"a^": "â",
+		"a_": "ā",
+		"a`": "à",
+		"ae&": "æ",
+		"a~": "ã",
+		"c'": "ć",
+		"c,": "ç",
+		"c^": "ĉ",
+		"d/": "đ",
+		"e'": "é",
+		"e:": "ë",
+		"e^": "ê",
+		"e_": "ē",
+		"e`": "è",
+		"e~": "ẽ",
+		"g^": "ĝ",
+		"h/": "ħ",
+		"h^": "ĥ",
+		"i'": "í",
+		"i/": "ɨ",
+		"i:": "ï",
+		"i^": "î",
+		"i_": "ī",
+		"i`": "ì",
+		"i~": "ĩ",
+		"j^": "ĵ",
+		"l'": "ĺ",
+		"l/": "ł",
+		"m'": "ḿ",
+		"n'": "ń",
+		"n`": "ǹ",
+		"n~": "ñ",
+		"o'": "ó",
+		"o/": "ø",
+		"o:": "ö",
+		"o^": "ô",
+		"o_": "ō",
+		"o`": "ò",
+		"oe&": "œ",
+		"o~": "õ",
+		"r'": "ŕ",
+		"s&": "ß",
+		"s'": "ś",
+		"s,": "ş",
+		"s^": "ŝ",
+		"t,": "ţ",
+		"u&": "ů",
+		"u'": "ú",
+		"u:": "ü",
+		"u^": "û",
+		"u_": "ū",
+		"u`": "ù",
+		"u~": "ũ",
+		"y'": "ý",
+		"y:": "ÿ",
+		"z'": "ź",
+	}
+}
+
+func jis_map() map[int]string {
+	/* https://kanji.jitenon.jp/ */
+	/* http://www13.plala.or.jp/bigdata/index_kanji.html */
+	return map[int]string {
+		311476: "匇",
+		311524: "噱",
+		311589: "媧",
+		318428: "彘",
+		318431: "彽",
+		318445: "怳",
+		318454: "惝",
+		318455: "惸",
+		318459: "愷",
+		318466: "戢",
+		318477: "挘",
+		318615: "橛",
+		318662: "泫",
+		318740: "炷",
+		318764: "燄",
+		318771: "犍",
+		318822: "璆",
+		318881: "眶",
+		318885: "睜",
+		319155: "蛼",
+		319239: "蹰",
+		319278: "鄢",
+		319413: "騃",
+		319484: "鼹",
+		421283: "戕",
+		428874: "譃",
+		429267: "餼",
+		429268: "饀",
+		429271: "饍",
+		429337: "魳",
+	}
+}
+
+func replace_jis(str string) string {
+	exp := regexp.MustCompile(`※［＃([^］]+)］`)
+
+	for _, matches := range exp.FindAllStringSubmatch(str, -1) {
+		sub_exp := regexp.MustCompile(`第(\d)水準(\d)-(\d\d)-(\d\d)`)
+	
+		nums := sub_exp.FindStringSubmatch(str)
+		if nums == nil {
+			/* the same character appeared multiple times in str */
+			continue
+		}
+		num, _ := strconv.Atoi(nums[1] + nums[2] + nums[3] + nums[4])
+
+		m := jis_map()
+		replacement, ok := m[num]
+		if !ok {
+			log.Printf("jis code not implemented: %d: %s\n", num, matches[0])
+			continue
+		}
+
+		str = strings.Replace(str, matches[0], replacement, -1)
+	}
+
+	return str
+}
+
+func replace_ruby(str string, of *OutFmt) string {
+	kanji := `\x{3400}-\x{4DBF}` +   /* CJK Unified Ideographs Extension A */
+		 `\x{4E00}-\x{9FFF}` +   /* CJK Unified Ideographs */
+		 `\x{F900}-\x{FAFF}` +   /* CJK Compatibility Ideographs */
+		 `\x{20000}-\x{2FA1F}` + /* CJK Unified Ideographs Extension B - F, Supplement */
+		 `〆〻〇々ヶ`
+	ruby_exp := regexp.MustCompile(`[｜]?([` + kanji + `]+)《([^》]+)》`)
+	for _, matches := range ruby_exp.FindAllStringSubmatch(str, -1) {
+		replacement := fmt.Sprintf(of.ruby, matches[1], matches[2])
+		str = strings.Replace(str, matches[0], replacement, -1)
+	}
+
+	bouten_exp := regexp.MustCompile(`［＃「([^」]+)」に傍点］`)
+	for _, matches := range bouten_exp.FindAllStringSubmatch(str, -1) {
+		bouten := strings.Repeat("﹅", utf8.RuneCountInString(matches[1]))
+		replacement := fmt.Sprintf(of.ruby, matches[1], bouten)
+		str = strings.Replace(str, matches[1] + matches[0], replacement, -1)
+	}
+
+	return str
+}
+
+func replace_accents(str string) string {
+	exp := regexp.MustCompile(`〔([^〕]+)〕`)
+	
+	for _, matches := range exp.FindAllStringSubmatch(str, -1) {
+		str = strings.Replace(str, matches[0], matches[1], -1)
+
+		m := accent_map()
+		for key := range m {
+			str = strings.ReplaceAll(str, key, m[key])
+		}
+	}
+
+	return str
+}
+
+func replace_hdrs(str string, of *OutFmt) string {
+	exp := regexp.MustCompile(`\n\n［[^［]+［＃「([^」]+)」は([大中小])見出し］\n\n\n`)
+	slices := exp.FindAllStringSubmatch(str, -1)
+	if slices == nil {
+		exp = regexp.MustCompile(`\n\n\n([^\n]+)\n\n\n`)
+		for _, matches := range exp.FindAllStringSubmatch(str, -1) {
+			replacement := "\n" + fmt.Sprintf(of.hdr, matches[1]) + "\n"
+			str = strings.Replace(str, matches[0], replacement, -1)
+		}
+		return str
+	}
+
+	for _, matches := range slices {
+		var replacement string
+		switch matches[2] {
+		case "大":
+			replacement = fmt.Sprintf(of.hdr, matches[1])
+		case "中":
+			replacement = fmt.Sprintf(of.shdr, matches[1])
+		case "小":
+			replacement = fmt.Sprintf(of.sshdr, matches[1])
+		default:
+			log.Printf("bad hdr: %s\n", matches[0])
+			replacement = matches[1]
+		}
+		str = strings.Replace(str, matches[0], replacement + "\n", -1)
+	}
+
+	return str
+}
+
+func trim_info(str string) string {
+	delim := "\n" + strings.Repeat("-", 55) + "\n"
+
+	slices := strings.Split(str, delim)
+
+	return strings.Join([]string{slices[0], slices[2]}, "")
+}
+
+func parse(file string, of *OutFmt, debug bool) string {
+	f, err := os.Open(file)
+	defer f.Close()
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	var lines []string
+	r := bufio.NewScanner(f)
+	for r.Scan() {
+		line := strings.Trim(r.Text(), "　")
+		line = replace_jis(line)
+		line = replace_ruby(line, of)
+		line = replace_accents(line)
+		lines = append(lines, line)
+	}
+
+	out := strings.Join(lines, "\n\n");
+	out = replace_hdrs(out, of)
+	out = strings.Replace(out, "［＃改ページ］", of.pb, -1)
+
+	if (debug == false) {
+		out = trim_info(out)
+	}
+
+	return out
+}
+
+func main() {
+	var (
+		debug = flag.Bool("d", false, "debug mode")
+		format = flag.String("f", "plain", "output format [plain|md|tex]")
+	)
+
+	flag.Usage = usage
+	flag.Parse()
+
+	if flag.NArg() != 1 {
+		usage()
+		os.Exit(1)
+	}
+
+	log.SetFlags(log.Lshortfile)
+
+	of := get_outfmt(*format)
+	out := parse(flag.Arg(0), of, *debug) 
+
+	fmt.Printf("%s\n", out)
+}

A	LICENSE	\|	15	+++++++++++++++
A	Makefile	\|	22	++++++++++++++++++++++
A	README.md	\|	20	++++++++++++++++++++
A	aozora2fmt.go	\|	363	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++