aozora2fmt

a tool for converting Aozora Bunko files to better formats
git clone anongit@rnpnr.xyz:aozora2fmt.git
Log | Files | Refs | Feed | README | LICENSE

main.go (4803B)


      1 /* See LICENSE for license details. */
      2 package main
      3 
      4 import (
      5 	"bufio"
      6 	"flag"
      7 	"fmt"
      8 	"log"
      9 	"os"
     10 	"regexp"
     11 	"strings"
     12 	"unicode/utf8"
     13 
     14 	"aozora2fmt"
     15 )
     16 
     17 type OutFmt struct {
     18 	ruby  string /* Ruby output format */
     19 	hdr   string /* Header format */
     20 	shdr  string /* Subheader format */
     21 	sshdr string /* Subsubheader format */
     22 	pb    string /* Page Break text */
     23 }
     24 
     25 func usage() {
     26 	fmt.Fprintf(os.Stderr, "usage: %s [-d] [-f format] file\n", os.Args[0])
     27 	flag.PrintDefaults()
     28 }
     29 
     30 func get_outfmt(fmt string) *OutFmt {
     31 	of := new(OutFmt)
     32 
     33 	switch fmt {
     34 	case "tex":
     35 		of.ruby  = "\\ruby{%s}{%s}"
     36 		of.hdr   = "\\chapter{%s}"
     37 		of.shdr  = "\\section*{%s}"
     38 		of.sshdr = "\\subsection*{%s}"
     39 		of.pb    = "\\newpage"
     40 	case "md":
     41 		of.ruby  = "<ruby>%s<rp>《</rp><rt>%s</rt><rp>》</rp></ruby>"
     42 		of.hdr   = "# %s"
     43 		of.shdr  = "## %s"
     44 		of.sshdr = "### %s"
     45 		of.pb    = "<div style='break-after:always'></div>"
     46 	case "plain":
     47 		of.ruby  = "[%s:%s]"
     48 		of.hdr   = "%s"
     49 		of.shdr  = "%s"
     50 		of.sshdr = "%s"
     51 		of.pb    = ""
     52 	}
     53 
     54 	return of
     55 }
     56 
     57 func replace_jis(str string) string {
     58 	exp := regexp.MustCompile(`※[#[^」]+」、([^]]+)]`)
     59 
     60 	for _, matches := range exp.FindAllStringSubmatch(str, -1) {
     61 		p, m, k, t := 0, 0, 0, 0
     62 		n, _ := fmt.Sscanf(matches[1], `第%01d水準%01d-%02d-%02d`, &p, &m, &k, &t)
     63 
     64 		if n != 4 {
     65 			/* the same character appeared multiple times in str */
     66 			continue
     67 		}
     68 
     69 		str = strings.Replace(str, matches[0], aozora2fmt.Jis2Utf8(m, k, t), -1)
     70 	}
     71 
     72 	return str
     73 }
     74 
     75 func replace_ruby(str string, of *OutFmt) string {
     76 	kanji := `\x{3400}-\x{4DBF}` +   /* CJK Unified Ideographs Extension A */
     77 		 `\x{4E00}-\x{9FFF}` +   /* CJK Unified Ideographs */
     78 		 `\x{F900}-\x{FAFF}` +   /* CJK Compatibility Ideographs */
     79 		 `\x{20000}-\x{2FA1F}` + /* CJK Unified Ideographs Extension B - F, Supplement */
     80 		 `〆〻〇々ヶ`
     81 	ruby_exp := regexp.MustCompile(`[|]?([` + kanji + `]+)《([^》]+)》`)
     82 	for _, matches := range ruby_exp.FindAllStringSubmatch(str, -1) {
     83 		replacement := fmt.Sprintf(of.ruby, matches[1], matches[2])
     84 		str = strings.Replace(str, matches[0], replacement, -1)
     85 	}
     86 
     87 	bouten_exp := regexp.MustCompile(`[#「([^」]+)」に傍点]`)
     88 	for _, matches := range bouten_exp.FindAllStringSubmatch(str, -1) {
     89 		bouten := strings.Repeat("﹅", utf8.RuneCountInString(matches[1]))
     90 		replacement := fmt.Sprintf(of.ruby, matches[1], bouten)
     91 		str = strings.Replace(str, matches[1] + matches[0], replacement, -1)
     92 	}
     93 
     94 	return str
     95 }
     96 
     97 func replace_accents(str string) string {
     98 	exp := regexp.MustCompile(`〔([^〕]+)〕`)
     99 
    100 	for _, matches := range exp.FindAllStringSubmatch(str, -1) {
    101 		str = strings.Replace(str, matches[0], matches[1], -1)
    102 
    103 		m := aozora2fmt.AccentMap()
    104 		for key := range m {
    105 			str = strings.ReplaceAll(str, key, m[key])
    106 		}
    107 	}
    108 
    109 	return str
    110 }
    111 
    112 func replace_hdrs(str string, of *OutFmt) string {
    113 	exp := regexp.MustCompile(`\n\n[[^[]+[#「([^」]+)」は([大中小])見出し]\n\n\n`)
    114 	slices := exp.FindAllStringSubmatch(str, -1)
    115 	if slices == nil {
    116 		exp = regexp.MustCompile(`\n\n\n([^\n]+)\n\n\n`)
    117 		for _, matches := range exp.FindAllStringSubmatch(str, -1) {
    118 			replacement := "\n" + fmt.Sprintf(of.hdr, matches[1]) + "\n"
    119 			str = strings.Replace(str, matches[0], replacement, -1)
    120 		}
    121 		return str
    122 	}
    123 
    124 	for _, matches := range slices {
    125 		var replacement string
    126 		switch matches[2] {
    127 		case "大":
    128 			replacement = fmt.Sprintf(of.hdr, matches[1])
    129 		case "中":
    130 			replacement = fmt.Sprintf(of.shdr, matches[1])
    131 		case "小":
    132 			replacement = fmt.Sprintf(of.sshdr, matches[1])
    133 		default:
    134 			log.Printf("bad hdr: %s\n", matches[0])
    135 			replacement = matches[1]
    136 		}
    137 		str = strings.Replace(str, matches[0], replacement + "\n", -1)
    138 	}
    139 
    140 	return str
    141 }
    142 
    143 func trim_info(str string) string {
    144 	delim := "\n" + strings.Repeat("-", 55) + "\n"
    145 
    146 	slices := strings.Split(str, delim)
    147 
    148 	return strings.Join([]string{slices[0], slices[2]}, "")
    149 }
    150 
    151 func parse(file string, of *OutFmt, debug bool) string {
    152 	f, err := os.Open(file)
    153 	defer f.Close()
    154 	if err != nil {
    155 		log.Fatal(err)
    156 	}
    157 
    158 	var lines []string
    159 	r := bufio.NewScanner(f)
    160 	for r.Scan() {
    161 		line := strings.Trim(r.Text(), " ")
    162 		line = replace_jis(line)
    163 		line = replace_ruby(line, of)
    164 		line = replace_accents(line)
    165 		lines = append(lines, line)
    166 	}
    167 
    168 	out := strings.Join(lines, "\n\n");
    169 	out = replace_hdrs(out, of)
    170 	out = strings.Replace(out, "[#改ページ]", of.pb, -1)
    171 
    172 	if (debug == false) {
    173 		out = trim_info(out)
    174 	}
    175 
    176 	return out
    177 }
    178 
    179 func main() {
    180 	var (
    181 		debug = flag.Bool("d", false, "debug mode")
    182 		format = flag.String("f", "plain", "output format [plain|md|tex]")
    183 	)
    184 
    185 	flag.Usage = usage
    186 	flag.Parse()
    187 
    188 	if flag.NArg() != 1 {
    189 		usage()
    190 		os.Exit(1)
    191 	}
    192 
    193 	log.SetFlags(log.Lshortfile)
    194 
    195 	of := get_outfmt(*format)
    196 	out := parse(flag.Arg(0), of, *debug) 
    197 
    198 	fmt.Printf("%s\n", out)
    199 }