links

lynx-like text mode web browser
git clone anongit@rnpnr.xyz:links.git
Log | Files | Refs | Feed | README | LICENSE

charsets.c (10621B)


      1 /* charsets.c
      2  * (c) 2002 Mikulas Patocka, Karel 'Clock' Kulhavy
      3  * This file is a part of the Links program, released under GPL.
      4  */
      5 
      6 #include <limits.h>
      7 #include <wctype.h>
      8 
      9 #include "links.h"
     10 
     11 struct codepage_desc {
     12 	const char *name;
     13 	const char *const *aliases;
     14 };
     15 
     16 #include "codepage.inc"
     17 #include "entity.inc"
     18 #include "upcase.inc"
     19 
     20 static const unsigned char strings[256][2] = {
     21 	"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010",
     22 	"\011", "\012", "\013", "\014", "\015", "\016", "\017", "\020", "\021",
     23 	"\022", "\023", "\024", "\025", "\026", "\033", "\030", "\031", "\032",
     24 	"\033", "\034", "\035", "\036", "\033", "\040", "\041", "\042", "\043",
     25 	"\044", "\045", "\046", "\047", "\050", "\051", "\052", "\053", "\054",
     26 	"\055", "\056", "\057", "\060", "\061", "\062", "\063", "\064", "\065",
     27 	"\066", "\067", "\070", "\071", "\072", "\073", "\074", "\075", "\076",
     28 	"\077", "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
     29 	"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117", "\120",
     30 	"\121", "\122", "\123", "\124", "\125", "\126", "\127", "\130", "\131",
     31 	"\132", "\133", "\134", "\135", "\136", "\137", "\140", "\141", "\142",
     32 	"\143", "\144", "\145", "\146", "\147", "\150", "\151", "\152", "\153",
     33 	"\154", "\155", "\156", "\157", "\160", "\161", "\162", "\163", "\164",
     34 	"\165", "\166", "\167", "\170", "\171", "\172", "\173", "\174", "\175",
     35 	"\176", "\177", "\200", "\201", "\202", "\203", "\204", "\205", "\206",
     36 	"\207", "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
     37 	"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227", "\230",
     38 	"\231", "\232", "\233", "\234", "\235", "\236", "\237", "\240", "\241",
     39 	"\242", "\243", "\244", "\245", "\246", "\247", "\250", "\251", "\252",
     40 	"\253", "\254", "\255", "\256", "\257", "\260", "\261", "\262", "\263",
     41 	"\264", "\265", "\266", "\267", "\270", "\271", "\272", "\273", "\274",
     42 	"\275", "\276", "\277", "\300", "\301", "\302", "\303", "\304", "\305",
     43 	"\306", "\307", "\310", "\311", "\312", "\313", "\314", "\315", "\316",
     44 	"\317", "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
     45 	"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337", "\340",
     46 	"\341", "\342", "\343", "\344", "\345", "\346", "\347", "\350", "\351",
     47 	"\352", "\353", "\354", "\355", "\356", "\357", "\360", "\361", "\362",
     48 	"\363", "\364", "\365", "\366", "\367", "\370", "\371", "\372", "\373",
     49 	"\374", "\375", "\376", "\377",
     50 };
     51 
     52 unsigned int
     53 locase(unsigned int a)
     54 {
     55 	if (a >= 'A' && a <= 'Z')
     56 		a += 0x20;
     57 	return a;
     58 }
     59 
     60 unsigned int
     61 upcase(unsigned int a)
     62 {
     63 	if (a >= 'a' && a <= 'z')
     64 		a -= 0x20;
     65 	return a;
     66 }
     67 
     68 unsigned char *
     69 u2cp(int u)
     70 {
     71 	return encode_utf_8(u);
     72 }
     73 
     74 static unsigned char utf_buffer[7];
     75 
     76 unsigned char *
     77 encode_utf_8(int u)
     78 {
     79 	memset(utf_buffer, 0, 7);
     80 	if (u < 0)
     81 		;
     82 	else if (u < 0x80)
     83 		utf_buffer[0] = (unsigned char)u;
     84 	else if (u < 0x800) {
     85 		utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f);
     86 		utf_buffer[1] = 0x80 | (u & 0x3f);
     87 	} else if (u < 0x10000) {
     88 		utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f);
     89 		utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f);
     90 		utf_buffer[2] = 0x80 | (u & 0x3f);
     91 	} else if (u < 0x200000) {
     92 		utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f);
     93 		utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f);
     94 		utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f);
     95 		utf_buffer[3] = 0x80 | (u & 0x3f);
     96 	} else if (u < 0x4000000) {
     97 		utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f);
     98 		utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f);
     99 		utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f);
    100 		utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f);
    101 		utf_buffer[4] = 0x80 | (u & 0x3f);
    102 	} else {
    103 		utf_buffer[0] = 0xfc | ((u >> 30) & 0x01);
    104 		utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f);
    105 		utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f);
    106 		utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f);
    107 		utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f);
    108 		utf_buffer[5] = 0x80 | (u & 0x3f);
    109 	}
    110 	return utf_buffer;
    111 }
    112 
    113 static struct conv_table *
    114 get_translation_table_to_utf_8(int from)
    115 {
    116 	static struct conv_table utf_table[256];
    117 	static int init = 0, lfr = -1;
    118 	int i;
    119 	if (from == lfr)
    120 		return utf_table;
    121 	lfr = from;
    122 	if (!init) {
    123 		for (i = 0; i < 256; i++)
    124 			utf_table[i].u.str = (unsigned char *)strings[i];
    125 		init = 1;
    126 	}
    127 	return utf_table;
    128 }
    129 
    130 unsigned char utf_8_1[256] = {
    131 	6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    132 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    133 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    134 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    135 	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    136 	7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
    137 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
    138 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
    139 	6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
    140 	5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    141 	3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 6, 6,
    142 };
    143 
    144 static const unsigned min_utf_8[8] = {
    145 	0, 0x4000000, 0x200000, 0x10000, 0x800, 0x80, 0x100, 0x1,
    146 };
    147 
    148 unsigned
    149 get_utf_8(unsigned char **s)
    150 {
    151 	unsigned v, min, c;
    152 	int l;
    153 	unsigned char *p = *s;
    154 	l = utf_8_1[p[0]];
    155 	min = min_utf_8[l];
    156 	v = p[0] & ((1 << l) - 1);
    157 	(*s)++;
    158 	while (l++ <= 5) {
    159 		c = **s - 0x80;
    160 		if (c >= 0x40) {
    161 			return 0;
    162 		}
    163 		(*s)++;
    164 		v = (v << 6) + c;
    165 	}
    166 	if (v < min)
    167 		return 0;
    168 	if (v > 0x10FFFF)
    169 		return 0;
    170 	return v;
    171 }
    172 
    173 struct conv_table *
    174 get_translation_table(const int from, const int to)
    175 {
    176 	if (from == -1 || to == -1)
    177 		return NULL;
    178 	return get_translation_table_to_utf_8(from);
    179 }
    180 
    181 static inline int
    182 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
    183 {
    184 	while (l2) {
    185 		if (*s1 > *s2)
    186 			return 1;
    187 		if (!*s1 || *s1 < *s2)
    188 			return -1;
    189 		s1++;
    190 		s2++;
    191 		l2--;
    192 	}
    193 	return !!*s1;
    194 }
    195 
    196 int
    197 get_entity_number(unsigned char *st, int l)
    198 {
    199 	int n = 0;
    200 	unsigned char c;
    201 	if (upcase(st[0]) == 'X') {
    202 		st++;
    203 		l--;
    204 		if (!l)
    205 			return -1;
    206 		do {
    207 			c = upcase(*(st++));
    208 			if (c >= '0' && c <= '9')
    209 				n = n * 16 + c - '0';
    210 			else if (c >= 'A' && c <= 'F')
    211 				n = n * 16 + c - 'A' + 10;
    212 			else
    213 				return -1;
    214 			if (n > 0x10FFFF)
    215 				return -1;
    216 		} while (--l);
    217 	} else {
    218 		if (!l)
    219 			return -1;
    220 		do {
    221 			c = *(st++);
    222 			if (c >= '0' && c <= '9')
    223 				n = n * 10 + c - '0';
    224 			else
    225 				return -1;
    226 			if (n > 0x10FFFF)
    227 				return -1;
    228 		} while (--l);
    229 	}
    230 	return n;
    231 }
    232 
    233 unsigned char *
    234 get_entity_string(unsigned char *st, int l)
    235 {
    236 	int n, c, m, s, e;
    237 	if (l <= 0)
    238 		return NULL;
    239 	if (st[0] == '#') {
    240 		if ((n = get_entity_number(st + 1, l - 1)) == -1 || l == 1)
    241 			return NULL;
    242 		if (n < 32 && get_attr_val_nl != 2)
    243 			n = 32;
    244 	} else {
    245 		for (s = 0, e = N_ENTITIES - 1; s <= e;) {
    246 			m = (s + e) / 2;
    247 			c = xxstrcmp(cast_uchar entities[m].s, st, l);
    248 			if (!c) {
    249 				n = entities[m].c;
    250 				goto f;
    251 			}
    252 			if (c > 0)
    253 				e = m - 1;
    254 			else
    255 				s = m + 1;
    256 		}
    257 		return NULL;
    258 f:;
    259 	}
    260 
    261 	return u2cp(n);
    262 }
    263 
    264 unsigned char *
    265 convert_string(struct conv_table *ct, unsigned char *c, int l,
    266                struct document_options *dopt)
    267 {
    268 	unsigned char *buffer, *e = NULL;
    269 	struct conv_table *t;
    270 	int i, bp = 0, pp = 0;
    271 	if (!ct) {
    272 		for (i = 0; i < l; i++)
    273 			if (c[i] == '&')
    274 				goto xx;
    275 		return memacpy(c, l);
    276 xx:;
    277 	}
    278 	buffer = xmalloc(ALLOC_GR);
    279 	while (pp < l) {
    280 		if (c[pp] < 128 && c[pp] != '&') {
    281 put_c:
    282 			buffer[bp++] = c[pp++];
    283 			if (!(bp & (ALLOC_GR - 1))) {
    284 				if ((unsigned)bp > INT_MAX - ALLOC_GR)
    285 					overalloc();
    286 				buffer = xrealloc(buffer, bp + ALLOC_GR);
    287 			}
    288 			continue;
    289 		}
    290 		if (c[pp] != '&') {
    291 			if (!ct)
    292 				goto put_c;
    293 			t = ct;
    294 			i = pp;
    295 decode:
    296 			if (!t[c[i]].t) {
    297 				e = t[c[i]].u.str;
    298 			} else {
    299 				t = t[c[i++]].u.tbl;
    300 				if (i >= l)
    301 					goto put_c;
    302 				goto decode;
    303 			}
    304 			pp = i + 1;
    305 		} else {
    306 			i = pp + 1;
    307 			if (!dopt || dopt->plain)
    308 				goto put_c;
    309 			while (i < l && !is_entity_terminator(c[i]))
    310 				i++;
    311 			if (!(e = get_entity_string(&c[pp + 1], i - pp - 1)))
    312 				goto put_c;
    313 			pp = i + (i < l && c[i] == ';');
    314 		}
    315 		if (!e[0])
    316 			continue;
    317 		if (!e[1]) {
    318 			buffer[bp++] = e[0];
    319 			if (!(bp & (ALLOC_GR - 1))) {
    320 				if ((unsigned)bp > INT_MAX - ALLOC_GR)
    321 					overalloc();
    322 				buffer = xrealloc(buffer, bp + ALLOC_GR);
    323 			}
    324 			continue;
    325 		}
    326 		while (*e) {
    327 			buffer[bp++] = *(e++);
    328 			if (!(bp & (ALLOC_GR - 1))) {
    329 				if ((unsigned)bp > INT_MAX - ALLOC_GR)
    330 					overalloc();
    331 				buffer = xrealloc(buffer, bp + ALLOC_GR);
    332 			}
    333 		}
    334 	}
    335 	buffer[bp] = 0;
    336 	return buffer;
    337 }
    338 
    339 unsigned char *
    340 convert(int from, int to, unsigned char *c, struct document_options *dopt)
    341 {
    342 	unsigned char *cc;
    343 	struct conv_table *ct;
    344 
    345 	for (cc = c; *cc; cc++)
    346 		if (*cc == '&' && dopt && !dopt->plain)
    347 			goto need_table;
    348 	return stracpy(c);
    349 
    350 need_table:
    351 	ct = get_translation_table(from, to);
    352 	return convert_string(ct, c, strlen((char *)c), dopt);
    353 }
    354 
    355 unsigned char *
    356 get_cp_name(int index)
    357 {
    358 	if (index < 0)
    359 		return (unsigned char *)"none";
    360 	return (unsigned char *)codepages[index].name;
    361 }
    362 
    363 unsigned char *
    364 get_cp_mime_name(int index)
    365 {
    366 	if (!codepages[index].aliases)
    367 		return NULL;
    368 	return (unsigned char *)codepages[index].aliases[0];
    369 }
    370 
    371 unsigned
    372 uni_locase(unsigned ch)
    373 {
    374 	return towlower(ch);
    375 }
    376 
    377 #define UP_EQUAL(a, b) unicode_upcase[a].o == (b)
    378 #define UP_ABOVE(a, b) unicode_upcase[a].o > (b)
    379 
    380 unsigned
    381 charset_upcase(unsigned ch, int cp)
    382 {
    383 	return towupper(ch);
    384 }
    385 
    386 void
    387 charset_upcase_string(unsigned char **chp, int cp)
    388 {
    389 	unsigned char *ch = *chp;
    390 	ch = unicode_upcase_string(ch);
    391 	free(*chp);
    392 	*chp = ch;
    393 }
    394 
    395 unsigned char *
    396 unicode_upcase_string(unsigned char *ch)
    397 {
    398 	unsigned char *r = NULL;
    399 	unsigned int c;
    400 	size_t rl = 0;
    401 	for (;;) {
    402 		GET_UTF_8(ch, c);
    403 		if (!c)
    404 			break;
    405 		c = towupper(c);
    406 		rl = add_to_str(&r, rl, encode_utf_8(c));
    407 	}
    408 	return r;
    409 }
    410 
    411 unsigned char *
    412 to_utf8_upcase(unsigned char *str, int cp)
    413 {
    414 	unsigned char *str1, *str2;
    415 	str1 = stracpy(str);
    416 	str2 = unicode_upcase_string(str1);
    417 	free(str1);
    418 	return str2;
    419 }
    420 
    421 int
    422 compare_case_utf8(unsigned char *u1, unsigned char *u2)
    423 {
    424 	unsigned char *x1, *uu1 = u1;
    425 	unsigned c1, c2;
    426 	int cc1;
    427 	for (;;) {
    428 		GET_UTF_8(u2, c2);
    429 		if (!c2)
    430 			return (int)(u1 - uu1);
    431 skip_discr:
    432 		GET_UTF_8(u1, c1);
    433 		BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE,
    434 		           c1, cc1);
    435 		if (cc1 != -1)
    436 			c1 = unicode_upcase[cc1].n;
    437 		if (c1 == 0xad)
    438 			goto skip_discr;
    439 		if (c1 != c2)
    440 			return 0;
    441 		if (c1 == ' ') {
    442 			do {
    443 				x1 = u1;
    444 				GET_UTF_8(u1, c1);
    445 				BIN_SEARCH(array_elements(unicode_upcase),
    446 				           UP_EQUAL, UP_ABOVE, c1, cc1);
    447 				if (cc1 >= 0)
    448 					c1 = unicode_upcase[cc1].n;
    449 			} while (c1 == ' ');
    450 			u1 = x1;
    451 		}
    452 	}
    453 }