charsets.c (10621B)
1 /* charsets.c 2 * (c) 2002 Mikulas Patocka, Karel 'Clock' Kulhavy 3 * This file is a part of the Links program, released under GPL. 4 */ 5 6 #include <limits.h> 7 #include <wctype.h> 8 9 #include "links.h" 10 11 struct codepage_desc { 12 const char *name; 13 const char *const *aliases; 14 }; 15 16 #include "codepage.inc" 17 #include "entity.inc" 18 #include "upcase.inc" 19 20 static const unsigned char strings[256][2] = { 21 "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", 22 "\011", "\012", "\013", "\014", "\015", "\016", "\017", "\020", "\021", 23 "\022", "\023", "\024", "\025", "\026", "\033", "\030", "\031", "\032", 24 "\033", "\034", "\035", "\036", "\033", "\040", "\041", "\042", "\043", 25 "\044", "\045", "\046", "\047", "\050", "\051", "\052", "\053", "\054", 26 "\055", "\056", "\057", "\060", "\061", "\062", "\063", "\064", "\065", 27 "\066", "\067", "\070", "\071", "\072", "\073", "\074", "\075", "\076", 28 "\077", "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107", 29 "\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117", "\120", 30 "\121", "\122", "\123", "\124", "\125", "\126", "\127", "\130", "\131", 31 "\132", "\133", "\134", "\135", "\136", "\137", "\140", "\141", "\142", 32 "\143", "\144", "\145", "\146", "\147", "\150", "\151", "\152", "\153", 33 "\154", "\155", "\156", "\157", "\160", "\161", "\162", "\163", "\164", 34 "\165", "\166", "\167", "\170", "\171", "\172", "\173", "\174", "\175", 35 "\176", "\177", "\200", "\201", "\202", "\203", "\204", "\205", "\206", 36 "\207", "\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217", 37 "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227", "\230", 38 "\231", "\232", "\233", "\234", "\235", "\236", "\237", "\240", "\241", 39 "\242", "\243", "\244", "\245", "\246", "\247", "\250", "\251", "\252", 40 "\253", "\254", "\255", "\256", "\257", "\260", "\261", "\262", "\263", 41 "\264", "\265", "\266", "\267", "\270", "\271", "\272", "\273", "\274", 42 "\275", "\276", "\277", "\300", "\301", "\302", "\303", "\304", "\305", 43 "\306", "\307", "\310", "\311", "\312", "\313", "\314", "\315", "\316", 44 "\317", "\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327", 45 "\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337", "\340", 46 "\341", "\342", "\343", "\344", "\345", "\346", "\347", "\350", "\351", 47 "\352", "\353", "\354", "\355", "\356", "\357", "\360", "\361", "\362", 48 "\363", "\364", "\365", "\366", "\367", "\370", "\371", "\372", "\373", 49 "\374", "\375", "\376", "\377", 50 }; 51 52 unsigned int 53 locase(unsigned int a) 54 { 55 if (a >= 'A' && a <= 'Z') 56 a += 0x20; 57 return a; 58 } 59 60 unsigned int 61 upcase(unsigned int a) 62 { 63 if (a >= 'a' && a <= 'z') 64 a -= 0x20; 65 return a; 66 } 67 68 unsigned char * 69 u2cp(int u) 70 { 71 return encode_utf_8(u); 72 } 73 74 static unsigned char utf_buffer[7]; 75 76 unsigned char * 77 encode_utf_8(int u) 78 { 79 memset(utf_buffer, 0, 7); 80 if (u < 0) 81 ; 82 else if (u < 0x80) 83 utf_buffer[0] = (unsigned char)u; 84 else if (u < 0x800) { 85 utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f); 86 utf_buffer[1] = 0x80 | (u & 0x3f); 87 } else if (u < 0x10000) { 88 utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f); 89 utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f); 90 utf_buffer[2] = 0x80 | (u & 0x3f); 91 } else if (u < 0x200000) { 92 utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f); 93 utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f); 94 utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f); 95 utf_buffer[3] = 0x80 | (u & 0x3f); 96 } else if (u < 0x4000000) { 97 utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f); 98 utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f); 99 utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f); 100 utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f); 101 utf_buffer[4] = 0x80 | (u & 0x3f); 102 } else { 103 utf_buffer[0] = 0xfc | ((u >> 30) & 0x01); 104 utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f); 105 utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f); 106 utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f); 107 utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f); 108 utf_buffer[5] = 0x80 | (u & 0x3f); 109 } 110 return utf_buffer; 111 } 112 113 static struct conv_table * 114 get_translation_table_to_utf_8(int from) 115 { 116 static struct conv_table utf_table[256]; 117 static int init = 0, lfr = -1; 118 int i; 119 if (from == lfr) 120 return utf_table; 121 lfr = from; 122 if (!init) { 123 for (i = 0; i < 256; i++) 124 utf_table[i].u.str = (unsigned char *)strings[i]; 125 init = 1; 126 } 127 return utf_table; 128 } 129 130 unsigned char utf_8_1[256] = { 131 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 132 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 133 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 134 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 135 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 136 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 137 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 138 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 139 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 140 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 141 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 6, 6, 142 }; 143 144 static const unsigned min_utf_8[8] = { 145 0, 0x4000000, 0x200000, 0x10000, 0x800, 0x80, 0x100, 0x1, 146 }; 147 148 unsigned 149 get_utf_8(unsigned char **s) 150 { 151 unsigned v, min, c; 152 int l; 153 unsigned char *p = *s; 154 l = utf_8_1[p[0]]; 155 min = min_utf_8[l]; 156 v = p[0] & ((1 << l) - 1); 157 (*s)++; 158 while (l++ <= 5) { 159 c = **s - 0x80; 160 if (c >= 0x40) { 161 return 0; 162 } 163 (*s)++; 164 v = (v << 6) + c; 165 } 166 if (v < min) 167 return 0; 168 if (v > 0x10FFFF) 169 return 0; 170 return v; 171 } 172 173 struct conv_table * 174 get_translation_table(const int from, const int to) 175 { 176 if (from == -1 || to == -1) 177 return NULL; 178 return get_translation_table_to_utf_8(from); 179 } 180 181 static inline int 182 xxstrcmp(unsigned char *s1, unsigned char *s2, int l2) 183 { 184 while (l2) { 185 if (*s1 > *s2) 186 return 1; 187 if (!*s1 || *s1 < *s2) 188 return -1; 189 s1++; 190 s2++; 191 l2--; 192 } 193 return !!*s1; 194 } 195 196 int 197 get_entity_number(unsigned char *st, int l) 198 { 199 int n = 0; 200 unsigned char c; 201 if (upcase(st[0]) == 'X') { 202 st++; 203 l--; 204 if (!l) 205 return -1; 206 do { 207 c = upcase(*(st++)); 208 if (c >= '0' && c <= '9') 209 n = n * 16 + c - '0'; 210 else if (c >= 'A' && c <= 'F') 211 n = n * 16 + c - 'A' + 10; 212 else 213 return -1; 214 if (n > 0x10FFFF) 215 return -1; 216 } while (--l); 217 } else { 218 if (!l) 219 return -1; 220 do { 221 c = *(st++); 222 if (c >= '0' && c <= '9') 223 n = n * 10 + c - '0'; 224 else 225 return -1; 226 if (n > 0x10FFFF) 227 return -1; 228 } while (--l); 229 } 230 return n; 231 } 232 233 unsigned char * 234 get_entity_string(unsigned char *st, int l) 235 { 236 int n, c, m, s, e; 237 if (l <= 0) 238 return NULL; 239 if (st[0] == '#') { 240 if ((n = get_entity_number(st + 1, l - 1)) == -1 || l == 1) 241 return NULL; 242 if (n < 32 && get_attr_val_nl != 2) 243 n = 32; 244 } else { 245 for (s = 0, e = N_ENTITIES - 1; s <= e;) { 246 m = (s + e) / 2; 247 c = xxstrcmp(cast_uchar entities[m].s, st, l); 248 if (!c) { 249 n = entities[m].c; 250 goto f; 251 } 252 if (c > 0) 253 e = m - 1; 254 else 255 s = m + 1; 256 } 257 return NULL; 258 f:; 259 } 260 261 return u2cp(n); 262 } 263 264 unsigned char * 265 convert_string(struct conv_table *ct, unsigned char *c, int l, 266 struct document_options *dopt) 267 { 268 unsigned char *buffer, *e = NULL; 269 struct conv_table *t; 270 int i, bp = 0, pp = 0; 271 if (!ct) { 272 for (i = 0; i < l; i++) 273 if (c[i] == '&') 274 goto xx; 275 return memacpy(c, l); 276 xx:; 277 } 278 buffer = xmalloc(ALLOC_GR); 279 while (pp < l) { 280 if (c[pp] < 128 && c[pp] != '&') { 281 put_c: 282 buffer[bp++] = c[pp++]; 283 if (!(bp & (ALLOC_GR - 1))) { 284 if ((unsigned)bp > INT_MAX - ALLOC_GR) 285 overalloc(); 286 buffer = xrealloc(buffer, bp + ALLOC_GR); 287 } 288 continue; 289 } 290 if (c[pp] != '&') { 291 if (!ct) 292 goto put_c; 293 t = ct; 294 i = pp; 295 decode: 296 if (!t[c[i]].t) { 297 e = t[c[i]].u.str; 298 } else { 299 t = t[c[i++]].u.tbl; 300 if (i >= l) 301 goto put_c; 302 goto decode; 303 } 304 pp = i + 1; 305 } else { 306 i = pp + 1; 307 if (!dopt || dopt->plain) 308 goto put_c; 309 while (i < l && !is_entity_terminator(c[i])) 310 i++; 311 if (!(e = get_entity_string(&c[pp + 1], i - pp - 1))) 312 goto put_c; 313 pp = i + (i < l && c[i] == ';'); 314 } 315 if (!e[0]) 316 continue; 317 if (!e[1]) { 318 buffer[bp++] = e[0]; 319 if (!(bp & (ALLOC_GR - 1))) { 320 if ((unsigned)bp > INT_MAX - ALLOC_GR) 321 overalloc(); 322 buffer = xrealloc(buffer, bp + ALLOC_GR); 323 } 324 continue; 325 } 326 while (*e) { 327 buffer[bp++] = *(e++); 328 if (!(bp & (ALLOC_GR - 1))) { 329 if ((unsigned)bp > INT_MAX - ALLOC_GR) 330 overalloc(); 331 buffer = xrealloc(buffer, bp + ALLOC_GR); 332 } 333 } 334 } 335 buffer[bp] = 0; 336 return buffer; 337 } 338 339 unsigned char * 340 convert(int from, int to, unsigned char *c, struct document_options *dopt) 341 { 342 unsigned char *cc; 343 struct conv_table *ct; 344 345 for (cc = c; *cc; cc++) 346 if (*cc == '&' && dopt && !dopt->plain) 347 goto need_table; 348 return stracpy(c); 349 350 need_table: 351 ct = get_translation_table(from, to); 352 return convert_string(ct, c, strlen((char *)c), dopt); 353 } 354 355 unsigned char * 356 get_cp_name(int index) 357 { 358 if (index < 0) 359 return (unsigned char *)"none"; 360 return (unsigned char *)codepages[index].name; 361 } 362 363 unsigned char * 364 get_cp_mime_name(int index) 365 { 366 if (!codepages[index].aliases) 367 return NULL; 368 return (unsigned char *)codepages[index].aliases[0]; 369 } 370 371 unsigned 372 uni_locase(unsigned ch) 373 { 374 return towlower(ch); 375 } 376 377 #define UP_EQUAL(a, b) unicode_upcase[a].o == (b) 378 #define UP_ABOVE(a, b) unicode_upcase[a].o > (b) 379 380 unsigned 381 charset_upcase(unsigned ch, int cp) 382 { 383 return towupper(ch); 384 } 385 386 void 387 charset_upcase_string(unsigned char **chp, int cp) 388 { 389 unsigned char *ch = *chp; 390 ch = unicode_upcase_string(ch); 391 free(*chp); 392 *chp = ch; 393 } 394 395 unsigned char * 396 unicode_upcase_string(unsigned char *ch) 397 { 398 unsigned char *r = NULL; 399 unsigned int c; 400 size_t rl = 0; 401 for (;;) { 402 GET_UTF_8(ch, c); 403 if (!c) 404 break; 405 c = towupper(c); 406 rl = add_to_str(&r, rl, encode_utf_8(c)); 407 } 408 return r; 409 } 410 411 unsigned char * 412 to_utf8_upcase(unsigned char *str, int cp) 413 { 414 unsigned char *str1, *str2; 415 str1 = stracpy(str); 416 str2 = unicode_upcase_string(str1); 417 free(str1); 418 return str2; 419 } 420 421 int 422 compare_case_utf8(unsigned char *u1, unsigned char *u2) 423 { 424 unsigned char *x1, *uu1 = u1; 425 unsigned c1, c2; 426 int cc1; 427 for (;;) { 428 GET_UTF_8(u2, c2); 429 if (!c2) 430 return (int)(u1 - uu1); 431 skip_discr: 432 GET_UTF_8(u1, c1); 433 BIN_SEARCH(array_elements(unicode_upcase), UP_EQUAL, UP_ABOVE, 434 c1, cc1); 435 if (cc1 != -1) 436 c1 = unicode_upcase[cc1].n; 437 if (c1 == 0xad) 438 goto skip_discr; 439 if (c1 != c2) 440 return 0; 441 if (c1 == ' ') { 442 do { 443 x1 = u1; 444 GET_UTF_8(u1, c1); 445 BIN_SEARCH(array_elements(unicode_upcase), 446 UP_EQUAL, UP_ABOVE, c1, cc1); 447 if (cc1 >= 0) 448 c1 = unicode_upcase[cc1].n; 449 } while (c1 == ' '); 450 u1 = x1; 451 } 452 } 453 }