Commit: 20b2be7e6e78c9ac59165855854494a762e14f71
Parent: 6160a04640f513c17cf282212f0444cb954db875
Author: Randy Palamar
Date: Sun, 16 Feb 2025 09:21:57 -0700
util: utf8_decode: add a replacement char for invalid codepoints
Diffstat:
M | util.c | | | 23 | +++++++++++++---------- |
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/util.c b/util.c
@@ -574,27 +574,30 @@ static s8
utf8_encode(u32 cp)
{
static u8 buf[4];
- s8 ret = { .data = buf, .len = -1 };
- if (cp < 0x80) {
- ret.len = 1;
+ s8 result = {.data = buf};
+ if (cp <= 0x7F) {
+ result.len = 1;
buf[0] = cp & 0x7F;
- } else if (cp < 0x800) {
- ret.len = 2;
+ } else if (cp <= 0x7FF) {
+ result.len = 2;
buf[0] = ((cp >> 6) & 0x1F) | 0xC0;
buf[1] = ((cp >> 0) & 0x3F) | 0x80;
- } else if (cp < 0x10000) {
- ret.len = 3;
+ } else if (cp <= 0xFFFF) {
+ result.len = 3;
buf[0] = ((cp >> 12) & 0x0F) | 0xE0;
buf[1] = ((cp >> 6) & 0x3F) | 0x80;
buf[2] = ((cp >> 0) & 0x3F) | 0x80;
- } else if (cp < 0x200000) {
- ret.len = 4;
+ } else if (cp <= 0x10FFFF) {
+ result.len = 4;
buf[0] = ((cp >> 18) & 0x07) | 0xF0;
buf[1] = ((cp >> 12) & 0x3F) | 0x80;
buf[2] = ((cp >> 6) & 0x3F) | 0x80;
buf[3] = ((cp >> 0) & 0x3F) | 0x80;
+ } else {
+ buf[0] = '?';
+ result.len = 1;
}
- return ret;
+ return result;
}
#include "extern/utf8_decode.c"