Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 31 additions & 16 deletions cutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,17 +276,19 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)

/* Decode a single code point from a UTF-8 encoded array of bytes
`p` is a valid pointer to an array of bytes
`max_len` is the number of bytes available in the array
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
to the byte following the current sequence.
Return the code point at `p`, in the range `0..0x10FFFF`
Return 0xFFFD on error. Only a single byte is consumed in this case
The maximum length for a UTF-8 byte sequence is 4 bytes.
This implements the algorithm specified in whatwg.org, except it accepts
UTF-8 encoded surrogates as JavaScript allows them in strings.
The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
or be null terminated.
If `p[0]` is '\0', the return value is `0` and the byte is consumed.
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
*/
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
{
uint32_t c;
uint8_t lower, upper;
Expand All @@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
if (max_len < 2) {
// need more bytes
break;
}
if (*p >= 0x80 && *p <= 0xBF) {
*pp = p + 1;
return ((c - 0xC0) << 6) + (*p - 0x80);
Expand All @@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
case 0xEC: case 0xED: case 0xEE: case 0xEF:
lower = 0x80;
need2:
if (max_len < 3) {
// need more bytes
break;
}
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
*pp = p + 2;
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
Expand All @@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
lower = 0x80;
upper = 0xBF;
need3:
if (max_len < 4) {
// need more bytes
break;
}
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
*pp = p + 3;
Expand All @@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
return 0xFFFD;
}

uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
switch (max_len) {
case 0:
*pp = p;
return 0xFFFD;
case 1:
if (*p < 0x80)
goto good;
break;
case 2:
if (*p < 0xE0)
goto good;
break;
case 3:
if (*p < 0xF0)
goto good;
break;
default:
good:
return utf8_decode(p, pp);
}
*pp = p + 1;
return 0xFFFD;
}

/* Scan a UTF-8 encoded buffer for content type
`buf` is a valid pointer to a UTF-8 encoded string
`len` is the number of bytes to scan
Expand Down Expand Up @@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
len++;
if (*p++ >= 0x80) {
/* parse UTF-8 sequence, check for encoding error */
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
if (p_next == p)
kind |= UTF8_HAS_ERRORS;
p = p_next;
Expand Down Expand Up @@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
uint32_t c = *p++;
if (c >= 0x80) {
/* parse utf-8 sequence */
c = utf8_decode(p - 1, p_end - (p - 1), &p);
c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
/* encoding errors are converted as 0xFFFD and use a single byte */
if (c > 0xFFFF) {
if (i < dest_len)
Expand Down
3 changes: 2 additions & 1 deletion cutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,8 @@ enum {
int utf8_scan(const char *buf, size_t len, size_t *plen);
size_t utf8_encode_len(uint32_t c);
size_t utf8_encode(uint8_t *buf, uint32_t c);
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
Expand Down
6 changes: 3 additions & 3 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
normal_char:
p++;
if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
return re_parse_error(s, "invalid UTF-8 sequence");
p = p_next;
Expand Down Expand Up @@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
} else if (c == '>') {
break;
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
return -1;
p = p_next;
if (is_hi_surrogate(c)) {
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
d = utf8_decode(p, &p_next);
if (is_lo_surrogate(d)) {
c = from_surrogate(c, d);
p = p_next;
Expand Down
2 changes: 1 addition & 1 deletion quickjs-libc.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
string_arg = JS_ToCString(ctx, argv[i++]);
if (!string_arg)
goto fail;
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
JS_FreeCString(ctx, string_arg);
} else {
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
Expand Down
32 changes: 16 additions & 16 deletions quickjs.c
Original file line number Diff line number Diff line change
Expand Up @@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
break;
} else {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
if (!lre_is_space(c))
break;
Expand Down Expand Up @@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
s->eol = &p[-1];
s->mark = p;
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
js_parse_error(s, "invalid UTF-8 sequence");
goto fail;
Expand Down Expand Up @@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
}
goto fail;
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1) {
goto invalid_utf8;
}
Expand All @@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
break;
}
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p)
goto invalid_utf8;
p = p_next;
Expand Down Expand Up @@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
else if (c == '\0' && p >= s->buf_end)
goto eof_error;
else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
goto invalid_utf8;
}
Expand All @@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
goto eol_error;
}
} else if (c >= 0x80) {
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
invalid_utf8:
js_parse_error(s, "invalid UTF-8 sequence");
Expand All @@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)

/* flags */
for(;;) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
if (!lre_js_is_ident_next(c))
break;
Expand Down Expand Up @@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
c = lre_parse_escape(&p_next, TRUE);
*pident_has_escape = TRUE;
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
}
if (!lre_js_is_ident_next(c))
Expand Down Expand Up @@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
s->got_lf = TRUE; /* considered as LF for ASI */
p++;
} else if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* ignore invalid UTF-8 in comments */
if (c == CP_LS || c == CP_PS) {
s->got_lf = TRUE; /* considered as LF for ASI */
Expand All @@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
if (*p == '\r' || *p == '\n')
break;
if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* ignore invalid UTF-8 in comments */
/* LS or PS are considered as line terminator */
if (c == CP_LS || c == CP_PS) {
Expand Down Expand Up @@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
if (c == '\\' && *p_next == 'u') {
c = lre_parse_escape(&p_next, TRUE);
} else if (c >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
}
Expand Down Expand Up @@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
goto fail;
/* reject `10instanceof Number` */
if (JS_VALUE_IS_NAN(ret) ||
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
lre_js_is_ident_next(utf8_decode(p, &p_next))) {
JS_FreeValue(s->ctx, ret);
js_parse_error(s, "invalid number literal");
goto fail;
Expand Down Expand Up @@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
break;
default:
if (c >= 0x80) { /* non-ASCII code-point */
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1)
goto invalid_utf8;
p = p_next;
Expand Down Expand Up @@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
}
} else
if (c >= 0x80) {
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
c = utf8_decode(p - 1, &p_next);
if (p_next == p) {
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
goto fail;
Expand Down Expand Up @@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
break;
default:
if (c >= 0x80) {
c = utf8_decode(p, s->buf_end - p, &p_next);
c = utf8_decode(p, &p_next);
if (p_next == p + 1) {
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
} else {
Expand Down Expand Up @@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
if (*p == '\n' || *p == '\r') {
break;
} else if (*p >= 0x80) {
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
c = utf8_decode(p, &p);
/* purposely ignore UTF-8 encoding errors in this comment line */
if (c == CP_LS || c == CP_PS)
break;
Expand Down