---
src/codepoint.c | 73 +++++++++++++++----------------------------------
1 file changed, 22 insertions(+), 51 deletions(-)
diff --git a/src/codepoint.c b/src/codepoint.c
index 0b63184..6cb0d4f 100644
--- a/src/codepoint.c
+++ b/src/codepoint.c
_AT_@ -8,73 +8,44 @@
size_t
cp_decode(const uint8_t *str, Codepoint *p)
{
- size_t off, j, k, l;
+ size_t rank, i, len;
struct {
uint8_t lower;
uint8_t upper;
uint8_t mask;
- uint8_t bits;
+ Codepoint lowest;
} lookup[] = {
- { 0x00, 0x7F, 0xFF, 7 }, /* 00000000 - 01111111, 01111111 */
- { 0xC0, 0xDF, 0x1F, 11 }, /* 11000000 - 11011111, 00011111 */
- { 0xE0, 0xEF, 0x0F, 16 }, /* 11100000 - 11101111, 00001111 */
- { 0xF0, 0xF7, 0x07, 21 }, /* 11110000 - 11110111, 00000111 */
- { 0xF8, 0xFB, 0x03, 26 }, /* 11111000 - 11111011, 00000011 */
- { 0xFC, 0xFD, 0x01, 31 }, /* 11111100 - 11111101, 00000001 */
+ { 0x00, 0x7F, 0xFF, UINT32_C(0x000000) },
+ { 0xC0, 0xDF, 0x1F, UINT32_C(0x000080) },
+ { 0xE0, 0xEF, 0x0F, UINT32_C(0x000800) },
+ { 0xF0, 0xF7, 0x07, UINT32_C(0x010000) }
};
- /* empty string */
- if (str[0] == '\0') {
- *p = 0;
- return 0;
- }
-
- /* find out in which ranges str[0] is */
- for (off = 0; off < LEN(lookup); off++) {
- if (BETWEEN(str[0], lookup[off].lower, lookup[off].upper)) {
- *p = str[0] & lookup[off].mask;
+ for (rank = 0; rank < LEN(lookup); rank++)
+ if (BETWEEN(str[0], lookup[rank].lower, lookup[rank].upper))
break;
- }
- }
- if (off == 0) {
- /* ASCII */
- return 1;
- } else if (off == LEN(lookup)) {
- /* not in ranges */
+ if (rank == LEN(lookup)) {
+ /* Out of range */
*p = CP_INVALID;
return 1;
}
- /* off denotes the number of upcoming expected code units */
- for (j = 0; j < off; j++) {
- if (str[j] == '\0') {
- *p = CP_INVALID;
- return j;
- }
- if ((str[1 + j] & 0xC0) != 0x80) {
+ *p = (Codepoint)(str[0] & lookup[rank].mask);
+ len = rank + 1;
+ for (i = 1; i < len; i++) {
+ if ((str[i] & 0xC0) != 0x80) {
+ /* Not continuation of character */
*p = CP_INVALID;
- return 1 + j;
+ return 1;
}
- *p <<= 6;
- *p |= str[1 + j] & 0x3F; /* 00111111 */
+ *p = (*p << 6) | (str[i] & 0x3F);
}
- if (*p == 0) {
- if (off != 0) {
- /* overencoded NUL */
- *p = CP_INVALID;
- }
- } else {
- /* determine effective bytes */
- for (k = 0; ((*p << k) & (1 << 31)) == 0; k++)
- ;
- for (l = 0; l < off; l++) {
- if ((32 - k) <= lookup[l].bits) {
- *p = CP_INVALID;
- }
- }
+ if (*p < lookup[rank].lowest || BETWEEN(*p, 0xD800, 0xDFFF) || *p > UINT32_C(0x10FFFF)) {
+ /* Overencoded, surrogate, or out of range */
+ *p = CP_INVALID;
+ return 1;
}
-
- return 1 + j;
+ return len;
}
--
2.26.2
Received on Wed May 27 2020 - 15:22:35 CEST
This archive was generated by hypermail 2.3.0 : Wed May 27 2020 - 15:24:30 CEST