[hackers] [libgrapheme][PATCH] Simplify cp_decode and be more strict

From: Mattias Andrée <maandree_AT_kth.se>
Date: Wed, 27 May 2020 15:22:35 +0200

No special cases for NUL and ASCII.

Simplified check of overencoded codepoints:
lowest non-overencoded codepoints are stored in the lookup table

Do not accept surrogate code points.
From RFC 3629:
        The definition of UTF-8 prohibits encoding character
        numbers between U+D800 and U+DFFF, [...]

Do not accept plane 17 and above:
From RFC 3629:
        Restricted the range of characters to 0000-10FFFF
        (the UTF-16 accessible range).
---
 src/codepoint.c | 73 +++++++++++++++----------------------------------
 1 file changed, 22 insertions(+), 51 deletions(-)
diff --git a/src/codepoint.c b/src/codepoint.c
index 0b63184..6cb0d4f 100644
--- a/src/codepoint.c
+++ b/src/codepoint.c
_AT_@ -8,73 +8,44 @@
 size_t
 cp_decode(const uint8_t *str, Codepoint *p)
 {
-	size_t off, j, k, l;
+	size_t rank, i, len;
 
 	struct {
 		uint8_t lower;
 		uint8_t upper;
 		uint8_t mask;
-		uint8_t bits;
+		Codepoint lowest;
 	} lookup[] = {
-		{ 0x00, 0x7F, 0xFF,  7 }, /* 00000000 - 01111111, 01111111 */
-		{ 0xC0, 0xDF, 0x1F, 11 }, /* 11000000 - 11011111, 00011111 */
-		{ 0xE0, 0xEF, 0x0F, 16 }, /* 11100000 - 11101111, 00001111 */
-		{ 0xF0, 0xF7, 0x07, 21 }, /* 11110000 - 11110111, 00000111 */
-		{ 0xF8, 0xFB, 0x03, 26 }, /* 11111000 - 11111011, 00000011 */
-		{ 0xFC, 0xFD, 0x01, 31 }, /* 11111100 - 11111101, 00000001 */
+	        { 0x00, 0x7F, 0xFF, UINT32_C(0x000000) },
+	        { 0xC0, 0xDF, 0x1F, UINT32_C(0x000080) },
+	        { 0xE0, 0xEF, 0x0F, UINT32_C(0x000800) },
+	        { 0xF0, 0xF7, 0x07, UINT32_C(0x010000) }
 	};
 
-	/* empty string */
-	if (str[0] == '\0') {
-		*p = 0;
-		return 0;
-	}
-
-	/* find out in which ranges str[0] is */
-	for (off = 0; off < LEN(lookup); off++) {
-		if (BETWEEN(str[0], lookup[off].lower, lookup[off].upper)) {
-			*p = str[0] & lookup[off].mask;
+	for (rank = 0; rank < LEN(lookup); rank++)
+		if (BETWEEN(str[0], lookup[rank].lower, lookup[rank].upper))
 			break;
-		}
-	}
-	if (off == 0) {
-		/* ASCII */
-		return 1;
-	} else if (off == LEN(lookup)) {
-		/* not in ranges */
+	if (rank == LEN(lookup)) {
+		/* Out of range */
 		*p = CP_INVALID;
 		return 1;
 	}
 
-	/* off denotes the number of upcoming expected code units */
-	for (j = 0; j < off; j++) {
-		if (str[j] == '\0') {
-			*p = CP_INVALID;
-			return j;
-		}
-		if ((str[1 + j] & 0xC0) != 0x80) {
+	*p = (Codepoint)(str[0] & lookup[rank].mask);
+	len = rank + 1;
+	for (i = 1; i < len; i++) {
+		if ((str[i] & 0xC0) != 0x80) {
+			/* Not continuation of character */
 			*p = CP_INVALID;
-			return 1 + j;
+			return 1;
 		}
-		*p <<= 6;
-		*p |= str[1 + j] & 0x3F; /* 00111111 */
+		*p = (*p << 6) | (str[i] & 0x3F);
 	}
 
-	if (*p == 0) {
-		if (off != 0) {
-			/* overencoded NUL */
-			*p = CP_INVALID;
-		}
-	} else {
-		/* determine effective bytes */
-		for (k = 0; ((*p << k) & (1 << 31)) == 0; k++)
-			;
-		for (l = 0; l < off; l++) {
-			if ((32 - k) <= lookup[l].bits) {
-				*p = CP_INVALID;
-			}
-		}
+	if (*p < lookup[rank].lowest || BETWEEN(*p, 0xD800, 0xDFFF) || *p > UINT32_C(0x10FFFF)) {
+		/* Overencoded, surrogate, or out of range */
+		*p = CP_INVALID;
+		return 1;
 	}
-
-	return 1 + j;
+	return len;
 }
-- 
2.26.2
Received on Wed May 27 2020 - 15:22:35 CEST

This archive was generated by hypermail 2.3.0 : Wed May 27 2020 - 15:24:30 CEST