[hackers] [libgrapheme] Add automatic UTF-8-decoder-tests || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Thu, 28 May 2020 13:00:20 +0200 (CEST)

commit 04bab2a4c09816c37c8e06aa38dfc7f2cab8c680
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Thu May 28 12:57:37 2020 +0200
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Thu May 28 12:57:37 2020 +0200

    Add automatic UTF-8-decoder-tests
    
    The 23 tests should cover all cases and provide safety against any
    possible regressions.
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/data/gbt.awk b/data/gbt.awk
index 41c635a..5fd7c0a 100644
--- a/data/gbt.awk
+++ b/data/gbt.awk
_AT_@ -6,7 +6,7 @@ BEGIN {
 
         printf("struct test {\n\tCodepoint *cp;\n\tsize_t cplen;\n");
         printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n");
- printf("struct test t[] = {\n");
+ printf("static const struct test t[] = {\n");
 }
 
 $0 ~ /^#/ || $0 ~ /^\s*$/ { next }
diff --git a/src/test_body.c b/src/test_body.c
index 25dedd2..536de8f 100644
--- a/src/test_body.c
+++ b/src/test_body.c
_AT_@ -3,15 +3,277 @@
 #include <stdio.h>
 
 #include "boundary.h"
+#include "codepoint.h"
 
 #define LEN(x) (sizeof(x) / sizeof(*x))
 
+/* all types valid/invalid, overencoded, surrogate, over 10FFFF w/e
+ * expected return value and return cp */
+
+static const struct {
+ uint8_t *arr; /* byte array */
+ size_t len; /* number of bytes in array */
+ size_t exp_len; /* expected length returned */
+ uint32_t exp_cp; /* expected codepoint returned */
+} dec_test[] = {
+ {
+ /* empty sequence
+ * [ ] ->
+ * INVALID
+ */
+ .arr = NULL,
+ .len = 0,
+ .exp_len = 1,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid lead byte
+ * [ 11111101 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xFD },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* valid 1-byte sequence
+ * [ 00000001 ] ->
+ * 0000001
+ */
+ .arr = (uint8_t[]){ 0x01 },
+ .len = 1,
+ .exp_len = 1,
+ .exp_cp = 0x1,
+ },
+ {
+ /* valid 2-byte sequence
+ * [ 11000011 10111111 ] ->
+ * 00011111111
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = 0xff,
+ },
+ {
+ /* invalid 2-byte sequence (second byte missing)
+ * [ 11000011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3 },
+ .len = 1,
+ .exp_len = 2,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (second byte malformed)
+ * [ 11000011 11111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .len = 2,
+ .exp_len = 1,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 2-byte sequence (overlong encoded)
+ * [ 11000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .len = 2,
+ .exp_len = 2,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* valid 3-byte sequence
+ * [ 11100000 10111111 10111111 ] ->
+ * 0000111111111111
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = 0xfff,
+ },
+ {
+ /* invalid 3-byte sequence (second byte missing)
+ * [ 11100000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0 },
+ .len = 1,
+ .exp_len = 3,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (second byte malformed)
+ * [ 11100000 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .len = 3,
+ .exp_len = 1,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte missing)
+ * [ 11100000 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .len = 2,
+ .exp_len = 3,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (third byte malformed)
+ * [ 11100000 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .len = 3,
+ .exp_len = 2,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (overlong encoded)
+ * [ 11100000 10011111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 3-byte sequence (UTF-16 surrogate half)
+ * [ 11101101 10100000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .len = 3,
+ .exp_len = 3,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* valid 4-byte sequence
+ * [ 11110011 10111111 10111111 10111111 ] ->
+ * 011111111111111111111
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = 0xfffff,
+ },
+ {
+ /* invalid 4-byte sequence (second byte missing)
+ * [ 11110011 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3 },
+ .len = 1,
+ .exp_len = 4,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (second byte malformed)
+ * [ 11110011 01111111 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .len = 4,
+ .exp_len = 1,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte missing)
+ * [ 11110011 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .len = 2,
+ .exp_len = 4,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (third byte malformed)
+ * [ 11110011 10111111 01111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .len = 4,
+ .exp_len = 2,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte missing)
+ * [ 11110011 10111111 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .len = 3,
+ .exp_len = 4,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (fourth byte malformed)
+ * [ 11110011 10111111 10111111 01111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .len = 4,
+ .exp_len = 3,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (overlong encoded)
+ * [ 11110000 10000000 10000001 10111111 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = CP_INVALID,
+ },
+ {
+ /* invalid 4-byte sequence (UTF-16-unrepresentable)
+ * [ 11110100 10010000 10000000 10000000 ] ->
+ * INVALID
+ */
+ .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .len = 4,
+ .exp_len = 4,
+ .exp_cp = CP_INVALID,
+ },
+};
+
 int main(void)
 {
         int state;
- size_t i, j, k, len, failed = 0;
+ size_t i, j, k, len, failed;
+
+ /* UTF-8 decoder test */
+ for (i = 0, failed = 0; i < LEN(dec_test); i++) {
+ size_t len;
+ uint32_t cp;
+
+ len = grapheme_cp_decode(&cp, dec_test[i].arr,
+ dec_test[i].len);
 
- for (i = 0; i < LEN(t); i++) {
+ if (len != dec_test[i].exp_len ||
+ cp != dec_test[i].exp_cp) {
+ fprintf(stderr, "Failed UTF-8-decoder test %zu: "
+ "Expected (%zx,%u), but got (%zx,%u)\n",
+ i, dec_test[i].exp_len,
+ dec_test[i].exp_cp, len, cp);
+ }
+ }
+ printf("UTF-8 decoder test: Passed %zu out of %zu tests.\n",
+ LEN(dec_test) - failed, LEN(dec_test));
+
+ /* grapheme break test */
+ for (i = 0, failed = 0; i < LEN(t); i++) {
                 for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
                         if ((j + 1) == t[i].cplen ||
                             boundary(t[i].cp[j], t[i].cp[j + 1], &state)) {
_AT_@ -28,8 +290,8 @@ int main(void)
                         }
                 }
         }
-
- printf("Passed %zu out of %zu tests.\n", LEN(t) - failed, LEN(t));
+ printf("Grapheme break test: Passed %zu out of %zu tests.\n",
+ LEN(t) - failed, LEN(t));
 
         return (failed > 0) ? 1 : 0;
 }
Received on Thu May 28 2020 - 13:00:20 CEST

This archive was generated by hypermail 2.3.0 : Thu May 28 2020 - 13:00:36 CEST