[hackers] [libgrapheme] Refine types (uint8_t -> char, uint32_t -> uint_least32_t) || Laslo Hunhold from git_AT_suckless.org on 2021-12-11 (hackers mail list archive)

From: <git_AT_suckless.org>
Date: Sat, 11 Dec 2021 14:25:26 +0100 (CET)

commit c0e14c9b89c1ac78b72b7d8840261fbb7285d07a
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sat Dec 11 14:17:39 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sat Dec 11 14:17:39 2021 +0100

    Refine types (uint8_t -> char, uint32_t -> uint_least32_t)

    The type uint32_t is not guaranteed by the standard to be present,
    but it guarantees uint_least32_t. If a libgrapheme-user passes a pointer
    to an uint32_t (instead of uint_least32_t) there will be no problem,
    as the presence of uint32_t immediately implies uint32_t ==
    uint_least32_t. However, we won't depend on it internally and are
    strict with using uint_least32_t. The type name is a mouthful, but still
    clearer and not much longer than "long int" (which is guaranteed to be
    at least 32 bits).

    Regarding uint8_t, it was a bit clumsy to require it in the API. C does
    not guarantee that a byte is actually an octet (i.e. char can have
    more than 8 bits), and even though the relevance of non-8-bit-char
    seems to be waning, I don't want to rely on that. But more importantly,
    accepting "char *" saves some casts on the user-side.
    Adapting the lg_utf8_* functions is trivial, as it requires just
    being careful with casts. The cast "signed char" <-> "unsigned char"
    is unproblematic, so every time we need the bit representation, we
    explicitly cast to unsigned char and are done with it. Likewise, every
    time we write to a char, we make sure that what we pass is explicitly
    an unsigned char.
    This became a bit awkward in the test cases where we have char-arrays
    with hex literals. As C does not really have a concept of a sub-int
    literal, all hexadecimal literals had to first be explicitly cast to
    unsigned char, but that's it.

    One more aspect where we've become more portable. :)

    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/gen/util.c b/gen/util.c
index ec5afb7..b43a3dc 100644
--- a/gen/util.c
+++ b/gen/util.c
_AT_@ -39,7 +39,7 @@ valid_hexstring(const char *str)
}

static int
-cp_parse(const char *str, uint32_t *cp)
+cp_parse(const char *str, uint_least32_t *cp)
{
         if (!valid_hexstring(str)) {
                 return 1;
_AT_@ -348,13 +348,13 @@ segment_test_list_print(struct segment_test *st, size_t numsegtests,
         printf("/* Automatically generated by %s */\n"
                "#include <stdint.h>\n#include <stddef.h>\n\n", progname);

- printf("static const struct {\n\tuint32_t *cp;\n"
+ printf("static const struct {\n\tuint_least32_t *cp;\n"
                "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
                "\tchar *descr;\n} %s[] = {\n", identifier);
         for (i = 0; i < numsegtests; i++) {
                 printf("\t{\n");

- printf("\t\t.cp = (uint32_t[]){");
+ printf("\t\t.cp = (uint_least32_t[]){");
                 for (j = 0; j < st[i].cplen; j++) {
                         printf(" UINT32_C(0x%06X)", st[i].cp[j]);
                         if (j + 1 < st[i].cplen) {
diff --git a/gen/util.h b/gen/util.h
index 9461416..e269888 100644
--- a/gen/util.h
+++ b/gen/util.h
_AT_@ -8,8 +8,8 @@
#define LEN(x) (sizeof (x) / sizeof *(x))

struct range {
- uint32_t lower;
- uint32_t upper;
+ uint_least32_t lower;
+ uint_least32_t upper;
};

struct property {
_AT_@ -21,7 +21,7 @@ struct property {
};

struct segment_test {
- uint32_t *cp;
+ uint_least32_t *cp;
         size_t cplen;
         size_t *len;
         size_t lenlen;
diff --git a/grapheme.h b/grapheme.h
index 6a6fe4f..62d7cb9 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -20,9 +20,9 @@ typedef struct lg_internal_segmentation_state {

size_t lg_grapheme_nextbreak(const char *);

-int lg_grapheme_isbreak(uint32_t, uint32_t, LG_SEGMENTATION_STATE *);
+int lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);

-size_t lg_utf8_decode(const uint8_t *, size_t, uint32_t *);
-size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);

#endif /* GRAPHEME_H */
diff --git a/src/grapheme.c b/src/grapheme.c
index 2feaa2f..731ad37 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
_AT_@ -13,7 +13,7 @@ enum {
};

int
-lg_grapheme_isbreak(uint32_t a, uint32_t b, LG_SEGMENTATION_STATE *state)
+lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
{
         struct lg_internal_heisenstate *p[2] = { 0 };
         int ret = 1, flags = 0;
_AT_@ -179,7 +179,7 @@ hasbreak:
size_t
lg_grapheme_nextbreak(const char *str)
{
- uint32_t cp0, cp1;
+ uint_least32_t cp0, cp1;
         size_t ret, len = 0;
         LG_SEGMENTATION_STATE state = { 0 };

_AT_@ -200,14 +200,14 @@ lg_grapheme_nextbreak(const char *str)
          */

         /* get first code point */
- len += lg_utf8_decode((uint8_t *)str, 5, &cp0);
+ len += lg_utf8_decode(str, 5, &cp0);
         if (cp0 == LG_CODEPOINT_INVALID) {
                 return len;
         }

         while (cp0 != 0) {
                 /* get next code point */
- ret = lg_utf8_decode((uint8_t *)(str + len), 5, &cp1);
+ ret = lg_utf8_decode(str + len, 5, &cp1);

                 if (cp1 == LG_CODEPOINT_INVALID ||
                     lg_grapheme_isbreak(cp0, cp1, &state)) {
diff --git a/src/utf8.c b/src/utf8.c
index 4488359..1cb5e17 100644
--- a/src/utf8.c
+++ b/src/utf8.c
_AT_@ -8,10 +8,10 @@

/* lookup-table for the types of sequence first bytes */
static const struct {
- uint8_t lower; /* lower bound of sequence first byte */
- uint8_t upper; /* upper bound of sequence first byte */
- uint32_t mincp; /* smallest non-overlong encoded code point */
- uint32_t maxcp; /* largest encodable code point */
+ uint8_t lower; /* lower bound of sequence first byte */
+ uint8_t upper; /* upper bound of sequence first byte */
+ uint_least32_t mincp; /* smallest non-overlong encoded code point */
+ uint_least32_t maxcp; /* largest encodable code point */
         /*
          * implicit: table-offset represents the number of following
          * bytes of the form 10xxxxxx (6 bits capacity each)
_AT_@ -21,37 +21,44 @@ static const struct {
                 /* 0xxxxxxx */
                 .lower = 0x00, /* 00000000 */
                 .upper = 0x7F, /* 01111111 */
- .mincp = (uint32_t)0,
- .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
+ .mincp = (uint_least32_t)0,
+ .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
         },
         [1] = {
                 /* 110xxxxx */
                 .lower = 0xC0, /* 11000000 */
                 .upper = 0xDF, /* 11011111 */
- .mincp = (uint32_t)1 << 7,
- .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
+ .mincp = (uint_least32_t)1 << 7,
+ .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
         },
         [2] = {
                 /* 1110xxxx */
                 .lower = 0xE0, /* 11100000 */
                 .upper = 0xEF, /* 11101111 */
- .mincp = (uint32_t)1 << 11,
- .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
+ .mincp = (uint_least32_t)1 << 11,
+ .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
         },
         [3] = {
                 /* 11110xxx */
                 .lower = 0xF0, /* 11110000 */
                 .upper = 0xF7, /* 11110111 */
- .mincp = (uint32_t)1 << 16,
- .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
+ .mincp = (uint_least32_t)1 << 16,
+ .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
         },
};

size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
{
         size_t off, i;

+ /*
+ * char is guaranteed to be at least 8 bits, but it could
+ * be more. We assume that the encoding is faithful such
+ * that any higher bits are zero. If we encounter anything
+ * else, we treat it as an encoding error.
+ */
+
         if (n == 0) {
                 /* a sequence must be at least 1 byte long */
                 *cp = LG_CODEPOINT_INVALID;
_AT_@ -60,13 +67,15 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)

         /* identify sequence type with the first byte */
         for (off = 0; off < LEN(lut); off++) {
- if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+ if (BETWEEN((unsigned char)s[0], lut[off].lower,
+ lut[off].upper)) {
                         /*
                          * first byte is within the bounds; fill
                          * p with the the first bits contained in
                          * the first byte (by subtracting the high bits)
+ * and discarding any higher bits than 8
                          */
- *cp = s[0] - lut[off].lower;
+ *cp = ((unsigned char)s[0] - lut[off].lower) & 0xff;
                         break;
                 }
         }
_AT_@ -92,7 +101,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
          * (i.e. between 0x80 (10000000) and 0xBF (10111111))
          */
         for (i = 1; i <= off; i++) {
- if(!BETWEEN(s[i], 0x80, 0xBF)) {
+ if(!BETWEEN((unsigned char)s[i], 0x80, 0xBF)) {
                         /*
                          * byte does not match format; return
                          * number of bytes processed excluding the
_AT_@ -106,7 +115,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
                  * shift code point by 6 bits and add the 6 stored bits
                  * in s[i] to it using the bitmask 0x3F (00111111)
                  */
- *cp = (*cp << 6) | (s[i] & 0x3F);
+ *cp = (*cp << 6) | ((unsigned char)s[i] & 0x3F);
         }

         if (*cp < lut[off].mincp ||
_AT_@ -125,7 +134,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
}

size_t
-lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
{
         size_t off, i;

_AT_@ -161,7 +170,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
          * We do not overwrite the mask because we guaranteed earlier
          * that there are no bits higher than the mask allows.
          */
- s[0] = lut[off].lower | (cp >> (6 * off));
+ s[0] = (unsigned char)(lut[off].lower | (cp >> (6 * off)));

         for (i = 1; i <= off; i++) {
                 /*
_AT_@ -170,7 +179,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
                  * extract from the properly-shifted value using the
                  * mask 00111111 (0x3F)
                  */
- s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+ s[i] = (unsigned char)(0x80 | ((cp >> (6 * (off - i))) & 0x3F));
         }

         return 1 + off;
diff --git a/src/util.c b/src/util.c
index 5bbe926..16663f8 100644
--- a/src/util.c
+++ b/src/util.c
_AT_@ -41,14 +41,14 @@ heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state)
static int
cp_cmp(const void *a, const void *b)
{
- uint32_t cp = *(uint32_t *)a;
- uint32_t *range = (uint32_t *)b;
+ uint_least32_t cp = *(uint_least32_t *)a;
+ uint_least32_t *range = (uint_least32_t *)b;

         return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
}

int
-has_property(uint32_t cp, struct lg_internal_heisenstate *cpstate,
+has_property(uint_least32_t cp, struct lg_internal_heisenstate *cpstate,
              const struct range_list *proptable, int property)
{
         int res;
diff --git a/src/util.h b/src/util.h
index 3a9cccd..065097b 100644
--- a/src/util.h
+++ b/src/util.h
_AT_@ -10,8 +10,8 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))

struct range {
- uint32_t lower;
- uint32_t upper;
+ uint_least32_t lower;
+ uint_least32_t upper;
};

struct range_list {
_AT_@ -22,7 +22,7 @@ struct range_list {
int heisenstate_get(struct lg_internal_heisenstate *, int);
int heisenstate_set(struct lg_internal_heisenstate *, int, int);

-int has_property(uint32_t, struct lg_internal_heisenstate *,
+int has_property(uint_least32_t, struct lg_internal_heisenstate *,
                  const struct range_list *, int);

#endif /* UTIL_H */
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index 1182fb0..ee71cf9 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
_AT_@ -9,7 +9,7 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))

static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
+ char *arr; /* UTF-8 byte sequence */
         size_t len; /* length of UTF-8 byte sequence */
         size_t exp_len; /* expected length returned */
         uint32_t exp_cp; /* expected code point returned */
_AT_@ -29,7 +29,9 @@ static const struct {
                  * [ 11111101 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xFD },
+ .arr = (char[]){
+ (unsigned char)0xFD,
+ },
                 .len = 1,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -39,7 +41,9 @@ static const struct {
                  * [ 00000001 ] ->
                  * 0000001
                  */
- .arr = (uint8_t[]){ 0x01 },
+ .arr = (char[]){
+ (unsigned char)0x01,
+ },
                 .len = 1,
                 .exp_len = 1,
                 .exp_cp = 0x1,
_AT_@ -49,7 +53,10 @@ static const struct {
                  * [ 11000011 10111111 ] ->
                  * 00011111111
                  */
- .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xBF,
+ },
                 .len = 2,
                 .exp_len = 2,
                 .exp_cp = 0xFF,
_AT_@ -59,7 +66,9 @@ static const struct {
                  * [ 11000011 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC3 },
+ .arr = (char[]){
+ (unsigned char)0xC3
+ },
                 .len = 1,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -69,7 +78,10 @@ static const struct {
                  * [ 11000011 11111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xFF,
+ },
                 .len = 2,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -79,7 +91,10 @@ static const struct {
                  * [ 11000001 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xC1,
+ (unsigned char)0xBF,
+ },
                 .len = 2,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -89,7 +104,11 @@ static const struct {
                  * [ 11100000 10111111 10111111 ] ->
                  * 0000111111111111
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = 0xFFF,
_AT_@ -99,7 +118,9 @@ static const struct {
                  * [ 11100000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0 },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ },
                 .len = 1,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -109,7 +130,11 @@ static const struct {
                  * [ 11100000 01111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ },
                 .len = 3,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -119,7 +144,10 @@ static const struct {
                  * [ 11100000 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ },
                 .len = 2,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -129,7 +157,11 @@ static const struct {
                  * [ 11100000 10111111 01111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ },
                 .len = 3,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -139,7 +171,11 @@ static const struct {
                  * [ 11100000 10011111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0x9F,
+ (unsigned char)0xBF,
+ },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -149,7 +185,11 @@ static const struct {
                  * [ 11101101 10100000 10000000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .arr = (char[]){
+ (unsigned char)0xED,
+ (unsigned char)0xA0,
+ (unsigned char)0x80,
+ },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -159,7 +199,12 @@ static const struct {
                  * [ 11110011 10111111 10111111 10111111 ] ->
                  * 011111111111111111111
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = UINT32_C(0xFFFFF),
_AT_@ -169,7 +214,9 @@ static const struct {
                  * [ 11110011 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3 },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ },
                 .len = 1,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -179,7 +226,12 @@ static const struct {
                  * [ 11110011 01111111 10111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .len = 4,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -189,7 +241,10 @@ static const struct {
                  * [ 11110011 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ },
                 .len = 2,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -199,7 +254,12 @@ static const struct {
                  * [ 11110011 10111111 01111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ },
                 .len = 4,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -209,7 +269,11 @@ static const struct {
                  * [ 11110011 10111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .len = 3,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -219,7 +283,12 @@ static const struct {
                  * [ 11110011 10111111 10111111 01111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ },
                 .len = 4,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -229,7 +298,12 @@ static const struct {
                  * [ 11110000 10000000 10000001 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF0,
+ (unsigned char)0x80,
+ (unsigned char)0x81,
+ (unsigned char)0xBF,
+ },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -239,7 +313,12 @@ static const struct {
                  * [ 11110100 10010000 10000000 10000000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .arr = (char[]){
+ (unsigned char)0xF4,
+ (unsigned char)0x90,
+ (unsigned char)0x80,
+ (unsigned char)0x80,
+ },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -254,7 +333,7 @@ main(void)
         /* UTF-8 decoder test */
         for (i = 0, failed = 0; i < LEN(dec_test); i++) {
                 size_t len;
- uint32_t cp;
+ uint_least32_t cp;

                 len = lg_utf8_decode(dec_test[i].arr,
                                      dec_test[i].len, &cp);
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 2f978d2..4ecf32a 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
_AT_@ -9,44 +9,66 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))

static const struct {
- uint32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
- size_t exp_len; /* expected length of UTF-8 sequence */
+ uint_least32_t cp; /* input code point */
+ char *exp_arr; /* expected UTF-8 byte sequence */
+ size_t exp_len; /* expected length of UTF-8 sequence */
} enc_test[] = {
         {
                 /* invalid code point (UTF-16 surrogate half) */
                 .cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char[]){
+ (unsigned char)0xEF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBD,
+ },
                 .exp_len = 3,
         },
         {
                 /* invalid code point (UTF-16-unrepresentable) */
                 .cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char[]){
+ (unsigned char)0xEF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBD,
+ },
                 .exp_len = 3,
         },
         {
                 /* code point encoded to a 1-byte sequence */
                 .cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_arr = (char[]){
+ (unsigned char)0x01
+ },
                 .exp_len = 1,
         },
         {
                 /* code point encoded to a 2-byte sequence */
                 .cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xBF,
+ },
                 .exp_len = 2,
         },
         {
                 /* code point encoded to a 3-byte sequence */
                 .cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .exp_len = 3,
         },
         {
                 /* code point encoded to a 4-byte sequence */
                 .cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
                 .exp_len = 4,
         },
};
_AT_@ -58,7 +80,7 @@ main(void)

         /* UTF-8 encoder test */
         for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
+ char arr[4];
                 size_t len;

                 len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
Received on Sat Dec 11 2021 - 14:25:26 CET

This archive was generated by hypermail 2.3.0 : Sat Dec 11 2021 - 14:36:31 CET