[hackers] [libgrapheme] Encourage strict aliasing for library users (uint8_t * -> char *) || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Fri, 17 Dec 2021 00:47:45 +0100 (CET)

commit b99a40eefc2ec1ad8714ed210a3aeedfb3283159
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Fri Dec 17 00:34:27 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Fri Dec 17 00:34:27 2021 +0100

    Encourage strict aliasing for library users (uint8_t * -> char *)
    
    After a long-winded discussion with Michael Forney who has a really
    deep understanding of the C-specification, he rightfully pointed out
    that using uint8_t * might look good on paper, but leads to subtle
    problems due to intrinsics within the C99-specification.
    
    While you can alias any pointer to character types (char, unsigned char,
    signed char), uint8_t is not a character type and aliasing to it breaks
    the strict aliasing rule. This is not a problem in practice as gcc
    is the only big compiler enforcing strict aliasing and uint8_t is
    usually defined as unsigned char, inheriting the aliasing property for
    technical reasons, but strictly speaking uint8_t is not a character
    type.
    
    With uint8_t * in the API, library users would've been forced to cast
    any input-string to uint8_t *, breaking the strict aliasing rule. A
    lot of code relies on this or conveniently disables strict aliasing
    through compiler flags, but using char-arrays is the only really
    portable and safe way to work with it.
    Given char is usually 8 bits and indicates strongly that we're dealing
    with a string is one strong point for using char *, another is that
    C11 introduced UTF-8-string-literals of the form u8"..." which are
    of type char[]. In this sense, using char * ensures some form of
    forward-compatibility and fits nicely within the spec that's slowly
    converging towards UTF-8.
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/grapheme.h b/grapheme.h
index bd5244b..3294c8e 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -19,11 +19,11 @@ typedef struct lg_internal_segmentation_state {
 
 #define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
 
-size_t lg_grapheme_nextbreak(const uint8_t *);
+size_t lg_grapheme_nextbreak(const char *);
 
 bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
 
-size_t lg_utf8_decode(const uint8_t *, size_t, uint_least32_t *);
-size_t lg_utf8_encode(uint_least32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
 
 #endif /* GRAPHEME_H */
diff --git a/man/lg_grapheme_nextbreak.3 b/man/lg_grapheme_nextbreak.3
index 795e1b4..ff78395 100644
--- a/man/lg_grapheme_nextbreak.3
+++ b/man/lg_grapheme_nextbreak.3
_AT_@ -7,7 +7,7 @@
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn lg_grapheme_nextbreak "const uint8_t *str"
+.Fn lg_grapheme_nextbreak "const char *str"
 .Sh DESCRIPTION
 .Fn lg_grapheme_nextbreak
 computes the offset (in bytes) to the next grapheme
_AT_@ -52,7 +52,7 @@ main(void)
 
         /* print each grapheme cluster with byte-length */
         for (; *s != '\\0';) {
- len = lg_grapheme_nextbreak((uint8_t *)s);
+ len = lg_grapheme_nextbreak(s);
                 printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
                 s += len;
         }
diff --git a/src/grapheme.c b/src/grapheme.c
index 56993af..78d0993 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
_AT_@ -179,7 +179,7 @@ hasbreak:
 }
 
 size_t
-lg_grapheme_nextbreak(const uint8_t *str)
+lg_grapheme_nextbreak(const char *str)
 {
         uint_least32_t cp0, cp1;
         size_t ret, len = 0;
diff --git a/src/utf8.c b/src/utf8.c
index b21c920..327deea 100644
--- a/src/utf8.c
+++ b/src/utf8.c
_AT_@ -48,7 +48,7 @@ static const struct {
 };
 
 size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
 {
         size_t off, i;
 
_AT_@ -60,13 +60,14 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 
         /* identify sequence type with the first byte */
         for (off = 0; off < LEN(lut); off++) {
- if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+ if (BETWEEN(((unsigned char *)s)[0], lut[off].lower,
+ lut[off].upper)) {
                         /*
                          * first byte is within the bounds; fill
                          * p with the the first bits contained in
                          * the first byte (by subtracting the high bits)
                          */
- *cp = s[0] - lut[off].lower;
+ *cp = ((unsigned char *)s)[0] - lut[off].lower;
                         break;
                 }
         }
_AT_@ -74,6 +75,9 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
                 /*
                  * first byte does not match a sequence type;
                  * set cp as invalid and return 1 byte processed
+ *
+ * this also includes the cases where bits higher than
+ * the 8th are set on systems with CHAR_BIT > 8
                  */
                 *cp = LG_CODEPOINT_INVALID;
                 return 1;
_AT_@ -92,12 +96,16 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
          * (i.e. between 0x80 (10000000) and 0xBF (10111111))
          */
         for (i = 1; i <= off; i++) {
- if(!BETWEEN(s[i], 0x80, 0xBF)) {
+ if(!BETWEEN(((unsigned char *)s)[i], 0x80, 0xBF)) {
                         /*
                          * byte does not match format; return
                          * number of bytes processed excluding the
                          * unexpected character as recommended since
                          * Unicode 6 (chapter 3)
+ *
+ * this also includes the cases where bits
+ * higher than the 8th are set on systems
+ * with CHAR_BIT > 8
                          */
                         *cp = LG_CODEPOINT_INVALID;
                         return 1 + (i - 1);
_AT_@ -106,7 +114,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
                  * shift code point by 6 bits and add the 6 stored bits
                  * in s[i] to it using the bitmask 0x3F (00111111)
                  */
- *cp = (*cp << 6) | (s[i] & 0x3F);
+ *cp = (*cp << 6) | (((unsigned char *)s)[i] & 0x3F);
         }
 
         if (*cp < lut[off].mincp ||
_AT_@ -125,7 +133,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint_least32_t *cp)
 }
 
 size_t
-lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
 {
         size_t off, i;
 
_AT_@ -165,7 +173,7 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
          * We do not overwrite the mask because we guaranteed earlier
          * that there are no bits higher than the mask allows.
          */
- s[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
+ ((unsigned char *)s)[0] = lut[off].lower | (uint8_t)(cp >> (6 * off));
 
         for (i = 1; i <= off; i++) {
                 /*
_AT_@ -174,7 +182,8 @@ lg_utf8_encode(uint_least32_t cp, uint8_t *s, size_t n)
                  * extract from the properly-shifted value using the
                  * mask 00111111 (0x3F)
                  */
- s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+ ((unsigned char *)s)[i] = 0x80 |
+ ((cp >> (6 * (off - i))) & 0x3F);
         }
 
         return 1 + off;
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index 0fd6f77..b4dc7f2 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
_AT_@ -8,7 +8,7 @@
 #include "util.h"
 
 static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
+ char *arr; /* UTF-8 byte sequence */
         size_t len; /* length of UTF-8 byte sequence */
         size_t exp_len; /* expected length returned */
         uint_least32_t exp_cp; /* expected code point returned */
_AT_@ -28,7 +28,7 @@ static const struct {
                  * [ 11111101 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xFD },
+ .arr = (char *)(unsigned char[]){ 0xFD },
                 .len = 1,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -38,7 +38,7 @@ static const struct {
                  * [ 00000001 ] ->
                  * 0000001
                  */
- .arr = (uint8_t[]){ 0x01 },
+ .arr = (char *)(unsigned char[]){ 0x01 },
                 .len = 1,
                 .exp_len = 1,
                 .exp_cp = 0x1,
_AT_@ -48,7 +48,7 @@ static const struct {
                  * [ 11000011 10111111 ] ->
                  * 00011111111
                  */
- .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
                 .len = 2,
                 .exp_len = 2,
                 .exp_cp = 0xFF,
_AT_@ -58,7 +58,7 @@ static const struct {
                  * [ 11000011 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC3 },
+ .arr = (char *)(unsigned char[]){ 0xC3 },
                 .len = 1,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -68,7 +68,7 @@ static const struct {
                  * [ 11000011 11111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
                 .len = 2,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -78,7 +78,7 @@ static const struct {
                  * [ 11000001 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
                 .len = 2,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -88,7 +88,7 @@ static const struct {
                  * [ 11100000 10111111 10111111 ] ->
                  * 0000111111111111
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = 0xFFF,
_AT_@ -98,7 +98,7 @@ static const struct {
                  * [ 11100000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0 },
+ .arr = (char *)(unsigned char[]){ 0xE0 },
                 .len = 1,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -108,7 +108,7 @@ static const struct {
                  * [ 11100000 01111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
                 .len = 3,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -118,7 +118,7 @@ static const struct {
                  * [ 11100000 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
                 .len = 2,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -128,7 +128,7 @@ static const struct {
                  * [ 11100000 10111111 01111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
                 .len = 3,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -138,7 +138,7 @@ static const struct {
                  * [ 11100000 10011111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -148,7 +148,7 @@ static const struct {
                  * [ 11101101 10100000 10000000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
                 .len = 3,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -158,7 +158,7 @@ static const struct {
                  * [ 11110011 10111111 10111111 10111111 ] ->
                  * 011111111111111111111
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = UINT32_C(0xFFFFF),
_AT_@ -168,7 +168,7 @@ static const struct {
                  * [ 11110011 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3 },
+ .arr = (char *)(unsigned char[]){ 0xF3 },
                 .len = 1,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -178,7 +178,7 @@ static const struct {
                  * [ 11110011 01111111 10111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
                 .len = 4,
                 .exp_len = 1,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -188,7 +188,7 @@ static const struct {
                  * [ 11110011 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
                 .len = 2,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -198,7 +198,7 @@ static const struct {
                  * [ 11110011 10111111 01111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
                 .len = 4,
                 .exp_len = 2,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -208,7 +208,7 @@ static const struct {
                  * [ 11110011 10111111 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
                 .len = 3,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -218,7 +218,7 @@ static const struct {
                  * [ 11110011 10111111 10111111 01111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
                 .len = 4,
                 .exp_len = 3,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -228,7 +228,7 @@ static const struct {
                  * [ 11110000 10000000 10000001 10111111 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -238,7 +238,7 @@ static const struct {
                  * [ 11110100 10010000 10000000 10000000 ] ->
                  * INVALID
                  */
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
                 .len = 4,
                 .exp_len = 4,
                 .exp_cp = LG_CODEPOINT_INVALID,
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 99f5d48..9ebaccf 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
_AT_@ -9,43 +9,43 @@
 
 static const struct {
         uint_least32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
+ char *exp_arr; /* expected UTF-8 byte sequence */
         size_t exp_len; /* expected length of UTF-8 sequence */
 } enc_test[] = {
         {
                 /* invalid code point (UTF-16 surrogate half) */
                 .cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
                 .exp_len = 3,
         },
         {
                 /* invalid code point (UTF-16-unrepresentable) */
                 .cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char *)(unsigned char[]){ 0xEF, 0xBF, 0xBD },
                 .exp_len = 3,
         },
         {
                 /* code point encoded to a 1-byte sequence */
                 .cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_arr = (char *)(unsigned char[]){ 0x01 },
                 .exp_len = 1,
         },
         {
                 /* code point encoded to a 2-byte sequence */
                 .cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
                 .exp_len = 2,
         },
         {
                 /* code point encoded to a 3-byte sequence */
                 .cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
                 .exp_len = 3,
         },
         {
                 /* code point encoded to a 4-byte sequence */
                 .cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
                 .exp_len = 4,
         },
 };
_AT_@ -59,7 +59,7 @@ main(int argc, char *argv[])
 
         /* UTF-8 encoder test */
         for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
+ char arr[4];
                 size_t len;
 
                 len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
Received on Fri Dec 17 2021 - 00:47:45 CET

This archive was generated by hypermail 2.3.0 : Fri Dec 17 2021 - 00:48:34 CET