[hackers] [libgrapheme] Refine types (uint8_t -> char, uint32_t -> uint_least32_t) || Laslo Hunhold
commit c0e14c9b89c1ac78b72b7d8840261fbb7285d07a
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sat Dec 11 14:17:39 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sat Dec 11 14:17:39 2021 +0100
Refine types (uint8_t -> char, uint32_t -> uint_least32_t)
The type uint32_t is not guaranteed by the standard to be present,
but it guarantees uint_least32_t. If a libgrapheme-user passes a pointer
to an uint32_t (instead of uint_least32_t) there will be no problem,
as the presence of uint32_t immediately implies uint32_t ==
uint_least32_t. However, we won't depend on it internally and are
strict with using uint_least32_t. The type name is a mouthful, but still
clearer and not much longer than "long int" (which is guaranteed to be
at least 32 bits).
Regarding uint8_t, it was a bit clumsy to require it in the API. C does
not guarantee that a byte is actually an octet (i.e. char can have
more than 8 bits), and even though the relevance of non-8-bit-char
seems to be waning, I don't want to rely on that. But more importantly,
accepting "char *" saves some casts on the user-side.
Adapting the lg_utf8_* functions is trivial, as it requires just
being careful with casts. The cast "signed char" <-> "unsigned char"
is unproblematic, so every time we need the bit representation, we
explicitly cast to unsigned char and are done with it. Likewise, every
time we write to a char, we make sure that what we pass is explicitly
an unsigned char.
This became a bit awkward in the test cases where we have char-arrays
with hex literals. As C does not really have a concept of a sub-int
literal, all hexadecimal literals had to first be explicitly cast to
unsigned char, but that's it.
One more aspect where we've become more portable. :)
Signed-off-by: Laslo Hunhold <dev_AT_frign.de>
diff --git a/gen/util.c b/gen/util.c
index ec5afb7..b43a3dc 100644
--- a/gen/util.c
+++ b/gen/util.c
_AT_@ -39,7 +39,7 @@ valid_hexstring(const char *str)
}
static int
-cp_parse(const char *str, uint32_t *cp)
+cp_parse(const char *str, uint_least32_t *cp)
{
if (!valid_hexstring(str)) {
return 1;
_AT_@ -348,13 +348,13 @@ segment_test_list_print(struct segment_test *st, size_t numsegtests,
printf("/* Automatically generated by %s */\n"
"#include <stdint.h>\n#include <stddef.h>\n\n", progname);
- printf("static const struct {\n\tuint32_t *cp;\n"
+ printf("static const struct {\n\tuint_least32_t *cp;\n"
"\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
"\tchar *descr;\n} %s[] = {\n", identifier);
for (i = 0; i < numsegtests; i++) {
printf("\t{\n");
- printf("\t\t.cp = (uint32_t[]){");
+ printf("\t\t.cp = (uint_least32_t[]){");
for (j = 0; j < st[i].cplen; j++) {
printf(" UINT32_C(0x%06X)", st[i].cp[j]);
if (j + 1 < st[i].cplen) {
diff --git a/gen/util.h b/gen/util.h
index 9461416..e269888 100644
--- a/gen/util.h
+++ b/gen/util.h
_AT_@ -8,8 +8,8 @@
#define LEN(x) (sizeof (x) / sizeof *(x))
struct range {
- uint32_t lower;
- uint32_t upper;
+ uint_least32_t lower;
+ uint_least32_t upper;
};
struct property {
_AT_@ -21,7 +21,7 @@ struct property {
};
struct segment_test {
- uint32_t *cp;
+ uint_least32_t *cp;
size_t cplen;
size_t *len;
size_t lenlen;
diff --git a/grapheme.h b/grapheme.h
index 6a6fe4f..62d7cb9 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -20,9 +20,9 @@ typedef struct lg_internal_segmentation_state {
size_t lg_grapheme_nextbreak(const char *);
-int lg_grapheme_isbreak(uint32_t, uint32_t, LG_SEGMENTATION_STATE *);
+int lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
-size_t lg_utf8_decode(const uint8_t *, size_t, uint32_t *);
-size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
+size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
+size_t lg_utf8_encode(uint_least32_t, char *, size_t);
#endif /* GRAPHEME_H */
diff --git a/src/grapheme.c b/src/grapheme.c
index 2feaa2f..731ad37 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
_AT_@ -13,7 +13,7 @@ enum {
};
int
-lg_grapheme_isbreak(uint32_t a, uint32_t b, LG_SEGMENTATION_STATE *state)
+lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
{
struct lg_internal_heisenstate *p[2] = { 0 };
int ret = 1, flags = 0;
_AT_@ -179,7 +179,7 @@ hasbreak:
size_t
lg_grapheme_nextbreak(const char *str)
{
- uint32_t cp0, cp1;
+ uint_least32_t cp0, cp1;
size_t ret, len = 0;
LG_SEGMENTATION_STATE state = { 0 };
_AT_@ -200,14 +200,14 @@ lg_grapheme_nextbreak(const char *str)
*/
/* get first code point */
- len += lg_utf8_decode((uint8_t *)str, 5, &cp0);
+ len += lg_utf8_decode(str, 5, &cp0);
if (cp0 == LG_CODEPOINT_INVALID) {
return len;
}
while (cp0 != 0) {
/* get next code point */
- ret = lg_utf8_decode((uint8_t *)(str + len), 5, &cp1);
+ ret = lg_utf8_decode(str + len, 5, &cp1);
if (cp1 == LG_CODEPOINT_INVALID ||
lg_grapheme_isbreak(cp0, cp1, &state)) {
diff --git a/src/utf8.c b/src/utf8.c
index 4488359..1cb5e17 100644
--- a/src/utf8.c
+++ b/src/utf8.c
_AT_@ -8,10 +8,10 @@
/* lookup-table for the types of sequence first bytes */
static const struct {
- uint8_t lower; /* lower bound of sequence first byte */
- uint8_t upper; /* upper bound of sequence first byte */
- uint32_t mincp; /* smallest non-overlong encoded code point */
- uint32_t maxcp; /* largest encodable code point */
+ uint8_t lower; /* lower bound of sequence first byte */
+ uint8_t upper; /* upper bound of sequence first byte */
+ uint_least32_t mincp; /* smallest non-overlong encoded code point */
+ uint_least32_t maxcp; /* largest encodable code point */
/*
* implicit: table-offset represents the number of following
* bytes of the form 10xxxxxx (6 bits capacity each)
_AT_@ -21,37 +21,44 @@ static const struct {
/* 0xxxxxxx */
.lower = 0x00, /* 00000000 */
.upper = 0x7F, /* 01111111 */
- .mincp = (uint32_t)0,
- .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
+ .mincp = (uint_least32_t)0,
+ .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
},
[1] = {
/* 110xxxxx */
.lower = 0xC0, /* 11000000 */
.upper = 0xDF, /* 11011111 */
- .mincp = (uint32_t)1 << 7,
- .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
+ .mincp = (uint_least32_t)1 << 7,
+ .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
},
[2] = {
/* 1110xxxx */
.lower = 0xE0, /* 11100000 */
.upper = 0xEF, /* 11101111 */
- .mincp = (uint32_t)1 << 11,
- .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
+ .mincp = (uint_least32_t)1 << 11,
+ .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
},
[3] = {
/* 11110xxx */
.lower = 0xF0, /* 11110000 */
.upper = 0xF7, /* 11110111 */
- .mincp = (uint32_t)1 << 16,
- .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
+ .mincp = (uint_least32_t)1 << 16,
+ .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
},
};
size_t
-lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
+lg_utf8_decode(const char *s, size_t n, uint_least32_t *cp)
{
size_t off, i;
+ /*
+ * char is guaranteed to be at least 8 bits, but it could
+ * be more. We assume that the encoding is faithful such
+ * that any higher bits are zero. If we encounter anything
+ * else, we treat it as an encoding error.
+ */
+
if (n == 0) {
/* a sequence must be at least 1 byte long */
*cp = LG_CODEPOINT_INVALID;
_AT_@ -60,13 +67,15 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
/* identify sequence type with the first byte */
for (off = 0; off < LEN(lut); off++) {
- if (BETWEEN(s[0], lut[off].lower, lut[off].upper)) {
+ if (BETWEEN((unsigned char)s[0], lut[off].lower,
+ lut[off].upper)) {
/*
* first byte is within the bounds; fill
* p with the the first bits contained in
* the first byte (by subtracting the high bits)
+ * and discarding any higher bits than 8
*/
- *cp = s[0] - lut[off].lower;
+ *cp = ((unsigned char)s[0] - lut[off].lower) & 0xff;
break;
}
}
_AT_@ -92,7 +101,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
* (i.e. between 0x80 (10000000) and 0xBF (10111111))
*/
for (i = 1; i <= off; i++) {
- if(!BETWEEN(s[i], 0x80, 0xBF)) {
+ if(!BETWEEN((unsigned char)s[i], 0x80, 0xBF)) {
/*
* byte does not match format; return
* number of bytes processed excluding the
_AT_@ -106,7 +115,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
* shift code point by 6 bits and add the 6 stored bits
* in s[i] to it using the bitmask 0x3F (00111111)
*/
- *cp = (*cp << 6) | (s[i] & 0x3F);
+ *cp = (*cp << 6) | ((unsigned char)s[i] & 0x3F);
}
if (*cp < lut[off].mincp ||
_AT_@ -125,7 +134,7 @@ lg_utf8_decode(const uint8_t *s, size_t n, uint32_t *cp)
}
size_t
-lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
+lg_utf8_encode(uint_least32_t cp, char *s, size_t n)
{
size_t off, i;
_AT_@ -161,7 +170,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
* We do not overwrite the mask because we guaranteed earlier
* that there are no bits higher than the mask allows.
*/
- s[0] = lut[off].lower | (cp >> (6 * off));
+ s[0] = (unsigned char)(lut[off].lower | (cp >> (6 * off)));
for (i = 1; i <= off; i++) {
/*
_AT_@ -170,7 +179,7 @@ lg_utf8_encode(uint32_t cp, uint8_t *s, size_t n)
* extract from the properly-shifted value using the
* mask 00111111 (0x3F)
*/
- s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+ s[i] = (unsigned char)(0x80 | ((cp >> (6 * (off - i))) & 0x3F));
}
return 1 + off;
diff --git a/src/util.c b/src/util.c
index 5bbe926..16663f8 100644
--- a/src/util.c
+++ b/src/util.c
_AT_@ -41,14 +41,14 @@ heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state)
static int
cp_cmp(const void *a, const void *b)
{
- uint32_t cp = *(uint32_t *)a;
- uint32_t *range = (uint32_t *)b;
+ uint_least32_t cp = *(uint_least32_t *)a;
+ uint_least32_t *range = (uint_least32_t *)b;
return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
}
int
-has_property(uint32_t cp, struct lg_internal_heisenstate *cpstate,
+has_property(uint_least32_t cp, struct lg_internal_heisenstate *cpstate,
const struct range_list *proptable, int property)
{
int res;
diff --git a/src/util.h b/src/util.h
index 3a9cccd..065097b 100644
--- a/src/util.h
+++ b/src/util.h
_AT_@ -10,8 +10,8 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))
struct range {
- uint32_t lower;
- uint32_t upper;
+ uint_least32_t lower;
+ uint_least32_t upper;
};
struct range_list {
_AT_@ -22,7 +22,7 @@ struct range_list {
int heisenstate_get(struct lg_internal_heisenstate *, int);
int heisenstate_set(struct lg_internal_heisenstate *, int, int);
-int has_property(uint32_t, struct lg_internal_heisenstate *,
+int has_property(uint_least32_t, struct lg_internal_heisenstate *,
const struct range_list *, int);
#endif /* UTIL_H */
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index 1182fb0..ee71cf9 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
_AT_@ -9,7 +9,7 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))
static const struct {
- uint8_t *arr; /* UTF-8 byte sequence */
+ char *arr; /* UTF-8 byte sequence */
size_t len; /* length of UTF-8 byte sequence */
size_t exp_len; /* expected length returned */
uint32_t exp_cp; /* expected code point returned */
_AT_@ -29,7 +29,9 @@ static const struct {
* [ 11111101 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xFD },
+ .arr = (char[]){
+ (unsigned char)0xFD,
+ },
.len = 1,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -39,7 +41,9 @@ static const struct {
* [ 00000001 ] ->
* 0000001
*/
- .arr = (uint8_t[]){ 0x01 },
+ .arr = (char[]){
+ (unsigned char)0x01,
+ },
.len = 1,
.exp_len = 1,
.exp_cp = 0x1,
_AT_@ -49,7 +53,10 @@ static const struct {
* [ 11000011 10111111 ] ->
* 00011111111
*/
- .arr = (uint8_t[]){ 0xC3, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xBF,
+ },
.len = 2,
.exp_len = 2,
.exp_cp = 0xFF,
_AT_@ -59,7 +66,9 @@ static const struct {
* [ 11000011 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC3 },
+ .arr = (char[]){
+ (unsigned char)0xC3
+ },
.len = 1,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -69,7 +78,10 @@ static const struct {
* [ 11000011 11111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC3, 0xFF },
+ .arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xFF,
+ },
.len = 2,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -79,7 +91,10 @@ static const struct {
* [ 11000001 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xC1, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xC1,
+ (unsigned char)0xBF,
+ },
.len = 2,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -89,7 +104,11 @@ static const struct {
* [ 11100000 10111111 10111111 ] ->
* 0000111111111111
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.len = 3,
.exp_len = 3,
.exp_cp = 0xFFF,
_AT_@ -99,7 +118,9 @@ static const struct {
* [ 11100000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0 },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ },
.len = 1,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -109,7 +130,11 @@ static const struct {
* [ 11100000 01111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0x7F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ },
.len = 3,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -119,7 +144,10 @@ static const struct {
* [ 11100000 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ },
.len = 2,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -129,7 +157,11 @@ static const struct {
* [ 11100000 10111111 01111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0xBF, 0x7F },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ },
.len = 3,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -139,7 +171,11 @@ static const struct {
* [ 11100000 10011111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xE0, 0x9F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0x9F,
+ (unsigned char)0xBF,
+ },
.len = 3,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -149,7 +185,11 @@ static const struct {
* [ 11101101 10100000 10000000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xED, 0xA0, 0x80 },
+ .arr = (char[]){
+ (unsigned char)0xED,
+ (unsigned char)0xA0,
+ (unsigned char)0x80,
+ },
.len = 3,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -159,7 +199,12 @@ static const struct {
* [ 11110011 10111111 10111111 10111111 ] ->
* 011111111111111111111
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.len = 4,
.exp_len = 4,
.exp_cp = UINT32_C(0xFFFFF),
_AT_@ -169,7 +214,9 @@ static const struct {
* [ 11110011 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3 },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ },
.len = 1,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -179,7 +226,12 @@ static const struct {
* [ 11110011 01111111 10111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0x7F, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.len = 4,
.exp_len = 1,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -189,7 +241,10 @@ static const struct {
* [ 11110011 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ },
.len = 2,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -199,7 +254,12 @@ static const struct {
* [ 11110011 10111111 01111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0x7F, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ (unsigned char)0xBF,
+ },
.len = 4,
.exp_len = 2,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -209,7 +269,11 @@ static const struct {
* [ 11110011 10111111 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.len = 3,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -219,7 +283,12 @@ static const struct {
* [ 11110011 10111111 10111111 01111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0x7F },
+ .arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0x7F,
+ },
.len = 4,
.exp_len = 3,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -229,7 +298,12 @@ static const struct {
* [ 11110000 10000000 10000001 10111111 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF0, 0x80, 0x81, 0xBF },
+ .arr = (char[]){
+ (unsigned char)0xF0,
+ (unsigned char)0x80,
+ (unsigned char)0x81,
+ (unsigned char)0xBF,
+ },
.len = 4,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -239,7 +313,12 @@ static const struct {
* [ 11110100 10010000 10000000 10000000 ] ->
* INVALID
*/
- .arr = (uint8_t[]){ 0xF4, 0x90, 0x80, 0x80 },
+ .arr = (char[]){
+ (unsigned char)0xF4,
+ (unsigned char)0x90,
+ (unsigned char)0x80,
+ (unsigned char)0x80,
+ },
.len = 4,
.exp_len = 4,
.exp_cp = LG_CODEPOINT_INVALID,
_AT_@ -254,7 +333,7 @@ main(void)
/* UTF-8 decoder test */
for (i = 0, failed = 0; i < LEN(dec_test); i++) {
size_t len;
- uint32_t cp;
+ uint_least32_t cp;
len = lg_utf8_decode(dec_test[i].arr,
dec_test[i].len, &cp);
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 2f978d2..4ecf32a 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
_AT_@ -9,44 +9,66 @@
#define LEN(x) (sizeof(x) / sizeof(*(x)))
static const struct {
- uint32_t cp; /* input code point */
- uint8_t *exp_arr; /* expected UTF-8 byte sequence */
- size_t exp_len; /* expected length of UTF-8 sequence */
+ uint_least32_t cp; /* input code point */
+ char *exp_arr; /* expected UTF-8 byte sequence */
+ size_t exp_len; /* expected length of UTF-8 sequence */
} enc_test[] = {
{
/* invalid code point (UTF-16 surrogate half) */
.cp = UINT32_C(0xD800),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char[]){
+ (unsigned char)0xEF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBD,
+ },
.exp_len = 3,
},
{
/* invalid code point (UTF-16-unrepresentable) */
.cp = UINT32_C(0x110000),
- .exp_arr = (uint8_t[]){ 0xEF, 0xBF, 0xBD },
+ .exp_arr = (char[]){
+ (unsigned char)0xEF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBD,
+ },
.exp_len = 3,
},
{
/* code point encoded to a 1-byte sequence */
.cp = 0x01,
- .exp_arr = (uint8_t[]){ 0x01 },
+ .exp_arr = (char[]){
+ (unsigned char)0x01
+ },
.exp_len = 1,
},
{
/* code point encoded to a 2-byte sequence */
.cp = 0xFF,
- .exp_arr = (uint8_t[]){ 0xC3, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xC3,
+ (unsigned char)0xBF,
+ },
.exp_len = 2,
},
{
/* code point encoded to a 3-byte sequence */
.cp = 0xFFF,
- .exp_arr = (uint8_t[]){ 0xE0, 0xBF, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xE0,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.exp_len = 3,
},
{
/* code point encoded to a 4-byte sequence */
.cp = UINT32_C(0xFFFFF),
- .exp_arr = (uint8_t[]){ 0xF3, 0xBF, 0xBF, 0xBF },
+ .exp_arr = (char[]){
+ (unsigned char)0xF3,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ (unsigned char)0xBF,
+ },
.exp_len = 4,
},
};
_AT_@ -58,7 +80,7 @@ main(void)
/* UTF-8 encoder test */
for (i = 0, failed = 0; i < LEN(enc_test); i++) {
- uint8_t arr[4];
+ char arr[4];
size_t len;
len = lg_utf8_encode(enc_test[i].cp, arr, LEN(arr));
Received on Sat Dec 11 2021 - 14:25:26 CET
This archive was generated by hypermail 2.3.0
: Sat Dec 11 2021 - 14:36:31 CET