[hackers] [libgrapheme] Rewrite grapheme_next_character_break() and add size-parameter || Laslo Hunhold from git_AT_suckless.org on 2021-12-19 (hackers mail list archive)

From: <git_AT_suckless.org>
Date: Sun, 19 Dec 2021 00:56:07 +0100 (CET)

commit f8e8649a4fd88e61f9473400f44b9b1c5fce9e7c
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sun Dec 19 00:52:23 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sun Dec 19 00:52:23 2021 +0100

    Rewrite grapheme_next_character_break() and add size-parameter

    Not in all cases will you have a NUL-terminated string to look at,
    but some length-bounded "raw" array in memory. Comparable to how
    we already do it in grapheme_decode_utf8() to handle NUL-terminated
    strings, we add a len-parameter to grapheme_next_character_break()
    that can be set to SIZE_MAX to indicate that the string doesn't have
    a known bound but is instead NUL-terminated. Otherwise, if len is
    not SIZE_MAX, we have a proper bound.

    It was planned anyway, but this was a good point to rewrite the function
    to make it more readable and simplify it. There was especially no reason
    to call grapheme_decode_utf8() more than once.

    This will bring 99% feature-parity with what most people do with
    ICU without all the unnecessary cruft, boiler-plate and incantations
    you need with ICU.

    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/grapheme.h b/grapheme.h
index ea8a02d..c2def7c 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -19,7 +19,7 @@ typedef struct grapheme_internal_segmentation_state {

#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)

-size_t grapheme_next_character_break(const char *);
+size_t grapheme_next_character_break(const char *, size_t);

bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);

diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break.3
index 1e96383..962b2ce 100644
--- a/man/grapheme_next_character_break.3
+++ b/man/grapheme_next_character_break.3
_AT_@ -7,19 +7,30 @@
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
-.Fn grapheme_next_character_break "const char *str"
+.Fn grapheme_next_character_break "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_character_break
function computes the offset (in bytes) to the next grapheme
cluster break (see
.Xr libgrapheme 7 )
-in the UTF-8-encoded NUL-terminated string
-.Va str .
+in the UTF-8-encoded string
+.Va str
+of length
+.Va len .
If a grapheme cluster begins at
.Va str
this offset is equal to the length of said grapheme cluster.
.Pp
+If
+.Va len
+is set to
+.Dv SIZE_MAX
+(stdint.h is already included by grapheme.h) the string
+.Va str
+is interpreted to be NUL-terminated and processing stops when a
+NUL-byte is encountered.
+.Pp
For non-UTF-8 input data
.Xr grapheme_is_character_break 3
can be used instead.
_AT_@ -48,15 +59,24 @@ main(void)
                   "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
                   "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
                   "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
- size_t len;
+ size_t ret, len, off;

         printf("Input: \\"%s\\"\\n", s);

         /* print each grapheme cluster with byte-length */
- for (; *s != '\\0';) {
- len = grapheme_next_character_break(s);
- printf("%2zu bytes | %.*s\\n", len, (int)len, s, len);
- s += len;
+ printf("Grapheme clusters in NUL-delimited input:\\n");
+ for (off = 0; s[off] != '\\0'; off += ret) {
+ ret = grapheme_next_character_break(s + off, SIZE_MAX);
+ printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
+ }
+ printf("\\n");
+
+ /* do the same, but this time string is length-delimited */
+ len = 17;
+ printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
+ for (off = 0; off < len; off += ret) {
+ ret = grapheme_next_character_break(s + off, len - off);
+ printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
         }

         return 0;
diff --git a/src/character.c b/src/character.c
index 8f1143f..2215543 100644
--- a/src/character.c
+++ b/src/character.c
_AT_@ -179,50 +179,41 @@ hasbreak:
}

size_t
-grapheme_next_character_break(const char *str)
+grapheme_next_character_break(const char *str, size_t len)
{
- uint_least32_t cp0, cp1;
- size_t ret, len = 0;
         GRAPHEME_STATE state = { 0 };
+ uint_least32_t cp0 = 0, cp1 = 0;
+ size_t off, ret;

- if (str == NULL) {
+ if (str == NULL || len == 0) {
                 return 0;
         }

- /*
- * grapheme_decode_utf8, when it encounters an unexpected byte,
- * does not count it to the error and instead assumes that the
- * unexpected byte is the beginning of a new sequence.
- * This way, when the string ends with a null byte, we never
- * miss it, even if the previous UTF-8 sequence terminates
- * unexpectedly, as it would either act as an unexpected byte,
- * saved for later, or as a null byte itself, that we can catch.
- * We pass SIZE_MAX to the length, as we will never read beyond
- * the null byte for the reasons given above.
- */
-
- /* get first codepoint */
- len += grapheme_decode_utf8(str, SIZE_MAX, &cp0);
- if (cp0 == GRAPHEME_INVALID_CODEPOINT) {
- return len;
- }
+ for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
+ cp0 = cp1;
+ ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
+ SIZE_MAX : len - off, &cp1);

- while (cp0 != 0) {
- /* get next codepoint */
- ret = grapheme_decode_utf8(str + len, SIZE_MAX, &cp1);
+ if (len != SIZE_MAX && ret > (len - off)) {
+ /* string ended abruptly, simply accept cropping */
+ ret = len - off;
+ }

- if (cp1 == GRAPHEME_INVALID_CODEPOINT ||
- grapheme_is_character_break(cp0, cp1, &state)) {
- /* we read an invalid cp or have a breakpoint */
+ if (len == SIZE_MAX && cp1 == 0) {
+ /* we hit a NUL-byte and are done */
                         break;
- } else {
- /* we don't have a breakpoint, continue */
- len += ret;
                 }

- /* prepare next round */
- cp0 = cp1;
+ if (off == 0) {
+ /*
+ * we skip the first round, as we need both
+ * cp0 and cp1 to be initialized
+ */
+ continue;
+ } else if (grapheme_is_character_break(cp0, cp1, &state)) {
+ break;
+ }
         }

- return len;
+ return off;
}
Received on Sun Dec 19 2021 - 00:56:07 CET

This archive was generated by hypermail 2.3.0 : Sun Dec 19 2021 - 01:00:33 CET