[hackers] [libgrapheme] Rename functions/defines/files from "grapheme" to "character" || Laslo Hunhold from git_AT_suckless.org on 2021-12-18 (hackers mail list archive)

From: <git_AT_suckless.org>
Date: Sat, 18 Dec 2021 12:15:21 +0100 (CET)

commit dfda0db8503b0051addc96368840b06c22fa8eeb
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sat Dec 18 12:02:20 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sat Dec 18 12:11:08 2021 +0100

    Rename functions/defines/files from "grapheme" to "character"

    It was always confusing to have "grapheme" used in different contexts.
    One is the library name, which is also the prefix for all constants,
    the other is to indicate we have functions for analyzing grapheme
    clusters.

    We rename all functions related to graphemes to operate on "characters"
    instead, where these are user-perceived characters. This naming choice
    is not out of the ordinary and also what libunistring for instance uses.

    Additionally, rename gen/grapheme.c to gen/character-prop.c to indicate
    we extract properties, improving readability. This also removes a bit
    of ambiguity regarding internal constants prefixed with GRAPHEME_, which
    might suggest that these were "officially" from grapheme.h, even though
    they serve only an internal use for characters specifically.

    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index 64327f3..8f6d694 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -10,17 +10,17 @@ DATA =\
         data/GraphemeBreakTest.txt\

GEN =\
- gen/grapheme\
- gen/grapheme-test\
+ gen/character-prop\
+ gen/character-test\

SRC =\
- src/grapheme\
+ src/character\
         src/utf8\
         src/util\

TEST =\
- test/grapheme\
- test/grapheme-performance\
+ test/character\
+ test/character-performance\
         test/utf8-decode\
         test/utf8-encode\

_AT_@ -34,27 +34,27 @@ MAN7 = man/libgrapheme.7

all: libgrapheme.a libgrapheme.so

-gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
-gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
+gen/character-prop.o: gen/character-prop.c config.mk gen/util.h
+gen/character-test.o: gen/character-test.c config.mk gen/util.h
gen/util.o: gen/util.c config.mk gen/util.h
-src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
+src/character.o: src/character.c config.mk gen/character-prop.h grapheme.h src/util.h
src/utf8.o: src/utf8.c config.mk grapheme.h
src/util.o: src/util.c config.mk grapheme.h src/util.h
-test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h test/util.h
-test/grapheme-performance.o: test/grapheme-performance.c config.mk gen/grapheme-test.h grapheme.h test/util.h
+test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
+test/character-performance.o: test/character-performance.c config.mk gen/character-test.h grapheme.h test/util.h
test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
test/util.o: test/util.c config.mk test/util.h

-gen/grapheme: gen/grapheme.o gen/util.o
-gen/grapheme-test: gen/grapheme-test.o gen/util.o
-test/grapheme: test/grapheme.o test/util.o libgrapheme.a
-test/grapheme-performance: test/grapheme-performance.o test/util.o libgrapheme.a
+gen/character-prop: gen/character-prop.o gen/util.o
+gen/character-test: gen/character-test.o gen/util.o
+test/character: test/character.o test/util.o libgrapheme.a
+test/character-performance: test/character-performance.o test/util.o libgrapheme.a
test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a
test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a

-gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme
-gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test
+gen/character-prop.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character-prop
+gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test

data/emoji-data.txt:
         wget -O $_AT_ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
diff --git a/gen/grapheme.c b/gen/character-prop.c
similarity index 67%
rename from gen/grapheme.c
rename to gen/character-prop.c
index 1b01125..5a0bbbc 100644
--- a/gen/grapheme.c
+++ b/gen/character-prop.c
_AT_@ -8,72 +8,72 @@

static struct property segment_property[] = {
         {
- .enumname = "GRAPHEME_PROP_CONTROL",
+ .enumname = "CHARACTER_PROP_CONTROL",
                 .identifier = "Control",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_CR",
+ .enumname = "CHARACTER_PROP_CR",
                 .identifier = "CR",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_EXTEND",
+ .enumname = "CHARACTER_PROP_EXTEND",
                 .identifier = "Extend",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC",
+ .enumname = "CHARACTER_PROP_EXTENDED_PICTOGRAPHIC",
                 .identifier = "Extended_Pictographic",
                 .fname = FILE_EMOJI,
         },
         {
- .enumname = "GRAPHEME_PROP_HANGUL_L",
+ .enumname = "CHARACTER_PROP_HANGUL_L",
                 .identifier = "L",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_HANGUL_V",
+ .enumname = "CHARACTER_PROP_HANGUL_V",
                 .identifier = "V",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_HANGUL_T",
+ .enumname = "CHARACTER_PROP_HANGUL_T",
                 .identifier = "T",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_HANGUL_LV",
+ .enumname = "CHARACTER_PROP_HANGUL_LV",
                 .identifier = "LV",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_HANGUL_LVT",
+ .enumname = "CHARACTER_PROP_HANGUL_LVT",
                 .identifier = "LVT",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_LF",
+ .enumname = "CHARACTER_PROP_LF",
                 .identifier = "LF",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_PREPEND",
+ .enumname = "CHARACTER_PROP_PREPEND",
                 .identifier = "Prepend",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_REGIONAL_INDICATOR",
+ .enumname = "CHARACTER_PROP_REGIONAL_INDICATOR",
                 .identifier = "Regional_Indicator",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_SPACINGMARK",
+ .enumname = "CHARACTER_PROP_SPACINGMARK",
                 .identifier = "SpacingMark",
                 .fname = FILE_GRAPHEME,
         },
         {
- .enumname = "GRAPHEME_PROP_ZWJ",
+ .enumname = "CHARACTER_PROP_ZWJ",
                 .identifier = "ZWJ",
                 .fname = FILE_GRAPHEME,
         },
_AT_@ -86,7 +86,7 @@ main(int argc, char *argv[])

         property_list_parse(segment_property, LEN(segment_property));
         property_list_print(segment_property, LEN(segment_property),
- "grapheme_prop", argv[0]);
+ "character_prop", argv[0]);
         property_list_free(segment_property, LEN(segment_property));

         return 0;
diff --git a/gen/grapheme-test.c b/gen/character-test.c
similarity index 82%
rename from gen/grapheme-test.c
rename to gen/character-test.c
index 174d01b..d4235cb 100644
--- a/gen/grapheme-test.c
+++ b/gen/character-test.c
_AT_@ -12,7 +12,7 @@ main(int argc, char *argv[])
         (void)argc;

         segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests);
- segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]);
+ segment_test_list_print(st, numsegtests, "character_test", argv[0]);
         segment_test_list_free(st, numsegtests);

         return 0;
diff --git a/grapheme.h b/grapheme.h
index f8eefba..ff25358 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -19,9 +19,9 @@ typedef struct lg_internal_segmentation_state {

#define LG_INVALID_CODE_POINT UINT32_C(0xFFFD)

-size_t lg_grapheme_nextbreak(const char *);
+size_t lg_character_nextbreak(const char *);

-bool lg_grapheme_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);
+bool lg_character_isbreak(uint_least32_t, uint_least32_t, LG_SEGMENTATION_STATE *);

size_t lg_utf8_decode(const char *, size_t, uint_least32_t *);
size_t lg_utf8_encode(uint_least32_t, char *, size_t);
diff --git a/src/character.c b/src/character.c
new file mode 100644
index 0000000..798fec3
--- /dev/null
+++ b/src/character.c
_AT_@ -0,0 +1,228 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../gen/character-prop.h"
+#include "../grapheme.h"
+#include "util.h"
+
+enum {
+ CHARACTER_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+ CHARACTER_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
+};
+
+bool
+lg_character_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
+{
+ struct lg_internal_heisenstate *p[2] = { 0 };
+ uint_least16_t flags = 0;
+ bool isbreak = true;
+
+ /* set state depending on state pointer */
+ if (state != NULL) {
+ p[0] = &(state->a);
+ p[1] = &(state->b);
+ flags = state->flags;
+ }
+
+ /* skip printable ASCII */
+ if ((a >= 0x20 && a <= 0x7E) &&
+ (b >= 0x20 && b <= 0x7E)) {
+ goto hasbreak;
+ }
+
+ /*
+ * Apply grapheme cluster breaking algorithm (UAX #29), see
+ * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ */
+
+ /*
+ * update flags, if state-pointer given
+ */
+ if (has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) {
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) {
+ /* one more RI is on the left side of the seam, flip state */
+ flags ^= CHARACTER_FLAG_RI_ODD;
+ } else {
+ /* an RI appeared on the right side but the left
+ side is not an RI, reset state (number 0 is even) */
+ flags &= ~CHARACTER_FLAG_RI_ODD;
+ }
+ }
+ if (!(flags & CHARACTER_FLAG_EMOJI) &&
+ ((has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
+ (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) {
+ flags |= CHARACTER_FLAG_EMOJI;
+ } else if ((flags & CHARACTER_FLAG_EMOJI) &&
+ ((has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) ||
+ (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)) ||
+ (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTEND) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
+ (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
+ (has_property(a, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND)))) {
+ /* CHARACTER_FLAG_EMOJI remains */
+ } else {
+ flags &= ~CHARACTER_FLAG_EMOJI;
+ }
+
+ /* write updated flags to state, if given */
+ if (state != NULL) {
+ state->flags = flags;
+ }
+
+ /*
+ * apply rules
+ */
+
+ /* skip GB1 and GB2, as they are never satisfied here */
+
+ /* GB3 */
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_CR) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) {
+ goto nobreak;
+ }
+
+ /* GB4 */
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_CONTROL) ||
+ has_property(a, p[0], character_prop, CHARACTER_PROP_CR) ||
+ has_property(a, p[0], character_prop, CHARACTER_PROP_LF)) {
+ goto hasbreak;
+ }
+
+ /* GB5 */
+ if (has_property(b, p[1], character_prop, CHARACTER_PROP_CONTROL) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_CR) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_LF)) {
+ goto hasbreak;
+ }
+
+ /* GB6 */
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_L) &&
+ (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_L) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LV) ||
+
+ has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_LVT))) {
+ goto nobreak;
+ }
+
+ /* GB7 */
+ if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LV) ||
+ has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_V)) &&
+ (has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_V) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T))) {
+ goto nobreak;
+ }
+
+ /* GB8 */
+ if ((has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_LVT) ||
+ has_property(a, p[0], character_prop, CHARACTER_PROP_HANGUL_T)) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_HANGUL_T)) {
+ goto nobreak;
+ }
+
+ /* GB9 */
+ if (has_property(b, p[1], character_prop, CHARACTER_PROP_EXTEND) ||
+ has_property(b, p[1], character_prop, CHARACTER_PROP_ZWJ)) {
+ goto nobreak;
+ }
+
+ /* GB9a */
+ if (has_property(b, p[1], character_prop, CHARACTER_PROP_SPACINGMARK)) {
+ goto nobreak;
+ }
+
+ /* GB9b */
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_PREPEND)) {
+ goto nobreak;
+ }
+
+ /* GB11 */
+ if ((flags & CHARACTER_FLAG_EMOJI) &&
+ has_property(a, p[0], character_prop, CHARACTER_PROP_ZWJ) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) {
+ goto nobreak;
+ }
+
+ /* GB12/GB13 */
+ if (has_property(a, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) &&
+ has_property(b, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) &&
+ (flags & CHARACTER_FLAG_RI_ODD)) {
+ goto nobreak;
+ }
+
+ /* GB999 */
+ goto hasbreak;
+nobreak:
+ isbreak = false;
+hasbreak:
+ if (state != NULL) {
+ /* move b-state to a-state, discard b-state */
+ memcpy(&(state->a), &(state->b), sizeof(state->a));
+ memset(&(state->b), 0, sizeof(state->b));
+
+ /* reset flags */
+ if (isbreak) {
+ state->flags = 0;
+ }
+ }
+
+ return isbreak;
+}
+
+size_t
+lg_character_nextbreak(const char *str)
+{
+ uint_least32_t cp0, cp1;
+ size_t ret, len = 0;
+ LG_SEGMENTATION_STATE state = { 0 };
+
+ if (str == NULL) {
+ return 0;
+ }
+
+ /*
+ * lg_utf8_decode, when it encounters an unexpected byte,
+ * does not count it to the error and instead assumes that the
+ * unexpected byte is the beginning of a new sequence.
+ * This way, when the string ends with a null byte, we never
+ * miss it, even if the previous UTF-8 sequence terminates
+ * unexpectedly, as it would either act as an unexpected byte,
+ * saved for later, or as a null byte itself, that we can catch.
+ * We pass (size_t)-1 to the length, as we will never read beyond
+ * the null byte for the reasons given above.
+ */
+
+ /* get first code point */
+ len += lg_utf8_decode(str, (size_t)-1, &cp0);
+ if (cp0 == LG_INVALID_CODE_POINT) {
+ return len;
+ }
+
+ while (cp0 != 0) {
+ /* get next code point */
+ ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
+
+ if (cp1 == LG_INVALID_CODE_POINT ||
+ lg_character_isbreak(cp0, cp1, &state)) {
+ /* we read an invalid cp or have a breakpoint */
+ break;
+ } else {
+ /* we don't have a breakpoint, continue */
+ len += ret;
+ }
+
+ /* prepare next round */
+ cp0 = cp1;
+ }
+
+ return len;
+}
diff --git a/src/grapheme.c b/src/grapheme.c
deleted file mode 100644
index 8dae759..0000000
--- a/src/grapheme.c
+++ /dev/null
_AT_@ -1,228 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "../gen/grapheme.h"
-#include "../grapheme.h"
-#include "util.h"
-
-enum {
- GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
- GRAPHEME_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
-};
-
-bool
-lg_grapheme_isbreak(uint_least32_t a, uint_least32_t b, LG_SEGMENTATION_STATE *state)
-{
- struct lg_internal_heisenstate *p[2] = { 0 };
- uint_least16_t flags = 0;
- bool isbreak = true;
-
- /* set state depending on state pointer */
- if (state != NULL) {
- p[0] = &(state->a);
- p[1] = &(state->b);
- flags = state->flags;
- }
-
- /* skip printable ASCII */
- if ((a >= 0x20 && a <= 0x7E) &&
- (b >= 0x20 && b <= 0x7E)) {
- goto hasbreak;
- }
-
- /*
- * Apply grapheme cluster breaking algorithm (UAX #29), see
- * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- */
-
- /*
- * update flags, if state-pointer given
- */
- if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
- /* one more RI is on the left side of the seam, flip state */
- flags ^= GRAPHEME_FLAG_RI_ODD;
- } else {
- /* an RI appeared on the right side but the left
- side is not an RI, reset state (number 0 is even) */
- flags &= ~GRAPHEME_FLAG_RI_ODD;
- }
- }
- if (!(flags & GRAPHEME_FLAG_EMOJI) &&
- ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
- flags |= GRAPHEME_FLAG_EMOJI;
- } else if ((flags & GRAPHEME_FLAG_EMOJI) &&
- ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
- (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
- (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
- /* GRAPHEME_FLAG_EMOJI remains */
- } else {
- flags &= ~GRAPHEME_FLAG_EMOJI;
- }
-
- /* write updated flags to state, if given */
- if (state != NULL) {
- state->flags = flags;
- }
-
- /*
- * apply rules
- */
-
- /* skip GB1 and GB2, as they are never satisfied here */
-
- /* GB3 */
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
- goto nobreak;
- }
-
- /* GB4 */
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
- has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) ||
- has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) {
- goto hasbreak;
- }
-
- /* GB5 */
- if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
- goto hasbreak;
- }
-
- /* GB6 */
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
- (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
-
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
- goto nobreak;
- }
-
- /* GB7 */
- if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
- has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
- (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
- goto nobreak;
- }
-
- /* GB8 */
- if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
- has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
- goto nobreak;
- }
-
- /* GB9 */
- if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
- goto nobreak;
- }
-
- /* GB9a */
- if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
- goto nobreak;
- }
-
- /* GB9b */
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
- goto nobreak;
- }
-
- /* GB11 */
- if ((flags & GRAPHEME_FLAG_EMOJI) &&
- has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
- goto nobreak;
- }
-
- /* GB12/GB13 */
- if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
- has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
- (flags & GRAPHEME_FLAG_RI_ODD)) {
- goto nobreak;
- }
-
- /* GB999 */
- goto hasbreak;
-nobreak:
- isbreak = false;
-hasbreak:
- if (state != NULL) {
- /* move b-state to a-state, discard b-state */
- memcpy(&(state->a), &(state->b), sizeof(state->a));
- memset(&(state->b), 0, sizeof(state->b));
-
- /* reset flags */
- if (isbreak) {
- state->flags = 0;
- }
- }
-
- return isbreak;
-}
-
-size_t
-lg_grapheme_nextbreak(const char *str)
-{
- uint_least32_t cp0, cp1;
- size_t ret, len = 0;
- LG_SEGMENTATION_STATE state = { 0 };
-
- if (str == NULL) {
- return 0;
- }
-
- /*
- * lg_utf8_decode, when it encounters an unexpected byte,
- * does not count it to the error and instead assumes that the
- * unexpected byte is the beginning of a new sequence.
- * This way, when the string ends with a null byte, we never
- * miss it, even if the previous UTF-8 sequence terminates
- * unexpectedly, as it would either act as an unexpected byte,
- * saved for later, or as a null byte itself, that we can catch.
- * We pass (size_t)-1 to the length, as we will never read beyond
- * the null byte for the reasons given above.
- */
-
- /* get first code point */
- len += lg_utf8_decode(str, (size_t)-1, &cp0);
- if (cp0 == LG_INVALID_CODE_POINT) {
- return len;
- }
-
- while (cp0 != 0) {
- /* get next code point */
- ret = lg_utf8_decode(str + len, (size_t)-1, &cp1);
-
- if (cp1 == LG_INVALID_CODE_POINT ||
- lg_grapheme_isbreak(cp0, cp1, &state)) {
- /* we read an invalid cp or have a breakpoint */
- break;
- } else {
- /* we don't have a breakpoint, continue */
- len += ret;
- }
-
- /* prepare next round */
- cp0 = cp1;
- }
-
- return len;
-}
diff --git a/test/grapheme-performance.c b/test/character-performance.c
similarity index 74%
rename from test/grapheme-performance.c
rename to test/character-performance.c
index 4bfd429..5d25e82 100644
--- a/test/grapheme-performance.c
+++ b/test/character-performance.c
_AT_@ -6,7 +6,7 @@
#include <time.h>

#include "../grapheme.h"
-#include "../gen/grapheme-test.h"
+#include "../gen/character-test.h"
#include "util.h"

#define NUM_ITERATIONS 1000
_AT_@ -23,18 +23,18 @@ main(int argc, char *argv[])
         (void)argc;

         /* allocate and generate buffer */
- for (i = 0, bufsiz = 0; i < LEN(grapheme_test); i++) {
- bufsiz += grapheme_test[i].cplen;
+ for (i = 0, bufsiz = 0; i < LEN(character_test); i++) {
+ bufsiz += character_test[i].cplen;
         }
         if (!(buf = calloc(bufsiz, sizeof(*buf)))) {
                 fprintf(stderr, "%s: calloc: Out of memory.\n", argv[0]);
                 return 1;
         }
- for (i = 0, off = 0; i < LEN(grapheme_test); i++) {
- for (j = 0; j < grapheme_test[i].cplen; j++) {
- buf[off + j] = grapheme_test[i].cp[j];
+ for (i = 0, off = 0; i < LEN(character_test); i++) {
+ for (j = 0; j < character_test[i].cplen; j++) {
+ buf[off + j] = character_test[i].cp[j];
                 }
- off += grapheme_test[i].cplen;
+ off += character_test[i].cplen;
         }

         /* run test */
_AT_@ -45,7 +45,7 @@ main(int argc, char *argv[])
         for (i = 0; i < NUM_ITERATIONS; i++) {
                 memset(&state, 0, sizeof(state));
                 for (j = 0; j < bufsiz - 1; j++) {
- (void)lg_grapheme_isbreak(buf[j], buf[j+1], &state);
+ (void)lg_character_isbreak(buf[j], buf[j+1], &state);
                 }
                 if (i % (NUM_ITERATIONS / 10) == 0) {
                         printf(".");
diff --git a/test/grapheme.c b/test/character.c
similarity index 50%
rename from test/grapheme.c
rename to test/character.c
index 5af2c46..f7f3ce8 100644
--- a/test/grapheme.c
+++ b/test/character.c
_AT_@ -5,7 +5,7 @@
#include <string.h>

#include "../grapheme.h"
-#include "../gen/grapheme-test.h"
+#include "../gen/character-test.h"
#include "util.h"

int
_AT_@ -16,19 +16,19 @@ main(int argc, char *argv[])

         (void)argc;

- /* grapheme break test */
- for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
+ /* character break test */
+ for (i = 0, failed = 0; i < LEN(character_test); i++) {
                 memset(&state, 0, sizeof(state));
- for (j = 0, k = 0, len = 1; j < grapheme_test[i].cplen; j++) {
- if ((j + 1) == grapheme_test[i].cplen ||
- lg_grapheme_isbreak(grapheme_test[i].cp[j],
- grapheme_test[i].cp[j + 1],
- &state)) {
+ for (j = 0, k = 0, len = 1; j < character_test[i].cplen; j++) {
+ if ((j + 1) == character_test[i].cplen ||
+ lg_character_isbreak(character_test[i].cp[j],
+ character_test[i].cp[j + 1],
+ &state)) {
                                 /* check if our resulting length matches */
- if (k == grapheme_test[i].lenlen ||
- len != grapheme_test[i].len[k++]) {
+ if (k == character_test[i].lenlen ||
+ len != character_test[i].len[k++]) {
                                         fprintf(stderr, "%s: Failed test \"%s\".\n",
- argv[0], grapheme_test[i].descr);
+ argv[0], character_test[i].descr);
                                         failed++;
                                         break;
                                 }
_AT_@ -39,7 +39,7 @@ main(int argc, char *argv[])
                 }
         }
         printf("%s: %zu/%zu tests passed.\n", argv[0],
- LEN(grapheme_test) - failed, LEN(grapheme_test));
+ LEN(character_test) - failed, LEN(character_test));

         return (failed > 0) ? 1 : 0;
}
Received on Sat Dec 18 2021 - 12:15:21 CET

This archive was generated by hypermail 2.3.0 : Sat Dec 18 2021 - 12:24:30 CET