[hackers] [libgrapheme] Implement word-segmentation || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Mon, 6 Jun 2022 22:40:33 +0200 (CEST)

commit 1a67a7fc7df73df260f8b7a18effd3b3e2b161c7
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Mon Jun 6 22:16:46 2022 +0200
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Mon Jun 6 22:40:22 2022 +0200

    Implement word-segmentation
    
    This was a tough nut to crack and took a lot of hours and multiple
    rewrites to get right.
    
    The first issue was that some codepoints could be in multiple classes
    at the same time, requiring the implementation of a "conflict-handler"
    in the data parser.
    
    The segmentation algorithm itself then was highly complicated, as it
    parses the data on two levels involving ignoring certain character
    property classes and doing so gracefully and simultaneously.
    
    Now it works though and passes the 1800+ tests provided by the Unicode
    consortium. The LUTs are highly compressed and the complete library
    still only weighs in at around 92K, which is lightweight given what
    it does. If you link statically, it will cut away most of it as well.
    
    What needed to be rethought was the general API-structure. It is
    impossible to do word-segmentation on a 2-codepoint-comparison-with-
    state basis and the only "form" is a function taking the entire
    array and returning the offset to the next break. The API was adapted
    accordingly.
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index 7bb10d9..74f3352 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -7,25 +7,32 @@ include config.mk
 BENCHMARK =\
         benchmark/character\
         benchmark/utf8-decode\
+ benchmark/word\
 
 DATA =\
         data/emoji-data.txt\
         data/GraphemeBreakProperty.txt\
         data/GraphemeBreakTest.txt\
+ data/WordBreakProperty.txt\
+ data/WordBreakTest.txt\
 
 GEN =\
         gen/character\
         gen/character-test\
+ gen/word\
+ gen/word-test\
 
 SRC =\
         src/character\
         src/utf8\
         src/util\
+ src/word\
 
 TEST =\
         test/character\
         test/utf8-decode\
         test/utf8-encode\
+ test/word\
 
 MAN3 =\
         man/grapheme_decode_utf8.3\
_AT_@ -40,27 +47,38 @@ all: libgrapheme.a libgrapheme.so
 benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h
 benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h
 benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
+benchmark/word.o: benchmark/word.c config.mk gen/word-test.h grapheme.h benchmark/util.h
 gen/character.o: gen/character.c config.mk gen/util.h
 gen/character-test.o: gen/character-test.c config.mk gen/util.h
+gen/word.o: gen/word.c config.mk gen/util.h
+gen/word-test.o: gen/word-test.c config.mk gen/util.h
 gen/util.o: gen/util.c config.mk gen/util.h
 src/character.o: src/character.c config.mk gen/character.h grapheme.h src/util.h
 src/utf8.o: src/utf8.c config.mk grapheme.h
 src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
+src/word.o: src/word.c config.mk gen/word.h grapheme.h src/util.h
 test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
 test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
 test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
 test/util.o: test/util.c config.mk test/util.h
+test/word.o: test/word.c config.mk gen/word-test.h grapheme.h test/util.h
 
 benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
 benchmark/utf8-decode: benchmark/utf8-decode.o benchmark/util.o libgrapheme.a
+benchmark/word: benchmark/word.o benchmark/util.o libgrapheme.a
 gen/character: gen/character.o gen/util.o
 gen/character-test: gen/character-test.o gen/util.o
+gen/word: gen/word.o gen/util.o
+gen/word-test: gen/word-test.o gen/util.o
 test/character: test/character.o test/util.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a
+test/word: test/word.o test/util.o libgrapheme.a
 
 gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character
 gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test
+gen/word.h: data/WordBreakProperty.txt gen/word
+gen/word-test.h: data/WordBreakTest.txt gen/word-test
 
 data/emoji-data.txt:
         wget -O $_AT_ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
_AT_@ -71,6 +89,12 @@ data/GraphemeBreakProperty.txt:
 data/GraphemeBreakTest.txt:
         wget -O $_AT_ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
 
+data/WordBreakProperty.txt:
+ wget -O $_AT_ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
+
+data/WordBreakTest.txt:
+ wget -O $_AT_ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt
+
 $(BENCHMARK):
         $(CC) -o $_AT_ $(LDFLAGS) $@.o benchmark/util.o libgrapheme.a -lutf8proc
 
diff --git a/benchmark/character.c b/benchmark/character.c
index 2ef0d1c..8d7eb40 100644
--- a/benchmark/character.c
+++ b/benchmark/character.c
_AT_@ -12,7 +12,7 @@
 
 #include <utf8proc.h>
 
-#define NUM_ITERATIONS 1000000
+#define NUM_ITERATIONS 100000
 
 struct break_benchmark_payload {
         uint_least32_t *buf;
_AT_@ -56,7 +56,8 @@ main(int argc, char *argv[])
 
         (void)argc;
 
- if ((p.buf = generate_cp_test_buffer(character_test, LEN(character_test),
+ if ((p.buf = generate_cp_test_buffer(character_break_test,
+ LEN(character_break_test),
                                              &(p.buflen))) == NULL) {
                 return 1;
         }
diff --git a/benchmark/utf8-decode.c b/benchmark/utf8-decode.c
index 68d28fe..fa8abfa 100644
--- a/benchmark/utf8-decode.c
+++ b/benchmark/utf8-decode.c
_AT_@ -64,8 +64,8 @@ main(int argc, char *argv[])
 
         (void)argc;
 
- p.buf = generate_utf8_test_buffer(character_test,
- LEN(character_test),
+ p.buf = generate_utf8_test_buffer(character_break_test,
+ LEN(character_break_test),
                                           &(p.buflen));
 
         /* convert cp-buffer to stupid custom libutf8proc-uint8-type */
diff --git a/benchmark/word.c b/benchmark/word.c
new file mode 100644
index 0000000..51a7b5a
--- /dev/null
+++ b/benchmark/word.c
_AT_@ -0,0 +1,52 @@
+/* See LICENSE file for copyright and license details. */
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../grapheme.h"
+#include "../gen/word-test.h"
+#include "util.h"
+
+#define NUM_ITERATIONS 100000
+
+struct break_benchmark_payload {
+ uint_least32_t *buf;
+ size_t buflen;
+};
+
+void
+libgrapheme(const void *payload)
+{
+ const struct break_benchmark_payload *p = payload;
+ size_t off;
+
+ for (off = 0; off < p->buflen; ) {
+ off += grapheme_next_word_break(p->buf + off, p->buflen - off);
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct break_benchmark_payload p;
+ double baseline = (double)NAN;
+
+ (void)argc;
+
+ if ((p.buf = generate_cp_test_buffer(word_break_test,
+ LEN(word_break_test),
+ &(p.buflen))) == NULL) {
+ return 1;
+ }
+
+ printf("%s\n", argv[0]);
+ run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
+ &baseline, NUM_ITERATIONS, p.buflen - 1);
+
+ free(p.buf);
+
+ return 0;
+}
diff --git a/gen/character-test.c b/gen/character-test.c
index f620935..0570afb 100644
--- a/gen/character-test.c
+++ b/gen/character-test.c
_AT_@ -12,7 +12,7 @@ main(int argc, char *argv[])
         (void)argc;
 
         break_test_list_parse("data/GraphemeBreakTest.txt", &test, &testlen);
- break_test_list_print(test, testlen, "character_test", argv[0]);
+ break_test_list_print(test, testlen, "character_break_test", argv[0]);
         break_test_list_free(test, testlen);
 
         return 0;
diff --git a/gen/character.c b/gen/character.c
index d8e1c4a..9dea0c4 100644
--- a/gen/character.c
+++ b/gen/character.c
_AT_@ -1,4 +1,6 @@
 /* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
 #include "util.h"
 
 #define FILE_EMOJI "data/emoji-data.txt"
_AT_@ -89,7 +91,7 @@ main(int argc, char *argv[])
 
         properties_generate_break_property(char_break_property,
                                            LEN(char_break_property),
- "char", argv[0]);
+ NULL, "char", argv[0]);
 
         return 0;
 }
diff --git a/gen/util.c b/gen/util.c
index 5f6488c..de92c9f 100644
--- a/gen/util.c
+++ b/gen/util.c
_AT_@ -3,6 +3,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
_AT_@ -20,6 +21,7 @@ struct properties_payload {
         const struct property_spec *spec;
         uint_least8_t speclen;
         int (*set_value)(struct properties_payload *, uint_least32_t, uint_least8_t);
+ uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t, uint_least8_t);
 };
 
 struct properties_compressed {
_AT_@ -421,11 +423,18 @@ set_value_bp(struct properties_payload *payload, uint_least32_t cp,
              uint_least8_t value)
 {
         if (payload->prop[cp].break_property != 0) {
- fprintf(stderr, "set_value_bp: "
- "Character break property overwrite (%s <- %s).\n",
- payload->spec[payload->prop[cp].break_property].enumname,
- payload->spec[value].enumname);
- return 1;
+ if (payload->handle_conflict == NULL) {
+ fprintf(stderr, "set_value_bp: "
+ "Unhandled character break property "
+ "overwrite for 0x%06X (%s <- %s).\n",
+ cp, payload->spec[payload->prop[cp].
+ break_property].enumname,
+ payload->spec[value].enumname);
+ return 1;
+ } else {
+ value = payload->handle_conflict(cp,
+ payload->prop[cp].break_property, value);
+ }
         }
         payload->prop[cp].break_property = value;
 
_AT_@ -440,8 +449,11 @@ get_value_bp(const struct properties *prop, size_t offset)
 
 void
 properties_generate_break_property(const struct property_spec *spec,
- uint_least8_t speclen, const char *prefix,
- const char *argv0)
+ uint_least8_t speclen,
+ uint_least8_t (*handle_conflict)(
+ uint_least32_t, uint_least8_t,
+ uint_least8_t), const char *prefix,
+ const char *argv0)
 {
         struct properties_compressed comp;
         struct properties_major_minor mm;
_AT_@ -461,6 +473,7 @@ properties_generate_break_property(const struct property_spec *spec,
         payload.spec = spec;
         payload.speclen = speclen;
         payload.set_value = set_value_bp;
+ payload.handle_conflict = handle_conflict;
 
         /* parse each file exactly once and ignore NULL-fields */
         for (i = 0; i < speclen; i++) {
diff --git a/gen/util.h b/gen/util.h
index 119bcc4..bc1f3be 100644
--- a/gen/util.h
+++ b/gen/util.h
_AT_@ -23,8 +23,10 @@ void parse_file_with_callback(const char *, int (*callback)(const char *,
                               char **, size_t, char *, void *), void *payload);
 
 void properties_generate_break_property(const struct property_spec *,
- uint_least8_t, const char *,
- const char *);
+ uint_least8_t, uint_least8_t
+ (*handle_conflict)(uint_least32_t,
+ uint_least8_t, uint_least8_t),
+ const char *, const char *);
 
 void break_test_list_parse(char *, struct break_test **, size_t *);
 void break_test_list_print(const struct break_test *, size_t,
diff --git a/gen/word-test.c b/gen/word-test.c
new file mode 100644
index 0000000..76cc7d6
--- /dev/null
+++ b/gen/word-test.c
_AT_@ -0,0 +1,19 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+ struct break_test *test = NULL;
+ size_t testlen = 0;
+
+ (void)argc;
+
+ break_test_list_parse("data/WordBreakTest.txt", &test, &testlen);
+ break_test_list_print(test, testlen, "word_break_test", argv[0]);
+ break_test_list_free(test, testlen);
+
+ return 0;
+}
diff --git a/gen/word.c b/gen/word.c
new file mode 100644
index 0000000..97e0d18
--- /dev/null
+++ b/gen/word.c
_AT_@ -0,0 +1,158 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "util.h"
+
+#define FILE_EMOJI "data/emoji-data.txt"
+#define FILE_WORD "data/WordBreakProperty.txt"
+
+static const struct property_spec word_break_property[] = {
+ {
+ .enumname = "OTHER",
+ .file = NULL,
+ .ucdname = NULL,
+ },
+ {
+ .enumname = "ALETTER",
+ .file = FILE_WORD,
+ .ucdname = "ALetter",
+ },
+ {
+ .enumname = "BOTH_ALETTER_EXTPICT",
+ .file = NULL,
+ .ucdname = NULL,
+ },
+ {
+ .enumname = "CR",
+ .file = FILE_WORD,
+ .ucdname = "CR",
+ },
+ {
+ .enumname = "DOUBLE_QUOTE",
+ .file = FILE_WORD,
+ .ucdname = "Double_Quote",
+ },
+ {
+ .enumname = "EXTEND",
+ .file = FILE_WORD,
+ .ucdname = "Extend",
+ },
+ {
+ .enumname = "EXTENDED_PICTOGRAPHIC",
+ .file = FILE_EMOJI,
+ .ucdname = "Extended_Pictographic",
+ },
+ {
+ .enumname = "EXTENDNUMLET",
+ .file = FILE_WORD,
+ .ucdname = "ExtendNumLet",
+ },
+ {
+ .enumname = "FORMAT",
+ .file = FILE_WORD,
+ .ucdname = "Format",
+ },
+ {
+ .enumname = "HEBREW_LETTER",
+ .file = FILE_WORD,
+ .ucdname = "Hebrew_Letter",
+ },
+ {
+ .enumname = "KATAKANA",
+ .file = FILE_WORD,
+ .ucdname = "Katakana",
+ },
+ {
+ .enumname = "LF",
+ .file = FILE_WORD,
+ .ucdname = "LF",
+ },
+ {
+ .enumname = "MIDLETTER",
+ .file = FILE_WORD,
+ .ucdname = "MidLetter",
+ },
+ {
+ .enumname = "MIDNUM",
+ .file = FILE_WORD,
+ .ucdname = "MidNum",
+ },
+ {
+ .enumname = "MIDNUMLET",
+ .file = FILE_WORD,
+ .ucdname = "MidNumLet",
+ },
+ {
+ .enumname = "NEWLINE",
+ .file = FILE_WORD,
+ .ucdname = "Newline",
+ },
+ {
+ .enumname = "NUMERIC",
+ .file = FILE_WORD,
+ .ucdname = "Numeric",
+ },
+ {
+ .enumname = "REGIONAL_INDICATOR",
+ .file = FILE_WORD,
+ .ucdname = "Regional_Indicator",
+ },
+ {
+ .enumname = "SINGLE_QUOTE",
+ .file = FILE_WORD,
+ .ucdname = "Single_Quote",
+ },
+ {
+ .enumname = "WSEGSPACE",
+ .file = FILE_WORD,
+ .ucdname = "WSegSpace",
+ },
+ {
+ .enumname = "ZWJ",
+ .file = FILE_WORD,
+ .ucdname = "ZWJ",
+ },
+};
+
+static uint_least8_t
+handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
+{
+ uint_least8_t result;
+
+ (void)cp;
+
+ if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") &&
+ !strcmp(word_break_property[prop2].enumname, "EXTENDED_PICTOGRAPHIC")) ||
+ (!strcmp(word_break_property[prop1].enumname, "EXTENDED_PICTOGRAPHIC") &&
+ !strcmp(word_break_property[prop2].enumname, "ALETTER"))) {
+ for (result = 0; result < LEN(word_break_property); result++) {
+ if (!strcmp(word_break_property[result].enumname,
+ "BOTH_ALETTER_EXTPICT")) {
+ break;
+ }
+ }
+ if (result == LEN(word_break_property)) {
+ fprintf(stderr, "handle_conflict: Internal error.\n");
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
+ exit(1);
+ }
+
+ return result;
+}
+
+int
+main(int argc, char *argv[])
+{
+ (void)argc;
+
+ properties_generate_break_property(word_break_property,
+ LEN(word_break_property),
+ handle_conflict, "word", argv[0]);
+
+ return 0;
+}
diff --git a/grapheme.h b/grapheme.h
index ddcfa5e..39ee8f6 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -15,10 +15,14 @@ typedef struct grapheme_internal_segmentation_state {
 
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
 
-size_t grapheme_next_character_break(const char *, size_t);
-
 bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
 
+size_t grapheme_next_character_break(const uint_least32_t *, size_t);
+size_t grapheme_next_word_break(const uint_least32_t *, size_t);
+
+size_t grapheme_next_character_break_utf8(const char *, size_t);
+size_t grapheme_next_word_break_utf8(const char *, size_t);
+
 size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
 size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
 
diff --git a/man/grapheme_next_character_break.3 b/man/grapheme_next_character_break_utf8.3
similarity index 83%
rename from man/grapheme_next_character_break.3
rename to man/grapheme_next_character_break_utf8.3
index 9e0245b..9fe6356 100644
--- a/man/grapheme_next_character_break.3
+++ b/man/grapheme_next_character_break_utf8.3
_AT_@ -1,16 +1,16 @@
 .Dd 2021-12-22
-.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3
+.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
 .Os suckless.org
 .Sh NAME
-.Nm grapheme_next_character_break
+.Nm grapheme_next_character_break_utf8
 .Nd determine byte-offset to next grapheme cluster break
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn grapheme_next_character_break "const char *str" "size_t len"
+.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
 .Sh DESCRIPTION
 The
-.Fn grapheme_next_character_break
+.Fn grapheme_next_character_break_utf8
 function computes the offset (in bytes) to the next grapheme
 cluster break (see
 .Xr libgrapheme 7 )
_AT_@ -36,7 +36,7 @@ For non-UTF-8 input data
 can be used instead.
 .Sh RETURN VALUES
 The
-.Fn grapheme_next_character_break
+.Fn grapheme_next_character_break_utf8
 function returns the offset (in bytes) to the next grapheme cluster
 break in
 .Va str
_AT_@ -66,7 +66,7 @@ main(void)
         /* print each grapheme cluster with byte-length */
         printf("Grapheme clusters in NUL-delimited input:\\n");
         for (off = 0; s[off] != '\\0'; off += ret) {
- ret = grapheme_next_character_break(s + off, SIZE_MAX);
+ ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
                 printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
         }
         printf("\\n");
_AT_@ -75,7 +75,7 @@ main(void)
         len = 17;
         printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
         for (off = 0; off < len; off += ret) {
- ret = grapheme_next_character_break(s + off, len - off);
+ ret = grapheme_next_character_break_utf8(s + off, len - off);
                 printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
         }
 
_AT_@ -86,7 +86,7 @@ main(void)
 .Xr grapheme_is_character_break 3 ,
 .Xr libgrapheme 7
 .Sh STANDARDS
-.Fn grapheme_next_character_break
+.Fn grapheme_next_character_break_utf8
 is compliant with the Unicode 14.0.0 specification.
 .Sh AUTHORS
 .An Laslo Hunhold Aq Mt dev_AT_frign.de
diff --git a/src/character.c b/src/character.c
index b085e70..ac79327 100644
--- a/src/character.c
+++ b/src/character.c
_AT_@ -102,7 +102,7 @@ static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
                 UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
 };
 
-static enum char_break_property
+static inline enum char_break_property
 get_break_prop(uint_least32_t cp)
 {
         if (likely(cp <= 0x10FFFF)) {
_AT_@ -132,14 +132,14 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
                 state->prop_set = true;
 
                 /* update flags */
- state->gb11_flag = flag_update_gb11[cp0_prop +
- NUM_CHAR_BREAK_PROPS *
- state->gb11_flag] &
- UINT16_C(1 << cp1_prop);
- state->gb12_13_flag = flag_update_gb12_13[cp0_prop +
- NUM_CHAR_BREAK_PROPS *
- state->gb12_13_flag] &
- UINT16_C(1 << cp1_prop);
+ state->gb11_flag =
+ flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
+ state->gb11_flag] &
+ UINT16_C(1 << cp1_prop);
+ state->gb12_13_flag =
+ flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
+ state->gb12_13_flag] &
+ UINT16_C(1 << cp1_prop);
 
                 /*
                  * Apply grapheme cluster breaking algorithm (UAX #29), see
_AT_@ -177,7 +177,31 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
 }
 
 size_t
-grapheme_next_character_break(const char *str, size_t len)
+grapheme_next_character_break(const uint_least32_t *str, size_t len)
+{
+ GRAPHEME_STATE state = { 0 };
+ size_t off;
+
+ if (str == NULL || len == 0) {
+ return 0;
+ }
+
+ for (off = 1; off < len; off++) {
+ if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
+ break;
+ }
+ }
+
+ /* with no breaks we break at the end */
+ if (off == len) {
+ return len;
+ } else {
+ return off;
+ }
+}
+
+size_t
+grapheme_next_character_break_utf8(const char *str, size_t len)
 {
         GRAPHEME_STATE state = { 0 };
         uint_least32_t cp0 = 0, cp1 = 0;
diff --git a/src/word.c b/src/word.c
new file mode 100644
index 0000000..4e7c411
--- /dev/null
+++ b/src/word.c
_AT_@ -0,0 +1,373 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../gen/word.h"
+#include "../grapheme.h"
+#include "util.h"
+
+static inline enum word_break_property
+get_break_prop(uint_least32_t cp)
+{
+ if (likely(cp <= 0x10FFFF)) {
+ return (enum word_break_property)
+ word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
+ } else {
+ return WORD_BREAK_PROP_OTHER;
+ }
+}
+
+static inline size_t
+get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+{
+ if (offset < len) {
+ *cp = ((const uint_least32_t *)str)[offset];
+ return 1;
+ } else {
+ *cp = GRAPHEME_INVALID_CODEPOINT;
+ return 0;
+ }
+}
+
+static inline size_t
+get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp)
+{
+ if (offset < len) {
+ return grapheme_decode_utf8((const char *)str + offset,
+ len - offset, cp);
+ } else {
+ *cp = GRAPHEME_INVALID_CODEPOINT;
+ return 0;
+ }
+}
+
+static size_t
+next_word_break(const void *str, size_t len, size_t (*get_codepoint)
+ (const void *, size_t, size_t, uint_least32_t *))
+{
+ struct {
+ enum word_break_property a, b, c, d;
+ } raw, skip;
+ enum word_break_property res;
+ uint_least32_t cp;
+ size_t off, tmp, new_off;
+ bool ri_even = true;
+
+ /* check degenerate cases */
+ if (str == NULL || len == 0) {
+ return 0;
+ }
+
+ /*
+ * Apply word breaking algorithm (UAX #29), see
+ * https://unicode.org/reports/tr29/#Word_Boundary_Rules
+ *
+ * There are 4 slots (a, b, c, d) of "break" properties and
+ * we check if there is a break in the middle between b and c.
+ *
+ * The position of this middle spot is determined by off,
+ * which gives the offset of the first element on the right
+ * hand side of said spot, or, in other words, gives the number
+ * of elements on the left hand side.
+ *
+ * It is further complicated by the fact that the algorithm
+ * expects you to skip certain characters for the second
+ * half of the rules (after WB4). Thus, we do not only have
+ * the "raw" properties as described above, but also the "skip"
+ * properties, where the skip.a and skip.b, for instance,
+ * give the two preceding character properties behind the
+ * currently investigated breakpoint.
+ *
+ */
+
+ /*
+ * Initialize the different properties such that we have
+ * a good state after the state-update in the loop
+ */
+ raw.b = NUM_WORD_BREAK_PROPS;
+ if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
+ return 1;
+ }
+ raw.c = get_break_prop(cp);
+ (void)get_codepoint(str, len, off, &cp);
+ raw.d = get_break_prop(cp);
+ skip.a = skip.b = NUM_WORD_BREAK_PROPS;
+
+ for (; off < len; off = new_off) {
+ /*
+ * Update left side (a and b) of the skip state by
+ * "shifting in" the raw.c property as long as it is
+ * not one of the "ignored" character properties.
+ * While at it, update the RI-counter.
+ *
+ */
+ if (raw.c != WORD_BREAK_PROP_EXTEND &&
+ raw.c != WORD_BREAK_PROP_FORMAT &&
+ raw.c != WORD_BREAK_PROP_ZWJ) {
+ skip.a = skip.b;
+ skip.b = raw.c;
+
+ if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+ /*
+ * The property we just shifted in is
+ * a regional indicator, increasing the
+ * number of consecutive RIs on the left
+ * side of the breakpoint by one, changing
+ * the oddness.
+ *
+ */
+ ri_even = !ri_even;
+ } else {
+ /*
+ * We saw no regional indicator, so the
+ * number of consecutive RIs on the left
+ * side of the breakpoint is zero, which
+ * is an even number.
+ *
+ */
+ ri_even = true;
+ }
+ }
+
+ /*
+ * Update right side (b and c) of the skip state by
+ * starting at the breakpoint and detecting the two
+ * following non-ignored character classes
+ *
+ */
+ skip.c = NUM_WORD_BREAK_PROPS;
+ for (tmp = off; tmp < len; ) {
+ tmp += get_codepoint(str, len, tmp, &cp);
+ res = get_break_prop(cp);
+
+ if (res != WORD_BREAK_PROP_EXTEND &&
+ res != WORD_BREAK_PROP_FORMAT &&
+ res != WORD_BREAK_PROP_ZWJ) {
+ skip.c = res;
+ break;
+ }
+ }
+ skip.d = NUM_WORD_BREAK_PROPS;
+ for (; tmp < len; ) {
+ tmp += get_codepoint(str, len, tmp, &cp);
+ res = get_break_prop(cp);
+
+ if (res != WORD_BREAK_PROP_EXTEND &&
+ res != WORD_BREAK_PROP_FORMAT &&
+ res != WORD_BREAK_PROP_ZWJ) {
+ skip.d = res;
+ break;
+ }
+ }
+
+ /*
+ * Update the raw state by simply shifting everything
+ * in and, if we still have data left, determining
+ * the character class of the next codepoint.
+ *
+ */
+ raw.a = raw.b;
+ raw.b = raw.c;
+ raw.c = raw.d;
+ if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
+ get_codepoint(str, len, new_off, &cp);
+ raw.d = get_break_prop(cp);
+ } else {
+ raw.d = NUM_WORD_BREAK_PROPS;
+ }
+
+ /* WB3 */
+ if (raw.b == WORD_BREAK_PROP_CR &&
+ raw.c == WORD_BREAK_PROP_LF) {
+ continue;
+ }
+
+ /* WB3a */
+ if (raw.b == WORD_BREAK_PROP_NEWLINE ||
+ raw.b == WORD_BREAK_PROP_CR ||
+ raw.b == WORD_BREAK_PROP_LF) {
+ break;
+ }
+
+ /* WB3b */
+ if (raw.c == WORD_BREAK_PROP_NEWLINE ||
+ raw.c == WORD_BREAK_PROP_CR ||
+ raw.c == WORD_BREAK_PROP_LF) {
+ break;
+ }
+
+ /* WB3c */
+ if (raw.b == WORD_BREAK_PROP_ZWJ &&
+ (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
+ raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
+ continue;
+ }
+
+ /* WB3d */
+ if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
+ raw.c == WORD_BREAK_PROP_WSEGSPACE) {
+ continue;
+ }
+
+ /* WB4 */
+ if (raw.c == WORD_BREAK_PROP_EXTEND ||
+ raw.c == WORD_BREAK_PROP_FORMAT ||
+ raw.c == WORD_BREAK_PROP_ZWJ) {
+ continue;
+ }
+
+ /* WB5 */
+ if ((skip.b == WORD_BREAK_PROP_ALETTER ||
+ skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (skip.c == WORD_BREAK_PROP_ALETTER ||
+ skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ continue;
+ }
+
+ /* WB6 */
+ if ((skip.b == WORD_BREAK_PROP_ALETTER ||
+ skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ (skip.c == WORD_BREAK_PROP_MIDLETTER ||
+ skip.c == WORD_BREAK_PROP_MIDNUMLET ||
+ skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ len > 2 &&
+ (skip.d == WORD_BREAK_PROP_ALETTER ||
+ skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ continue;
+ }
+
+ /* WB7 */
+ if ((skip.b == WORD_BREAK_PROP_MIDLETTER ||
+ skip.b == WORD_BREAK_PROP_MIDNUMLET ||
+ skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ (skip.c == WORD_BREAK_PROP_ALETTER ||
+ skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ len > 2 &&
+ (skip.a == WORD_BREAK_PROP_ALETTER ||
+ skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ continue;
+ }
+
+ /* WB7a */
+ if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
+ skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
+ continue;
+ }
+
+ /* WB7b */
+ if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
+ skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ len > 2 &&
+ skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
+ continue;
+ }
+
+ /* WB7c */
+ if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
+ skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
+ off > 1 &&
+ skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
+ continue;
+ }
+
+ /* WB8 */
+ if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+ skip.c == WORD_BREAK_PROP_NUMERIC) {
+ continue;
+ }
+
+ /* WB9 */
+ if ((skip.b == WORD_BREAK_PROP_ALETTER ||
+ skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
+ skip.c == WORD_BREAK_PROP_NUMERIC) {
+ continue;
+ }
+
+ /* WB10 */
+ if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+ (skip.c == WORD_BREAK_PROP_ALETTER ||
+ skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
+ continue;
+ }
+
+ /* WB11 */
+ if ((skip.b == WORD_BREAK_PROP_MIDNUM ||
+ skip.b == WORD_BREAK_PROP_MIDNUMLET ||
+ skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ skip.c == WORD_BREAK_PROP_NUMERIC &&
+ off > 1 &&
+ skip.a == WORD_BREAK_PROP_NUMERIC) {
+ continue;
+ }
+
+ /* WB12 */
+ if (skip.b == WORD_BREAK_PROP_NUMERIC &&
+ (skip.c == WORD_BREAK_PROP_MIDNUM ||
+ skip.c == WORD_BREAK_PROP_MIDNUMLET ||
+ skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
+ len > 2 &&
+ skip.d == WORD_BREAK_PROP_NUMERIC) {
+ continue;
+ }
+
+ /* WB13 */
+ if (skip.b == WORD_BREAK_PROP_KATAKANA &&
+ skip.c == WORD_BREAK_PROP_KATAKANA) {
+ continue;
+ }
+
+ /* WB13a */
+ if ((skip.b == WORD_BREAK_PROP_ALETTER ||
+ skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.b == WORD_BREAK_PROP_HEBREW_LETTER ||
+ skip.b == WORD_BREAK_PROP_NUMERIC ||
+ skip.b == WORD_BREAK_PROP_KATAKANA ||
+ skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
+ skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
+ continue;
+ }
+
+ /* WB13b */
+ if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
+ (skip.c == WORD_BREAK_PROP_ALETTER ||
+ skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
+ skip.c == WORD_BREAK_PROP_HEBREW_LETTER ||
+ skip.c == WORD_BREAK_PROP_NUMERIC ||
+ skip.c == WORD_BREAK_PROP_KATAKANA)) {
+ continue;
+ }
+
+ /* WB15 and WB16 */
+ if (!ri_even &&
+ skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
+ continue;
+ }
+
+ /* WB999 */
+ break;
+ }
+
+ return off;
+}
+
+size_t
+grapheme_next_word_break(const uint_least32_t *str, size_t len)
+{
+ return next_word_break(str, len, get_codepoint);
+}
+
+size_t
+grapheme_next_word_break_utf8(const char *str, size_t len)
+{
+ return next_word_break(str, len, get_codepoint_utf8);
+}
diff --git a/test/character.c b/test/character.c
index 39f5865..f87022e 100644
--- a/test/character.c
+++ b/test/character.c
_AT_@ -3,19 +3,15 @@
 #include <stdint.h>
 
 #include "../gen/character-test.h"
+#include "../grapheme.h"
 #include "util.h"
 
-static bool
-is_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
-{
- return grapheme_is_character_break(cp0, cp1, state);
-}
-
 int
 main(int argc, char *argv[])
 {
         (void)argc;
 
- return run_break_tests(is_break, character_test,
- LEN(character_test), argv[0]);
+ return run_break_tests(grapheme_next_character_break,
+ character_break_test,
+ LEN(character_break_test), argv[0]);
 }
diff --git a/test/util.c b/test/util.c
index 05f5e73..b4ca8b6 100644
--- a/test/util.c
+++ b/test/util.c
_AT_@ -10,30 +10,25 @@
 #include "util.h"
 
 int
-run_break_tests(bool (*is_break)(uint_least32_t, uint_least32_t, GRAPHEME_STATE *),
+run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
                 const struct break_test *test, size_t testlen, const char *argv0)
 {
         GRAPHEME_STATE state;
- size_t i, j, k, len, failed;
+ size_t i, j, off, res, failed;
 
         /* character break test */
         for (i = 0, failed = 0; i < testlen; i++) {
- memset(&state, 0, sizeof(state));
- for (j = 0, k = 0, len = 1; j < test[i].cplen; j++) {
- if ((j + 1) == test[i].cplen ||
- is_break(test[i].cp[j], test[i].cp[j + 1],
- &state)) {
- /* check if our resulting length matches */
- if (k == test[i].lenlen ||
- len != test[i].len[k++]) {
- fprintf(stderr, "%s: Failed test \"%s\".\n",
- argv0, test[i].descr);
- failed++;
- break;
- }
- len = 1;
- } else {
- len++;
+ for (j = 0, off = 0; off < test[i].cplen; off += res) {
+ res = next_break(test[i].cp + off, test[i].cplen - off);
+
+ /* check if our resulting offset matches */
+ if (j == test[i].lenlen ||
+ res != test[i].len[j++]) {
+ fprintf(stderr, "%s: Failed test %zu \"%s\".\n",
+ argv0, i, test[i].descr);
+ fprintf(stderr, "J=%zu: EXPECTED len %zu, got %zu\n", j-1, test[i].len[j-1], res);
+ failed++;
+ break;
                         }
                 }
         }
diff --git a/test/util.h b/test/util.h
index ef61b18..1c6e18f 100644
--- a/test/util.h
+++ b/test/util.h
_AT_@ -7,8 +7,8 @@
 
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
-int run_break_tests(bool (*is_break)(uint_least32_t, uint_least32_t,
- GRAPHEME_STATE *), const struct break_test *test,
- size_t testlen, const char *);
+int run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
+ const struct break_test *test, size_t testlen,
+ const char *);
 
 #endif /* UTIL_H */
diff --git a/test/word.c b/test/word.c
new file mode 100644
index 0000000..fe6d82c
--- /dev/null
+++ b/test/word.c
_AT_@ -0,0 +1,16 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "../gen/word-test.h"
+#include "../grapheme.h"
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+ (void)argc;
+
+ return run_break_tests(grapheme_next_word_break, word_break_test,
+ LEN(word_break_test), argv[0]);
+}
Received on Mon Jun 06 2022 - 22:40:33 CEST

This archive was generated by hypermail 2.3.0 : Mon Jun 06 2022 - 22:48:29 CEST