[hackers] [libgrapheme] Introduce mostly branchless character break detection || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Sun, 2 Jan 2022 13:05:43 +0100 (CET)

commit 072bb271868a3583da1fe432caa132cbb6c4ce9d
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sun Jan 2 12:48:10 2022 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sun Jan 2 12:48:10 2022 +0100

    Introduce mostly branchless character break detection
    
    Given the ever-increasing CPU-cache-sizes compared to the attempts of
    branch-prediction, where the CPU executes from cache-miss to cache-miss,
    the optimum is to have basically no branches at all.
    
    In the case of character-break-detection, this would necessitate a
    lookup-table over all 0x110000 (~1.1 million) codepoints that would
    be, if we calculate one byte per entry, around 1.1MB, which is totally
    excessive.
    However, the generator added in 759f02fbe74f11f64ba451b3a18090cbbb10add2
    achieves a compression ratio of 97.91% using a two-stage-compression,
    yielding a lookup-table of size roughly around 55KB.
    
    55KB easily fits even within L1 (which is often around 64KB), let alone
    L2 and L3 which are getting even larger. The CPU does not have to do any
    branch prediction as it can anticipate the cache-access.
    
    More interesting are the break rule: Different from libutf8proc, which
    also employs the two-stage approach to determine character-properties,
    it then uses normal branching to actually apply the algorithm.
    However, we can do it better: Given the limited number of break
    properties, we can formulate lookup-tables to determine breaks,
    optionally even including flags which can be added as offsets within the
    tables. The flag-updates themselves can also be achieved with LUTs.
    
    So in the end, we don't branch, as the CPU simply accesses
    easily-predictable cache-areas. The branches still left can be evaluated
    later on for further optimization.
    
    The benchmarks speak for themselves: We are 50-150% faster than
    libutf8proc now, yielding a total speedup of around 40x.
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index e90a986..2a016ad 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -13,7 +13,6 @@ DATA =\
         data/GraphemeBreakTest.txt\
 
 GEN =\
- gen/character-prop\
         gen/character-test\
         gen/properties\
 
_AT_@ -43,7 +42,7 @@ gen/character-prop.o: gen/character-prop.c config.mk gen/util.h
 gen/character-test.o: gen/character-test.c config.mk gen/util.h
 gen/properties.o: gen/properties.c config.mk gen/util.h
 gen/util.o: gen/util.c config.mk gen/util.h
-src/character.o: src/character.c config.mk gen/character-prop.h grapheme.h src/util.h
+src/character.o: src/character.c config.mk gen/properties.h grapheme.h src/util.h
 src/utf8.o: src/utf8.c config.mk grapheme.h
 src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
 test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
_AT_@ -52,14 +51,12 @@ test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
 test/util.o: test/util.c config.mk test/util.h
 
 benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
-gen/character-prop: gen/character-prop.o gen/util.o
 gen/character-test: gen/character-test.o gen/util.o
 gen/properties: gen/properties.o gen/util.o
 test/character: test/character.o test/util.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a
 
-gen/character-prop.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character-prop
 gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test
 gen/properties.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/properties
 
diff --git a/gen/character-prop.c b/gen/character-prop.c
deleted file mode 100644
index 5a0bbbc..0000000
--- a/gen/character-prop.c
+++ /dev/null
_AT_@ -1,93 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-
-#include "util.h"
-
-#define FILE_EMOJI "data/emoji-data.txt"
-#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
-
-static struct property segment_property[] = {
- {
- .enumname = "CHARACTER_PROP_CONTROL",
- .identifier = "Control",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_CR",
- .identifier = "CR",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_EXTEND",
- .identifier = "Extend",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_EXTENDED_PICTOGRAPHIC",
- .identifier = "Extended_Pictographic",
- .fname = FILE_EMOJI,
- },
- {
- .enumname = "CHARACTER_PROP_HANGUL_L",
- .identifier = "L",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_HANGUL_V",
- .identifier = "V",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_HANGUL_T",
- .identifier = "T",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_HANGUL_LV",
- .identifier = "LV",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_HANGUL_LVT",
- .identifier = "LVT",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_LF",
- .identifier = "LF",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_PREPEND",
- .identifier = "Prepend",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_REGIONAL_INDICATOR",
- .identifier = "Regional_Indicator",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_SPACINGMARK",
- .identifier = "SpacingMark",
- .fname = FILE_GRAPHEME,
- },
- {
- .enumname = "CHARACTER_PROP_ZWJ",
- .identifier = "ZWJ",
- .fname = FILE_GRAPHEME,
- },
-};
-
-int
-main(int argc, char *argv[])
-{
- (void)argc;
-
- property_list_parse(segment_property, LEN(segment_property));
- property_list_print(segment_property, LEN(segment_property),
- "character_prop", argv[0]);
- property_list_free(segment_property, LEN(segment_property));
-
- return 0;
-}
diff --git a/grapheme.h b/grapheme.h
index 1f08f55..ddcfa5e 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -6,15 +6,11 @@
 #include <stddef.h>
 #include <stdint.h>
 
-struct grapheme_internal_heisenstate {
- uint_least64_t determined;
- uint_least64_t state;
-};
-
 typedef struct grapheme_internal_segmentation_state {
- struct grapheme_internal_heisenstate cp0;
- struct grapheme_internal_heisenstate cp1;
- uint_least16_t flags;
+ uint_least8_t prop;
+ bool prop_set;
+ bool gb11_flag;
+ bool gb12_13_flag;
 } GRAPHEME_STATE;
 
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
diff --git a/src/character.c b/src/character.c
index ae8f3df..80a9a79 100644
--- a/src/character.c
+++ b/src/character.c
_AT_@ -4,178 +4,175 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "../gen/character-prop.h"
+#include "../gen/properties.h"
 #include "../grapheme.h"
 #include "util.h"
 
-enum {
- CHARACTER_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
- CHARACTER_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
+static const uint_least16_t dont_break[NUM_BREAK_PROPS] = {
+ [BREAK_PROP_OTHER] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_CR] =
+ UINT16_C(1 << BREAK_PROP_LF), /* GB3 */
+ [BREAK_PROP_EXTEND] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_HANGUL_L] =
+ UINT16_C(1 << BREAK_PROP_HANGUL_L) | /* GB6 */
+ UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB6 */
+ UINT16_C(1 << BREAK_PROP_HANGUL_LV) | /* GB6 */
+ UINT16_C(1 << BREAK_PROP_HANGUL_LVT) | /* GB6 */
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_HANGUL_V] =
+ UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */
+ UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_HANGUL_T] =
+ UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_HANGUL_LV] =
+ UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */
+ UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_HANGUL_LVT] =
+ UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_PREPEND] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK) | /* GB9a */
+ (UINT16_C(0xFFFF) &
+ ~(UINT16_C(1 << BREAK_PROP_CR) |
+ UINT16_C(1 << BREAK_PROP_LF) |
+ UINT16_C(1 << BREAK_PROP_CONTROL)
+ )
+ ), /* GB9b */
+ [BREAK_PROP_REGIONAL_INDICATOR] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_SPACINGMARK] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [BREAK_PROP_ZWJ] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+};
+static const uint_least16_t flag_update_gb11[2 * NUM_BREAK_PROPS] = {
+ [BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
+ UINT16_C(1 << BREAK_PROP_ZWJ) |
+ UINT16_C(1 << BREAK_PROP_EXTEND),
+ [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] =
+ UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+ [BREAK_PROP_EXTEND + NUM_BREAK_PROPS] =
+ UINT16_C(1 << BREAK_PROP_EXTEND) |
+ UINT16_C(1 << BREAK_PROP_ZWJ),
+ [BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_BREAK_PROPS] =
+ UINT16_C(1 << BREAK_PROP_ZWJ) |
+ UINT16_C(1 << BREAK_PROP_EXTEND),
+};
+static const uint_least16_t dont_break_gb11[2 * NUM_BREAK_PROPS] = {
+ [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] =
+ UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+};
+static const uint_least16_t flag_update_gb12_13[2 * NUM_BREAK_PROPS] = {
+ [BREAK_PROP_REGIONAL_INDICATOR] =
+ UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR),
+};
+static const uint_least16_t dont_break_gb12_13[2 * NUM_BREAK_PROPS] = {
+ [BREAK_PROP_REGIONAL_INDICATOR + NUM_BREAK_PROPS] =
+ UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR),
 };
 
-bool
-grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
+static enum break_property
+get_break_prop(uint_least32_t cp)
 {
- struct grapheme_internal_heisenstate *p[2] = { 0 };
- uint_least16_t flags = 0;
- bool isbreak = true;
-
- /* set state depending on state pointer */
- if (state != NULL) {
- p[0] = &(state->cp0);
- p[1] = &(state->cp1);
- flags = state->flags;
+ if (cp > 0x10FFFF) {
+ return BREAK_PROP_OTHER;
+ } else {
+ return prop[minor[major[cp >> 8] + (cp & 0xff)]].break_property;
         }
+}
 
- /* skip printable ASCII */
- if ((cp0 >= 0x20 && cp0 <= 0x7E) &&
- (cp1 >= 0x20 && cp1 <= 0x7E)) {
- goto hasbreak;
- }
+bool
+grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
+{
+ enum break_property cp0_prop, cp1_prop;
+ bool notbreak = false;
 
- /*
- * Apply grapheme cluster breaking algorithm (UAX #29), see
- * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- */
-
- /*
- * update flags, if state-pointer given
- */
- if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) {
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR)) {
- /* one more RI is on the left side of the seam, flip state */
- flags ^= CHARACTER_FLAG_RI_ODD;
+ if (state) {
+ if (!state->prop_set) {
+ cp0_prop = get_break_prop(cp0);
                 } else {
- /* an RI appeared on the right side but the left
- side is not an RI, reset state (number 0 is even) */
- flags &= ~CHARACTER_FLAG_RI_ODD;
+ cp0_prop = state->prop;
                 }
- }
- if (!(flags & CHARACTER_FLAG_EMOJI) &&
- ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
- (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)))) {
- flags |= CHARACTER_FLAG_EMOJI;
- } else if ((flags & CHARACTER_FLAG_EMOJI) &&
- ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_ZWJ) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) ||
- (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTEND) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)) ||
- (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTEND) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
- (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) ||
- (has_property(cp0, p[0], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND)))) {
- /* CHARACTER_FLAG_EMOJI remains */
- } else {
- flags &= ~CHARACTER_FLAG_EMOJI;
- }
-
- /* write updated flags to state, if given */
- if (state != NULL) {
- state->flags = flags;
- }
-
- /*
- * apply rules
- */
-
- /* skip GB1 and GB2, as they are never satisfied here */
-
- /* GB3 */
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_CR) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_LF)) {
- goto nobreak;
- }
-
- /* GB4 */
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_CONTROL) ||
- has_property(cp0, p[0], character_prop, CHARACTER_PROP_CR) ||
- has_property(cp0, p[0], character_prop, CHARACTER_PROP_LF)) {
- goto hasbreak;
- }
-
- /* GB5 */
- if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_CONTROL) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_CR) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_LF)) {
- goto hasbreak;
- }
-
- /* GB6 */
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_L) &&
- (has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_L) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_V) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_LV) ||
-
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_LVT))) {
- goto nobreak;
- }
-
- /* GB7 */
- if ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_LV) ||
- has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_V)) &&
- (has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_V) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_T))) {
- goto nobreak;
- }
-
- /* GB8 */
- if ((has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_LVT) ||
- has_property(cp0, p[0], character_prop, CHARACTER_PROP_HANGUL_T)) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_HANGUL_T)) {
- goto nobreak;
- }
-
- /* GB9 */
- if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTEND) ||
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_ZWJ)) {
- goto nobreak;
- }
-
- /* GB9a */
- if (has_property(cp1, p[1], character_prop, CHARACTER_PROP_SPACINGMARK)) {
- goto nobreak;
- }
-
- /* GB9b */
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_PREPEND)) {
- goto nobreak;
- }
-
- /* GB11 */
- if ((flags & CHARACTER_FLAG_EMOJI) &&
- has_property(cp0, p[0], character_prop, CHARACTER_PROP_ZWJ) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_EXTENDED_PICTOGRAPHIC)) {
- goto nobreak;
- }
-
- /* GB12/GB13 */
- if (has_property(cp0, p[0], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) &&
- has_property(cp1, p[1], character_prop, CHARACTER_PROP_REGIONAL_INDICATOR) &&
- (flags & CHARACTER_FLAG_RI_ODD)) {
- goto nobreak;
- }
-
- /* GB999 */
- goto hasbreak;
-nobreak:
- isbreak = false;
-hasbreak:
- if (state != NULL) {
- /* move b-state to a-state, discard b-state */
- memcpy(&(state->cp0), &(state->cp1), sizeof(state->cp0));
- memset(&(state->cp1), 0, sizeof(state->cp1));
-
- /* reset flags */
- if (isbreak) {
- state->flags = 0;
+ cp1_prop = get_break_prop(cp1);
+
+ /* preserve prop of right codepoint for next iteration */
+ state->prop = cp1_prop;
+ state->prop_set = true;
+
+ /* update flags */
+ state->gb11_flag = flag_update_gb11[cp0_prop +
+ NUM_BREAK_PROPS *
+ state->gb11_flag] &
+ UINT16_C(1 << cp1_prop);
+ state->gb12_13_flag = flag_update_gb12_13[cp0_prop +
+ NUM_BREAK_PROPS *
+ state->gb12_13_flag] &
+ UINT16_C(1 << cp1_prop);
+
+ /*
+ * Apply grapheme cluster breaking algorithm (UAX #29), see
+ * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ */
+ notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
+ (dont_break_gb11[cp0_prop + state->gb11_flag *
+ NUM_BREAK_PROPS] &
+ UINT16_C(1 << cp1_prop)) ||
+ (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
+ NUM_BREAK_PROPS] &
+ UINT16_C(1 << cp1_prop));
+
+ /* update or reset flags (when we have a break) */
+ if (!notbreak) {
+ state->gb11_flag = state->gb12_13_flag = false;
                 }
- }
-
- return isbreak;
+ } else {
+ cp0_prop = get_break_prop(cp0);
+ cp1_prop = get_break_prop(cp1);
+
+ /*
+ * Apply grapheme cluster breaking algorithm (UAX #29), see
+ * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ *
+ * Given we have no state, this behaves as if the state-booleans
+ * were all set to false
+ */
+ notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
+ (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
+ (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop));
+ }
+
+ return !notbreak;
 }
 
 size_t
diff --git a/src/util.c b/src/util.c
index 60af166..7b8e176 100644
--- a/src/util.c
+++ b/src/util.c
_AT_@ -1,77 +1,8 @@
 /* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "../gen/types.h"
 #include "../grapheme.h"
 #include "util.h"
-
-/* 64-slot (0,...,63) optionally undetermined binary state */
-
-int
-heisenstate_get(struct grapheme_internal_heisenstate *h, int slot)
-{
- if (h == NULL || slot >= 64 || slot < 0 ||
- !(h->determined & (1 << slot))) {
- /* no state given, slot out of range or undetermined */
- return -1;
- } else {
- /* slot determined, return state (0 or 1) */
- return (h->state & (1 << slot)) ? 1 : 0;
- }
-}
-
-int
-heisenstate_set(struct grapheme_internal_heisenstate *h, int slot, int state)
-{
- if (h == NULL || slot >= 64 || slot < 0) {
- /* no state given or slot out of range */
- return 1;
- } else {
- h->determined |= (UINT64_C(1) << slot);
- if (state) {
- h->state |= (UINT64_C(1) << slot);
- } else {
- h->state &= ~(UINT64_C(1) << slot);
- }
- }
-
- return 0;
-}
-
-static int
-cp_cmp(const void *a, const void *b)
-{
- const uint_least32_t cp = *(const uint_least32_t *)a;
- const uint_least32_t *range = (const uint_least32_t *)b;
-
- if (cp < range[0]) {
- return -1;
- } else if (cp > range[1]) {
- return 1;
- } else {
- return 0;
- }
-}
-
-int
-has_property(uint_least32_t cp, struct grapheme_internal_heisenstate *cpstate,
- const struct range_list *proptable, int property)
-{
- int res;
-
- if (cpstate == NULL ||
- (res = heisenstate_get(cpstate, property)) == -1) {
- /* make a lookup */
- res = bsearch(&cp, proptable[property].data,
- proptable[property].len,
- sizeof(*proptable[property].data), cp_cmp) ?
- 1 : 0;
-
- if (cpstate != NULL) {
- heisenstate_set(cpstate, property, res);
- }
- }
-
- return res;
-}
diff --git a/src/util.h b/src/util.h
index 1108025..049af2f 100644
--- a/src/util.h
+++ b/src/util.h
_AT_@ -10,10 +10,4 @@
 
 #define LEN(x) (sizeof(x) / sizeof(*(x)))
 
-int heisenstate_get(struct grapheme_internal_heisenstate *, int);
-int heisenstate_set(struct grapheme_internal_heisenstate *, int, int);
-
-int has_property(uint_least32_t, struct grapheme_internal_heisenstate *,
- const struct range_list *, int);
-
 #endif /* UTIL_H */
Received on Sun Jan 02 2022 - 13:05:43 CET

This archive was generated by hypermail 2.3.0 : Sun Jan 02 2022 - 13:12:33 CET