commit 65b354f0fcb1d925f4340dbb4415ea06e8af2bec
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sun Sep 1 22:42:18 2024 +0200
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sun Sep 1 22:45:28 2024 +0200
Update grapheme break algorithm to Unicode version 15.1.0
While the change to the algorithm looks harmless in the specification,
it comes at the price of more complexity because we have to keep track
of a relatively complex state for a sequence of indic conjunct breaks.
Fortunately adding so many additional classes only decreases the
compression ratio for the grapheme cluster LUTs by ~0.5%.
We now pass all 1187 character tests.
Signed-off-by: Laslo Hunhold <dev_AT_frign.de>
diff --git a/Makefile b/Makefile
index 4789e1c..d768101 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -196,7 +196,7 @@ src/sentence.o: src/sentence.c Makefile config.mk gen/sentence.h grapheme.h src/
src/utf8.o: src/utf8.c Makefile config.mk grapheme.h
src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h
src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h
-test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional-test.h grapheme.h test/util.h
+test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional.h gen/bidirectional-test.h grapheme.h test/util.h
test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h
test/character.o: test/character.c Makefile config.mk gen/character-test.h grapheme.h test/util.h
test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/util.h
_AT_@ -236,7 +236,7 @@ test/word$(BINSUFFIX): test/word.o test/util.o $(ANAME)
gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/DerivedBidiClass.txt data/UnicodeData.txt gen/bidirectional$(BINSUFFIX)
gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bidirectional-test$(BINSUFFIX)
gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCasing.txt gen/case$(BINSUFFIX)
-gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX)
+gen/character.h: data/DerivedCoreProperties.txt data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX)
gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test$(BINSUFFIX)
gen/line.h: data/emoji-data.txt data/EastAsianWidth.txt data/LineBreak.txt gen/line$(BINSUFFIX)
gen/line-test.h: data/LineBreakTest.txt gen/line-test$(BINSUFFIX)
diff --git a/gen/character.c b/gen/character.c
index 717d01e..2938444 100644
--- a/gen/character.c
+++ b/gen/character.c
_AT_@ -1,8 +1,12 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include "util.h"
+#define FILE_DCP "data/DerivedCoreProperties.txt"
#define FILE_EMOJI "data/emoji-data.txt"
#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
_AT_@ -12,6 +16,21 @@ static const struct property_spec char_break_property[] = {
.file = NULL,
.ucdname = NULL,
},
+ {
+ .enumname = "BOTH_EXTEND_ICB_EXTEND",
+ .file = NULL,
+ .ucdname = NULL,
+ },
+ {
+ .enumname = "BOTH_EXTEND_ICB_LINKER",
+ .file = NULL,
+ .ucdname = NULL,
+ },
+ {
+ .enumname = "BOTH_ZWJ_ICB_EXTEND",
+ .file = NULL,
+ .ucdname = NULL,
+ },
{
.enumname = "CONTROL",
.file = FILE_GRAPHEME,
_AT_@ -57,6 +76,24 @@ static const struct property_spec char_break_property[] = {
.file = FILE_GRAPHEME,
.ucdname = "LVT",
},
+ {
+ .enumname = "ICB_CONSONANT",
+ .file = FILE_DCP,
+ .ucdname = "InCB",
+ .ucdsubname = "Consonant",
+ },
+ {
+ .enumname = "ICB_EXTEND",
+ .file = FILE_DCP,
+ .ucdname = "InCB",
+ .ucdsubname = "Extend",
+ },
+ {
+ .enumname = "ICB_LINKER",
+ .file = FILE_DCP,
+ .ucdname = "InCB",
+ .ucdsubname = "Linker",
+ },
{
.enumname = "LF",
.file = FILE_GRAPHEME,
_AT_@ -84,14 +121,75 @@ static const struct property_spec char_break_property[] = {
},
};
+static uint_least8_t
+handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
+{
+ uint_least8_t result;
+
+ (void)cp;
+
+ if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") &&
+ !strcmp(char_break_property[prop2].enumname, "ICB_EXTEND")) ||
+ (!strcmp(char_break_property[prop1].enumname, "ICB_EXTEND") &&
+ !strcmp(char_break_property[prop2].enumname, "EXTEND"))) {
+ for (result = 0; result < LEN(char_break_property); result++) {
+ if (!strcmp(char_break_property[result].enumname,
+ "BOTH_EXTEND_ICB_EXTEND")) {
+ break;
+ }
+ }
+ if (result == LEN(char_break_property)) {
+ fprintf(stderr, "handle_conflict: Internal error.\n");
+ exit(1);
+ }
+ } else if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") &&
+ !strcmp(char_break_property[prop2].enumname,
+ "ICB_LINKER")) ||
+ (!strcmp(char_break_property[prop1].enumname,
+ "ICB_LINKER") &&
+ !strcmp(char_break_property[prop2].enumname, "EXTEND"))) {
+ for (result = 0; result < LEN(char_break_property); result++) {
+ if (!strcmp(char_break_property[result].enumname,
+ "BOTH_EXTEND_ICB_LINKER")) {
+ break;
+ }
+ }
+ if (result == LEN(char_break_property)) {
+ fprintf(stderr, "handle_conflict: Internal error.\n");
+ exit(1);
+ }
+ } else if ((!strcmp(char_break_property[prop1].enumname, "ZWJ") &&
+ !strcmp(char_break_property[prop2].enumname,
+ "ICB_EXTEND")) ||
+ (!strcmp(char_break_property[prop1].enumname,
+ "ICB_EXTEND") &&
+ !strcmp(char_break_property[prop2].enumname, "ZWJ"))) {
+ for (result = 0; result < LEN(char_break_property); result++) {
+ if (!strcmp(char_break_property[result].enumname,
+ "BOTH_ZWJ_ICB_EXTEND")) {
+ break;
+ }
+ }
+ if (result == LEN(char_break_property)) {
+ fprintf(stderr, "handle_conflict: Internal error.\n");
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
+ exit(1);
+ }
+
+ return result;
+}
+
int
main(int argc, char *argv[])
{
(void)argc;
- properties_generate_break_property(char_break_property,
- LEN(char_break_property), NULL, NULL,
- NULL, "char_break", argv[0]);
+ properties_generate_break_property(
+ char_break_property, LEN(char_break_property), NULL,
+ handle_conflict, NULL, "char_break", argv[0]);
return 0;
}
diff --git a/gen/util.c b/gen/util.c
index 6f21906..6b82293 100644
--- a/gen/util.c
+++ b/gen/util.c
_AT_@ -317,7 +317,10 @@ properties_callback(const char *file, char **field, size_t nfields,
(comment != NULL &&
!strncmp(p->spec[i].ucdname, comment,
strlen(p->spec[i].ucdname)) &&
- comment[strlen(p->spec[i].ucdname)] == ' '))) {
+ comment[strlen(p->spec[i].ucdname)] == ' ')) &&
+ (p->spec[i].ucdsubname == NULL ||
+ (nfields >= 3 &&
+ !strcmp(p->spec[i].ucdsubname, field[2])))) {
/* parse range in first field */
if (range_parse(field[0], &r)) {
return 1;
diff --git a/gen/util.h b/gen/util.h
index 31542a4..f3a6ae1 100644
--- a/gen/util.h
+++ b/gen/util.h
_AT_@ -13,6 +13,7 @@ struct property_spec {
const char *enumname;
const char *file;
const char *ucdname;
+ const char *ucdsubname;
};
struct properties {
diff --git a/src/character.c b/src/character.c
index aedcf9e..ee062a6 100644
--- a/src/character.c
+++ b/src/character.c
_AT_@ -1,3 +1,5 @@
+#include <stdio.h>
+
/* See LICENSE file for copyright and license details. */
#include <limits.h>
#include <stdbool.h>
_AT_@ -12,97 +14,239 @@ struct character_break_state {
bool prop_set;
bool gb11_flag;
bool gb12_13_flag;
+ uint_least8_t gb9c_level;
};
-static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
+static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_OTHER] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
- [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_ICB_CONSONANT] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_ICB_EXTEND] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_ICB_LINKER] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
[CHAR_BREAK_PROP_EXTEND] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_L] =
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_V] =
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_T] =
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LV] =
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LVT] =
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_PREPEND] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
- (UINT16_C(0xFFFF) &
- ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
- UINT16_C(1) << CHAR_BREAK_PROP_LF |
- UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
+ (UINT32_C(0xFFFFFFFF) &
+ ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
+ UINT32_C(1) << CHAR_BREAK_PROP_LF |
+ UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_SPACINGMARK] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_ZWJ] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
+
};
-static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
+static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
+ UINT32_C(1)
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
};
-static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
+static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
};
-static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
+static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
+ UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
-static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
+static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
- UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
+ UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static inline enum char_break_property
_AT_@ -126,7 +270,9 @@ state_serialize(const struct character_break_state *in, uint_least16_t *out)
(uint_least16_t)(((uint_least16_t)(in->gb11_flag))
<< 9) | /* 10th bit */
(uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
- << 10); /* 11th bit */
+ << 10) | /* 11th bit */
+ (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
+ << 11); /* 12th and 13th bit */
}
static inline void
_AT_@ -136,6 +282,7 @@ state_deserialize(uint_least16_t in, struct character_break_state *out)
out->prop_set = in & (UINT16_C(1) << 8);
out->gb11_flag = in & (UINT16_C(1) << 9);
out->gb12_13_flag = in & (UINT16_C(1) << 10);
+ out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
}
bool
_AT_@ -164,26 +311,105 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
state.gb11_flag =
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
state.gb11_flag] &
- UINT16_C(1) << cp1_prop;
+ UINT32_C(1) << cp1_prop;
state.gb12_13_flag =
flag_update_gb12_13[cp0_prop +
NUM_CHAR_BREAK_PROPS *
state.gb12_13_flag] &
- UINT16_C(1) << cp1_prop;
+ UINT32_C(1) << cp1_prop;
+
+ /*
+ * update GB9c state, which deals with indic conjunct breaks.
+ * We want to detect the following prefix:
+ *
+ * ICB_CONSONANT
+ * [ICB_EXTEND ICB_LINKER]*
+ * ICB_LINKER
+ * [ICB_EXTEND ICB_LINKER]*
+ *
+ * This representation is not ideal: In reality, what is
+ * meant is that the prefix is a sequence of [ICB_EXTEND
+ * ICB_LINKER]*, following an ICB_CONSONANT, that contains at
+ * least one ICB_LINKER. We thus use the following equivalent
+ * representation that allows us to store the levels 0..3 in 2
+ * bits.
+ *
+ * ICB_CONSONANT -- Level 1
+ * ICB_EXTEND* -- Level 2
+ * ICB_LINKER -- Level 3
+ * [ICB_EXTEND ICB_LINKER]* -- Level 3
+ *
+ * The following chain of if-else-blocks is a bit redundant and
+ * of course could be optimised, but this is kept as is for
+ * best readability.
+ */
+ if (state.gb9c_level == 0 &&
+ cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
+ /* the sequence has begun */
+ state.gb9c_level = 1;
+ } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
+ (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
+ cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
+ cp0_prop ==
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) {
+ /*
+ * either the level is 1 and thus the ICB consonant is
+ * followed by an ICB extend, where we jump
+ * to level 2, or we are at level 2 and just witness
+ * more ICB extends, staying at level 2.
+ */
+ state.gb9c_level = 2;
+ } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
+ (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
+ cp0_prop ==
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
+ /*
+ * witnessing an ICB linker directly lifts us up to
+ * level 3
+ */
+ state.gb9c_level = 3;
+ } else if (state.gb9c_level == 3 &&
+ (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
+ cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
+ cp0_prop ==
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND ||
+ cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
+ cp0_prop ==
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
+ /*
+ * we stay at level 3 when we observe either ICB
+ * extends or linkers
+ */
+ state.gb9c_level = 3;
+ } else {
+ /*
+ * the sequence has collapsed, but it could be
+ * that the left property is ICB consonant, which
+ * means that we jump right back to level 1 instead
+ * of 0
+ */
+ if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
+ state.gb9c_level = 1;
+ } else {
+ state.gb9c_level = 0;
+ }
+ }
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
*
http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*/
- notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
+ notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
+ (state.gb9c_level == 3 &&
+ cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
(dont_break_gb11[cp0_prop +
state.gb11_flag *
NUM_CHAR_BREAK_PROPS] &
- (UINT16_C(1) << cp1_prop)) ||
+ (UINT32_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop +
state.gb12_13_flag *
NUM_CHAR_BREAK_PROPS] &
- (UINT16_C(1) << cp1_prop));
+ (UINT32_C(1) << cp1_prop));
/* update or reset flags (when we have a break) */
if (likely(!notbreak)) {
_AT_@ -202,11 +428,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
* Given we have no state, this behaves as if the state-booleans
* were all set to false
*/
- notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
+ notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
(dont_break_gb11[cp0_prop] &
- (UINT16_C(1) << cp1_prop)) ||
+ (UINT32_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop] &
- (UINT16_C(1) << cp1_prop));
+ (UINT32_C(1) << cp1_prop));
}
return !notbreak;
Received on Mon Sep 02 2024 - 11:20:55 CEST