commit 0e3d5f60213ba55935364c73422b373ac380f574
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Wed Dec 8 17:47:58 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Wed Dec 8 17:55:56 2021 +0100
Refactor data-generation and library structure
What I always didn't like was the fact that you would have to have
two heisenstates in grapheme_boundary() (one for the grapheme-proptable
and one for the emoji-proptable). This unnecessarily complicated the
handling a little bit, even though there is still room for improvement.
A new folder gen was created to contain the generation tools. The data
folder from now on only contains data files.
Now gen/util.c contains all necessary functions to properly parse
property files (and test files) and you merely have to create an
"order list" (e.g. in gen/grapheme.c and gen/grapheme-test.c) and then
are good to go. This doesn't immensely remove code duplication, but
will come in handy in the future.
Additionally, src/boundary.c was moved into src/grapheme.c so there's
only one object file pulling in the data-table. This separation makes
the structure of the program clearer and helps the linker discard
unused library elements.
The heisenstate was increased to 64 bits for future use.
Signed-off-by: Laslo Hunhold <dev_AT_frign.de>
diff --git a/LICENSE b/LICENSE
index 936b515..43ddeef 100644
--- a/LICENSE
+++ b/LICENSE
_AT_@ -1,6 +1,6 @@
ISC-License
-Copyright 2019-2020 Laslo Hunhold <dev_AT_frign.de>
+Copyright 2019-2021 Laslo Hunhold <dev_AT_frign.de>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
diff --git a/Makefile b/Makefile
index e76ae0d..a7a6eee 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -4,52 +4,52 @@
include config.mk
-LIB = src/boundary src/codepoint src/grapheme src/util
-TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode
-DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test
+DATA =\
+ data/emoji-data.txt\
+ data/GraphemeBreakProperty.txt\
+ data/GraphemeBreakTest.txt
+GEN = gen/grapheme gen/grapheme-test
+LIB = src/codepoint src/grapheme src/util
+TEST = test/grapheme test/utf8-decode test/utf8-encode
MAN3 = man/grapheme_bytelen.3
MAN7 = man/libgrapheme.7
all: libgrapheme.a libgrapheme.so
-data/emoji.h: data/emoji.txt data/emoji
-data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary
-data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt data/grapheme_boundary_test
-
-data/emoji.o: data/emoji.c config.mk data/datautil.h
-data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h
-data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/datautil.h
-data/datautil.o: data/datautil.c config.mk data/datautil.h
-src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h grapheme.h
+gen/grapheme.o: gen/grapheme.c config.mk gen/util.h
+gen/grapheme-test.o: gen/grapheme-test.c config.mk gen/util.h
+gen/util.o: gen/util.c config.mk gen/util.h
src/codepoint.o: src/codepoint.c config.mk grapheme.h
-src/grapheme.o: src/grapheme.c config.mk grapheme.h
+src/grapheme.o: src/grapheme.c config.mk gen/grapheme.h grapheme.h src/util.h
src/util.o: src/util.c config.mk src/util.h
-test/grapheme_boundary.o: test/grapheme_boundary.c config.mk data/grapheme_boundary_test.h grapheme.h
+test/grapheme.o: test/grapheme.c config.mk gen/grapheme-test.h grapheme.h
test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h
test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h
-data/emoji: data/emoji.o data/datautil.o
-data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o
-data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o
-test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a
+gen/grapheme: gen/grapheme.o gen/util.o
+gen/grapheme-test: gen/grapheme-test.o gen/util.o
+test/grapheme: test/grapheme.o libgrapheme.a
test/utf8-encode: test/utf8-encode.o libgrapheme.a
test/utf8-decode: test/utf8-decode.o libgrapheme.a
-data/emoji.txt:
+gen/grapheme.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/grapheme
+gen/grapheme-test.h: data/GraphemeBreakTest.txt gen/grapheme-test
+
+data/emoji-data.txt:
wget -O $_AT_
https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
-data/grapheme_boundary.txt:
+data/GraphemeBreakProperty.txt:
wget -O $_AT_
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-data/grapheme_boundary_test.txt:
+data/GraphemeBreakTest.txt:
wget -O $_AT_
https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
-$(DATA:=.h):
- $(_AT_:.h=) < $(@:.h=.txt) > $@
+$(GEN):
+ $(CC) -o $_AT_ $(LDFLAGS) $@.o gen/util.o
-$(DATA):
- $(CC) -o $_AT_ $(LDFLAGS) $@.o data/datautil.o
+$(GEN:=.h):
+ $(_AT_:.h=) > $@
$(TEST):
$(CC) -o $_AT_ $(LDFLAGS) $@.o libgrapheme.a
_AT_@ -86,7 +86,7 @@ uninstall:
rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
clean:
- rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so
+ rm -f $(GEN:=.h) $(GEN:=.o) $(GEN) gen/util.o $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so
clean-data:
- rm -f $(DATA:=.txt)
+ rm -f $(DATA)
diff --git a/data/grapheme_boundary.txt b/data/GraphemeBreakProperty.txt
similarity index 100%
rename from data/grapheme_boundary.txt
rename to data/GraphemeBreakProperty.txt
diff --git a/data/grapheme_boundary_test.txt b/data/GraphemeBreakTest.txt
similarity index 100%
rename from data/grapheme_boundary_test.txt
rename to data/GraphemeBreakTest.txt
diff --git a/data/datautil.c b/data/datautil.c
deleted file mode 100644
index 84f059e..0000000
--- a/data/datautil.c
+++ /dev/null
_AT_@ -1,159 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-void
-parse_input(int (*process_line)(char **, size_t, char *))
-{
- char *line = NULL, **field = NULL, *comment;
- size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
- ssize_t len;
-
- while ((len = getline(&line, &linebufsize, stdin)) >= 0) {
- /* remove trailing newline */
- if (len > 0 && line[len - 1] == '\n') {
- line[len - 1] = '\0';
- len--;
- }
-
- /* skip empty lines and comment lines */
- if (len == 0 || line[0] == '#') {
- continue;
- }
-
- /* tokenize line into fields */
- for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
- /* extend field buffer, if necessary */
- if (++nfields > fieldbufsize) {
- if ((field = realloc(field, nfields *
- sizeof(*field))) == NULL) {
- fprintf(stderr, "realloc: %s\n", strerror(errno));
- exit(1);
- }
- fieldbufsize = nfields;
- }
-
- /* skip leading whitespace */
- while (line[i] == ' ') {
- i++;
- }
-
- /* set current position as field start */
- field[nfields - 1] = &line[i];
-
- /* continue until we reach ';' or '#' or end */
- while (line[i] != ';' && line[i] != '#' &&
- line[i] != '\0') {
- i++;
- }
- if (line [i] == '#') {
- /* set comment-variable for later */
- comment = &line[i + 1];
- }
-
- /* go back whitespace and terminate field there */
- if (i > 0) {
- for (j = i - 1; line[j] == ' '; j--)
- ;
- line[j + 1] = '\0';
- } else {
- line[i] = '\0';
- }
-
- /* if comment is set, we are done */
- if (comment != NULL) {
- break;
- }
- }
-
- /* skip leading whitespace in comment */
- while (comment != NULL && comment[0] == ' ') {
- comment++;
- }
-
- /* call line processing function */
- if (process_line(field, nfields, comment)) {
- exit(1);
- }
- }
-
- free(line);
- free(field);
-}
-
-static int
-valid_hexstring(const char *str)
-{
- const char *p = str;
-
- while ((*p >= '0' && *p <= '9') ||
- (*p >= 'a' && *p <= 'f') ||
- (*p >= 'A' && *p <= 'F')) {
- p++;
- }
-
- if (*p != '\0') {
- fprintf(stderr, "invalid code point range '%s'\n", str);
- return 0;
- }
-
- return 1;
-}
-
-int
-cp_parse(const char *str, uint32_t *cp)
-{
- if (!valid_hexstring(str)) {
- return 1;
- }
- *cp = strtol(str, NULL, 16);
-
- return 0;
-}
-
-int
-range_parse(const char *str, struct range *range)
-{
- char *p;
-
- if ((p = strstr(str, "..")) == NULL) {
- /* input has the form "XXXXXX" */
- if (!valid_hexstring(str)) {
- return 1;
- }
- range->lower = range->upper = strtol(str, NULL, 16);
- } else {
- /* input has the form "XXXXXX..XXXXXX" */
- *p = '\0';
- p += 2;
- if (!valid_hexstring(str) || !valid_hexstring(p)) {
- return 1;
- }
- range->lower = strtol(str, NULL, 16);
- range->upper = strtol(p, NULL, 16);
- }
-
- return 0;
-}
-
-void
-range_list_append(struct range **range, size_t *nranges, const struct range *new)
-{
- if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
- /* we can merge with previous entry */
- (*range)[*nranges - 1].upper = new->upper;
- } else {
- /* need to append new entry */
- if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
- fprintf(stderr, "realloc: %s\n", strerror(errno));
- exit(1);
- }
- (*range)[*nranges - 1].lower = new->lower;
- (*range)[*nranges - 1].upper = new->upper;
- }
-}
diff --git a/data/datautil.h b/data/datautil.h
deleted file mode 100644
index c64e037..0000000
--- a/data/datautil.h
+++ /dev/null
_AT_@ -1,20 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#ifndef DATAUTIL_H
-#define DATAUTIL_H
-
-#include <stddef.h>
-#include <stdint.h>
-
-#define LEN(x) (sizeof (x) / sizeof *(x))
-
-struct range {
- uint32_t lower;
- uint32_t upper;
-};
-
-void parse_input(int (*process_line)(char **, size_t, char *));
-int cp_parse(const char *, uint32_t *);
-int range_parse(const char *, struct range *);
-void range_list_append(struct range **, size_t *, const struct range *);
-
-#endif /* DATAUTIL_H */
diff --git a/data/emoji.txt b/data/emoji-data.txt
similarity index 100%
rename from data/emoji.txt
rename to data/emoji-data.txt
diff --git a/data/emoji.c b/data/emoji.c
deleted file mode 100644
index 8c6c3ce..0000000
--- a/data/emoji.c
+++ /dev/null
_AT_@ -1,78 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
- char *enumname;
- char *identifier;
- struct range *table;
- size_t tablelen;
-} properties[] = {
- {
- /* extended pictographic */
- .enumname = "EMOJI_PROP_EXTPICT",
- .identifier = "Extended_Pictographic",
- },
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
- size_t i;
- struct range r;
-
- (void)comment;
-
- if (nfields < 2) {
- return 1;
- }
-
- for (i = 0; i < LEN(properties); i++) {
- if (!strcmp(field[1], properties[i].identifier)) {
- if (range_parse(field[0], &r)) {
- return 1;
- }
- range_list_append(&(properties[i].table),
- &(properties[i].tablelen), &r);
- break;
- }
- }
-
- return 0;
-}
-
-int
-main(void)
-{
- size_t i, j;
-
- printf("/* Automatically generated by data/emo */\n"
- "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
- parse_input(process_line);
-
- /* output enum */
- printf("enum emoji_prop {\n");
- for (i = 0; i < LEN(properties); i++) {
- printf("\t%s,\n", properties[i].enumname);
- }
- printf("};\n\n");
-
- /* output table */
- printf("static const struct range_list emoji_prop[] = {\n");
- for (i = 0; i < LEN(properties); i++) {
- printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
- for (j = 0; j < properties[i].tablelen; j++) {
- printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
- properties[i].table[j].lower,
- properties[i].table[j].upper);
- }
- printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
- }
- printf("};\n");
-
- return 0;
-}
diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c
deleted file mode 100644
index 068c350..0000000
--- a/data/grapheme_boundary.c
+++ /dev/null
_AT_@ -1,138 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "datautil.h"
-
-static struct {
- char *enumname;
- char *identifier;
- struct range *table;
- size_t tablelen;
-} properties[] = {
- {
- /* carriage return */
- .enumname = "GB_PROP_CR",
- .identifier = "CR",
- },
- {
- /* line feed */
- .enumname = "GB_PROP_LF",
- .identifier = "LF",
- },
- {
- /* control character */
- .enumname = "GB_PROP_CONTROL",
- .identifier = "Control",
- },
- {
- /* grapheme extender */
- .enumname = "GB_PROP_EXTEND",
- .identifier = "Extend",
- },
- {
- /* zero width joiner */
- .enumname = "GB_PROP_ZWJ",
- .identifier = "ZWJ",
- },
- {
- /* regional indicator */
- .enumname = "GB_PROP_REGIONAL_INDICATOR",
- .identifier = "Regional_Indicator",
- },
- {
- /* prepend character */
- .enumname = "GB_PROP_PREPEND",
- .identifier = "Prepend",
- },
- {
- /* spacing mark */
- .enumname = "GB_PROP_SPACINGMARK",
- .identifier = "SpacingMark",
- },
- {
- /* hangul syllable type L */
- .enumname = "GB_PROP_L",
- .identifier = "L",
- },
- {
- /* hangul syllable type V */
- .enumname = "GB_PROP_V",
- .identifier = "V",
- },
- {
- /* hangul syllable type T */
- .enumname = "GB_PROP_T",
- .identifier = "T",
- },
- {
- /* hangul syllable type LV */
- .enumname = "GB_PROP_LV",
- .identifier = "LV",
- },
- {
- /* hangul syllable type LVT */
- .enumname = "GB_PROP_LVT",
- .identifier = "LVT",
- },
-};
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
- size_t i;
- struct range r;
-
- (void)comment;
-
- if (nfields < 2) {
- return 1;
- }
-
- for (i = 0; i < LEN(properties); i++) {
- if (!strcmp(field[1], properties[i].identifier)) {
- if (range_parse(field[0], &r)) {
- return 1;
- }
- range_list_append(&(properties[i].table),
- &(properties[i].tablelen), &r);
- break;
- }
- }
-
- return 0;
-}
-
-int
-main(void)
-{
- size_t i, j;
-
- printf("/* Automatically generated by data/gbp */\n"
- "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
-
- parse_input(process_line);
-
- /* output enum */
- printf("enum gb_prop {\n");
- for (i = 0; i < LEN(properties); i++) {
- printf("\t%s,\n", properties[i].enumname);
- }
- printf("};\n\n");
-
- /* output table */
- printf("static const struct range_list gb_prop[] = {\n");
- for (i = 0; i < LEN(properties); i++) {
- printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
- for (j = 0; j < properties[i].tablelen; j++) {
- printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
- properties[i].table[j].lower,
- properties[i].table[j].upper);
- }
- printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
- }
- printf("};\n");
-
- return 0;
-}
diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c
deleted file mode 100644
index 2f3d6b4..0000000
--- a/data/grapheme_boundary_test.c
+++ /dev/null
_AT_@ -1,139 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "datautil.h"
-
-struct break_test {
- uint32_t *cp;
- size_t cplen;
- size_t *len;
- size_t lenlen;
- char *descr;
-};
-
-static struct break_test *test = NULL;
-static size_t ntests = 0;
-
-int
-process_line(char **field, size_t nfields, char *comment)
-{
- struct break_test *t;
- size_t i;
- char *token;
-
- if (nfields < 1) {
- return 1;
- }
-
- /* append new testcase and initialize with zeroes */
- if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) {
- fprintf(stderr, "realloc: %s\n", strerror(errno));
- return 1;
- }
- t = &test[ntests - 1];
- memset(t, 0, sizeof(*t));
-
- /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
- for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
- token = strtok(NULL, " ")) {
- if (i % 2 == 0) {
- /* delimiter */
- if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
- /*
- * '÷' indicates a breakpoint,
- * the current length is done; allocate
- * a new length field and set it to 0
- */
- if ((t->len = realloc(t->len,
- ++t->lenlen * sizeof(*t->len))) == NULL) {
- fprintf(stderr, "realloc: %s\n",
- strerror(errno));
- return 1;
- }
- t->len[t->lenlen - 1] = 0;
- } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
- /*
- * '×' indicates a non-breakpoint, do nothing
- */
- } else {
- fprintf(stderr, "malformed delimiter '%s'\n",
- token);
- return 1;
- }
- } else {
- /* add code point to cp-array */
- if ((t->cp = realloc(t->cp, ++t->cplen *
- sizeof(*t->cp))) == NULL) {
- fprintf(stderr, "realloc: %s\n", strerror(errno));
- return 1;
- }
- if (cp_parse(token, &t->cp[t->cplen - 1])) {
- return 1;
- }
- if (t->lenlen > 0) {
- t->len[t->lenlen - 1]++;
- }
- }
- }
- if (t->len[t->lenlen - 1] == 0) {
- /* we allocated one more length than we needed */
- t->lenlen--;
- }
-
- /* store comment */
- if ((test[ntests - 1].descr = strdup(comment)) == NULL) {
- fprintf(stderr, "strdup: %s\n", strerror(errno));
- return 1;
- }
-
- return 0;
-}
-
-int
-main(void)
-{
- size_t i, j;
-
- printf("/* Automatically generated by data/gbt */\n"
- "#include <stdint.h>\n#include <stddef.h>\n\n");
-
- parse_input(process_line);
-
- printf("static const struct break_test {\n\tuint32_t *cp;\n"
- "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
- "\tchar *descr;\n} t[] = {\n");
- for (i = 0; i < ntests; i++) {
- printf("\t{\n");
-
- printf("\t\t.cp = (uint32_t[]){");
- for (j = 0; j < test[i].cplen; j++) {
- printf(" UINT32_C(0x%06X)", test[i].cp[j]);
- if (j + 1 < test[i].cplen) {
- putchar(',');
- }
- }
- printf(" },\n");
- printf("\t\t.cplen = %zu,\n", test[i].cplen);
-
- printf("\t\t.len = (size_t[]){");
- for (j = 0; j < test[i].lenlen; j++) {
- printf(" %zu", test[i].len[j]);
- if (j + 1 < test[i].lenlen) {
- putchar(',');
- }
- }
- printf(" },\n");
- printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
-
- printf("\t\t.descr = \"%s\",\n", test[i].descr);
-
- printf("\t},\n");
- }
- printf("};\n");
-
- return 0;
-}
diff --git a/gen/grapheme-test.c b/gen/grapheme-test.c
new file mode 100644
index 0000000..e05dae6
--- /dev/null
+++ b/gen/grapheme-test.c
_AT_@ -0,0 +1,18 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+int
+main(int argc, char *argv[])
+{
+ struct segment_test *st = NULL;
+ size_t numsegtests = 0;
+
+ (void)argc;
+
+ segment_test_list_parse("data/GraphemeBreakTest.txt", &st, &numsegtests);
+ segment_test_list_print(st, numsegtests, "grapheme_test", argv[0]);
+
+ return 0;
+}
diff --git a/gen/grapheme.c b/gen/grapheme.c
new file mode 100644
index 0000000..232a156
--- /dev/null
+++ b/gen/grapheme.c
_AT_@ -0,0 +1,92 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+#include "util.h"
+
+#define FILE_EMOJI "data/emoji-data.txt"
+#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
+
+static struct property segment_property[] = {
+ {
+ .enumname = "GRAPHEME_PROP_CONTROL",
+ .identifier = "Control",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_CR",
+ .identifier = "CR",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_EXTEND",
+ .identifier = "Extend",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC",
+ .identifier = "Extended_Pictographic",
+ .fname = FILE_EMOJI,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_HANGUL_L",
+ .identifier = "L",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_HANGUL_V",
+ .identifier = "V",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_HANGUL_T",
+ .identifier = "T",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_HANGUL_LV",
+ .identifier = "LV",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_HANGUL_LVT",
+ .identifier = "LVT",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_LF",
+ .identifier = "LF",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_PREPEND",
+ .identifier = "Prepend",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_REGIONAL_INDICATOR",
+ .identifier = "Regional_Indicator",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_SPACINGMARK",
+ .identifier = "SpacingMark",
+ .fname = FILE_GRAPHEME,
+ },
+ {
+ .enumname = "GRAPHEME_PROP_ZWJ",
+ .identifier = "ZWJ",
+ .fname = FILE_GRAPHEME,
+ },
+};
+
+int
+main(int argc, char *argv[])
+{
+ (void)argc;
+
+ property_list_parse(segment_property, LEN(segment_property));
+ property_list_print(segment_property, LEN(segment_property),
+ "grapheme_prop", argv[0]);
+
+ return 0;
+}
diff --git a/gen/util.c b/gen/util.c
new file mode 100644
index 0000000..ec5afb7
--- /dev/null
+++ b/gen/util.c
_AT_@ -0,0 +1,384 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+struct property_list_payload
+{
+ struct property *prop;
+ size_t numprops;
+};
+
+struct segment_test_payload
+{
+ struct segment_test **st;
+ size_t *numsegtests;
+};
+
+static int
+valid_hexstring(const char *str)
+{
+ const char *p = str;
+
+ while ((*p >= '0' && *p <= '9') ||
+ (*p >= 'a' && *p <= 'f') ||
+ (*p >= 'A' && *p <= 'F')) {
+ p++;
+ }
+
+ if (*p != '\0') {
+ fprintf(stderr, "valid_hexstring: Invalid code point range '%s'\n", str);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+cp_parse(const char *str, uint32_t *cp)
+{
+ if (!valid_hexstring(str)) {
+ return 1;
+ }
+ *cp = strtol(str, NULL, 16);
+
+ return 0;
+}
+
+static int
+range_parse(const char *str, struct range *range)
+{
+ char *p;
+
+ if ((p = strstr(str, "..")) == NULL) {
+ /* input has the form "XXXXXX" */
+ if (!valid_hexstring(str)) {
+ return 1;
+ }
+ range->lower = range->upper = strtol(str, NULL, 16);
+ } else {
+ /* input has the form "XXXXXX..XXXXXX" */
+ *p = '\0';
+ p += 2;
+ if (!valid_hexstring(str) || !valid_hexstring(p)) {
+ return 1;
+ }
+ range->lower = strtol(str, NULL, 16);
+ range->upper = strtol(p, NULL, 16);
+ }
+
+ return 0;
+}
+
+void
+range_list_append(struct range **range, size_t *nranges, const struct range *new)
+{
+ if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
+ /* we can merge with previous entry */
+ (*range)[*nranges - 1].upper = new->upper;
+ } else {
+ /* need to append new entry */
+ if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ (*range)[*nranges - 1].lower = new->lower;
+ (*range)[*nranges - 1].upper = new->upper;
+ }
+}
+
+void parse_file_with_callback(char *fname, int (*callback)(char *, char **, size_t, char *, void *), void *payload)
+{
+ FILE *fp;
+ char *line = NULL, **field = NULL, *comment;
+ size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
+ ssize_t len;
+
+ /* open file */
+ if (!(fp = fopen(fname, "r"))) {
+ fprintf(stderr, "fopen '%s': %s\n", fname,
+ strerror(errno));
+ exit(1);
+ }
+
+ while ((len = getline(&line, &linebufsize, fp)) >= 0) {
+ /* remove trailing newline */
+ if (len > 0 && line[len - 1] == '\n') {
+ line[len - 1] = '\0';
+ len--;
+ }
+
+ /* skip empty lines and comment lines */
+ if (len == 0 || line[0] == '#') {
+ continue;
+ }
+
+ /* tokenize line into fields */
+ for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
+ /* extend field buffer, if necessary */
+ if (++nfields > fieldbufsize) {
+ if ((field = realloc(field, nfields *
+ sizeof(*field))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ fieldbufsize = nfields;
+ }
+
+ /* skip leading whitespace */
+ while (line[i] == ' ') {
+ i++;
+ }
+
+ /* set current position as field start */
+ field[nfields - 1] = &line[i];
+
+ /* continue until we reach ';' or '#' or end */
+ while (line[i] != ';' && line[i] != '#' &&
+ line[i] != '\0') {
+ i++;
+ }
+ if (line[i] == '#') {
+ /* set comment-variable for later */
+ comment = &line[i + 1];
+ }
+
+ /* go back whitespace and terminate field there */
+ if (i > 0) {
+ for (j = i - 1; line[j] == ' '; j--)
+ ;
+ line[j + 1] = '\0';
+ } else {
+ line[i] = '\0';
+ }
+
+ /* if comment is set, we are done */
+ if (comment != NULL) {
+ break;
+ }
+ }
+
+ /* skip leading whitespace in comment */
+ while (comment != NULL && comment[0] == ' ') {
+ comment++;
+ }
+
+ /* call callback function */
+ if (callback(fname, field, nfields, comment, payload)) {
+ fprintf(stderr, "parse_file_with_callback: Malformed input.\n");
+ exit(1);
+ }
+ }
+}
+
+int
+property_list_callback(char *fname, char **field, size_t nfields, char *comment, void *payload)
+{
+ struct property *prop = ((struct property_list_payload *)payload)->prop;
+ struct range r;
+ size_t i, numprops = ((struct property_list_payload *)payload)->numprops;
+
+ (void)comment;
+
+ if (nfields < 2) {
+ return 1;
+ }
+
+ for (i = 0; i < numprops; i++) {
+ if (!strcmp(field[1], prop[i].identifier) &&
+ !strcmp(fname, prop[i].fname)) {
+ if (range_parse(field[0], &r)) {
+ return 1;
+ }
+ range_list_append(&(prop[i].table),
+ &(prop[i].tablelen), &r);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+void
+property_list_parse(struct property *prop, size_t numprops)
+{
+ struct property_list_payload pl = { .prop = prop, .numprops = numprops };
+ size_t i;
+
+ /* make sure to parse each file only once */
+ for (i = 0; i < numprops; i++) {
+ if (prop[i].tablelen > 0) {
+ /* property's file was already parsed */
+ continue;
+ }
+
+ parse_file_with_callback(prop[i].fname, property_list_callback, &pl);
+ }
+}
+
+void
+property_list_print(const struct property *prop, size_t numprops,
+ const char *identifier, const char *progname)
+{
+ size_t i, j;
+
+ printf("/* Automatically generated by %s */\n"
+ "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n",
+ progname);
+
+ /* print enum */
+ printf("enum %s {\n", identifier);
+ for (i = 0; i < numprops; i++) {
+ printf("\t%s,\n", prop[i].enumname);
+ }
+ printf("};\n\n");
+
+ /* print table */
+ printf("static const struct range_list %s[] = {\n", identifier);
+ for (i = 0; i < numprops; i++) {
+ printf("\t[%s] = {\n\t\t.data = (struct range[]){\n",
+ prop[i].enumname);
+ for (j = 0; j < prop[i].tablelen; j++) {
+ printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+ prop[i].table[j].lower,
+ prop[i].table[j].upper);
+ }
+ printf("\t\t},\n\t\t.len = %zu,\n\t},\n", prop[i].tablelen);
+ }
+ printf("};\n");
+}
+
+int
+segment_test_callback(char *fname, char **field, size_t nfields, char *comment, void *payload)
+{
+ struct segment_test *t, **test = ((struct segment_test_payload *)payload)->st;
+ size_t i, *ntests = ((struct segment_test_payload *)payload)->numsegtests;
+ char *token;
+
+ (void)fname;
+
+ if (nfields < 1) {
+ return 1;
+ }
+
+ /* append new testcase and initialize with zeroes */
+ if ((*test = realloc(*test, ++(*ntests) * sizeof(**test))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ return 1;
+ }
+ t = &(*test)[*ntests - 1];
+ memset(t, 0, sizeof(*t));
+
+ /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
+ for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
+ token = strtok(NULL, " ")) {
+ if (i % 2 == 0) {
+ /* delimiter */
+ if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
+ /*
+ * '÷' indicates a breakpoint,
+ * the current length is done; allocate
+ * a new length field and set it to 0
+ */
+ if ((t->len = realloc(t->len,
+ ++t->lenlen * sizeof(*t->len))) == NULL) {
+ fprintf(stderr, "realloc: %s\n",
+ strerror(errno));
+ return 1;
+ }
+ t->len[t->lenlen - 1] = 0;
+ } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
+ /*
+ * '×' indicates a non-breakpoint, do nothing
+ */
+ } else {
+ fprintf(stderr, "malformed delimiter '%s'\n",
+ token);
+ return 1;
+ }
+ } else {
+ /* add code point to cp-array */
+ if ((t->cp = realloc(t->cp, ++t->cplen *
+ sizeof(*t->cp))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ return 1;
+ }
+ if (cp_parse(token, &t->cp[t->cplen - 1])) {
+ return 1;
+ }
+ if (t->lenlen > 0) {
+ t->len[t->lenlen - 1]++;
+ }
+ }
+ }
+ if (t->len[t->lenlen - 1] == 0) {
+ /* we allocated one more length than we needed */
+ t->lenlen--;
+ }
+
+ /* store comment */
+ if (((*test)[*ntests - 1].descr = strdup(comment)) == NULL) {
+ fprintf(stderr, "strdup: %s\n", strerror(errno));
+ return 1;
+ }
+
+ return 0;
+}
+
+void
+segment_test_list_parse(char *fname, struct segment_test **st, size_t *numsegtests)
+{
+ struct segment_test_payload pl = { .st = st, .numsegtests = numsegtests };
+ *st = NULL;
+ *numsegtests = 0;
+
+ parse_file_with_callback(fname, segment_test_callback, &pl);
+}
+
+void
+segment_test_list_print(struct segment_test *st, size_t numsegtests,
+ const char *identifier, const char *progname)
+{
+ size_t i, j;
+
+ printf("/* Automatically generated by %s */\n"
+ "#include <stdint.h>\n#include <stddef.h>\n\n", progname);
+
+ printf("static const struct {\n\tuint32_t *cp;\n"
+ "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
+ "\tchar *descr;\n} %s[] = {\n", identifier);
+ for (i = 0; i < numsegtests; i++) {
+ printf("\t{\n");
+
+ printf("\t\t.cp = (uint32_t[]){");
+ for (j = 0; j < st[i].cplen; j++) {
+ printf(" UINT32_C(0x%06X)", st[i].cp[j]);
+ if (j + 1 < st[i].cplen) {
+ putchar(',');
+ }
+ }
+ printf(" },\n");
+ printf("\t\t.cplen = %zu,\n", st[i].cplen);
+
+ printf("\t\t.len = (size_t[]){");
+ for (j = 0; j < st[i].lenlen; j++) {
+ printf(" %zu", st[i].len[j]);
+ if (j + 1 < st[i].lenlen) {
+ putchar(',');
+ }
+ }
+ printf(" },\n");
+ printf("\t\t.lenlen = %zu,\n", st[i].lenlen);
+
+ printf("\t\t.descr = \"%s\",\n", st[i].descr);
+
+ printf("\t},\n");
+ }
+ printf("};\n");
+}
+
+
diff --git a/gen/util.h b/gen/util.h
new file mode 100644
index 0000000..9461416
--- /dev/null
+++ b/gen/util.h
_AT_@ -0,0 +1,37 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+ uint32_t lower;
+ uint32_t upper;
+};
+
+struct property {
+ char *enumname;
+ char *identifier;
+ char *fname;
+ struct range *table;
+ size_t tablelen;
+};
+
+struct segment_test {
+ uint32_t *cp;
+ size_t cplen;
+ size_t *len;
+ size_t lenlen;
+ char *descr;
+};
+
+void property_list_parse(struct property *, size_t);
+void property_list_print(const struct property *, size_t, const char *, const char *);
+
+void segment_test_list_parse(char *, struct segment_test **, size_t *);
+void segment_test_list_print(struct segment_test *, size_t, const char *, const char *);
+
+#endif /* UTIL_H */
diff --git a/src/boundary.c b/src/boundary.c
deleted file mode 100644
index f1c03d2..0000000
--- a/src/boundary.c
+++ /dev/null
_AT_@ -1,181 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "../data/emoji.h"
-#include "../data/grapheme_boundary.h"
-
-enum {
- GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
- GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
-};
-
-static int
-cp_cmp(const void *a, const void *b)
-{
- uint32_t cp = *(uint32_t *)a;
- uint32_t *range = (uint32_t *)b;
-
- return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
-}
-
-static int
-has_property(uint32_t cp, struct heisenstate *cpstate,
- const struct range_list *proptable, int property)
-{
- if (heisenstate_get(cpstate, property) == -1) {
- /* state undetermined, make a lookup and set it */
- heisenstate_set(cpstate, property, bsearch(&cp,
- proptable[property].data,
- proptable[property].len,
- sizeof(*proptable[property].data),
- cp_cmp) ? 1 : 0);
- }
-
- return heisenstate_get(cpstate, property);
-}
-
-int
-grapheme_boundary(uint32_t a, uint32_t b, int *state)
-{
- struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 };
- int s;
-
- /* skip printable ASCII */
- if ((a >= 0x20 && a <= 0x7E) &&
- (b >= 0x20 && b <= 0x7E)) {
- return 1;
- }
-
- /* set internal state based on given state-pointer */
- s = (state != NULL) ? *state : 0;
-
- /*
- * Apply grapheme cluster breaking algorithm (UAX #29), see
- *
http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
- */
-
- /*
- * update state
- */
- if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
- if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
- /* one more RI is on the left side of the seam */
- s ^= GRAPHEME_STATE_RI_ODD;
- } else {
- /* an RI appeared on the right side but the left
- side is not an RI, reset state (0 is even) */
- s &= ~GRAPHEME_STATE_RI_ODD;
- }
- }
- if (!(*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
- (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) {
- s |= GRAPHEME_STATE_EMOJI;
- } else if ((*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
- has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) ||
- (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)) ||
- (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
- (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
- (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) {
- /* GRAPHEME_STATE_EMOJI remains */
- } else {
- s &= ~GRAPHEME_STATE_EMOJI;
- }
-
- /* write updated state to state-pointer, if given */
- if (state != NULL) {
- *state = s;
- }
-
- /*
- * apply rules
- */
-
- /* skip GB1 and GB2, as they are never satisfied here */
-
- /* GB3 */
- if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
- return 0;
- }
-
- /* GB4 */
- if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) ||
- has_property(a, &gb[0], gb_prop, GB_PROP_CR) ||
- has_property(a, &gb[0], gb_prop, GB_PROP_LF)) {
- return 1;
- }
-
- /* GB5 */
- if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_CR) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
- return 1;
- }
-
- /* GB6 */
- if (has_property(a, &gb[0], gb_prop, GB_PROP_L) &&
- (has_property(b, &gb[1], gb_prop, GB_PROP_L) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_LV) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) {
- return 0;
- }
-
- /* GB7 */
- if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) ||
- has_property(a, &gb[0], gb_prop, GB_PROP_V)) &&
- (has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_T))) {
- return 0;
- }
-
- /* GB8 */
- if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) ||
- has_property(a, &gb[0], gb_prop, GB_PROP_T)) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_T)) {
- return 0;
- }
-
- /* GB9 */
- if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) ||
- has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) {
- return 0;
- }
-
- /* GB9a */
- if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) {
- return 0;
- }
-
- /* GB9b */
- if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) {
- return 0;
- }
-
- /* GB11 */
- if ((s & GRAPHEME_STATE_EMOJI) &&
- has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
- has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) {
- return 0;
- }
-
- /* GB12/GB13 */
- if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
- has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
- (s & GRAPHEME_STATE_RI_ODD)) {
- return 0;
- }
-
- /* GB999 */
- return 1;
-}
diff --git a/src/grapheme.c b/src/grapheme.c
index 8577038..068f91b 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
_AT_@ -2,8 +2,158 @@
#include <stddef.h>
#include <stdlib.h>
+#include "../gen/grapheme.h"
#include "../grapheme.h"
+enum {
+ GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+ GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
+};
+
+int
+grapheme_boundary(uint32_t a, uint32_t b, int *state)
+{
+ struct heisenstate prop[2] = { 0 };
+ int s;
+
+ /* skip printable ASCII */
+ if ((a >= 0x20 && a <= 0x7E) &&
+ (b >= 0x20 && b <= 0x7E)) {
+ return 1;
+ }
+
+ /* set internal state based on given state-pointer */
+ s = (state != NULL) ? *state : 0;
+
+ /*
+ * Apply grapheme cluster breaking algorithm (UAX #29), see
+ *
http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
+ */
+
+ /*
+ * update state
+ */
+ if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+ /* one more RI is on the left side of the seam */
+ s ^= GRAPHEME_STATE_RI_ODD;
+ } else {
+ /* an RI appeared on the right side but the left
+ side is not an RI, reset state (0 is even) */
+ s &= ~GRAPHEME_STATE_RI_ODD;
+ }
+ }
+ if (!(*state & GRAPHEME_STATE_EMOJI) &&
+ ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+ s |= GRAPHEME_STATE_EMOJI;
+ } else if ((*state & GRAPHEME_STATE_EMOJI) &&
+ ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
+ (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
+ (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+ /* GRAPHEME_STATE_EMOJI remains */
+ } else {
+ s &= ~GRAPHEME_STATE_EMOJI;
+ }
+
+ /* write updated state to state-pointer, if given */
+ if (state != NULL) {
+ *state = s;
+ }
+
+ /*
+ * apply rules
+ */
+
+ /* skip GB1 and GB2, as they are never satisfied here */
+
+ /* GB3 */
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+ return 0;
+ }
+
+ /* GB4 */
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+ has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) ||
+ has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) {
+ return 1;
+ }
+
+ /* GB5 */
+ if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+ return 1;
+ }
+
+ /* GB6 */
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
+ (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
+ return 0;
+ }
+
+ /* GB7 */
+ if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+ has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
+ (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
+ return 0;
+ }
+
+ /* GB8 */
+ if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
+ has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
+ return 0;
+ }
+
+ /* GB9 */
+ if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
+ return 0;
+ }
+
+ /* GB9a */
+ if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
+ return 0;
+ }
+
+ /* GB9b */
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
+ return 0;
+ }
+
+ /* GB11 */
+ if ((s & GRAPHEME_STATE_EMOJI) &&
+ has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
+ return 0;
+ }
+
+ /* GB12/GB13 */
+ if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+ has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+ (s & GRAPHEME_STATE_RI_ODD)) {
+ return 0;
+ }
+
+ /* GB999 */
+ return 1;
+}
+
size_t
grapheme_bytelen(const char *str)
{
diff --git a/src/util.c b/src/util.c
index 662ea98..955cdad 100644
--- a/src/util.c
+++ b/src/util.c
_AT_@ -1,10 +1,13 @@
/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+
#include "util.h"
int
heisenstate_get(struct heisenstate *h, int slot)
{
- if (h == NULL || slot >= 16 || slot < 0 ||
+ if (h == NULL || slot >= 64 || slot < 0 ||
!(h->determined & (1 << slot))) {
/* no state given, slot out of range or undetermined */
return -1;
_AT_@ -17,7 +20,7 @@ heisenstate_get(struct heisenstate *h, int slot)
int
heisenstate_set(struct heisenstate *h, int slot, int state)
{
- if (h == NULL || slot >= 16 || slot < 0) {
+ if (h == NULL || slot >= 64 || slot < 0) {
/* no state given or slot out of range */
return 1;
} else {
_AT_@ -31,3 +34,28 @@ heisenstate_set(struct heisenstate *h, int slot, int state)
return 0;
}
+
+static int
+cp_cmp(const void *a, const void *b)
+{
+ uint32_t cp = *(uint32_t *)a;
+ uint32_t *range = (uint32_t *)b;
+
+ return (cp >= range[0] && cp <= range[1]) ? 0 : (cp - range[0]);
+}
+
+int
+has_property(uint32_t cp, struct heisenstate *cpstate,
+ const struct range_list *proptable, int property)
+{
+ if (heisenstate_get(cpstate, property) == -1) {
+ /* state undetermined, make a lookup and set it */
+ heisenstate_set(cpstate, property, bsearch(&cp,
+ proptable[property].data,
+ proptable[property].len,
+ sizeof(*proptable[property].data),
+ cp_cmp) ? 1 : 0);
+ }
+
+ return heisenstate_get(cpstate, property);
+}
diff --git a/src/util.h b/src/util.h
index e213428..e480da0 100644
--- a/src/util.h
+++ b/src/util.h
_AT_@ -17,13 +17,16 @@ struct range_list {
size_t len;
};
-/* 16-slot (0,...,15) optionally undetermined binary state */
+/* 64-slot (0,...,63) optionally undetermined binary state */
struct heisenstate {
- uint_least16_t determined;
- uint_least16_t state;
+ uint_least64_t determined;
+ uint_least64_t state;
};
int heisenstate_get(struct heisenstate *, int);
int heisenstate_set(struct heisenstate *, int, int);
+int has_property(uint32_t, struct heisenstate *,
+ const struct range_list *, int);
+
#endif /* UTIL_H */
diff --git a/test/grapheme_boundary.c b/test/grapheme.c
similarity index 54%
rename from test/grapheme_boundary.c
rename to test/grapheme.c
index 09f5971..ff4d1f4 100644
--- a/test/grapheme_boundary.c
+++ b/test/grapheme.c
_AT_@ -5,7 +5,7 @@
#include <string.h>
#include "../grapheme.h"
-#include "../data/grapheme_boundary_test.h"
+#include "../gen/grapheme-test.h"
#define LEN(x) (sizeof(x) / sizeof(*x))
_AT_@ -16,15 +16,17 @@ main(void)
size_t i, j, k, len, failed;
/* grapheme break test */
- for (i = 0, failed = 0; i < LEN(t); i++) {
- for (j = 0, k = 0, state = 0, len = 1; j < t[i].cplen; j++) {
- if ((j + 1) == t[i].cplen ||
- grapheme_boundary(t[i].cp[j], t[i].cp[j + 1],
+ for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
+ for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) {
+ if ((j + 1) == grapheme_test[i].cplen ||
+ grapheme_boundary(grapheme_test[i].cp[j],
+ grapheme_test[i].cp[j + 1],
&state)) {
/* check if our resulting length matches */
- if (k == t[i].lenlen || len != t[i].len[k++]) {
+ if (k == grapheme_test[i].lenlen ||
+ len != grapheme_test[i].len[k++]) {
fprintf(stderr, "Failed \"%s\"\n",
- t[i].descr);
+ grapheme_test[i].descr);
failed++;
break;
}
_AT_@ -35,7 +37,7 @@ main(void)
}
}
printf("Grapheme break test: Passed %zu out of %zu tests.\n",
- LEN(t) - failed, LEN(t));
+ LEN(grapheme_test) - failed, LEN(grapheme_test));
return (failed > 0) ? 1 : 0;
}
Received on Wed Dec 08 2021 - 17:57:05 CET