[hackers] [libgrapheme] Refactor data-tables and lookup-code to be more universal || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Fri, 4 Dec 2020 13:42:36 +0100 (CET)

commit ea318c404e67e71aaf9aeb2dab671eee57ed2766
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Fri Dec 4 13:35:13 2020 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Fri Dec 4 13:35:13 2020 +0100

    Refactor data-tables and lookup-code to be more universal
    
    Previously, we had an explicit list of property-tables in boundary.c,
    but this shouldn't be the place to put them, especially if we plan to
    use the tables somewhere else, too. Instead, structure the data by
    better by emitting an enum for each datatype and base the rest of the
    code on it.
    
    This turns out for boundary to require two heisenstates (see below) for
    the emoji- and grapheme-break-tables, which is more than before, but
    that was only possible because we reduced the generalizability of the
    code.
    
    The advantage is that if the Unicode specification adds another type
    of character and implements it in the algorithm, it is as simple as
    adding an element to the properties-array in the data-generator (e.g.
    data/grapheme_boundary.c) which in turn automatically adds it to the
    enum and you are free to access it from the code.
    
    The specific changes are summarized below:
    
    - Add heisenstate-struct to handle partially-known states to prevent
      multiple identical lookups. Previously, we already kept track of it,
      but handled the states by hand.
    - Remove explicit table-listing in src/boundary.c and adapt code
    - New datatable-format that is necessary to have an enum-indexed table
    - Rename data/util.h to data/datautil.h
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index 38487d8..23d5ffe 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -4,7 +4,7 @@
 
 include config.mk
 
-LIB = src/boundary src/codepoint src/grapheme
+LIB = src/boundary src/codepoint src/grapheme src/util
 TEST = test/grapheme_boundary test/utf8-decode test/utf8-encode
 DATA = data/emoji data/grapheme_boundary data/grapheme_boundary_test
 
_AT_@ -17,20 +17,21 @@ data/emoji.h: data/emoji.txt data/emoji
 data/grapheme_boundary.h: data/grapheme_boundary.txt data/grapheme_boundary
 data/grapheme_boundary_test.h: data/grapheme_boundary_test.txt data/grapheme_boundary_test
 
-data/emoji.o: data/emoji.c config.mk data/util.h
-data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/util.h
-data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/util.h
-data/util.o: data/util.c config.mk data/util.h
+data/emoji.o: data/emoji.c config.mk data/datautil.h
+data/grapheme_boundary.o: data/grapheme_boundary.c config.mk data/datautil.h
+data/grapheme_boundary_test.o: data/grapheme_boundary_test.c config.mk data/datautil.h
+data/datautil.o: data/datautil.c config.mk data/datautil.h
 src/boundary.o: src/boundary.c config.mk data/emoji.h data/grapheme_boundary.h grapheme.h
 src/codepoint.o: src/codepoint.c config.mk grapheme.h
 src/grapheme.o: src/grapheme.c config.mk grapheme.h
+src/util.o: src/util.c config.mk src/util.h
 test/grapheme_boundary.o: test/grapheme_boundary.c config.mk data/grapheme_boundary_test.h grapheme.h
 test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h
 test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h
 
-data/emoji: data/emoji.o data/util.o
-data/grapheme_boundary: data/grapheme_boundary.o data/util.o
-data/grapheme_boundary_test: data/grapheme_boundary_test.o data/util.o
+data/emoji: data/emoji.o data/datautil.o
+data/grapheme_boundary: data/grapheme_boundary.o data/datautil.o
+data/grapheme_boundary_test: data/grapheme_boundary_test.o data/datautil.o
 test/grapheme_boundary: test/grapheme_boundary.o libgrapheme.a
 test/utf8-encode: test/utf8-encode.o libgrapheme.a
 test/utf8-decode: test/utf8-decode.o libgrapheme.a
_AT_@ -48,7 +49,7 @@ $(DATA:=.h):
         $(_AT_:.h=) < $(@:.h=.txt) > $@
 
 $(DATA):
- $(CC) -o $_AT_ $(LDFLAGS) $@.o data/util.o
+ $(CC) -o $_AT_ $(LDFLAGS) $@.o data/datautil.o
 
 $(TEST):
         $(CC) -o $_AT_ $(LDFLAGS) $@.o libgrapheme.a
_AT_@ -85,7 +86,7 @@ uninstall:
         rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
 
 clean:
- rm -f $(DATA:=.h) $(DATA:=.o) data/util.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so
+ rm -f $(DATA:=.h) $(DATA:=.o) data/datautil.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so
 
 clean-data:
         rm -f $(DATA:=.txt)
diff --git a/data/util.c b/data/datautil.c
similarity index 99%
rename from data/util.c
rename to data/datautil.c
index b950dbd..84f059e 100644
--- a/data/util.c
+++ b/data/datautil.c
_AT_@ -5,7 +5,7 @@
 #include <string.h>
 #include <errno.h>
 
-#include "util.h"
+#include "datautil.h"
 
 void
 parse_input(int (*process_line)(char **, size_t, char *))
diff --git a/data/util.h b/data/datautil.h
similarity index 87%
rename from data/util.h
rename to data/datautil.h
index e84eb01..c64e037 100644
--- a/data/util.h
+++ b/data/datautil.h
_AT_@ -1,6 +1,6 @@
 /* See LICENSE file for copyright and license details. */
-#ifndef UTIL_H
-#define UTIL_H
+#ifndef DATAUTIL_H
+#define DATAUTIL_H
 
 #include <stddef.h>
 #include <stdint.h>
_AT_@ -17,4 +17,4 @@ int cp_parse(const char *, uint32_t *);
 int range_parse(const char *, struct range *);
 void range_list_append(struct range **, size_t *, const struct range *);
 
-#endif /* UTIL_H */
+#endif /* DATAUTIL_H */
diff --git a/data/emoji.c b/data/emoji.c
index a3a9e38..8c6c3ce 100644
--- a/data/emoji.c
+++ b/data/emoji.c
_AT_@ -3,17 +3,18 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "util.h"
+#include "datautil.h"
 
 static struct {
+ char *enumname;
         char *identifier;
- char *tablename;
         struct range *table;
         size_t tablelen;
 } properties[] = {
         {
+ /* extended pictographic */
+ .enumname = "EMOJI_PROP_EXTPICT",
                 .identifier = "Extended_Pictographic",
- .tablename = "extpict_table",
         },
 };
 
_AT_@ -49,20 +50,29 @@ main(void)
         size_t i, j;
 
         printf("/* Automatically generated by data/emo */\n"
- "#include <stdint.h>\n");
+ "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
 
         parse_input(process_line);
 
+ /* output enum */
+ printf("enum emoji_prop {\n");
         for (i = 0; i < LEN(properties); i++) {
- printf("\nstatic const uint32_t %s[][2] = {\n",
- properties[i].tablename);
+ printf("\t%s,\n", properties[i].enumname);
+ }
+ printf("};\n\n");
+
+ /* output table */
+ printf("static const struct range_list emoji_prop[] = {\n");
+ for (i = 0; i < LEN(properties); i++) {
+ printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
                 for (j = 0; j < properties[i].tablelen; j++) {
- printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+ printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
                                properties[i].table[j].lower,
                                properties[i].table[j].upper);
                 }
- printf("};\n");
+ printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
         }
+ printf("};\n");
 
         return 0;
 }
diff --git a/data/grapheme_boundary.c b/data/grapheme_boundary.c
index 7ef8239..068c350 100644
--- a/data/grapheme_boundary.c
+++ b/data/grapheme_boundary.c
_AT_@ -3,65 +3,78 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "util.h"
+#include "datautil.h"
 
 static struct {
+ char *enumname;
         char *identifier;
- char *tablename;
         struct range *table;
         size_t tablelen;
 } properties[] = {
         {
+ /* carriage return */
+ .enumname = "GB_PROP_CR",
                 .identifier = "CR",
- .tablename = "cr_table",
         },
         {
+ /* line feed */
+ .enumname = "GB_PROP_LF",
                 .identifier = "LF",
- .tablename = "lf_table",
         },
         {
+ /* control character */
+ .enumname = "GB_PROP_CONTROL",
                 .identifier = "Control",
- .tablename = "control_table",
         },
         {
+ /* grapheme extender */
+ .enumname = "GB_PROP_EXTEND",
                 .identifier = "Extend",
- .tablename = "extend_table",
         },
         {
+ /* zero width joiner */
+ .enumname = "GB_PROP_ZWJ",
                 .identifier = "ZWJ",
- .tablename = "zwj_table",
         },
         {
+ /* regional indicator */
+ .enumname = "GB_PROP_REGIONAL_INDICATOR",
                 .identifier = "Regional_Indicator",
- .tablename = "ri_table",
         },
         {
+ /* prepend character */
+ .enumname = "GB_PROP_PREPEND",
                 .identifier = "Prepend",
- .tablename = "prepend_table",
         },
         {
+ /* spacing mark */
+ .enumname = "GB_PROP_SPACINGMARK",
                 .identifier = "SpacingMark",
- .tablename = "spacingmark_table",
         },
         {
+ /* hangul syllable type L */
+ .enumname = "GB_PROP_L",
                 .identifier = "L",
- .tablename = "l_table",
         },
         {
+ /* hangul syllable type V */
+ .enumname = "GB_PROP_V",
                 .identifier = "V",
- .tablename = "v_table",
         },
         {
+ /* hangul syllable type T */
+ .enumname = "GB_PROP_T",
                 .identifier = "T",
- .tablename = "t_table",
         },
         {
+ /* hangul syllable type LV */
+ .enumname = "GB_PROP_LV",
                 .identifier = "LV",
- .tablename = "lv_table",
         },
         {
+ /* hangul syllable type LVT */
+ .enumname = "GB_PROP_LVT",
                 .identifier = "LVT",
- .tablename = "lvt_table",
         },
 };
 
_AT_@ -97,20 +110,29 @@ main(void)
         size_t i, j;
 
         printf("/* Automatically generated by data/gbp */\n"
- "#include <stdint.h>\n");
+ "#include <stdint.h>\n\n#include \"../src/util.h\"\n\n");
 
         parse_input(process_line);
 
+ /* output enum */
+ printf("enum gb_prop {\n");
         for (i = 0; i < LEN(properties); i++) {
- printf("\nstatic const uint32_t %s[][2] = {\n",
- properties[i].tablename);
+ printf("\t%s,\n", properties[i].enumname);
+ }
+ printf("};\n\n");
+
+ /* output table */
+ printf("static const struct range_list gb_prop[] = {\n");
+ for (i = 0; i < LEN(properties); i++) {
+ printf("\t[%s] = {\n\t\t.data = (struct range[]){\n", properties[i].enumname);
                 for (j = 0; j < properties[i].tablelen; j++) {
- printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+ printf("\t\t\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
                                properties[i].table[j].lower,
                                properties[i].table[j].upper);
                 }
- printf("};\n");
+ printf("\t\t},\n\t\t.len = %zu,\n\t},\n", properties[i].tablelen);
         }
+ printf("};\n");
 
         return 0;
 }
diff --git a/data/grapheme_boundary_test.c b/data/grapheme_boundary_test.c
index 02e71f1..2f3d6b4 100644
--- a/data/grapheme_boundary_test.c
+++ b/data/grapheme_boundary_test.c
_AT_@ -5,7 +5,7 @@
 #include <string.h>
 #include <errno.h>
 
-#include "util.h"
+#include "datautil.h"
 
 struct break_test {
         uint32_t *cp;
diff --git a/src/boundary.c b/src/boundary.c
index b2d67d3..f1c03d2 100644
--- a/src/boundary.c
+++ b/src/boundary.c
_AT_@ -6,98 +6,11 @@
 #include "../data/emoji.h"
 #include "../data/grapheme_boundary.h"
 
-#define LEN(x) (sizeof(x) / sizeof(*x))
-
 enum {
         GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
         GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
 };
 
-enum cp_property {
- PROP_CR, /* carriage return */
- PROP_LF, /* line feed */
- PROP_CONTROL, /* control character */
- PROP_EXTEND, /* grapheme extender (TODO Emoji_Modifier=Yes) */
- PROP_ZWJ, /* zero width joiner */
- PROP_RI, /* regional indicator */
- PROP_PREPEND, /* prepend character */
- PROP_SPACINGMARK, /* spacing mark */
- PROP_L, /* hangul syllable type L */
- PROP_V, /* hangul syllable type V */
- PROP_T, /* hangul syllable type T */
- PROP_LV, /* hangul syllable type LV */
- PROP_LVT, /* hangul syllable type LVT */
- PROP_EXTPICT, /* extended pictographic */
-};
-
-struct {
- const uint32_t (*table)[2];
- size_t tablelen;
-} cp_property_tables[] = {
- [PROP_CR] = {
- .table = cr_table,
- .tablelen = LEN(cr_table),
- },
- [PROP_LF] = {
- .table = lf_table,
- .tablelen = LEN(lf_table),
- },
- [PROP_CONTROL] = {
- .table = control_table,
- .tablelen = LEN(control_table),
- },
- [PROP_EXTEND] = {
- .table = extend_table,
- .tablelen = LEN(extend_table),
- },
- [PROP_ZWJ] = {
- .table = zwj_table,
- .tablelen = LEN(zwj_table),
- },
- [PROP_RI] = {
- .table = ri_table,
- .tablelen = LEN(ri_table),
- },
- [PROP_PREPEND] = {
- .table = prepend_table,
- .tablelen = LEN(prepend_table),
- },
- [PROP_SPACINGMARK] = {
- .table = spacingmark_table,
- .tablelen = LEN(spacingmark_table),
- },
- [PROP_L] = {
- .table = l_table,
- .tablelen = LEN(l_table),
- },
- [PROP_V] = {
- .table = v_table,
- .tablelen = LEN(v_table),
- },
- [PROP_T] = {
- .table = t_table,
- .tablelen = LEN(t_table),
- },
- [PROP_LV] = {
- .table = lv_table,
- .tablelen = LEN(lv_table),
- },
- [PROP_LVT] = {
- .table = lvt_table,
- .tablelen = LEN(lvt_table),
- },
- [PROP_EXTPICT] = {
- .table = extpict_table,
- .tablelen = LEN(extpict_table),
- },
-};
-
-struct cp_properties {
- uint32_t cp;
- int_least16_t determined;
- int_least16_t state;
-};
-
 static int
 cp_cmp(const void *a, const void *b)
 {
_AT_@ -108,37 +21,25 @@ cp_cmp(const void *a, const void *b)
 }
 
 static int
-has_property(struct cp_properties *props, enum cp_property p)
+has_property(uint32_t cp, struct heisenstate *cpstate,
+ const struct range_list *proptable, int property)
 {
- if (!(props->determined & (1 << p))) {
- /* not determined yet, do a lookup and set the state */
- if (bsearch(&props->cp, cp_property_tables[p].table,
- cp_property_tables[p].tablelen,
- sizeof(*cp_property_tables[p].table),
- cp_cmp)) {
- props->state |= (1 << p);
- } else {
- props->state &= ~(1 << p);
- }
-
- /* now it's determined */
- props->determined |= (1 << p);
+ if (heisenstate_get(cpstate, property) == -1) {
+ /* state undetermined, make a lookup and set it */
+ heisenstate_set(cpstate, property, bsearch(&cp,
+ proptable[property].data,
+ proptable[property].len,
+ sizeof(*proptable[property].data),
+ cp_cmp) ? 1 : 0);
         }
 
- return (props->state & (1 << p));
+ return heisenstate_get(cpstate, property);
 }
 
 int
 grapheme_boundary(uint32_t a, uint32_t b, int *state)
 {
- struct cp_properties props[] = {
- {
- .cp = a,
- },
- {
- .cp = b,
- },
- };
+ struct heisenstate gb[2] = { 0 }, emoji[2] = { 0 };
         int s;
 
         /* skip printable ASCII */
_AT_@ -158,8 +59,8 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state)
         /*
          * update state
          */
- if (has_property(&props[1], PROP_RI)) {
- if (has_property(&props[0], PROP_RI)) {
+ if (has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR)) {
                         /* one more RI is on the left side of the seam */
                         s ^= GRAPHEME_STATE_RI_ODD;
                 } else {
_AT_@ -169,22 +70,22 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state)
                 }
         }
         if (!(*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_EXTEND)))) {
+ ((has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
+ (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) {
                 s |= GRAPHEME_STATE_EMOJI;
         } else if ((*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(&props[0], PROP_ZWJ) &&
- has_property(&props[1], PROP_EXTPICT)) ||
- (has_property(&props[0], PROP_EXTEND) &&
- has_property(&props[1], PROP_EXTEND)) ||
- (has_property(&props[0], PROP_EXTEND) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_ZWJ)) ||
- (has_property(&props[0], PROP_EXTPICT) &&
- has_property(&props[1], PROP_EXTEND)))) {
+ ((has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
+ has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) ||
+ (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)) ||
+ (has_property(a, &gb[0], gb_prop, GB_PROP_EXTEND) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
+ (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) ||
+ (has_property(a, &emoji[0], emoji_prop, EMOJI_PROP_EXTPICT) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND)))) {
                 /* GRAPHEME_STATE_EMOJI remains */
         } else {
                 s &= ~GRAPHEME_STATE_EMOJI;
_AT_@ -202,75 +103,75 @@ grapheme_boundary(uint32_t a, uint32_t b, int *state)
         /* skip GB1 and GB2, as they are never satisfied here */
 
         /* GB3 */
- if (has_property(&props[0], PROP_CR) &&
- has_property(&props[1], PROP_LF)) {
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_CR) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
                 return 0;
         }
 
         /* GB4 */
- if (has_property(&props[0], PROP_CONTROL) ||
- has_property(&props[0], PROP_CR) ||
- has_property(&props[0], PROP_LF)) {
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_CONTROL) ||
+ has_property(a, &gb[0], gb_prop, GB_PROP_CR) ||
+ has_property(a, &gb[0], gb_prop, GB_PROP_LF)) {
                 return 1;
         }
 
         /* GB5 */
- if (has_property(&props[1], PROP_CONTROL) ||
- has_property(&props[1], PROP_CR) ||
- has_property(&props[1], PROP_LF)) {
+ if (has_property(b, &gb[1], gb_prop, GB_PROP_CONTROL) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_CR) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_LF)) {
                 return 1;
         }
 
         /* GB6 */
- if (has_property(&props[0], PROP_L) &&
- (has_property(&props[1], PROP_L) ||
- has_property(&props[1], PROP_V) ||
- has_property(&props[1], PROP_LV) ||
- has_property(&props[1], PROP_LVT))) {
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_L) &&
+ (has_property(b, &gb[1], gb_prop, GB_PROP_L) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_LV) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_LVT))) {
                 return 0;
         }
 
         /* GB7 */
- if ((has_property(&props[0], PROP_LV) ||
- has_property(&props[0], PROP_V)) &&
- (has_property(&props[1], PROP_V) ||
- has_property(&props[1], PROP_T))) {
+ if ((has_property(a, &gb[0], gb_prop, GB_PROP_LV) ||
+ has_property(a, &gb[0], gb_prop, GB_PROP_V)) &&
+ (has_property(b, &gb[1], gb_prop, GB_PROP_V) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_T))) {
                 return 0;
         }
 
         /* GB8 */
- if ((has_property(&props[0], PROP_LVT) ||
- has_property(&props[0], PROP_T)) &&
- has_property(&props[1], PROP_T)) {
+ if ((has_property(a, &gb[0], gb_prop, GB_PROP_LVT) ||
+ has_property(a, &gb[0], gb_prop, GB_PROP_T)) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_T)) {
                 return 0;
         }
 
         /* GB9 */
- if (has_property(&props[1], PROP_EXTEND) ||
- has_property(&props[1], PROP_ZWJ)) {
+ if (has_property(b, &gb[1], gb_prop, GB_PROP_EXTEND) ||
+ has_property(b, &gb[1], gb_prop, GB_PROP_ZWJ)) {
                 return 0;
         }
 
         /* GB9a */
- if (has_property(&props[1], PROP_SPACINGMARK)) {
+ if (has_property(b, &gb[1], gb_prop, GB_PROP_SPACINGMARK)) {
                 return 0;
         }
 
         /* GB9b */
- if (has_property(&props[0], PROP_PREPEND)) {
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_PREPEND)) {
                 return 0;
         }
 
         /* GB11 */
         if ((s & GRAPHEME_STATE_EMOJI) &&
- has_property(&props[0], PROP_ZWJ) &&
- has_property(&props[1], PROP_EXTPICT)) {
+ has_property(a, &gb[0], gb_prop, GB_PROP_ZWJ) &&
+ has_property(b, &emoji[1], emoji_prop, EMOJI_PROP_EXTPICT)) {
                 return 0;
         }
 
         /* GB12/GB13 */
- if (has_property(&props[0], PROP_RI) &&
- has_property(&props[1], PROP_RI) &&
+ if (has_property(a, &gb[0], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
+ has_property(b, &gb[1], gb_prop, GB_PROP_REGIONAL_INDICATOR) &&
             (s & GRAPHEME_STATE_RI_ODD)) {
                 return 0;
         }
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..662ea98
--- /dev/null
+++ b/src/util.c
_AT_@ -0,0 +1,33 @@
+/* See LICENSE file for copyright and license details. */
+#include "util.h"
+
+int
+heisenstate_get(struct heisenstate *h, int slot)
+{
+ if (h == NULL || slot >= 16 || slot < 0 ||
+ !(h->determined & (1 << slot))) {
+ /* no state given, slot out of range or undetermined */
+ return -1;
+ } else {
+ /* slot determined, return state (0 or 1) */
+ return (h->state & (1 << slot)) ? 1 : 0;
+ }
+}
+
+int
+heisenstate_set(struct heisenstate *h, int slot, int state)
+{
+ if (h == NULL || slot >= 16 || slot < 0) {
+ /* no state given or slot out of range */
+ return 1;
+ } else {
+ h->determined |= (1 << slot);
+ if (state) {
+ h->state |= (1 << slot);
+ } else {
+ h->state &= ~(1 << slot);
+ }
+ }
+
+ return 0;
+}
diff --git a/src/util.h b/src/util.h
new file mode 100644
index 0000000..e213428
--- /dev/null
+++ b/src/util.h
_AT_@ -0,0 +1,29 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+ uint32_t lower;
+ uint32_t upper;
+};
+
+struct range_list {
+ struct range *data;
+ size_t len;
+};
+
+/* 16-slot (0,...,15) optionally undetermined binary state */
+struct heisenstate {
+ uint_least16_t determined;
+ uint_least16_t state;
+};
+
+int heisenstate_get(struct heisenstate *, int);
+int heisenstate_set(struct heisenstate *, int, int);
+
+#endif /* UTIL_H */
Received on Fri Dec 04 2020 - 13:42:36 CET

This archive was generated by hypermail 2.3.0 : Fri Dec 04 2020 - 13:48:32 CET