[hackers] [libgrapheme] Replace awk-scripts with C programs for data-parsing || Laslo Hunhold from git_AT_suckless.org on 2020-10-18 (hackers mail list archive)

From: <git_AT_suckless.org>
Date: Sun, 18 Oct 2020 19:17:50 +0200 (CEST)

commit d74e91e355c37eff0ac64b8ce0e18ef587a1d333
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Sun Oct 18 19:07:17 2020 +0200
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Sun Oct 18 19:17:32 2020 +0200

    Replace awk-scripts with C programs for data-parsing

    Even though one can expect POSIX awk(1) to be present on almost all
    conceivable systems, I personally must admit that I was never
    comfortable with it and had to really bend it to support the features
    necessary for the Unicode data table parsing (most prominently,
    parsing hexadecimal numbers).

    It is common to write short awk-invocations to parse line-oriented
    data, but it hits its limits at the given scale. Much finer-grained
    control is possible in C, with the added benefit that code-reuse is
    possible and people familiar with C can now also debug the data parsing.
    All in all, it adds a few lines overall, but only marginally if you
    consider the fact that C is such a low-level language.

    As a result, libgrapheme now only needs POSIX make(1) and a C99
    compiler, while simplifying the Makefile a bit as well.

    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/Makefile b/Makefile
index af7aa66..7b4663d 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -13,18 +13,41 @@ MAN7 = man/libgrapheme.7

all: libgrapheme.a libgrapheme.so $(TEST)

+data/gbp.h: data/gbp.txt data/gbp
+data/emo.h: data/emo.txt data/emo
+data/gbt.h: data/gbt.txt data/gbt
+
+data/gbp.o: data/gbp.c config.mk data/util.h
+data/emo.o: data/emo.c config.mk data/util.h
+data/gbt.o: data/gbt.c config.mk data/util.h
+data/util.o: data/util.c config.mk data/util.h
src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h
src/codepoint.o: src/codepoint.c config.mk grapheme.h
src/grapheme.o: src/grapheme.c config.mk grapheme.h
test/test.o: test/test.c config.mk data/gbt.h grapheme.h

+data/gbp: data/gbp.o data/util.o
+data/emo: data/emo.o data/util.o
+data/gbt: data/gbt.o data/util.o
test/test: test/test.o $(LIB:=.o)

-test: $(TEST)
- for m in $(TEST); do ./$$m; done
+data/gbp.txt:
+ wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
+
+data/emo.txt:
+ wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
+
+data/gbt.txt:
+ wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
+
+$(DATA:=.h):
+ $(_AT_:.h=) < $(@:.h=.txt) > $@
+
+$(DATA):
+ $(CC) -o $_AT_ $(LDFLAGS) $@.o data/util.o

$(TEST):
- $(CC) -o $_AT_ $(LDFLAGS) $< $(LIB:=.o)
+ $(CC) -o $_AT_ $(LDFLAGS) $@.o $(LIB:=.o)

.c.o:
         $(CC) -c -o $_AT_ $(CPPFLAGS) $(CFLAGS) $<
_AT_@ -36,34 +59,8 @@ libgrapheme.a: $(LIB:=.o)
libgrapheme.so: $(LIB:=.o)
         $(CC) -o $_AT_ -shared $?

-data/gbp.h: data/gbp.awk data/gbp.txt
- printf "/* Automatically generated by gbp.awk */\n" > $_AT_
- printf "#include <stdint.h>\n\n" >> $_AT_
- awk -f data/gbp.awk data/gbp.txt >> $_AT_
- printf "\n" >> $_AT_
-
-data/emo.h: data/emo.awk data/emo.txt
- printf "/* Automatically generated by emo.awk */\n" > $_AT_
- printf "#include <stdint.h>\n\n" >> $_AT_
- awk -f data/emo.awk data/emo.txt >> $_AT_
- printf "\n" >> $_AT_
-
-data/gbt.h: data/gbt.awk data/gbt.txt
- printf "/* Automatically generated by gbt.awk */\n" > $_AT_
- printf "#include <stddef.h>\n" >> $_AT_
- printf "#include <stdint.h>\n\n" >> $_AT_
- printf "#include \"../grapheme.h\"\n\n" >> $_AT_
- awk -f data/gbt.awk data/gbt.txt >> $_AT_
- printf "\n" >> $_AT_
-
-data/gbp.txt:
- wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-
-data/emo.txt:
- wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
-
-data/gbt.txt:
- wget -O $_AT_ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
+test: $(TEST)
+ for m in $(TEST); do ./$$m; done

install: all
         mkdir -p "$(DESTDIR)$(LIBPREFIX)"
_AT_@ -84,7 +81,7 @@ uninstall:
         rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"

clean:
- rm -f $(DATA:=.h) $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so
+ rm -f $(DATA:=.h) $(DATA:=.o) data/util.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so

clean-data:
         rm -f $(DATA:=.txt)
diff --git a/data/emo.awk b/data/emo.awk
deleted file mode 100644
index a8920d9..0000000
--- a/data/emo.awk
+++ /dev/null
_AT_@ -1,77 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# https://www.unicode.org/Public/emoji/latest/emoji-data.txt
-BEGIN {
- FS = "[ ;]+"
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/ { next }
-$2 == "Extended_Pictographic#" { extpicts[nextpicts++] = $1 }
-
-END {
- mktable("extpict", extpicts, nextpicts);
-}
-
-function hextonum(str) {
- str = tolower(str);
- if (substr(str, 1, 2) != "0x") {
- return -1;
- }
- str = substr(str, 3);
-
- val = 0;
- for (i = 0; i < length(str); i++) {
- dig = index("0123456789abcdef", substr(str, i + 1, 1));
-
- if (!dig) {
- return -1;
- }
-
- val = (16 * val) + (dig - 1);
- }
-
- return val;
-}
-
-function mktable(name, array, arrlen) {
- printf("\nstatic const uint32_t "name"_table[][2] = {\n");
-
- for (j = 0; j < arrlen; j++) {
- if (ind = index(array[j], "..")) {
- lower = tolower(substr(array[j], 1, ind - 1));
- upper = tolower(substr(array[j], ind + 2));
- } else {
- lower = upper = tolower(array[j]);
- }
- lower = sprintf("0x%s", lower);
- upper = sprintf("0x%s", upper);
-
- # print lower bound
- printf("\t{ UINT32_C(%s), ", lower);
-
- for (; j < arrlen - 1; j++) {
- # look ahead and check if we have adjacent arrays
- if (ind = index(array[j + 1], "..")) {
- nextlower = tolower(substr(array[j + 1],
- 1, ind - 1));
- nextupper = tolower(substr(array[j + 1],
- ind + 2));
- } else {
- nextlower = nextupper = tolower(array[j + 1]);
- }
- nextlower = sprintf("0x%s", nextlower);
- nextupper = sprintf("0x%s", nextupper);
-
- if ((hextonum(nextlower) * 1) != (hextonum(upper) + 1)) {
- break;
- } else {
- upper = nextupper;
- }
- }
-
- # print upper bound
- printf("UINT32_C(%s) },\n", upper);
- }
-
- printf("};\n");
-}
diff --git a/data/emo.c b/data/emo.c
new file mode 100644
index 0000000..a3a9e38
--- /dev/null
+++ b/data/emo.c
_AT_@ -0,0 +1,68 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "util.h"
+
+static struct {
+ char *identifier;
+ char *tablename;
+ struct range *table;
+ size_t tablelen;
+} properties[] = {
+ {
+ .identifier = "Extended_Pictographic",
+ .tablename = "extpict_table",
+ },
+};
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+ size_t i;
+ struct range r;
+
+ (void)comment;
+
+ if (nfields < 2) {
+ return 1;
+ }
+
+ for (i = 0; i < LEN(properties); i++) {
+ if (!strcmp(field[1], properties[i].identifier)) {
+ if (range_parse(field[0], &r)) {
+ return 1;
+ }
+ range_list_append(&(properties[i].table),
+ &(properties[i].tablelen), &r);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int
+main(void)
+{
+ size_t i, j;
+
+ printf("/* Automatically generated by data/emo */\n"
+ "#include <stdint.h>\n");
+
+ parse_input(process_line);
+
+ for (i = 0; i < LEN(properties); i++) {
+ printf("\nstatic const uint32_t %s[][2] = {\n",
+ properties[i].tablename);
+ for (j = 0; j < properties[i].tablelen; j++) {
+ printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+ properties[i].table[j].lower,
+ properties[i].table[j].upper);
+ }
+ printf("};\n");
+ }
+
+ return 0;
+}
diff --git a/data/gbp.awk b/data/gbp.awk
deleted file mode 100644
index 0847ed9..0000000
--- a/data/gbp.awk
+++ /dev/null
_AT_@ -1,101 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
-BEGIN {
- FS = "[ ;]+"
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/ { next }
-$2 == "CR" { crs[ncrs++] = $1 }
-$2 == "LF" { lfs[nlfs++] = $1 }
-$2 == "Control" { controls[ncontrols++] = $1 }
-$2 == "Extend" { extends[nextends++] = $1 }
-$2 == "ZWJ" { zwj[nzwj++] = $1 }
-$2 == "Regional_Indicator" { ris[nris++] = $1 }
-$2 == "Prepend" { prepends[nprepends++] = $1 }
-$2 == "SpacingMark" { spacingmarks[nspacingmarks++] = $1 }
-$2 == "L" { ls[nls++] = $1 }
-$2 == "V" { vs[nvs++] = $1 }
-$2 == "T" { ts[nts++] = $1 }
-$2 == "LV" { lvs[nlvs++] = $1 }
-$2 == "LVT" { lvts[nlvts++] = $1 }
-
-END {
- mktable("cr", crs, ncrs);
- mktable("lf", lfs, nlfs);
- mktable("control", controls, ncontrols);
- mktable("extend", extends, nextends);
- mktable("zwj", zwj, nzwj);
- mktable("ri", ris, nris);
- mktable("prepend", prepends, nprepends);
- mktable("spacingmark", spacingmarks, nspacingmarks);
- mktable("l", ls, nls);
- mktable("v", vs, nvs);
- mktable("t", ts, nts);
- mktable("lv", lvs, nlvs);
- mktable("lvt", lvts, nlvts);
-}
-
-function hextonum(str) {
- str = tolower(str);
- if (substr(str, 1, 2) != "0x") {
- return -1;
- }
- str = substr(str, 3);
-
- val = 0;
- for (i = 0; i < length(str); i++) {
- dig = index("0123456789abcdef", substr(str, i + 1, 1));
-
- if (!dig) {
- return -1;
- }
-
- val = (16 * val) + (dig - 1);
- }
-
- return val;
-}
-
-function mktable(name, array, arrlen) {
- printf("static const uint32_t "name"_table[][2] = {\n");
-
- for (j = 0; j < arrlen; j++) {
- if (ind = index(array[j], "..")) {
- lower = tolower(substr(array[j], 1, ind - 1));
- upper = tolower(substr(array[j], ind + 2));
- } else {
- lower = upper = tolower(array[j]);
- }
- lower = sprintf("0x%s", lower);
- upper = sprintf("0x%s", upper);
-
- # print lower bound
- printf("\t{ UINT32_C(%s), ", lower);
-
- for (; j < arrlen - 1; j++) {
- # look ahead and check if we have adjacent arrays
- if (ind = index(array[j + 1], "..")) {
- nextlower = tolower(substr(array[j + 1],
- 1, ind - 1));
- nextupper = tolower(substr(array[j + 1],
- ind + 2));
- } else {
- nextlower = nextupper = tolower(array[j + 1]);
- }
- nextlower = sprintf("0x%s", nextlower);
- nextupper = sprintf("0x%s", nextupper);
-
- if ((hextonum(nextlower) * 1) != (hextonum(upper) + 1)) {
- break;
- } else {
- upper = nextupper;
- }
- }
-
- # print upper bound
- printf("UINT32_C(%s) },\n", upper);
- }
-
- printf("};\n");
-}
diff --git a/data/gbp.c b/data/gbp.c
new file mode 100644
index 0000000..7ef8239
--- /dev/null
+++ b/data/gbp.c
_AT_@ -0,0 +1,116 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "util.h"
+
+static struct {
+ char *identifier;
+ char *tablename;
+ struct range *table;
+ size_t tablelen;
+} properties[] = {
+ {
+ .identifier = "CR",
+ .tablename = "cr_table",
+ },
+ {
+ .identifier = "LF",
+ .tablename = "lf_table",
+ },
+ {
+ .identifier = "Control",
+ .tablename = "control_table",
+ },
+ {
+ .identifier = "Extend",
+ .tablename = "extend_table",
+ },
+ {
+ .identifier = "ZWJ",
+ .tablename = "zwj_table",
+ },
+ {
+ .identifier = "Regional_Indicator",
+ .tablename = "ri_table",
+ },
+ {
+ .identifier = "Prepend",
+ .tablename = "prepend_table",
+ },
+ {
+ .identifier = "SpacingMark",
+ .tablename = "spacingmark_table",
+ },
+ {
+ .identifier = "L",
+ .tablename = "l_table",
+ },
+ {
+ .identifier = "V",
+ .tablename = "v_table",
+ },
+ {
+ .identifier = "T",
+ .tablename = "t_table",
+ },
+ {
+ .identifier = "LV",
+ .tablename = "lv_table",
+ },
+ {
+ .identifier = "LVT",
+ .tablename = "lvt_table",
+ },
+};
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+ size_t i;
+ struct range r;
+
+ (void)comment;
+
+ if (nfields < 2) {
+ return 1;
+ }
+
+ for (i = 0; i < LEN(properties); i++) {
+ if (!strcmp(field[1], properties[i].identifier)) {
+ if (range_parse(field[0], &r)) {
+ return 1;
+ }
+ range_list_append(&(properties[i].table),
+ &(properties[i].tablelen), &r);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int
+main(void)
+{
+ size_t i, j;
+
+ printf("/* Automatically generated by data/gbp */\n"
+ "#include <stdint.h>\n");
+
+ parse_input(process_line);
+
+ for (i = 0; i < LEN(properties); i++) {
+ printf("\nstatic const uint32_t %s[][2] = {\n",
+ properties[i].tablename);
+ for (j = 0; j < properties[i].tablelen; j++) {
+ printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+ properties[i].table[j].lower,
+ properties[i].table[j].upper);
+ }
+ printf("};\n");
+ }
+
+ return 0;
+}
diff --git a/data/gbt.awk b/data/gbt.awk
deleted file mode 100644
index 264edd5..0000000
--- a/data/gbt.awk
+++ /dev/null
_AT_@ -1,68 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
-BEGIN {
- FS = " "
-
- printf("struct test {\n\tuint32_t *cp;\n\tsize_t cplen;\n");
- printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n");
- printf("static const struct test t[] = {\n");
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/ { next }
-
-{
- ncps = 0;
- nlens = 0;
-
- curlen = 1;
- for (i = 2; i <= NF; i++) {
- if ($(i + 1) == "#") {
- break;
- }
- if (i % 2 == 0) {
- # code point
- cp[ncps++] = tolower($i);
- } else {
- # break information
- if ($i == "÷") {
- # break
- len[nlens++] = curlen;
- curlen = 1;
- } else { # $i == "×"
- # no break
- curlen++;
- }
- }
- }
- len[nlens++] = curlen;
-
- # print code points
- printf("\t{\n\t\t.cp = (uint32_t[]){ ");
- for (i = 0; i < ncps; i++) {
- printf("UINT32_C(0x%s)", cp[i]);
- if (i + 1 < ncps) {
- printf(", ");
- }
- }
- printf(" },\n\t\t.cplen = %d,\n", ncps);
-
- # print grapheme cluster lengths
- printf("\t\t.len = (size_t[]){ ");
- for (i = 0; i < nlens; i++) {
- printf("%s", len[i]);
- if (i + 1 < nlens) {
- printf(", ");
- }
- }
- printf(" },\n\t\t.lenlen = %d,\n", nlens);
-
- # print testcase description
- printf("\t\t.descr = \"%s\",\n", substr($0, index($0, "#") + 3));
-
- printf("\t},\n");
-}
-
-END {
- printf("};\n");
-}
diff --git a/data/gbt.c b/data/gbt.c
new file mode 100644
index 0000000..02e71f1
--- /dev/null
+++ b/data/gbt.c
_AT_@ -0,0 +1,139 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+struct break_test {
+ uint32_t *cp;
+ size_t cplen;
+ size_t *len;
+ size_t lenlen;
+ char *descr;
+};
+
+static struct break_test *test = NULL;
+static size_t ntests = 0;
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+ struct break_test *t;
+ size_t i;
+ char *token;
+
+ if (nfields < 1) {
+ return 1;
+ }
+
+ /* append new testcase and initialize with zeroes */
+ if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ return 1;
+ }
+ t = &test[ntests - 1];
+ memset(t, 0, sizeof(*t));
+
+ /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
+ for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
+ token = strtok(NULL, " ")) {
+ if (i % 2 == 0) {
+ /* delimiter */
+ if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
+ /*
+ * '÷' indicates a breakpoint,
+ * the current length is done; allocate
+ * a new length field and set it to 0
+ */
+ if ((t->len = realloc(t->len,
+ ++t->lenlen * sizeof(*t->len))) == NULL) {
+ fprintf(stderr, "realloc: %s\n",
+ strerror(errno));
+ return 1;
+ }
+ t->len[t->lenlen - 1] = 0;
+ } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
+ /*
+ * '×' indicates a non-breakpoint, do nothing
+ */
+ } else {
+ fprintf(stderr, "malformed delimiter '%s'\n",
+ token);
+ return 1;
+ }
+ } else {
+ /* add code point to cp-array */
+ if ((t->cp = realloc(t->cp, ++t->cplen *
+ sizeof(*t->cp))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ return 1;
+ }
+ if (cp_parse(token, &t->cp[t->cplen - 1])) {
+ return 1;
+ }
+ if (t->lenlen > 0) {
+ t->len[t->lenlen - 1]++;
+ }
+ }
+ }
+ if (t->len[t->lenlen - 1] == 0) {
+ /* we allocated one more length than we needed */
+ t->lenlen--;
+ }
+
+ /* store comment */
+ if ((test[ntests - 1].descr = strdup(comment)) == NULL) {
+ fprintf(stderr, "strdup: %s\n", strerror(errno));
+ return 1;
+ }
+
+ return 0;
+}
+
+int
+main(void)
+{
+ size_t i, j;
+
+ printf("/* Automatically generated by data/gbt */\n"
+ "#include <stdint.h>\n#include <stddef.h>\n\n");
+
+ parse_input(process_line);
+
+ printf("static const struct break_test {\n\tuint32_t *cp;\n"
+ "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
+ "\tchar *descr;\n} t[] = {\n");
+ for (i = 0; i < ntests; i++) {
+ printf("\t{\n");
+
+ printf("\t\t.cp = (uint32_t[]){");
+ for (j = 0; j < test[i].cplen; j++) {
+ printf(" UINT32_C(0x%06X)", test[i].cp[j]);
+ if (j + 1 < test[i].cplen) {
+ putchar(',');
+ }
+ }
+ printf(" },\n");
+ printf("\t\t.cplen = %zu,\n", test[i].cplen);
+
+ printf("\t\t.len = (size_t[]){");
+ for (j = 0; j < test[i].lenlen; j++) {
+ printf(" %zu", test[i].len[j]);
+ if (j + 1 < test[i].lenlen) {
+ putchar(',');
+ }
+ }
+ printf(" },\n");
+ printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
+
+ printf("\t\t.descr = \"%s\",\n", test[i].descr);
+
+ printf("\t},\n");
+ }
+ printf("};\n");
+
+ return 0;
+}
diff --git a/data/util.c b/data/util.c
new file mode 100644
index 0000000..b950dbd
--- /dev/null
+++ b/data/util.c
_AT_@ -0,0 +1,159 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+void
+parse_input(int (*process_line)(char **, size_t, char *))
+{
+ char *line = NULL, **field = NULL, *comment;
+ size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
+ ssize_t len;
+
+ while ((len = getline(&line, &linebufsize, stdin)) >= 0) {
+ /* remove trailing newline */
+ if (len > 0 && line[len - 1] == '\n') {
+ line[len - 1] = '\0';
+ len--;
+ }
+
+ /* skip empty lines and comment lines */
+ if (len == 0 || line[0] == '#') {
+ continue;
+ }
+
+ /* tokenize line into fields */
+ for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
+ /* extend field buffer, if necessary */
+ if (++nfields > fieldbufsize) {
+ if ((field = realloc(field, nfields *
+ sizeof(*field))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ fieldbufsize = nfields;
+ }
+
+ /* skip leading whitespace */
+ while (line[i] == ' ') {
+ i++;
+ }
+
+ /* set current position as field start */
+ field[nfields - 1] = &line[i];
+
+ /* continue until we reach ';' or '#' or end */
+ while (line[i] != ';' && line[i] != '#' &&
+ line[i] != '\0') {
+ i++;
+ }
+ if (line [i] == '#') {
+ /* set comment-variable for later */
+ comment = &line[i + 1];
+ }
+
+ /* go back whitespace and terminate field there */
+ if (i > 0) {
+ for (j = i - 1; line[j] == ' '; j--)
+ ;
+ line[j + 1] = '\0';
+ } else {
+ line[i] = '\0';
+ }
+
+ /* if comment is set, we are done */
+ if (comment != NULL) {
+ break;
+ }
+ }
+
+ /* skip leading whitespace in comment */
+ while (comment != NULL && comment[0] == ' ') {
+ comment++;
+ }
+
+ /* call line processing function */
+ if (process_line(field, nfields, comment)) {
+ exit(1);
+ }
+ }
+
+ free(line);
+ free(field);
+}
+
+static int
+valid_hexstring(const char *str)
+{
+ const char *p = str;
+
+ while ((*p >= '0' && *p <= '9') ||
+ (*p >= 'a' && *p <= 'f') ||
+ (*p >= 'A' && *p <= 'F')) {
+ p++;
+ }
+
+ if (*p != '\0') {
+ fprintf(stderr, "invalid code point range '%s'\n", str);
+ return 0;
+ }
+
+ return 1;
+}
+
+int
+cp_parse(const char *str, uint32_t *cp)
+{
+ if (!valid_hexstring(str)) {
+ return 1;
+ }
+ *cp = strtol(str, NULL, 16);
+
+ return 0;
+}
+
+int
+range_parse(const char *str, struct range *range)
+{
+ char *p;
+
+ if ((p = strstr(str, "..")) == NULL) {
+ /* input has the form "XXXXXX" */
+ if (!valid_hexstring(str)) {
+ return 1;
+ }
+ range->lower = range->upper = strtol(str, NULL, 16);
+ } else {
+ /* input has the form "XXXXXX..XXXXXX" */
+ *p = '\0';
+ p += 2;
+ if (!valid_hexstring(str) || !valid_hexstring(p)) {
+ return 1;
+ }
+ range->lower = strtol(str, NULL, 16);
+ range->upper = strtol(p, NULL, 16);
+ }
+
+ return 0;
+}
+
+void
+range_list_append(struct range **range, size_t *nranges, const struct range *new)
+{
+ if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
+ /* we can merge with previous entry */
+ (*range)[*nranges - 1].upper = new->upper;
+ } else {
+ /* need to append new entry */
+ if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
+ fprintf(stderr, "realloc: %s\n", strerror(errno));
+ exit(1);
+ }
+ (*range)[*nranges - 1].lower = new->lower;
+ (*range)[*nranges - 1].upper = new->upper;
+ }
+}
diff --git a/data/util.h b/data/util.h
new file mode 100644
index 0000000..e84eb01
--- /dev/null
+++ b/data/util.h
_AT_@ -0,0 +1,20 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+ uint32_t lower;
+ uint32_t upper;
+};
+
+void parse_input(int (*process_line)(char **, size_t, char *));
+int cp_parse(const char *, uint32_t *);
+int range_parse(const char *, struct range *);
+void range_list_append(struct range **, size_t *, const struct range *);
+
+#endif /* UTIL_H */
Received on Sun Oct 18 2020 - 19:17:50 CEST

This archive was generated by hypermail 2.3.0 : Sun Oct 18 2020 - 19:24:34 CEST