commit 7981a5db713073992d00ee2231b88558977671aa
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Fri Dec 10 20:30:21 2021 +0100
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Fri Dec 10 20:30:21 2021 +0100
Severely speed up grapheme cluster break detection
Instead of discarding all properties we determined for two adjacent
codepoints, we use the given state (we have to have for the flags) to
shift the properties of the right character and move them to the left
character. This change leads to a speedup of ~25%.
On the API, it changes the "int *" state argument to an opaque type
"LG_SEGMENTATION_STATE *". Any normal library would just define an
internal struct and export a function "init_state" or something and
be done with it, but I didn't like the idea of having heap-allocations
in libgrapheme and wanted to make it possible to just do
LG_SEGMENTATION_STATE state;
lg_grapheme_isbreak(a, b, &state);
but have also the choice to do
LG_SEGMENTATION_STATE *state;
state = malloc(sizeof(*state));
lg_grapheme_isbreak(a, b, &state);
whatever is preferred or makes sense in the context.
Doing this is pretty difficult, because allowing C to know the storage
size of a type through the API is only possible if it knows the
implementation details. My first attempt was to define a struct with
identical structure in grapheme.h (only with "anonymous" structs
rather than the internal heisenstate-structs), but after consulting
the standard this is undefined behaviour and breaks strict aliasing
rules. So I bit the bullet and gave the definition in the header,
indicating with "internal" that these types should not be used.
In most cases, users will probably never get in contact with this data
structure, as lg_grapheme_nextbreak() is still as clean as before.
Signed-off-by: Laslo Hunhold <dev_AT_frign.de>
diff --git a/grapheme.h b/grapheme.h
index cdd2599..6a6fe4f 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -5,12 +5,24 @@
#include <stddef.h>
#include <stdint.h>
+struct lg_internal_heisenstate {
+ uint_least64_t determined;
+ uint_least64_t state;
+};
+
+typedef struct lg_internal_segmentation_state {
+ struct lg_internal_heisenstate a;
+ struct lg_internal_heisenstate b;
+ uint_least16_t flags;
+} LG_SEGMENTATION_STATE;
+
#define LG_CODEPOINT_INVALID UINT32_C(0xFFFD)
+size_t lg_grapheme_nextbreak(const char *);
+
+int lg_grapheme_isbreak(uint32_t, uint32_t, LG_SEGMENTATION_STATE *);
+
size_t lg_utf8_decode(const uint8_t *, size_t, uint32_t *);
size_t lg_utf8_encode(uint32_t, uint8_t *, size_t);
-size_t lg_grapheme_nextbreak(const char *);
-int lg_grapheme_isbreak(uint32_t, uint32_t, int *);
-
#endif /* GRAPHEME_H */
diff --git a/src/grapheme.c b/src/grapheme.c
index ab487af..2feaa2f 100644
--- a/src/grapheme.c
+++ b/src/grapheme.c
_AT_@ -1,73 +1,79 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
#include <stdlib.h>
+#include <string.h>
#include "../gen/grapheme.h"
#include "../grapheme.h"
+#include "util.h"
enum {
- GRAPHEME_STATE_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
- GRAPHEME_STATE_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
+ GRAPHEME_FLAG_RI_ODD = 1 << 0, /* odd number of RI's before the seam */
+ GRAPHEME_FLAG_EMOJI = 1 << 1, /* within emoji modifier or zwj sequence */
};
int
-lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state)
+lg_grapheme_isbreak(uint32_t a, uint32_t b, LG_SEGMENTATION_STATE *state)
{
- struct heisenstate prop[2] = { 0 };
- int s;
+ struct lg_internal_heisenstate *p[2] = { 0 };
+ int ret = 1, flags = 0;
+
+ /* set state depending on state pointer */
+ if (state != NULL) {
+ p[0] = &(state->a);
+ p[1] = &(state->b);
+ flags = state->flags;
+ }
/* skip printable ASCII */
if ((a >= 0x20 && a <= 0x7E) &&
(b >= 0x20 && b <= 0x7E)) {
- return 1;
+ goto hasbreak;
}
- /* set internal state based on given state-pointer */
- s = (state != NULL) ? *state : 0;
-
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
*
http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*/
/*
- * update state
+ * update flags, if state-pointer given
*/
- if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
- /* one more RI is on the left side of the seam */
- s ^= GRAPHEME_STATE_RI_ODD;
+ if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR)) {
+ /* one more RI is on the left side of the seam, flip state */
+ flags ^= GRAPHEME_FLAG_RI_ODD;
} else {
/* an RI appeared on the right side but the left
- side is not an RI, reset state (0 is even) */
- s &= ~GRAPHEME_STATE_RI_ODD;
+ side is not an RI, reset state (number 0 is even) */
+ flags &= ~GRAPHEME_FLAG_RI_ODD;
}
}
- if (!(*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
- s |= GRAPHEME_STATE_EMOJI;
- } else if ((*state & GRAPHEME_STATE_EMOJI) &&
- ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
- (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
- (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
- (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
- /* GRAPHEME_STATE_EMOJI remains */
+ if (!(flags & GRAPHEME_FLAG_EMOJI) &&
+ ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+ flags |= GRAPHEME_FLAG_EMOJI;
+ } else if ((flags & GRAPHEME_FLAG_EMOJI) &&
+ ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) ||
+ (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)) ||
+ (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTEND) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) ||
+ (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND)))) {
+ /* GRAPHEME_FLAG_EMOJI remains */
} else {
- s &= ~GRAPHEME_STATE_EMOJI;
+ flags &= ~GRAPHEME_FLAG_EMOJI;
}
- /* write updated state to state-pointer, if given */
+ /* write updated flags to state, if given */
if (state != NULL) {
- *state = s;
+ state->flags = flags;
}
/*
_AT_@ -77,81 +83,97 @@ lg_grapheme_isbreak(uint32_t a, uint32_t b, int *state)
/* skip GB1 and GB2, as they are never satisfied here */
/* GB3 */
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
- return 0;
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+ goto nobreak;
}
/* GB4 */
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
- has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_CR) ||
- has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_LF)) {
- return 1;
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+ has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_CR) ||
+ has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_LF)) {
+ goto hasbreak;
}
/* GB5 */
- if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_CR) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_LF)) {
- return 1;
+ if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CONTROL) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_CR) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_LF)) {
+ goto hasbreak;
}
/* GB6 */
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
- (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
- return 0;
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_L) &&
+ (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_L) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT))) {
+ goto nobreak;
}
/* GB7 */
- if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
- has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
- (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
- return 0;
+ if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LV) ||
+ has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_V)) &&
+ (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_V) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T))) {
+ goto nobreak;
}
/* GB8 */
- if ((has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
- has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
- return 0;
+ if ((has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_LVT) ||
+ has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_HANGUL_T)) {
+ goto nobreak;
}
/* GB9 */
- if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
- return 0;
+ if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTEND) ||
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_ZWJ)) {
+ goto nobreak;
}
/* GB9a */
- if (has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
- return 0;
+ if (has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_SPACINGMARK)) {
+ goto nobreak;
}
/* GB9b */
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
- return 0;
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_PREPEND)) {
+ goto nobreak;
}
/* GB11 */
- if ((s & GRAPHEME_STATE_EMOJI) &&
- has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
- return 0;
+ if ((flags & GRAPHEME_FLAG_EMOJI) &&
+ has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_ZWJ) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_EXTENDED_PICTOGRAPHIC)) {
+ goto nobreak;
}
/* GB12/GB13 */
- if (has_property(a, &prop[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
- has_property(b, &prop[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
- (s & GRAPHEME_STATE_RI_ODD)) {
- return 0;
+ if (has_property(a, p[0], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+ has_property(b, p[1], grapheme_prop, GRAPHEME_PROP_REGIONAL_INDICATOR) &&
+ (flags & GRAPHEME_FLAG_RI_ODD)) {
+ goto nobreak;
}
/* GB999 */
- return 1;
+ goto hasbreak;
+nobreak:
+ ret = 0;
+hasbreak:
+ if (state != NULL) {
+ /* move b-state to a-state, discard b-state */
+ memcpy(&(state->a), &(state->b), sizeof(state->a));
+ memset(&(state->b), 0, sizeof(state->b));
+
+ /* reset flags */
+ if (ret == 1) {
+ state->flags = 0;
+ }
+ }
+
+ return ret;
}
size_t
_AT_@ -159,7 +181,7 @@ lg_grapheme_nextbreak(const char *str)
{
uint32_t cp0, cp1;
size_t ret, len = 0;
- int state = 0;
+ LG_SEGMENTATION_STATE state = { 0 };
if (str == NULL) {
return 0;
diff --git a/src/utf8.c b/src/utf8.c
index 2d3cd28..4488359 100644
--- a/src/utf8.c
+++ b/src/utf8.c
_AT_@ -1,9 +1,10 @@
/* See LICENSE file for copyright and license details. */
-#include "../grapheme.h"
#include <stdio.h>
+#include "../grapheme.h"
+#include "util.h"
+
#define BETWEEN(c, l, u) (c >= l && c <= u)
-#define LEN(x) (sizeof(x) / sizeof(*x))
/* lookup-table for the types of sequence first bytes */
static const struct {
diff --git a/src/util.c b/src/util.c
index 955cdad..5bbe926 100644
--- a/src/util.c
+++ b/src/util.c
_AT_@ -2,10 +2,13 @@
#include <stdint.h>
#include <stdlib.h>
+#include "../grapheme.h"
#include "util.h"
+/* 64-slot (0,...,63) optionally undetermined binary state */
+
int
-heisenstate_get(struct heisenstate *h, int slot)
+heisenstate_get(struct lg_internal_heisenstate *h, int slot)
{
if (h == NULL || slot >= 64 || slot < 0 ||
!(h->determined & (1 << slot))) {
_AT_@ -18,7 +21,7 @@ heisenstate_get(struct heisenstate *h, int slot)
}
int
-heisenstate_set(struct heisenstate *h, int slot, int state)
+heisenstate_set(struct lg_internal_heisenstate *h, int slot, int state)
{
if (h == NULL || slot >= 64 || slot < 0) {
/* no state given or slot out of range */
_AT_@ -45,17 +48,23 @@ cp_cmp(const void *a, const void *b)
}
int
-has_property(uint32_t cp, struct heisenstate *cpstate,
+has_property(uint32_t cp, struct lg_internal_heisenstate *cpstate,
const struct range_list *proptable, int property)
{
- if (heisenstate_get(cpstate, property) == -1) {
- /* state undetermined, make a lookup and set it */
- heisenstate_set(cpstate, property, bsearch(&cp,
- proptable[property].data,
- proptable[property].len,
- sizeof(*proptable[property].data),
- cp_cmp) ? 1 : 0);
+ int res;
+
+ if (cpstate == NULL ||
+ (res = heisenstate_get(cpstate, property)) == -1) {
+ /* make a lookup */
+ res = bsearch(&cp, proptable[property].data,
+ proptable[property].len,
+ sizeof(*proptable[property].data), cp_cmp) ?
+ 1 : 0;
+
+ if (cpstate != NULL) {
+ heisenstate_set(cpstate, property, res);
+ }
}
- return heisenstate_get(cpstate, property);
+ return res;
}
diff --git a/src/util.h b/src/util.h
index e480da0..3a9cccd 100644
--- a/src/util.h
+++ b/src/util.h
_AT_@ -5,7 +5,9 @@
#include <stddef.h>
#include <stdint.h>
-#define LEN(x) (sizeof (x) / sizeof *(x))
+#include "../grapheme.h"
+
+#define LEN(x) (sizeof(x) / sizeof(*(x)))
struct range {
uint32_t lower;
_AT_@ -17,16 +19,10 @@ struct range_list {
size_t len;
};
-/* 64-slot (0,...,63) optionally undetermined binary state */
-struct heisenstate {
- uint_least64_t determined;
- uint_least64_t state;
-};
-
-int heisenstate_get(struct heisenstate *, int);
-int heisenstate_set(struct heisenstate *, int, int);
+int heisenstate_get(struct lg_internal_heisenstate *, int);
+int heisenstate_set(struct lg_internal_heisenstate *, int, int);
-int has_property(uint32_t, struct heisenstate *,
+int has_property(uint32_t, struct lg_internal_heisenstate *,
const struct range_list *, int);
#endif /* UTIL_H */
diff --git a/test/grapheme-performance.c b/test/grapheme-performance.c
index b03f663..30fdc3a 100644
--- a/test/grapheme-performance.c
+++ b/test/grapheme-performance.c
_AT_@ -2,12 +2,13 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <time.h>
#include "../grapheme.h"
#include "../gen/grapheme-test.h"
-#define LEN(x) (sizeof(x) / sizeof(*x))
+#define LEN(x) (sizeof(x) / sizeof(*(x)))
#define NUM_ITERATIONS 10000
int64_t time_diff(struct timespec *a, struct timespec *b)
_AT_@ -22,7 +23,7 @@ main(void)
struct timespec start, end;
size_t i, j, bufsiz, off;
uint32_t *buf;
- int state;
+ LG_SEGMENTATION_STATE state;
double cp_per_sec;
/* allocate and generate buffer */
_AT_@ -46,6 +47,7 @@ main(void)
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < NUM_ITERATIONS; i++) {
+ memset(&state, 0, sizeof(state));
for (j = 0; j < bufsiz - 1; j++) {
(void)lg_grapheme_isbreak(buf[j], buf[j+1], &state);
}
diff --git a/test/grapheme.c b/test/grapheme.c
index 1cb5ad0..8a6a68a 100644
--- a/test/grapheme.c
+++ b/test/grapheme.c
_AT_@ -7,17 +7,18 @@
#include "../grapheme.h"
#include "../gen/grapheme-test.h"
-#define LEN(x) (sizeof(x) / sizeof(*x))
+#define LEN(x) (sizeof(x) / sizeof(*(x)))
int
main(void)
{
- int state;
+ LG_SEGMENTATION_STATE state;
size_t i, j, k, len, failed;
/* grapheme break test */
for (i = 0, failed = 0; i < LEN(grapheme_test); i++) {
- for (j = 0, k = 0, state = 0, len = 1; j < grapheme_test[i].cplen; j++) {
+ memset(&state, 0, sizeof(state));
+ for (j = 0, k = 0, len = 1; j < grapheme_test[i].cplen; j++) {
if ((j + 1) == grapheme_test[i].cplen ||
lg_grapheme_isbreak(grapheme_test[i].cp[j],
grapheme_test[i].cp[j + 1],
diff --git a/test/utf8-decode.c b/test/utf8-decode.c
index a9331af..1182fb0 100644
--- a/test/utf8-decode.c
+++ b/test/utf8-decode.c
_AT_@ -6,7 +6,7 @@
#include "../grapheme.h"
-#define LEN(x) (sizeof(x) / sizeof(*x))
+#define LEN(x) (sizeof(x) / sizeof(*(x)))
static const struct {
uint8_t *arr; /* UTF-8 byte sequence */
diff --git a/test/utf8-encode.c b/test/utf8-encode.c
index 0125b2a..2f978d2 100644
--- a/test/utf8-encode.c
+++ b/test/utf8-encode.c
_AT_@ -6,7 +6,7 @@
#include "../grapheme.h"
-#define LEN(x) (sizeof(x) / sizeof(*x))
+#define LEN(x) (sizeof(x) / sizeof(*(x)))
static const struct {
uint32_t cp; /* input code point */
Received on Fri Dec 10 2021 - 20:41:29 CET