[hackers] [libgrapheme] Split bidi-level-processing into preprocessing and line step || Laslo Hunhold
commit 5aaa6b99856e3cf29864a5d2c5aecc4206d495d3
Author: Laslo Hunhold <laslo_AT_hunhold.de>
AuthorDate: Mon Nov 21 08:53:14 2022 +0100
Commit: Laslo Hunhold <laslo_AT_hunhold.de>
CommitDate: Mon Nov 21 08:53:14 2022 +0100
Split bidi-level-processing into preprocessing and line step
The bidirectional algorithm is a bit convoluted in this regard,
but the canonical choice for the implementation is to do
preprocessing on all paragraphs first (applying all rules up to
L1.3) and applying rule L1.4 separately.
The reason for this is that rule L1.4 requires the knowledge
about line break positions, which we don't have (yet). We could
take it as a parameter for the preprocessing-function, however,
line breaks may change often (think of an ncurses-context with
window resizes), making constant complete reprocessings very
wasteful.
Thus, the line-specific processing is put into a separate
function. This way, the user passes each individual line together
with its preprocessing data.
Rule L1.4 will be implemented in a later commit.
diff --git a/grapheme.h b/grapheme.h
index fbb8e4a..f8d39bd 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -15,16 +15,26 @@ enum grapheme_bidirectional_override {
GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL,
};
-size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
-size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
+void grapheme_bidirectional_get_line_embedding_levels(
+ const int_least32_t *, size_t, int_least8_t *);
-size_t grapheme_get_bidirectional_embedding_levels(
+size_t grapheme_bidirectional_preprocess(
const uint_least32_t *, size_t, enum grapheme_bidirectional_override,
int_least32_t *, size_t);
-size_t grapheme_get_bidirectional_embedding_levels_utf8(
+size_t grapheme_bidirectional_preprocess_utf8(
const char *, size_t, enum grapheme_bidirectional_override,
int_least32_t *, size_t);
+size_t grapheme_bidirectional_reorder_line(
+ const uint_least32_t *, const int_least8_t *, size_t,
+ uint_least32_t *, size_t);
+size_t grapheme_bidirectional_reorder_line_utf8(
+ const char *, const int_least8_t *, size_t,
+ char *, size_t);
+
+size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
+size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
+
bool grapheme_is_character_break(uint_least32_t, uint_least32_t,
uint_least16_t *);
diff --git a/src/bidirectional.c b/src/bidirectional.c
index fa46853..be19a59 100644
--- a/src/bidirectional.c
+++ b/src/bidirectional.c
_AT_@ -385,8 +385,8 @@ ir_advance(struct isolate_runner *ir)
}
static size_t
-process_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off,
- uint_least8_t paragraph_level)
+preprocess_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off,
+ uint_least8_t paragraph_level)
{
enum bidi_property sequence_prop, prop;
struct isolate_runner ir, tmp;
_AT_@ -652,8 +652,8 @@ get_paragraph_level(enum grapheme_bidirectional_override override,
}
static void
-get_paragraph_embedding_levels(enum grapheme_bidirectional_override override,
- int_least32_t *buf, size_t buflen)
+preprocess_paragraph(enum grapheme_bidirectional_override override,
+ int_least32_t *buf, size_t buflen)
{
enum bidi_property prop;
int_least8_t level;
_AT_@ -920,7 +920,7 @@ again:
for (bufoff = 0; bufoff < buflen; bufoff++) {
if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
get_state(STATE_LEVEL, buf[bufoff]) != -1) {
- bufoff += process_isolating_run_sequence(
+ bufoff += preprocess_isolating_run_sequence(
buf, buflen, bufoff, paragraph_level);
}
}
_AT_@ -964,6 +964,12 @@ again:
continue;
}
+ /* rules 1 and 2 */
+ if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
+ set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
+ }
+
+ /* rule 3 */
if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
prop == BIDI_PROP_PDI) {
_AT_@ -971,8 +977,12 @@ again:
/* a new run has begun */
runsince = bufoff;
}
- } else if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
- /* L1.4 -- ignored for now, < beachten! */
+ } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
+ runsince != SIZE_MAX) {
+ /*
+ * we hit a segment or paragraph separator in a
+ * sequence, reset sequence-levels
+ */
for (i = runsince; i < bufoff; i++) {
if (get_state(STATE_LEVEL, buf[i]) != -1) {
set_state(STATE_LEVEL, paragraph_level,
_AT_@ -984,11 +994,6 @@ again:
/* sequence ended */
runsince = SIZE_MAX;
}
-
- if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
- set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
- }
- continue;
}
if (runsince != SIZE_MAX) {
/*
_AT_@ -1027,9 +1032,9 @@ get_bidi_bracket_off(uint_least32_t cp)
}
static size_t
-get_embedding_levels(HERODOTUS_READER *r,
- enum grapheme_bidirectional_override override,
- int_least32_t *buf, size_t buflen)
+preprocess(HERODOTUS_READER *r,
+ enum grapheme_bidirectional_override override,
+ int_least32_t *buf, size_t buflen)
{
size_t bufoff, bufsize, lastparoff;
uint_least32_t cp;
_AT_@ -1086,16 +1091,11 @@ get_embedding_levels(HERODOTUS_READER *r,
* the terminating character or last character of the
* string respectively
*/
- get_paragraph_embedding_levels(override, buf + lastparoff,
- bufoff + 1 - lastparoff);
+ preprocess_paragraph(override, buf + lastparoff,
+ bufoff + 1 - lastparoff);
lastparoff = bufoff + 1;
}
- /* bake the levels into the buffer, discarding the metadata */
- for (bufoff = 0; bufoff < bufsize; bufoff++) {
- buf[bufoff] = get_state(STATE_LEVEL, buf[bufoff]);
- }
-
/*
* we return the number of total bytes read, as the function
* should indicate if the given level-buffer is too small
_AT_@ -1104,7 +1104,7 @@ get_embedding_levels(HERODOTUS_READER *r,
}
size_t
-grapheme_get_bidirectional_embedding_levels(
+grapheme_bidirectional_preprocess(
const uint_least32_t *src, size_t srclen,
enum grapheme_bidirectional_override override, int_least32_t *dest,
size_t destlen)
_AT_@ -1113,11 +1113,11 @@ grapheme_get_bidirectional_embedding_levels(
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
- return get_embedding_levels(&r, override, dest, destlen);
+ return preprocess(&r, override, dest, destlen);
}
size_t
-grapheme_get_bidirectional_embedding_levels_utf8(
+grapheme_bidirectional_preprocess_utf8(
const char *src, size_t srclen,
enum grapheme_bidirectional_override override, int_least32_t *dest,
size_t destlen)
_AT_@ -1126,5 +1126,17 @@ grapheme_get_bidirectional_embedding_levels_utf8(
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
- return get_embedding_levels(&r, override, dest, destlen);
+ return preprocess(&r, override, dest, destlen);
+}
+
+void
+grapheme_bidirectional_get_line_embedding_levels(
+ const int_least32_t *linedata, size_t linelen, int_least8_t *linelevel)
+{
+ size_t i;
+
+ /* write the levels into the level-array */
+ for (i = 0; i < linelen; i++) {
+ linelevel[i] = get_state(STATE_LEVEL, linedata[i]);
+ }
}
diff --git a/test/bidirectional.c b/test/bidirectional.c
index 9240b9d..c32cacd 100644
--- a/test/bidirectional.c
+++ b/test/bidirectional.c
_AT_@ -12,10 +12,11 @@
int
main(int argc, char *argv[])
{
- int_least32_t lev[512]; /* TODO iterate and get max, allocate */
- size_t i, num_tests, failed, levlen, ret, j, m;
+ int_least32_t data[512]; /* TODO iterate and get max, allocate */
+ int_least8_t lev[512];
+ size_t i, num_tests, failed, datalen, ret, j, m;
- levlen = LEN(lev);
+ datalen = LEN(data);
(void)argc;
_AT_@ -28,13 +29,15 @@ main(int argc, char *argv[])
continue;*/
for (m = 0; m < bidirectional_test[i].modelen; m++) {
- ret = grapheme_get_bidirectional_embedding_levels(
+ ret = grapheme_bidirectional_preprocess(
bidirectional_test[i].cp,
bidirectional_test[i].cplen,
- bidirectional_test[i].mode[m], lev, levlen);
+ bidirectional_test[i].mode[m], data, datalen);
+ grapheme_bidirectional_get_line_embedding_levels(
+ data, datalen, lev);
if (ret != bidirectional_test[i].cplen ||
- ret > levlen) {
+ ret > datalen) {
goto err;
}
Received on Mon Nov 21 2022 - 09:00:34 CET
This archive was generated by hypermail 2.3.0
: Mon Nov 21 2022 - 09:00:50 CET