[hackers] [libgrapheme] Convert GRAPHEME_STATE to uint_least16_t and remove it || Laslo Hunhold

From: <git_AT_suckless.org>
Date: Mon, 3 Oct 2022 23:28:49 +0200 (CEST)

commit fc73d06fed76dd7cde37d3704949d01391ea0032
Author: Laslo Hunhold <dev_AT_frign.de>
AuthorDate: Mon Oct 3 23:13:26 2022 +0200
Commit: Laslo Hunhold <dev_AT_frign.de>
CommitDate: Mon Oct 3 23:13:26 2022 +0200

    Convert GRAPHEME_STATE to uint_least16_t and remove it
    
    I was never quite happy with the fact that the internal state-struct
    was visible in the grapheme.h-header, given the declaration of the
    fields only namely served internal purposes and were useless noise to
    the reader.
    
    To keep it in was merely a choice made because I had always hoped
    to be able to implement maybe a few more state-based pairwise
    segmentation check functions and use the GRAPHEME_STATE type in more
    places, but now after implementing the algorithms it becomes clear that
    they all do not satisfy these pairwise semantics.
    
    The first logical step was to convert the struct to an uint_least16_t,
    which provides enough space (at least 16 bits) to store all the complete
    state, and have internal deserialiation and serialization functions.
    
    The remaining question was if the
    
            typedef uint_least16_t GRAPHEME_STATE
    
    should be removed. I took inspiration from the Linux kernel coding
    style[0], which in section 5b lays out the exact case of typedeffing
    an integer that is meant to store flags (just like in our case).
    It is argued there that there needs to be a good reason to typedef an
    integer (e.g. given it might change by architecture or maybe change in
    later versions). Both cases are not given here (we will _never_ need
    more than 16 bits to store the grapheme cluster break state and you can
    even reduce more wastage, e.g. for storing the prop which never exceeds
    4 bits given NUM_CHAR_BREAK_PROPS == 14 < 15 == 2^4-1), and I must admit
    that it improves readability a bit given you finally know what you're
    dealing with.
    
    The expression
    
            GRAPHEME_STATE state = 0;
    
    admittedly looks a little fishy, given you don't really know what happens
    behind the scenes unless you look in the header, and I want all of the
    semantics to be crystal clear to the end-user.
    
    [0]:https://www.kernel.org/doc/html/latest/process/coding-style.html#typedefs
    
    Signed-off-by: Laslo Hunhold <dev_AT_frign.de>

diff --git a/benchmark/character.c b/benchmark/character.c
index 8d7eb40..854645d 100644
--- a/benchmark/character.c
+++ b/benchmark/character.c
_AT_@ -23,7 +23,7 @@ struct break_benchmark_payload {
 void
 libgrapheme(const void *payload)
 {
- GRAPHEME_STATE state = { 0 };
+ uint_least16_t state = 0;
         const struct break_benchmark_payload *p = payload;
         size_t i;
 
diff --git a/grapheme.h b/grapheme.h
index cf7b68d..4df6551 100644
--- a/grapheme.h
+++ b/grapheme.h
_AT_@ -6,19 +6,12 @@
 #include <stddef.h>
 #include <stdint.h>
 
-typedef struct grapheme_internal_segmentation_state {
- uint_least8_t prop;
- bool prop_set;
- bool gb11_flag;
- bool gb12_13_flag;
-} GRAPHEME_STATE;
-
 #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
 
 size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
 size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
 
-bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
+bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *);
 
 bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
 bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_break.sh
index 995fd7a..3c0fe54 100644
--- a/man/grapheme_is_character_break.sh
+++ b/man/grapheme_is_character_break.sh
_AT_@ -8,7 +8,7 @@ cat << EOF
 .Sh SYNOPSIS
 .In grapheme.h
 .Ft size_t
-.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_STATE *state"
+.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
 .Sh DESCRIPTION
 The
 .Fn grapheme_is_character_break
_AT_@ -52,7 +52,7 @@ if there is not.
 int
 main(void)
 {
- GRAPHEME_STATE state = { 0 };
+ uint_least16_t state = 0;
         uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
         size_t i;
 
diff --git a/src/character.c b/src/character.c
index 4d34b98..4240332 100644
--- a/src/character.c
+++ b/src/character.c
_AT_@ -7,6 +7,13 @@
 #include "../grapheme.h"
 #include "util.h"
 
+struct character_break_state {
+ uint_least8_t prop;
+ bool prop_set;
+ bool gb11_flag;
+ bool gb12_13_flag;
+};
+
 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
         [CHAR_BREAK_PROP_OTHER] =
                 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
_AT_@ -106,38 +113,59 @@ get_break_prop(uint_least32_t cp)
 {
         if (likely(cp <= 0x10FFFF)) {
                 return (enum char_break_property)
- char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
+ char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
         } else {
                 return CHAR_BREAK_PROP_OTHER;
         }
 }
 
+static inline void
+state_serialize(const struct character_break_state *in, uint_least16_t *out)
+{
+ *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
+ (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
+ (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
+ (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
+}
+
+static inline void
+state_deserialize(uint_least16_t in, struct character_break_state *out)
+{
+ out->prop = in & UINT8_C(0xFF);
+ out->prop_set = in & (((uint_least16_t)(1)) << 8);
+ out->gb11_flag = in & (((uint_least16_t)(1)) << 9);
+ out->gb12_13_flag = in & (((uint_least16_t)(1)) << 10);
+}
+
 bool
-grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
+grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
 {
+ struct character_break_state state;
         enum char_break_property cp0_prop, cp1_prop;
         bool notbreak = false;
 
- if (likely(state)) {
- if (likely(state->prop_set)) {
- cp0_prop = state->prop;
+ if (likely(s)) {
+ state_deserialize(*s, &state);
+
+ if (likely(state.prop_set)) {
+ cp0_prop = state.prop;
                 } else {
                         cp0_prop = get_break_prop(cp0);
                 }
                 cp1_prop = get_break_prop(cp1);
 
                 /* preserve prop of right codepoint for next iteration */
- state->prop = (uint_least8_t)cp1_prop;
- state->prop_set = true;
+ state.prop = (uint_least8_t)cp1_prop;
+ state.prop_set = true;
 
                 /* update flags */
- state->gb11_flag =
+ state.gb11_flag =
                         flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
- state->gb11_flag] &
+ state.gb11_flag] &
                         UINT16_C(1) << cp1_prop;
- state->gb12_13_flag =
+ state.gb12_13_flag =
                         flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
- state->gb12_13_flag] &
+ state.gb12_13_flag] &
                         UINT16_C(1) << cp1_prop;
 
                 /*
_AT_@ -145,17 +173,19 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
                  * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
                  */
                 notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
- (dont_break_gb11[cp0_prop + state->gb11_flag *
+ (dont_break_gb11[cp0_prop + state.gb11_flag *
                                             NUM_CHAR_BREAK_PROPS] &
                             (UINT16_C(1) << cp1_prop)) ||
- (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
+ (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
                                                NUM_CHAR_BREAK_PROPS] &
                             (UINT16_C(1) << cp1_prop));
 
                 /* update or reset flags (when we have a break) */
                 if (likely(!notbreak)) {
- state->gb11_flag = state->gb12_13_flag = false;
+ state.gb11_flag = state.gb12_13_flag = false;
                 }
+
+ state_serialize(&state, s);
         } else {
                 cp0_prop = get_break_prop(cp0);
                 cp1_prop = get_break_prop(cp1);
_AT_@ -178,7 +208,7 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
 static size_t
 next_character_break(HERODOTUS_READER *r)
 {
- GRAPHEME_STATE state = { 0 };
+ uint_least16_t state = 0;
         uint_least32_t cp0 = 0, cp1 = 0;
 
         for (herodotus_read_codepoint(r, true, &cp0);
Received on Mon Oct 03 2022 - 23:28:49 CEST

This archive was generated by hypermail 2.3.0 : Mon Oct 03 2022 - 23:36:35 CEST