Re: [hackers] [PATCH][dmenu/libsl] render invalid utf8 bytes as U+FFFD

From: Hiltjo Posthuma <hiltjo_AT_codemadness.org>
Date: Sun, 14 Jul 2024 11:47:56 +0200

Hi NRK,

Thanks it looks good, the patches are now pushing into the dmenu repo.

I pushed a very small style patch for the recursion check also.

While reviewing I noticed: in the future we might want to change the check for
ellipsis_width and invalid_width. I think there might be issues with the
caching if we switch the fontset/fonts (but that would also currently be an
issue).

After some time of crowd-testing I will sync the drw.{c,h} changes to libsl and
dwm.

On Fri, Jul 05, 2024 at 02:40:45AM +0000, NRK wrote:
> This is a follow up to a discussion from the dev list:
> https://lists.suckless.org/dev/2407/35646.html
>
> I've separated the patch into two halves.
>
> 1. Replaces the current utf8decoder with a shorter and more direct one.
> It reports back errors and how much to advance on error (needed for the
> next patch)
> 2. Detects invalid utf8 bytes and renders them as U+FFFD instead of
> passing invalid bytes to xft which cuts it off.
>
> If keeping the current utf8 decoder is important, I can look into how to
> change it so that it can return error information back to the caller.
> But IMHO the current decoder is unnecessarily overcomplicated for
> libsl's need.
>
> The interaction between invalid utf8 sequence and ellipsis is a bit
> finicky and could use some extra eyes. But so far it has worked well in
> my testing.
>
> - NRK

> From 7208684792d36efe262d2272f84e7d6e729fd3e7 Mon Sep 17 00:00:00 2001
> From: NRK <nrk_AT_disroot.org>
> Date: Thu, 4 Jul 2024 21:25:37 +0000
> Subject: [PATCH 1/2] overhaul utf8decode()
>
> this changes the utf8decode function to:
>
> * report when an error occurs
> * report how many bytes to advance on error
>
> these will be useful in the next commit to render invalid utf8
> sequences.
>
> the new implementation is also shorter and more direct.
> ---
> drw.c | 76 ++++++++++++++++++++++++-----------------------------------
> 1 file changed, 31 insertions(+), 45 deletions(-)
>
> diff --git a/drw.c b/drw.c
> index 78a2b27..eb71da7 100644
> --- a/drw.c
> +++ b/drw.c
> _AT_@ -9,54 +9,40 @@
> #include "util.h"
>
> #define UTF_INVALID 0xFFFD
> -#define UTF_SIZ 4
>
> -static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
> -static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
> -static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
> -static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
> -
> -static long
> -utf8decodebyte(const char c, size_t *i)
> -{
> - for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
> - if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
> - return (unsigned char)c & ~utfmask[*i];
> - return 0;
> -}
> -
> -static size_t
> -utf8validate(long *u, size_t i)
> +static int
> +utf8decode(const char *s_in, long *u, int *err)
> {
> - if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
> - *u = UTF_INVALID;
> - for (i = 1; *u > utfmax[i]; ++i)
> - ;
> - return i;
> -}
> -
> -static size_t
> -utf8decode(const char *c, long *u, size_t clen)
> -{
> - size_t i, j, len, type;
> - long udecoded;
> -
> + static const unsigned char lens[] = {
> + /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> + /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0, /* invalid */
> + /* 110XX */ 2, 2, 2, 2,
> + /* 1110X */ 3, 3,
> + /* 11110 */ 4,
> + /* 11111 */ 0, /* invalid */
> + };
> + static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
> + static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
> +
> + const unsigned char *s = (const unsigned char *)s_in;
> + int len = lens[*s >> 3];
> *u = UTF_INVALID;
> - if (!clen)
> - return 0;
> - udecoded = utf8decodebyte(c[0], &len);
> - if (!BETWEEN(len, 1, UTF_SIZ))
> + *err = 1;
> + if (len == 0)
> return 1;
> - for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
> - udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
> - if (type)
> - return j;
> +
> + long cp = s[0] & leading_mask[len - 1];
> + for (int i = 1; i < len; ++i) {
> + if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
> + return i;
> + cp = (cp << 6) | (s[i] & 0x3F);
> }
> - if (j < len)
> - return 0;
> - *u = udecoded;
> - utf8validate(u, len);
> + /* out of range, surrogate, overlong encoding */
> + if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
> + return len;
>
> + *err = 0;
> + *u = cp;
> return len;
> }
>
> _AT_@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
> XftDraw *d = NULL;
> Fnt *usedfont, *curfont, *nextfont;
> - int utf8strlen, utf8charlen, render = x || y || w || h;
> + int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
> long utf8codepoint = 0;
> const char *utf8str;
> FcCharSet *fccharset;
> _AT_@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> if (!ellipsis_width && render)
> ellipsis_width = drw_fontset_getwidth(drw, "...");
> while (1) {
> - ew = ellipsis_len = utf8strlen = 0;
> + ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
> utf8str = text;
> nextfont = NULL;
> while (*text) {
> - utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
> + utf8charlen = utf8decode(text, &utf8codepoint, &utf8err);
> for (curfont = drw->fonts; curfont; curfont = curfont->next) {
> charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
> if (charexists) {
> --
> 2.42.0
>

> From 396a21976f08b3c66e521855084685ce93356756 Mon Sep 17 00:00:00 2001
> From: NRK <nrk_AT_disroot.org>
> Date: Thu, 4 Jul 2024 21:27:47 +0000
> Subject: [PATCH 2/2] render invalid utf8 sequences as U+FFFD
>
> previously drw_text would do the width calculations as if
> invalid utf8 sequences were replaced with U+FFFD but would pass
> the invalid utf8 sequence to xft to render where xft would just
> cut it off at the first invalid byte.
>
> this change makes invalid utf8 render as U+FFFD and avoids
> sending invalid sequences to xft. the following can be used to
> check the behavior before and after the patch:
>
> $ printf "0\xef1234567\ntest" | dmenu
>
> Ref: https://lists.suckless.org/dev/2407/35646.html
> ---
> drw.c | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/drw.c b/drw.c
> index eb71da7..f151ae5 100644
> --- a/drw.c
> +++ b/drw.c
> _AT_@ -237,7 +237,8 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> XftResult result;
> int charexists = 0, overflow = 0;
> /* keep track of a couple codepoints for which we have no match. */
> - static unsigned int nomatches[128], ellipsis_width;
> + static unsigned int nomatches[128], ellipsis_width, invalid_width;
> + static const char invalid[] = "�";
>
> if (!drw || (render && (!drw->scheme || !w)) || !text || !drw->fonts)
> return 0;
> _AT_@ -257,6 +258,10 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> usedfont = drw->fonts;
> if (!ellipsis_width && render)
> ellipsis_width = drw_fontset_getwidth(drw, "...");
> + if (!invalid_width) {
> + invalid_width = -1; /* stop infinite recursion */
> + invalid_width = drw_fontset_getwidth(drw, invalid);
> + }
> while (1) {
> ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
> utf8str = text;
> _AT_@ -284,9 +289,9 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> else
> utf8strlen = ellipsis_len;
> } else if (curfont == usedfont) {
> - utf8strlen += utf8charlen;
> text += utf8charlen;
> - ew += tmpw;
> + utf8strlen += utf8err ? 0 : utf8charlen;
> + ew += utf8err ? 0 : tmpw;
> } else {
> nextfont = curfont;
> }
> _AT_@ -294,7 +299,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> }
> }
>
> - if (overflow || !charexists || nextfont)
> + if (overflow || !charexists || nextfont || utf8err)
> break;
> else
> charexists = 0;
> _AT_@ -309,6 +314,12 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp
> x += ew;
> w -= ew;
> }
> + if (utf8err && (!render || invalid_width < w)) {
> + if (render)
> + drw_text(drw, x, y, w, h, 0, invalid, invert);
> + x += invalid_width;
> + w -= invalid_width;
> + }
> if (render && overflow)
> drw_text(drw, ellipsis_x, y, ellipsis_w, h, 0, "...", invert);
>
> --
> 2.42.0
>


-- 
Kind regards,
Hiltjo
Received on Sun Jul 14 2024 - 11:47:56 CEST

This archive was generated by hypermail 2.3.0 : Sun Jul 14 2024 - 11:48:41 CEST