st.c | 196 +++++++++++++++++++++++++------------------------------------------ 1 file changed, 72 insertions(+), 124 deletions(-) diff --git a/st.c b/st.c index 69b2491..ce2accf 100644 --- a/st.c +++ b/st.c @@ -55,6 +55,7 @@ char *argv0; #define XEMBED_FOCUS_OUT 5 /* Arbitrary sizes */ +#define UTF_INVALID 0xFFFD #define UTF_SIZ 4 #define ESC_BUF_SIZ (128*UTF_SIZ) #define ESC_ARG_SIZ 16 @@ -442,10 +443,12 @@ static void selcopy(void); static void selscroll(int, int); static void selsnap(int, int *, int *, int); -static int utf8decode(char *, long *); -static int utf8encode(long *, char *); -static int utf8size(char *); -static int isfullutf8(char *, int); +static size_t utfdecode(char *, long *, size_t); +static long utfdecodebyte(char, size_t *); +static size_t utfencode(long, char *, size_t); +static char utfencodebyte(long, size_t); +static size_t utflen(char *); +static size_t utfvalidate(long *, size_t); static ssize_t xwrite(int, char *, size_t); static void *xmalloc(size_t); @@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 = release */ static char *usedfont = NULL; static double usedfontsize = 0; +static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; +static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; +static long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; +static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + /* Font Ring Cache */ enum { FRC_NORMAL, @@ -549,128 +557,69 @@ xstrdup(char *s) { return p; } -int -utf8decode(char *s, long *u) { - uchar c; - int i, n, rtn; - - rtn = 1; - c = *s; - if(~c & 0x80) { /* 0xxxxxxx */ - *u = c; - return rtn; - } else if((c & 0xE0) == 0xC0) { /* 110xxxxx */ - *u = c & 0x1F; - n = 1; - } else if((c & 0xF0) == 0xE0) { /* 1110xxxx */ - *u = c & 0x0F; - n = 2; - } else if((c & 0xF8) == 0xF0) { /* 11110xxx */ - *u = c & 0x07; - n = 3; - } else { - goto invalid; - } - - for(i = n, ++s; i > 0; --i, ++rtn, ++s) { - c = *s; - if((c & 0xC0) != 0x80) /* 10xxxxxx */ - goto invalid; - *u <<= 6; - *u |= c & 0x3F; - } - - if((n == 1 && *u < 0x80) || - (n == 2 && *u < 0x800) || - (n == 3 && *u < 0x10000) || - (*u >= 0xD800 && *u <= 0xDFFF)) { - goto invalid; - } - - return rtn; -invalid: - *u = 0xFFFD; - - return rtn; -} +size_t +utfdecode(char *c, long *u, size_t clen) { + size_t i, j, len, type; + long udecoded; -int -utf8encode(long *u, char *s) { - uchar *sp; - ulong uc; - int i, n; - - sp = (uchar *)s; - uc = *u; - if(uc < 0x80) { - *sp = uc; /* 0xxxxxxx */ + *u = UTF_INVALID; + if(!clen) + return 0; + udecoded = utfdecodebyte(c[0], &len); + if(!BETWEEN(len, 1, UTF_SIZ)) return 1; - } else if(*u < 0x800) { - *sp = (uc >> 6) | 0xC0; /* 110xxxxx */ - n = 1; - } else if(uc < 0x10000) { - *sp = (uc >> 12) | 0xE0; /* 1110xxxx */ - n = 2; - } else if(uc <= 0x10FFFF) { - *sp = (uc >> 18) | 0xF0; /* 11110xxx */ - n = 3; - } else { - goto invalid; + for(i = 1, j = 1; i < clen && j < len; ++i, ++j) { + udecoded = (udecoded << 6) | utfdecodebyte(c[i], &type); + if(type != 0) + return j; } + if(j < len) + return 0; + *u = udecoded; + utfvalidate(u, len); + return len; +} - for(i=n,++sp; i>0; --i,++sp) - *sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */ - - return n+1; -invalid: - /* U+FFFD */ - *s++ = '\xEF'; - *s++ = '\xBF'; - *s = '\xBD'; - - return 3; +long +utfdecodebyte(char c, size_t *i) { + for(*i = 0; *i < LEN(utfmask); ++(*i)) + if(((uchar)c & utfmask[*i]) == utfbyte[*i]) + return (uchar)c & ~utfmask[*i]; + return 0; } -/* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode - UTF-8 otherwise return 0 */ -int -isfullutf8(char *s, int b) { - uchar *c1, *c2, *c3; +size_t +utfencode(long u, char *c, size_t clen) { + size_t len, i; - c1 = (uchar *)s; - c2 = (uchar *)++s; - c3 = (uchar *)++s; - if(b < 1) { + len = utfvalidate(&u, 0); + if(clen < len) return 0; - } else if((*c1 & 0xE0) == 0xC0 && b == 1) { - return 0; - } else if((*c1 & 0xF0) == 0xE0 && - ((b == 1) || - ((b == 2) && (*c2 & 0xC0) == 0x80))) { - return 0; - } else if((*c1 & 0xF8) == 0xF0 && - ((b == 1) || - ((b == 2) && (*c2 & 0xC0) == 0x80) || - ((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) { - return 0; - } else { - return 1; + for(i = len - 1; i != 0; --i) { + c[i] = utfencodebyte(u, 0); + u >>= 6; } + c[0] = utfencodebyte(u, len); + return len; } -int -utf8size(char *s) { - uchar c = *s; +char +utfencodebyte(long u, size_t i) { + return utfbyte[i] | (u & ~utfmask[i]); +} - if(~c & 0x80) { - return 1; - } else if((c & 0xE0) == 0xC0) { - return 2; - } else if((c & 0xF0) == 0xE0) { - return 3; - } else { - return 4; - } +size_t +utflen(char *c) { + return utfdecode(c, &(long){0}, UTF_SIZ); +} + +size_t +utfvalidate(long *u, size_t i) { + if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) + *u = UTF_INVALID; + for(i = 1; *u > utfmax[i]; ++i) + ; + return i; } static void @@ -984,7 +933,7 @@ getsel(void) { if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) continue; - size = utf8size(gp->c); + size = utflen(gp->c); memcpy(ptr, gp->c, size); ptr += size; } @@ -1308,9 +1257,8 @@ ttyread(void) { /* process every complete utf8 char */ buflen += ret; ptr = buf; - while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) { - charsize = utf8decode(ptr, &utf8c); - utf8encode(&utf8c, s); + while(charsize = utfdecode(ptr, &utf8c, buflen)) { + utfencode(utf8c, s, UTF_SIZ); tputc(s, charsize); ptr += charsize; buflen -= charsize; @@ -2420,7 +2368,7 @@ tputc(char *c, int len) { if(len == 1) { width = 1; } else { - utf8decode(c, &u8char); + utfdecode(c, &u8char, UTF_SIZ); width = wcwidth(u8char); } @@ -3293,7 +3241,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { oneatatime = font->width != xw.cw; for(;;) { u8c = s; - u8cblen = utf8decode(s, &u8char); + u8cblen = utfdecode(s, &u8char, UTF_SIZ); s += u8cblen; bytelen -= u8cblen; @@ -3430,7 +3378,7 @@ xdrawcursor(void) { memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); /* remove the old cursor */ - sl = utf8size(term.line[oldy][oldx].c); + sl = utflen(term.line[oldy][oldx].c); width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, oldy, width, sl); @@ -3444,7 +3392,7 @@ xdrawcursor(void) { g.bg = defaultfg; } - sl = utf8size(g.c); + sl = utflen(g.c); width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ ? 2 : 1; xdraws(g.c, g, term.c.x, term.c.y, width, sl); @@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) { base = new; } - sl = utf8decode(new.c, &u8char); + sl = utfdecode(new.c, &u8char, UTF_SIZ); memcpy(buf+ib, new.c, sl); ib += sl; ic += (new.mode & ATTR_WIDE)? 2 : 1; @@ -3707,7 +3655,7 @@ kpress(XEvent *ev) { if(IS_SET(MODE_8BIT)) { if(*buf < 0177) { c = *buf | 0x80; - len = utf8encode(&c, buf); + len = utfencode(c, buf, UTF_SIZ); } } else { buf[1] = buf[0];