--- libutil/unescape.c | 40 +++++++++++++++++++++++++++++++++++----- printf.1 | 11 ++++++----- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/libutil/unescape.c b/libutil/unescape.c index bed2c61..5845dd4 100644 --- a/libutil/unescape.c +++ b/libutil/unescape.c _AT_@ -7,6 +7,32 @@ #define isoctal(c) ('0' <= c && c <= '7') size_t +utf8encode(size_t cp, char *out) +{ + char head = 0, headmask = (char)0x80, buf[7] = {0, 0, 0, 0, 0, 0, 0}; + size_t n = 0; + + if (cp < 0x80) + return *out = (char)cp, 1; + if (cp > 0x7fffffffUL) + eprintf("invalid code point %X\n", cp); + while (cp) { + buf[6 - n] |= (char)0x80; + buf[6 - ++n] |= cp & 0x3f; + cp >>= 6; + head |= headmask; + headmask >>= 1; + } + if (buf[6 - n] & (head | headmask)) { + buf[6 - n] |= (char)0x80; + n++, head |= headmask; + } + buf[6 - n] |= head; + memcpy(out, buf + 6 - n, n); + return n; +} + +size_t unescape(char *s) { static const char escapes[256] = { _AT_@ -23,8 +49,9 @@ unescape(char *s) ['t'] = '\t', ['v'] = '\v' }; + static const char hexlen[256] = {['x'] = 2, ['u'] = 4, ['U'] = 8}; size_t m, q; - char *r, *w; + char *r, *w, hex; for (r = w = s; *r;) { if (*r != '\\') { _AT_@ -40,11 +67,14 @@ unescape(char *s) for (q = 0, m = 4; m && isoctal(*r); m--, r++) q = q * 8 + (*r & 7); *w++ = q > 255 ? 255 : q; - } else if (*r == 'x' && isxdigit(r[1])) { - r++; - for (q = 0, m = 2; m && isxdigit(*r); m--, r++) + } else if (hexlen[*r & 255] && isxdigit(r[1])) { + m = hexlen[(hex = *r++) & 255]; + for (q = 0; m && isxdigit(*r); m--, r++) q = q * 16 + (*r & 15) + 9 * !!isalpha(*r); - *w++ = q; + if (hex == 'x') + *w++ = q; + else + w += utf8encode(q, w); } else { eprintf("invalid escape sequence '\\%c'\n", *r); } diff --git a/printf.1 b/printf.1 index 78ffb1e..00fa850 100644 --- a/printf.1 +++ b/printf.1 _AT_@ -1,4 +1,4 @@ -.Dd 2015-10-08 +.Dd 2017-02-04 .Dt PRINTF 1 .Os sbase .Sh NAME _AT_@ -17,9 +17,9 @@ using each until drained. .Pp .Nm -interprets the standard escape sequences \e\e, \e', \e", \ea, \eb, \ee, -\ef, \en, \er, \et, \ev, \exH[H], \eO[OOO], the sequence \ec, which -terminates further output if it's found inside +interprets the standard escape sequences \e\e, \e', \e", \ea, \eb, +\ef, \en, \er, \et, \ev, \exH[H], \eO[OOO], the sequences, \ee, \eE, \euH[HHH], +\eUH[HHHHHHH], and \ec, which terminates further output if it's found inside .Ar format or a %b format string, the format specification %b for an unescaped string and all C .Xr printf 3 _AT_@ -31,4 +31,5 @@ utility is compliant with the .St -p1003.1-2013 specification. .Pp -The possibility of specifying 4-digit octals is an extension to that specification. +The escape sequences \ee, \eE, \euH[HHH], \eUH[HHHHHHH], \exH[H] and possibility of +specifying 4-digit octals is an extension to that specification. -- 2.11.0Received on Sun Feb 05 2017 - 01:00:54 CET
This archive was generated by hypermail 2.3.0 : Sun Feb 05 2017 - 01:12:19 CET