[hackers] [PATCH 2][sbase] libutil/unescape.c: add \u and \U; and correct and update printf.1

From: Mattias Andrée <maandree_AT_kth.se>
Date: Sun, 5 Feb 2017 01:00:54 +0100

Signed-off-by: Mattias Andrée <maandree_AT_kth.se>
---
 libutil/unescape.c | 40 +++++++++++++++++++++++++++++++++++-----
 printf.1           | 11 ++++++-----
 2 files changed, 41 insertions(+), 10 deletions(-)
diff --git a/libutil/unescape.c b/libutil/unescape.c
index bed2c61..5845dd4 100644
--- a/libutil/unescape.c
+++ b/libutil/unescape.c
_AT_@ -7,6 +7,32 @@
 #define isoctal(c)  ('0' <= c && c <= '7')
 
 size_t
+utf8encode(size_t cp, char *out)
+{
+	char head = 0, headmask = (char)0x80, buf[7] = {0, 0, 0, 0, 0, 0, 0};
+	size_t n = 0;
+
+	if (cp < 0x80)
+		return *out = (char)cp, 1;
+	if (cp > 0x7fffffffUL)
+		eprintf("invalid code point %X\n", cp);
+	while (cp) {
+		buf[6 - n] |= (char)0x80;
+		buf[6 - ++n] |= cp & 0x3f;
+		cp >>= 6;
+		head |= headmask;
+		headmask >>= 1;
+	}
+	if (buf[6 - n] & (head | headmask)) {
+		buf[6 - n] |= (char)0x80;
+		n++, head |= headmask;
+	}
+	buf[6 - n] |= head;
+	memcpy(out, buf + 6 - n, n);
+	return n;
+}
+
+size_t
 unescape(char *s)
 {
 	static const char escapes[256] = {
_AT_@ -23,8 +49,9 @@ unescape(char *s)
 		['t'] = '\t',
 		['v'] = '\v'
 	};
+	static const char hexlen[256] = {['x'] = 2, ['u'] = 4, ['U'] = 8};
 	size_t m, q;
-	char *r, *w;
+	char *r, *w, hex;
 
 	for (r = w = s; *r;) {
 		if (*r != '\\') {
_AT_@ -40,11 +67,14 @@ unescape(char *s)
 			for (q = 0, m = 4; m && isoctal(*r); m--, r++)
 				q = q * 8 + (*r & 7);
 			*w++ = q > 255 ? 255 : q;
-		} else if (*r == 'x' && isxdigit(r[1])) {
-			r++;
-			for (q = 0, m = 2; m && isxdigit(*r); m--, r++)
+		} else if (hexlen[*r & 255] && isxdigit(r[1])) {
+			m = hexlen[(hex = *r++) & 255];
+			for (q = 0; m && isxdigit(*r); m--, r++)
 				q = q * 16 + (*r & 15) + 9 * !!isalpha(*r);
-			*w++ = q;
+			if (hex == 'x')
+				*w++ = q;
+			else
+				w += utf8encode(q, w);
 		} else {
 			eprintf("invalid escape sequence '\\%c'\n", *r);
 		}
diff --git a/printf.1 b/printf.1
index 78ffb1e..00fa850 100644
--- a/printf.1
+++ b/printf.1
_AT_@ -1,4 +1,4 @@
-.Dd 2015-10-08
+.Dd 2017-02-04
 .Dt PRINTF 1
 .Os sbase
 .Sh NAME
_AT_@ -17,9 +17,9 @@ using each
 until drained.
 .Pp
 .Nm
-interprets the standard escape sequences \e\e, \e', \e", \ea, \eb, \ee,
-\ef, \en, \er, \et, \ev, \exH[H], \eO[OOO], the sequence \ec, which
-terminates further output if it's found inside
+interprets the standard escape sequences \e\e, \e', \e", \ea, \eb,
+\ef, \en, \er, \et, \ev, \exH[H], \eO[OOO], the sequences, \ee, \eE, \euH[HHH],
+\eUH[HHHHHHH], and \ec, which terminates further output if it's found inside
 .Ar format
 or a %b format string, the format specification %b for an unescaped string and all C
 .Xr printf 3
_AT_@ -31,4 +31,5 @@ utility is compliant with the
 .St -p1003.1-2013
 specification.
 .Pp
-The possibility of specifying 4-digit octals is an extension to that specification.
+The escape sequences \ee, \eE, \euH[HHH], \eUH[HHHHHHH], \exH[H] and possibility of
+specifying 4-digit octals is an extension to that specification.
-- 
2.11.0
Received on Sun Feb 05 2017 - 01:00:54 CET

This archive was generated by hypermail 2.3.0 : Sun Feb 05 2017 - 01:12:19 CET