Re: Re: [hackers] [PATCH v3][sbase] libutil/unescape.c: simplify and add \E

From: Evan Gates <evan.gates_AT_gmail.com>
Date: Mon, 06 Feb 2017 15:50:04 -0800

Mattias Andrée <maandree_AT_kth.se> wrote:

> On Mon, 06 Feb 2017 15:05:32 -0800
> evan.gates_AT_gmail.com (Evan Gates) wrote:
>
> > Mattias Andrée <maandree_AT_kth.se> wrote:
> >
> > > + } else if (escapes[*r & 255]) {
> > > + *w++ = escapes[*r++ & 255];
> >
> > Why do you & 255 here? I think a cast to unsigned char
> > would accomplish what you are trying to do and be more
> > correct (as char can default to either signed or
> > unsigned). Although I may misunderstand what is going on
> > here.
>
> Yes. I used &255 because it's does clutter as much.

OK. I'm going to change to a cast to be on the safe side due to the
"implementation-defined and undefined aspects" mentioned in C99 6.5p4:

Some operators (the unary operator ~, and the binary operators <<, >>, &, ^, and |,
collectively described as bitwise operators) are required to have operands that have
integer type. These operators yield values that depend on the internal representations of
integers, and have implementation-defined and undefined aspects for signed types.

> > I think this is clearer, even though it adds a
> > conditional:
> >
> > q = q * 16 + isdigit(*r) ? *r - '0' : tolower(*r) - 'a' + 10;
>
> I think
>
> if (isdigit(*r))
> q = q * 16 + (*r - '0');
> else
> q = q * 16 + (tolower(*r) - 'a') + 10;
>
> is clearer, or at least brackets around the ternary.

You're right, the line was getting a bit long and convoluted with the
ternary, bad habit of mine.

Another thought just came to mind, isoctal is a reserved name. 7.26p1:

The following names are grouped under individual headers for convenience. All external
names described below are reserved no matter what headers are included by the program.

7.26.2p1:

Function names that begin with either is or to, and a lowercase letter may be added to
the declarations in the <ctype.h> header.

I don't know if it's worth being anal about that, especially as we have
already limited ourselves to C99 do we need to worry about future
directions? For now I've changed isoctal() to is_odigit() but I'm open
to discussion.

Updated patch for consideration:

----- 8< ----- 8< ----- 8< -----
From 30fd43d7f3b8716054eb9867c835aadc423f652c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Andr=C3=A9e?= <maandree_AT_kth.se>
Date: Sun, 5 Feb 2017 00:44:35 +0100
Subject: [PATCH] libutil/unescape.c: simplify and add \E
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mattias Andrée <maandree_AT_kth.se>
---
 libutil/unescape.c | 101 ++++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 59 deletions(-)
diff --git a/libutil/unescape.c b/libutil/unescape.c
index d1503e6..7523db3 100644
--- a/libutil/unescape.c
+++ b/libutil/unescape.c
_AT_@ -1,74 +1,57 @@
 /* See LICENSE file for copyright and license details. */
+#include <ctype.h>
 #include <string.h>
 
 #include "../util.h"
 
+#define is_odigit(c)  ('0' <= c && c <= '7')
+
 size_t
 unescape(char *s)
 {
-	size_t len, i, off, m, factor, q;
-
-	len = strlen(s);
+	static const char escapes[256] = {
+		['"'] = '"',
+		['\''] = '\'',
+		['\\'] = '\\',
+		['a'] = '\a',
+		['b'] = '\b',
+		['E'] = 033,
+		['e'] = 033,
+		['f'] = '\f',
+		['n'] = '\n',
+		['r'] = '\r',
+		['t'] = '\t',
+		['v'] = '\v'
+	};
+	size_t m, q;
+	char *r, *w;
 
-	for (i = 0; i < len; i++) {
-		if (s[i] != '\\')
+	for (r = w = s; *r;) {
+		if (*r != '\\') {
+			*w++ = *r++;
 			continue;
-		off = 0;
-
-		switch (s[i + 1]) {
-		case '\\': s[i] = '\\'; off++; break;
-		case '\'': s[i] = '\'', off++; break;
-		case '"':  s[i] =  '"', off++; break;
-		case 'a':  s[i] = '\a'; off++; break;
-		case 'b':  s[i] = '\b'; off++; break;
-		case 'e':  s[i] =  033; off++; break;
-		case 'f':  s[i] = '\f'; off++; break;
-		case 'n':  s[i] = '\n'; off++; break;
-		case 'r':  s[i] = '\r'; off++; break;
-		case 't':  s[i] = '\t'; off++; break;
-		case 'v':  s[i] = '\v'; off++; break;
-		case 'x':
-			/* "\xH[H]" hexadecimal escape */
-			for (m = i + 2; m < i + 1 + 3 && m < len; m++)
-				if ((s[m] < '0' && s[m] > '9') &&
-				    (s[m] < 'A' && s[m] > 'F') &&
-				    (s[m] < 'a' && s[m] > 'f'))
-					break;
-			if (m == i + 2)
-				eprintf("invalid escape sequence '\\%c'\n", s[i + 1]);
-			off += m - i - 1;
-			for (--m, q = 0, factor = 1; m > i + 1; m--) {
-				if (s[m] >= '0' && s[m] <= '9')
-					q += (s[m] - '0') * factor;
-				else if (s[m] >= 'A' && s[m] <= 'F')
-					q += ((s[m] - 'A') + 10) * factor;
-				else if (s[m] >= 'a' && s[m] <= 'f')
-					q += ((s[m] - 'a') + 10) * factor;
-				factor *= 16;
-			}
-			s[i] = q;
-			break;
-		case '\0':
+		}
+		r++;
+		if (!*r) {
 			eprintf("null escape sequence\n");
-		default:
-			/* "\O[OOO]" octal escape */
-			for (m = i + 1; m < i + 1 + 4 && m < len; m++)
-				if (s[m] < '0' || s[m] > '7')
-					break;
-			if (m == i + 1)
-				eprintf("invalid escape sequence '\\%c'\n", s[i + 1]);
-			off += m - i - 1;
-			for (--m, q = 0, factor = 1; m > i; m--) {
-				q += (s[m] - '0') * factor;
-				factor *= 8;
-			}
-			s[i] = (q > 255) ? 255 : q;
+		} else if (escapes[(unsigned char)*r]) {
+			*w++ = escapes[(unsigned char)*r++];
+		} else if (is_odigit(*r)) {
+			for (q = 0, m = 4; m && is_odigit(*r); m--, r++)
+				q = q * 8 + (*r - '0');
+			*w++ = MIN(q, 255);
+		} else if (*r == 'x' && isxdigit(r[1])) {
+			r++;
+			for (q = 0, m = 2; m && isxdigit(*r); m--, r++)
+				if (isdigit(*r))
+					q = q * 16 + (*r - '0');
+				else
+					q = q * 16 + (tolower(*r) - 'a' + 10);
+			*w++ = q;
+		} else {
+			eprintf("invalid escape sequence '\\%c'\n", *r);
 		}
-
-		for (m = i + 1; m <= len - off; m++)
-			s[m] = s[m + off];
-		len -= off;
 	}
 
-	return len;
+	return w - s;
 }
-- 
2.11.0
Received on Tue Feb 07 2017 - 00:50:04 CET

This archive was generated by hypermail 2.3.0 : Tue Feb 07 2017 - 01:00:19 CET