Re: [dev] [PATCH][RFC] Add a basic version of tr

From: Silvan Jegen <s.jegen_AT_gmail.com>
Date: Sat, 18 Jan 2014 16:29:10 +0100

On Wed, Jan 15, 2014 at 07:53:25PM +0000, Dimitris Papastamos wrote:
> On Wed, Jan 15, 2014 at 08:43:54PM +0100, Silvan Jegen wrote:
> > I will start writing a man page (possibly based on the GNU one) as soon
> > as I find the time (hopefully in the next few days).
>
> Consider looking at the OpenBSD manpage for tr.

I had a look but it used a lot of formatting which seemed quite gaudy to
me.


> I will apply this as soon as you send in a manpage for it.

Find the code (including a few more escapes) and the manpage below. The
manpage is very terse at the moment so if you think we should flesh the
text out more, or change the formatting, I am open for suggestions. Also,
I am not a native English speaker which means somebody should probably
check my writing.

What I realized it that mbtowc returns a negative int if it encounters
an illegal byte sequence (the definition of which is dependent on your
locale). The current implementation does not handle this case and will
probably choke horribly. At least it won't eat any data, even if you
feed it illegal bytes, I suppose...

Should we try to solve that problem now or later? :-)


--->8---

From: Silvan Jegen <s.jegen_AT_gmail.com>
Date: Fri, 15 Nov 2013 17:25:10 +0100
Subject: [PATCH] Add the tr program including man page

---
 Makefile |   1 +
 tr.1     |  50 ++++++++++++++++++++
 tr.c     | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 207 insertions(+)
 create mode 100644 tr.1
 create mode 100644 tr.c
diff --git a/Makefile b/Makefile
index 81dfaf6..ee84221 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -81,6 +81,7 @@ SRC = \
 	tee.c      \
 	test.c     \
 	touch.c    \
+	tr.c       \
 	true.c     \
 	tty.c      \
 	uname.c    \
diff --git a/tr.1 b/tr.1
new file mode 100644
index 0000000..7a81c0c
--- /dev/null
+++ b/tr.1
_AT_@ -0,0 +1,50 @@
+.TH TR 1 sbase\-VERSION
+.SH NAME
+tr \- translate characters
+.SH SYNOPSIS
+.B tr
+.RB set1
+.RI [ set2 ]
+.SH DESCRIPTION
+.B tr
+reads input from stdin replacing every character in 
+.B set1
+with the character at the same index in
+.B set2.
+If set2 is not given
+.B tr
+deletes the characters in set1 from the input.
+
+Sets are specified as strings of characters. Almost all represent themselves. The following ones will be interpreted:
+.TP
+\e\e
+backslash
+.TP
+\ea
+audible BEL
+.TP
+\ef
+form feed
+.TP
+\en
+new line
+.TP
+\er
+return
+.TP
+\et
+horizontal tab
+.TP
+\ev
+vertical tab
+.PP
+If set1 is longer than set2
+.B tr
+will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored.
+.B
+.SH NOTES
+.B tr
+is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]).
+.SH SEE ALSO
+.IR sed(1)
+.IR awk(1)
diff --git a/tr.c b/tr.c
new file mode 100644
index 0000000..4fdc28f
--- /dev/null
+++ b/tr.c
_AT_@ -0,0 +1,156 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <locale.h>
+#include <wchar.h>
+#include "text.h"
+#include "util.h"
+
+static void
+usage(void)
+{
+	eprintf("usage: %s set1 [set2]\n", argv0);
+}
+
+void
+handleescapes(char *s)
+{
+	switch(*s) {
+	case 'n':
+		*s = '\n';
+		break;
+	case 't':
+		*s = '\t';
+		break;
+	case '\\':
+		*s = '\\';
+		break;
+	case 'r':
+		*s = '\r';
+		break;
+	case 'f':
+		*s = '\f';
+		break;
+	case 'a':
+		*s = '\a';
+		break;
+	case 'b':
+		*s = '\b';
+		break;
+	case 'v':
+		*s = '\v';
+		break;
+	}
+}
+
+void
+parsemapping(const char *set1, const char *set2, wchar_t *mappings)
+{
+	char *s;
+	wchar_t runeleft;
+	wchar_t runeright;
+	int leftbytes;
+	int rightbytes;
+	size_t n = 0;
+	size_t lset2;
+
+	if(set2) {
+		lset2 = strnlen(set2, 255 * sizeof(wchar_t));
+	} else {
+		set2 = &set1[0];
+		lset2 = 0;
+	}
+
+	s = (char *) set1;
+	while(*s) {
+		if(*s == '\\') {
+			handleescapes(++s);
+		}
+
+		leftbytes = mbtowc(&runeleft, s, 4);
+		if(set2[n] != '\0')
+			rightbytes = mbtowc(&runeright, set2 + n, 4);
+		mappings[runeleft] = runeright;
+
+		s += leftbytes;
+		if(n < lset2)
+			n += rightbytes;
+	}
+}
+
+void
+maptonull(const wchar_t *mappings, char *in)
+{
+	const char *s;
+	wchar_t runeleft;
+	int leftbytes = 0;
+
+	s = in;
+	while(*s) {
+		leftbytes = mbtowc(&runeleft, s, 4);
+		if(!mappings[runeleft])
+			putwchar(runeleft);
+		s += leftbytes;
+	}
+}
+
+void
+maptoset(const wchar_t *mappings, char *in)
+{
+	const char *s;
+	wchar_t runeleft;
+	int leftbytes = 0;
+
+	s = in;
+	while(*s) {
+		leftbytes = mbtowc(&runeleft, s, 4);
+		if(!mappings[runeleft]) {
+			putwchar(runeleft);
+		} else {
+			putwchar(mappings[runeleft]);
+		}
+		s += leftbytes;
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	wchar_t *mappings;
+	char *buf = NULL;
+	size_t size = 0;
+	void (*mapfunc) (const wchar_t*, char*);
+
+	setlocale(LC_ALL, "");
+
+	mappings = (wchar_t *) mmap(NULL, 0x110000 * sizeof(wchar_t),
+					PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+	ARGBEGIN {
+	default:
+		usage();
+	} ARGEND;
+
+	if(argc == 0)
+		usage();
+
+	if(argc >= 2) {
+		parsemapping(argv[0], argv[1], mappings);
+		mapfunc = maptoset;
+	} else {
+		parsemapping(argv[0], NULL, mappings);
+		mapfunc = maptonull;
+	}
+
+	while(afgets(&buf, &size, stdin))
+		mapfunc(mappings, buf);
+	free(buf);
+
+	if(ferror(stdin))
+		eprintf("<stdin>: read error:");
+
+	return EXIT_SUCCESS;
+}
-- 
1.8.5.3
Received on Sat Jan 18 2014 - 16:29:10 CET

This archive was generated by hypermail 2.3.0 : Sat Jan 18 2014 - 16:36:14 CET