[dev] [PATCH][RFC] Add a basic version of tr

From: Silvan Jegen <s.jegen_AT_gmail.com>
Date: Mon, 13 Jan 2014 11:19:49 -0800 (PST)

Hi

I have rewritten "tr" to use mmap and the wchar.h functions. It seems
to be quite slow but as far as I can tell it works reasonably well (at
least when using a UTF-8 locale). Comments/review and testing welcome
(I am relatively new to C so beware)!

If you think adding this version of "tr" to sbase makes sense I can
prepare a man page that points out all the shortcomings (e. g. no
character classes) of this implementation.


Cheers,

Silvan


--->8---

Add a basic version of tr that is Unicode-aware but does not yet support
character classes.

---
 Makefile |   1 +
 tr.c     | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 tr.c
diff --git a/Makefile b/Makefile
index 81dfaf6..ee84221 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -81,6 +81,7 @@ SRC = \
 	tee.c      \
 	test.c     \
 	touch.c    \
+	tr.c       \
 	true.c     \
 	tty.c      \
 	uname.c    \
diff --git a/tr.c b/tr.c
new file mode 100644
index 0000000..869dbfa
--- /dev/null
+++ b/tr.c
_AT_@ -0,0 +1,142 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <locale.h>
+#include <wchar.h>
+#include "text.h"
+#include "util.h"
+
+static void
+usage(void)
+{
+	eprintf("usage: %s set1 [set2]\n", argv0);
+}
+
+void
+handle_escapes(char *s)
+{
+	switch(*s) {
+	case 'n':
+		*s = '\x0A';
+		break;
+	case 't':
+		*s = '\x09';
+		break;
+	case '\\':
+		*s = '\x5c';
+		break;
+	}
+}
+
+void
+parse_mapping(char *set1, char *set2, wchar_t *mappings)
+{
+	char *s;
+	wchar_t runeleft;
+	wchar_t runeright;
+	int leftbytes;
+	int rightbytes;
+	size_t n = 0;
+	size_t lset2;
+
+	if(set2) {
+		lset2 = strnlen(set2, 255 * sizeof(wchar_t));
+	} else {
+		set2 = (char*) &set1[0];
+		lset2 = 0;
+	}
+
+	s = set1;
+	while(*s) {
+		if(*s == '\\') {
+			handle_escapes(++s);
+		}
+
+		leftbytes = mbtowc(&runeleft, (const char *) s, 4);
+		if(*(set2 + n))
+			rightbytes = mbtowc(&runeright, (const char *) set2 + n, 4);
+		mappings[runeleft] = runeright;
+
+		s += leftbytes;
+		if(n < lset2)
+			n += rightbytes;
+	}
+}
+
+void
+map_to_null(const wchar_t *mappings, char *in)
+{
+	const char *s;
+	wchar_t runeleft;
+	int leftbytes = 0;
+
+	s = in;
+	while(*s) {
+		leftbytes = mbtowc(&runeleft, s, 4);
+		if(!mappings[runeleft])
+			putwchar(runeleft);
+		s += leftbytes;
+	}
+}
+
+void
+map_to_set(const wchar_t *mappings, char *in)
+{
+	const char *s;
+	wchar_t runeleft;
+	int leftbytes = 0;
+
+	s = in;
+	while(*s) {
+		leftbytes = mbtowc(&runeleft, s, 4);
+		if(!mappings[runeleft]) {
+			putwchar(runeleft);
+		} else {
+			putwchar(mappings[runeleft]);
+		}
+		s += leftbytes;
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	wchar_t *mappings;
+	char *buf = NULL;
+	size_t size = 0;
+	void (*mapfunc) (const wchar_t*, char*);
+
+	setlocale(LC_ALL, "");
+
+	mappings = (wchar_t *) mmap(NULL, 0x110000 * sizeof(wchar_t), PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+	ARGBEGIN {
+	default:
+		usage();
+	} ARGEND;
+
+	if(!argc)
+		usage();
+
+	if(argc >= 2) {
+		parse_mapping(argv[0], argv[1], mappings);
+		mapfunc = map_to_set;
+	} else {
+		parse_mapping(argv[0], NULL, mappings);
+		mapfunc = map_to_null;
+	}
+
+	while(afgets(&buf, &size, stdin))
+		mapfunc(mappings, buf);
+	free(buf);
+
+	if (ferror(stdin)) {
+		eprintf("<stdin>: read error:");
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
-- 
1.8.5.2
Received on Mon Jan 13 2014 - 20:19:49 CET

This archive was generated by hypermail 2.3.0 : Mon Jan 13 2014 - 20:24:07 CET