[hackers] [sbase][PATCH] Add implementation of shuf(1)

From: Elie Le Vaillant <eolien55_AT_disroot.org>
Date: Sun, 3 Mar 2024 00:36:35 +0100

---
 .gitignore       |  1 +
 Makefile         |  2 ++
 README           |  1 +
 libutil/random.c | 46 +++++++++++++++++++++++++++++
 shuf.1           | 43 +++++++++++++++++++++++++++
 shuf.c           | 77 ++++++++++++++++++++++++++++++++++++++++++++++++
 util.h           |  3 ++
 7 files changed, 173 insertions(+)
 create mode 100644 libutil/random.c
 create mode 100644 shuf.1
 create mode 100644 shuf.c
diff --git a/.gitignore b/.gitignore
index 001b055..e789e24 100644
--- a/.gitignore
+++ b/.gitignore
_AT_@ -71,6 +71,7 @@
 /sha512sum
 /sha512-224sum
 /sha512-256sum
+/shuf
 /sleep
 /sort
 /split
diff --git a/Makefile b/Makefile
index 7bd9626..8e8732a 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -86,6 +86,7 @@ LIBUTILOBJ =\
 	libutil/strsep.o\
 	libutil/strnsubst.o\
 	libutil/strtonum.o\
+	libutil/random.o\
 	libutil/unescape.o\
 	libutil/writeall.o
 
_AT_@ -160,6 +161,7 @@ BIN =\
 	sha512sum\
 	sha512-224sum\
 	sha512-256sum\
+	shuf\
 	sleep\
 	sort\
 	split\
diff --git a/README b/README
index 5555f8b..9b287d9 100644
--- a/README
+++ b/README
_AT_@ -112,6 +112,7 @@ The following tools are implemented:
 0=*|x sha512sum       .
 0=* x sha512-224sum   .
 0=* x sha512-256sum   .
+0=* x shuf            (-o, -z, -e, -i)
 0=*|o sleep           .
 0#*|o sort            .
 0=*|o split           .
diff --git a/libutil/random.c b/libutil/random.c
new file mode 100644
index 0000000..48eeb79
--- /dev/null
+++ b/libutil/random.c
_AT_@ -0,0 +1,46 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <time.h>
+
+/*
+ * Uniformity is achieved by generating new random numbers until the one
+ * returned is outside the range [0, 2**32 % upper_bound).  This
+ * guarantees the selected random number will be inside
+ * [2**32 % upper_bound, 2**32) which maps back to [0, upper_bound)
+ * after reduction modulo upper_bound.
+ *
+ * Copied off OpenBSD (original is arc4random_uniform)
+ */
+uint32_t
+random_uniform(uint32_t upper_bound)
+{
+	uint32_t r, min;
+
+	if (upper_bound < 2)
+		return 0;
+
+	/* 2**32 % x == (2**32 - x) % x */
+	min = -upper_bound % upper_bound;
+
+	/*
+	 * This could theoretically loop forever but each retry has
+	 * p > 0.5 (worst case, usually far better) of selecting a
+	 * number inside the range we need, so it should rarely need
+	 * to re-roll.
+	 */
+	for (;;) {
+		r = random(); /* arc4random() is better, but we don't always have it */
+		if (r >= min)
+			break;
+	}
+
+	return r % upper_bound;
+}
+
+void
+random_seed(void)
+{
+	struct timespec ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	srandom(ts.tv_nsec); /* not a good source of randomness, but eh */
+}
diff --git a/shuf.1 b/shuf.1
new file mode 100644
index 0000000..1f744a7
--- /dev/null
+++ b/shuf.1
_AT_@ -0,0 +1,43 @@
+.Dd 2024-02-25
+.Dt SHUF 1
+.Os
+.Sh NAME
+.Nm shuf
+.Nd randomly permute input lines
+.Sh SYNOPSIS
+.Nm
+.Op Fl r
+.Op Ar file ...
+.Sh DESCRIPTION
+.Nm
+is a utility that outputs a random permutation of its input lines.
+By default,
+.Nm
+reads from stdin and outputs to stdout.
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl r
+Do not permute.
+Instead, choose lines at random, with replacement.
+By default, it repeats indefinitely.
+.El
+.Sh NOTES
+This
+.Nm
+misses a few options.
+.Pp
+.Fl n
+can be reproduced using
+.Xr head 1 .
+.Pp
+.Fl i
+can be reproduced using
+.Xr seq 1 .
+.Pp
+.Fl e
+can be reproduced using
+.Xr printf 1 .
+.Sh SEE ALSO
+.Xr sort 1 ,
+.Xr seq 1 ,
+.Xr head 1
diff --git a/shuf.c b/shuf.c
new file mode 100644
index 0000000..36b3898
--- /dev/null
+++ b/shuf.c
_AT_@ -0,0 +1,77 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "text.h"
+#include "util.h"
+
+static void
+usage(void)
+{
+	eprintf("usage: %s [-r] [file ...]\n", argv0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	FILE *fp;
+	int i, j, rflag = 0, ret = 0;
+	struct linebuf buf = EMPTY_LINEBUF;
+	struct line line;
+
+	ARGBEGIN {
+	case 'r':
+		rflag = 1;
+		break;
+	default:
+		usage();
+	} ARGEND
+
+	if (!argc) {
+		getlines(stdin, &buf);
+	} else {
+		for (; *argv; argc--, argv++) {
+			if (!strcmp(*argv, "-")) {
+				*argv = "<stdin>";
+				fp = stdin;
+			} else if (!(fp = fopen(*argv, "r"))) {
+				weprintf("fopen %s:", *argv);
+				ret = 1;
+				continue;
+			}
+			getlines(fp, &buf);
+			if (fp != stdin && fshut(fp, *argv))
+				ret = 1;
+		}
+	}
+
+	random_seed();
+
+	if (!rflag) {
+		for (i = buf.nlines - 1; i > 0; i--) {
+			j = random_uniform(i+1);
+			line = buf.lines[j];
+			buf.lines[j] = buf.lines[i];
+			buf.lines[i] = line;
+		}
+
+		for (i = 0; i < buf.nlines; i++) {
+			line = buf.lines[i];
+			fwrite(line.data, 1, line.len, stdout);
+		}
+	} else {
+		for (;;) {
+			j = random_uniform(buf.nlines);
+			line = buf.lines[j];
+			fwrite(line.data, 1, line.len, stdout);
+		}
+	}
+	for (i = 0; i < buf.nlines; i++)
+		free(buf.lines[i].data);
+
+	ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
+
+	return ret;
+}
diff --git a/util.h b/util.h
index 346f6ca..4c9b5b2 100644
--- a/util.h
+++ b/util.h
_AT_@ -3,6 +3,7 @@
 
 #include <regex.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 
 #include "arg.h"
_AT_@ -91,3 +92,5 @@ int mkdirp(const char *, mode_t, mode_t);
 #undef memmem
 #define memmem xmemmem
 void *memmem(const void *, size_t, const void *, size_t);
+void random_seed(void);
+uint32_t random_uniform(uint32_t);
-- 
2.44.0
Received on Sun Mar 03 2024 - 00:36:35 CET

This archive was generated by hypermail 2.3.0 : Sun Mar 03 2024 - 00:48:37 CET