[dev] [sbase][PATCH v3] Add diff(1) and bdiff(1)

From: Mattias Andrée <maandree_AT_kth.se>
Date: Wed, 24 Feb 2016 11:26:44 +0100

Signed-off-by: Mattias Andrée <maandree_AT_kth.se>
---
 Makefile           |   12 +-
 README             |    2 +
 TODO               |    1 -
 bdiff.1            |   84 ++++
 diff.1             |  107 ++++
 diff.c             | 1397 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 libutil/ncprintf.c |   50 ++
 util.h             |    1 +
 8 files changed, 1649 insertions(+), 5 deletions(-)
 create mode 100644 bdiff.1
 create mode 100644 diff.1
 create mode 100644 diff.c
 create mode 100644 libutil/ncprintf.c
diff --git a/Makefile b/Makefile
index 754f4d2..2ebf48a 100644
--- a/Makefile
+++ b/Makefile
_AT_@ -56,6 +56,7 @@ LIBUTILSRC =\
 	libutil/md5.c\
 	libutil/mkdirp.c\
 	libutil/mode.c\
+	libutil/ncprintf.c\
 	libutil/parseoffset.c\
 	libutil/putword.c\
 	libutil/reallocarray.c\
_AT_@ -89,6 +90,7 @@ BIN =\
 	cron\
 	cut\
 	date\
+	diff\
 	dirname\
 	du\
 	echo\
_AT_@ -169,7 +171,7 @@ LIBUTFOBJ = $(LIBUTFSRC:.c=.o)
 LIBUTILOBJ = $(LIBUTILSRC:.c=.o)
 OBJ = $(BIN:=.o) $(LIBUTFOBJ) $(LIBUTILOBJ)
 SRC = $(BIN:=.c)
-MAN = $(BIN:=.1)
+MAN = $(BIN:=.1) bdiff.1
 
 all: $(BIN)
 
_AT_@ -199,7 +201,7 @@ confstr_l.h limits_l.h sysconf_l.h pathconf_l.h: getconf.sh
 install: all
 	mkdir -p $(DESTDIR)$(PREFIX)/bin
 	cp -f $(BIN) $(DESTDIR)$(PREFIX)/bin
-	cd $(DESTDIR)$(PREFIX)/bin && ln -f test [ && chmod 755 $(BIN)
+	cd $(DESTDIR)$(PREFIX)/bin && ln -f test [ && ln -f diff bdiff && chmod 755 $(BIN)
 	mv -f $(DESTDIR)$(PREFIX)/bin/xinstall $(DESTDIR)$(PREFIX)/bin/install
 	mkdir -p $(DESTDIR)$(MANPREFIX)/man1
 	for m in $(MAN); do sed "s/^\.Os sbase/.Os sbase $(VERSION)/g" < "$$m" > $(DESTDIR)$(MANPREFIX)/man1/"$$m"; done
_AT_@ -207,8 +209,8 @@ install: all
 	mv -f $(DESTDIR)$(MANPREFIX)/man1/xinstall.1 $(DESTDIR)$(MANPREFIX)/man1/install.1
 
 uninstall:
-	cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) [ install
-	cd $(DESTDIR)$(MANPREFIX)/man1 && rm -f $(MAN)
+	cd $(DESTDIR)$(PREFIX)/bin && rm -f $(BIN) [ install bdiff
+	cd $(DESTDIR)$(MANPREFIX)/man1 && rm -f $(MAN) install.1
 
 dist: clean
 	mkdir -p sbase-$(VERSION)
_AT_@ -231,6 +233,7 @@ sbase-box: $(LIB) $(SRC)
 	echo 'int main(int argc, char *argv[]) { char *s = basename(argv[0]);'                               >> build/$_AT_.c
 	echo 'if(!strcmp(s,"sbase-box")) { argc--; argv++; s = basename(argv[0]); } if(0) ;'                 >> build/$_AT_.c
 	echo "else if (!strcmp(s, \"install\")) return xinstall_main(argc, argv);"                           >> build/$_AT_.c
+	echo "else if (!strcmp(s, \"bdiff\")) return diff_main(argc, argv);"                                     >> build/$_AT_.c
 	echo "else if (!strcmp(s, \"[\")) return test_main(argc, argv);"                                     >> build/$_AT_.c
 	for f in $(SRC); do echo "else if(!strcmp(s, \"$${f%.c}\")) return $${f%.c}_main(argc, argv);"; done >> build/$_AT_.c
 	echo 'else { fputs("[ ", stdout);'                                                                   >> build/$_AT_.c
_AT_@ -245,6 +248,7 @@ sbase-box-install: sbase-box
 	chmod 755 $(DESTDIR)$(PREFIX)/bin/sbase-box
 	for f in $(BIN); do ln -sf sbase-box $(DESTDIR)$(PREFIX)/bin/"$$f"; done
 	ln -sf sbase-box $(DESTDIR)$(PREFIX)/bin/[
+	ln -sf sbase-box $(DESTDIR)$(PREFIX)/bin/bdiff
 	mv -f $(DESTDIR)$(PREFIX)/bin/xinstall $(DESTDIR)$(PREFIX)/bin/install
 	mkdir -p $(DESTDIR)$(MANPREFIX)/man1
 	for m in $(MAN); do sed "s/^\.Os sbase/.Os sbase $(VERSION)/g" < "$$m" > $(DESTDIR)$(MANPREFIX)/man1/"$$m"; done
diff --git a/README b/README
index cddf485..4f246cf 100644
--- a/README
+++ b/README
_AT_@ -13,6 +13,7 @@ The following tools are implemented:
      UTILITY     MISSING
      -------     -------
 =*|o basename    .
+=* x bdiff       .
 =*|o cal         .
 =*|o cat         .
 =*|o chgrp       .
_AT_@ -27,6 +28,7 @@ The following tools are implemented:
 =*|x cron        .
 #*|o cut         .
 =*|o date        .
+=* o diff        .
 =*|o dirname     .
 =*|o du          .
 =*|o echo        .
diff --git a/TODO b/TODO
index fc0730d..3d5474f 100644
--- a/TODO
+++ b/TODO
_AT_@ -6,7 +6,6 @@ sbase such as vi and sh are also not listed here.
 at
 awk
 bc
-diff
 ed manpage
 patch
 stty
diff --git a/bdiff.1 b/bdiff.1
new file mode 100644
index 0000000..1e114d6
--- /dev/null
+++ b/bdiff.1
_AT_@ -0,0 +1,84 @@
+.Dd 2016-02-21
+.Dt BDIFF 1
+.Os sbase
+.Sh NAME
+.Nm bdiff
+.Nd compare two big files
+.Sh SYNOPSIS
+.Nm
+.Op Fl c | C Ar n | Fl e | f | u | U Ar n
+.Op Fl bdDr
+.Ar file1 file2
+.Sh DESCRIPTION
+.Nm
+compares the contents of two files and prints
+the set of changes necessary to modify
+.Ar file1
+into
+.Ar file2 .
+.Pp
+Unlike
+.Xr diff 1 ,
+.Nm
+is designed to be able to compare huge files
+and files with vast about of differences. This
+comes at the cost of the result seldom being
+minimal.
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl b
+Ignore changes in trailing whitespace.
+.It Fl c
+Produce output with three lines of copied context.
+.It Fl C Ar n
+Produce output with
+.Ar n
+lines of copied context. If
+.Ar n
+is -1, use the entire file for context.
+.It Fl d
+Ignored. Exists for symmetry with
+.Xr diff 1 .
+.It Fl D
+Produce colored output if stdout, is a terminal.
+Always produce colored output if used twice or more.
+.It Fl e
+Produce output suitable as input to
+.Xr ed 1 .
+.It Fl f
+Produce output similar to the format of
+.Fl e ,
+but intended for reading by a human.
+.It Fl r
+Apply
+.Nm
+recursively to subdirectories of
+.Ar file1
+and
+.Ar file2 .
+.It Fl u
+Produce output with three lines of unified context.
+.It Fl U Ar n
+Produce output with
+.Ar n
+lines of unified context. If
+.Ar n
+is -1, use the entire file for context.
+.El
+.Sh EXIT STATUS
+.Bl -tag -width Ds
+.It 0
+The files are identical.
+.It 1
+The files are different.
+.It 3
+Binary files are different.
+.It > 3
+An error occurred.
+.El
+.Sh SEE ALSO
+.Xr cmp 1 ,
+.Xr comm 1 ,
+.Xr diff 1 ,
+.Xr ed 1 ,
+.Xr patch 1
diff --git a/diff.1 b/diff.1
new file mode 100644
index 0000000..1898c84
--- /dev/null
+++ b/diff.1
_AT_@ -0,0 +1,107 @@
+.Dd 2016-02-18
+.Dt DIFF 1
+.Os sbase
+.Sh NAME
+.Nm diff
+.Nd compare two files
+.Sh SYNOPSIS
+.Nm
+.Op Fl c | C Ar n | Fl e | f | u | U Ar n
+.Op Fl bdDr
+.Ar file1 file2
+.Sh DESCRIPTION
+.Nm
+compares the contents of two files and prints
+the minimal set of changes necessary to modify
+.Pp
+If
+.Nm
+detemines that it is too difficult to calculate
+the minimal set of changes fast enough and without
+usage too much memory,
+.Nm
+will fall back to behave like
+.Xr bdiff 1 ,
+unless
+.Fl d
+is used, and the exit status will not be 2
+instead of 1.
+.Ar file1
+into
+.Ar file2 .
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl b
+Ignore changes in trailing whitespace.
+.It Fl c
+Produce output with three lines of copied context.
+.It Fl C Ar n
+Produce output with
+.Ar n
+lines of copied context. If
+.Ar n
+is -1, use the entire file for context.
+.It Fl d
+Do not fall back to behave like
+.Xr bdiff 1 ,
+even if its appears to be impossible to calculate
+the minimal set of changes.
+.It Fl D
+Produce colored output if stdout, is a terminal.
+Always produce colored output if used twice or more.
+.It Fl e
+Produce output suitable as input to
+.Xr ed 1 .
+.It Fl f
+Produce output similar to the format of
+.Fl e ,
+but intended for reading by a human.
+.It Fl r
+Apply
+.Nm
+recursively to subdirectories of
+.Ar file1
+and
+.Ar file2 .
+.It Fl u
+Produce output with three lines of unified context.
+.It Fl U Ar n
+Produce output with
+.Ar n
+lines of unified context. If
+.Ar n
+is -1, use the entire file for context.
+.El
+.Sh EXIT STATUS
+.Bl -tag -width Ds
+.It 0
+The files are identical.
+.It 1
+The files are different, output is minimal.
+.It 2
+The files are different, output is not necessarily minimal.
+.It 3
+Binary files are different.
+.It > 3
+An error occurred.
+.El
+.Sh SEE ALSO
+.Xr bdiff 1 ,
+.Xr cmp 1 ,
+.Xr comm 1 ,
+.Xr ed 1 ,
+.Xr patch 1
+.Sh STANDARDS
+The
+.Nm
+utility is compliant with the
+.St -p1003.1-2013
+specification, except exit status 1 has been split to
+exit status 1, 2, and 3, and exit status > 1 has been
+shifted to > 3, and the output is only minimal if
+determined to be feasible to calculate with generally
+exceptable time and memory usage.
+.Pp
+The
+.Op Fl dD
+flags is an extension to that specification.
diff --git a/diff.c b/diff.c
new file mode 100644
index 0000000..f425f10
--- /dev/null
+++ b/diff.c
_AT_@ -0,0 +1,1397 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include <time.h>
+#include <errno.h>
+#include <libgen.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+#include "arg.h"
+#include "util.h"
+
+/*
+ * Lines that only appear in file1 are marked 1.
+ * Lines that only appear in file2 are marked 2.
+ * Lines that appear in both files are marked 0.
+ */
+
+/*
+ * Excluded optimisations.
+ * 
+ * POSIX mandates that the output change set is minimal
+ * (of course, this is not always possibly.) It does
+ * however not mandate that the total output is minimal,
+ * and I see no compelling reason to minimise anything
+ * but the change set. There the size of the hunks are
+ * not minimalised, which could be done by shifting
+ * lumps of changes without the same hunk.
+ * 
+ * After compressing the data set, it is possible to
+ * divide and “conquer” the problem. Find any lump
+ * that is at least as large as the sum of all other
+ * lumps. Solver the problem recursively on both sides
+ * of that lump. Ther the base case is not solved
+ * directly, but independently.
+ */
+
+#define END_OF_PATH          127
+#define NO_LF_MARK           "\n\\ No newline at end of file"
+#define COLOURED_NO_LF_MARK  "\n\033[7m\\ No newline at end of file\033[27m"
+
+#undef EXIT_FAILURE
+#define EXIT_FILES_DIFFER  1
+#define EXIT_NOT_MINIMAL  2
+#define EXIT_BINARIES_DIFFER  3
+#define EXIT_FAILURE  4
+
+#define emalloc(...)   enmalloc(EXIT_FAILURE, __VA_ARGS__)
+#define erealloc(...)  enrealloc(EXIT_FAILURE, __VA_ARGS__)
+#define eprintf(...)   enprintf(EXIT_FAILURE, __VA_ARGS__)
+#define eperror(...)   (perror(__VA_ARGS__), exit(EXIT_FAILURE))
+
+#define line_eq(a, b)         ((a)->hash == (b)->hash && !strcmp((a)->line, (b)->line))
+#define sort(base, nel, cmp)  qsort(base, nel, sizeof(*base), cmp)
+
+
+struct file_data {
+	char **lines;
+	size_t line_count; /* used as length of `lines[0]` if `is_binary` */
+	unsigned lf_terminated : 1;
+	unsigned is_binary : 1;
+	unsigned is_empty : 1;
+	struct stat attr;
+	const char *path;
+};
+
+struct trace {
+	char f;
+	int ch;
+	size_t d;
+	size_t a_len;
+	size_t b_len;
+};
+
+struct chunk {
+	size_t ai;
+	size_t bi;
+	unsigned have_a : 1;
+	unsigned have_b : 1;
+	struct trace *chunk;
+};
+
+struct line {
+	char *line;
+	char *end;
+	char old_end_char;
+	unsigned uncommon : 1;
+	size_t hash;
+	size_t compression;
+	size_t number;
+};
+
+
+static int bflag = 0;
+static int cflag = 0;
+static int eflag = 0;
+static int fflag = 0;
+static int uflag = 0;
+static int rflag = 0;
+static size_t n_context = 0;
+
+/* Minimal output or cheap comparsion? */
+static int bdiff = 0;
+static int dflag = 0;
+static int cheap_algorithm_used = 0;
+
+static int use_colour = 0;
+static int (*cprintf)(const char *, ...);
+
+
+static void
+usage(void)
+{
+	eprintf("usage: %s [-c | -C n | -e | -f | -u | -U n] [-bdDr] file1 file2\n", argv0);
+}
+
+static int
+is_minimal_feasible(size_t a, size_t b)
+{
+	size_t max = a > b ? a : b;
+
+	if (max > (SIZE_MAX / max) / 2)
+		return 0;
+
+	/* Keep in mind, this is not the size of the files, but rather the size of the
+	 * set that is compared after reduction and compression. It is important to
+	 * remember that the user may want to run diff one a large set of files, not
+	 * just one file. */
+	return (a * b) < 1000UL * 1000UL;
+}
+
+#ifdef DEBUG
+# include <time.h>
+static void
+measure_time(const char *label)
+{
+	static clock_t clocks[10];
+	static int i = 0, j = 0;
+	clock_t duration;
+	double seconds;
+	if (!label) {
+		clocks[i++] = clock();
+	} else {
+		duration = clock() - clocks[--i];
+		seconds = duration;
+		seconds /= CLOCKS_PER_SEC;
+		fprintf(stderr, "\033[1;3%imTIME MEASUREMENT [%*s%s]: %lu clocks = %lf seconds\033[m\n",
+		        (j ^= 1) ? 1 : 5, 2 * i, "", label, (unsigned long)duration, seconds);
+	}
+}
+#else
+# define measure_time(x)  ((void)0)
+#endif
+
+
+/* Functions for diff:ing directories. { */
+
+static char *
+join_paths(const char *a, const char *b)
+{
+	char *rc = emalloc(strlen(a) + strlen(b) + sizeof("/"));
+	sprintf(rc, "%s/%s", a, b);
+	return rc;
+}
+
+static const char *
+classify(struct file_data *f)
+{
+	return !f ? "directory" : f->is_empty ? "regular empty file" : "regular file";
+}
+
+static int
+is_incommensurable(mode_t mode)
+{
+	/* POSIX specifies that if a and b refer to the same special device,
+	 * there should be no comparision. This seems unnecessary since it
+	 * also specifies that special devices and FIFO:s shall not be compared.
+	 * We extend this to not compare sockets either. POSIX says that it
+	 * is implementation-specified for other types than special files,
+	 * FIFO:s, regular files and directories. */
+	return S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode);
+}
+
+/* } */
+
+
+/* Comparsion functions for lines and pointers to lines. { */
+
+#define LPCMP(NAME, CMPS)\
+  static int NAME(const void *a_, const void *b_)\
+  { const struct line *const *ap = a_, *const *bp = b_; CMPS; return 0; }
+#define LCMP(NAME, CMPS)\
+  static int NAME(const void *a_, const void *b_)\
+  { const struct line *a = a_, *b = b_; CMPS; return 0; }
+#define DO(...)          do { __VA_ARGS__ } while (0)
+#define LCMP_INT_(ATTR)  DO(if (a->ATTR != b->ATTR) return (a->ATTR > b->ATTR) * 2 - 1;)
+#define LCMP_LINE        DO(int cmp = strcmp(a->line, b->line); if (cmp) return cmp;)
+#define LCMP_HASH        LCMP_INT_(hash)
+#define LCMP_NO          LCMP_INT_(number)
+
+#define a  (*ap)
+#define b  (*bp)
+LPCMP(linepcmp_hash_line_stable, LCMP_HASH; LCMP_LINE; LCMP_NO)
+#undef a
+#undef b
+
+LCMP(linecmp_hash_line_stable, LCMP_HASH; LCMP_LINE; LCMP_NO)
+LCMP(linecmp_hash_line, LCMP_HASH; LCMP_LINE)
+
+/* } */
+
+
+/* Functions for supporting -b. { */
+
+static char *
+rstrip(char *text, char *removed)
+{
+	char *end = strchr(text, '\0');
+	while ((end != text) && isspace(*--end));
+	*removed = *end, *end = '\0';
+	return end;
+}
+
+static void
+rstrip_lines(struct line *lines, size_t n)
+{
+	size_t i;
+	for (i = 0; bflag && i < n; i++)
+		lines[i].end = rstrip(lines[i].line, &(lines[i].old_end_char));
+}
+
+static void
+unrstrip_lines(struct line *lines, size_t n)
+{
+	size_t i;
+	for (i = 0; bflag && i < n; i++)
+		*(lines[i].end) = lines[i].old_end_char;
+}
+
+/* } */
+
+
+/* Auxiliary functions used by the input reducers and the differs. { */
+
+static void
+hash_lines(struct line *lines, size_t n)
+{
+	size_t hash;
+	char *str;
+	while (n--) {
+		for (hash = 0, str = lines[n].line; *str; str++)
+			hash = 31 * hash + (size_t)*str;
+		lines[n].hash = hash;
+	}
+}
+
+static void
+retain_longest_increasing_subsequence(size_t *map, size_t n)
+{
+	size_t best_length = 0, low, high, mid, p, i;
+	ssize_t j;
+
+	/* Map from i to largest j such that j < i, where j
+	 * is an included index in the same subsequence. */
+	size_t *previous = emalloc(n * sizeof(*previous));
+	/* Map from a length of a subsequence to the last
+	 * included index in such subsequence. */
+	size_t *end_of = emalloc((n + 1) * sizeof(*end_of));
+
+	/* SIZE_MAX marks unset. We do this so that
+	 * the last part of the algorithm terminates. */
+	memset(end_of, ~0, (n + 1) * sizeof(*end_of));
+
+	for (i = 0; i < n; i++) {
+		/* Find longest subsequence that can be extended. */
+		low = 1;
+		high = best_length;
+		while (low <= high) {
+			mid = (low + high + 1) / 2; /* Cannot overflow in our case. */
+			if (map[end_of[mid]] < map[i])
+				low = mid + 1;
+			else
+				high = mid - 1;
+		}
+
+		/* Extend that subsequence. */
+		previous[i] = end_of[low - 1];
+		end_of[low] = i;
+
+		/* Get the length of the longest, yet known, subsequence. */
+		if (low > best_length)
+			best_length = low;
+	  }
+
+	/* Erase elements that where not selected, so that only
+	 * the longest increasing subsequence is present. */
+	p = end_of[best_length];
+	j = (ssize_t)n - 1;
+	while (p < SIZE_MAX) {
+		while ((ssize_t)p < j)
+			map[j--] = SIZE_MAX;
+		j--;
+		p = previous[p];
+	}
+	while (j >= 0)
+		map[j--] = SIZE_MAX;
+
+	free(previous);
+	free(end_of);
+}
+
+/* } */
+
+
+/* Diff algorithm for large input. { */
+
+/* This algorithm runs in O(n log n) rather than O(n²) time, and
+ * O(n) rather than O(n²) space, and is used in the event that O(n²)
+ * is deemed too expensive. The output is far from optimal, but
+ * it does not matter too much, the program reduces the problem
+ * enough that this will probably not be used too often. */
+static char *
+diff2_cheap(struct line *a_, struct line *b_, size_t an, size_t bn)
+{
+	/*
+	 * The idea behind the algorithm.
+	 * 
+	 * For lines that appear in both files, create a map from line numbers in the old file to
+	 * line numbers in the new file. For duplicate lines that do not appear the same number of
+	 * times in both files, do not care too much.
+	 * 
+	 * The map is not ordered, as find the longest increasing subsequence (there are gaps in
+	 * subsequences) and erase the rest from the map.
+	 * 
+	 * Removes lines from the old files are indicated by missing entries in the map
+	 * (marked with SIZE_MAX in this implementation) and added lines can be inferred
+	 * from gaps in the map.
+	 */
+
+	size_t ai, bi, ri;
+	struct line *a = emalloc(an * sizeof(*a));
+	struct line *b = emalloc(bn * sizeof(*b));
+	size_t *map = emalloc(an * sizeof(*map));
+	char *rc;
+
+	memcpy(a, a_, an * sizeof(*a));
+	memcpy(b, b_, bn * sizeof(*b));
+
+	/* Remember the original position _and_ make stable sort possible. */
+	for (ai = 0; ai < an; ai++)  a[ai].number = ai;
+	for (bi = 0; bi < bn; bi++)  b[bi].number = bi;
+
+	/* Perform stable total ordering (not really proper sort) of line content. */
+	sort(a, an, linecmp_hash_line_stable);
+	sort(b, bn, linecmp_hash_line_stable);
+
+	/* Get new line numbers, and mark removed lines with SIZE_MAX. */
+	memset(map, ~0, an * sizeof(*map));
+	for (ai = bi = 0; ai < an && bi < bn;) {
+		int cmp = linecmp_hash_line(a + ai, b + bi);
+		if (!cmp)
+			map[a[ai].number] = b[bi].number;
+		ai += (cmp <= 0);
+		bi += (cmp >= 0);
+	}
+	free(a);
+	free(b);
+
+	/* Find the longest increasing subsequence of map. The map is not monotonic
+	 * unless we do this. This is done in O(n log n), just like sorting, so we
+	 * get a better result (compare to selecting naïvely) without any significant
+	 * performance penalty. */
+	retain_longest_increasing_subsequence(map, an);
+
+	/* Create change set from removed lines and gaps in new line numbers. */
+	rc = emalloc(an + bn + 1);
+	ri = an + bn;
+	for (ai = bi = 0; ai < an; ai++) {
+		if (map[ai] == SIZE_MAX) {
+			rc[ri--] = 1;
+		} else {
+			while (map[ai] > bi++)
+				rc[ri--] = 2;
+			rc[ri--] = 0;
+		}
+	}
+	free(map);
+	while (bi++ < bn)
+		rc[ri--] = 2;
+	rc[ri] = END_OF_PATH;
+	memmove(rc, rc + ri, an + bn - ri + 1);
+
+	return rc + an + bn - ri + 1;
+}
+
+/* } */
+
+
+/* Diff algorithm for minimal output. { */
+
+static char *
+diff2_minimal(struct line *a, struct line *b, size_t an, size_t bn)
+{
+	/*
+	 * This function calculates the minimal edit set for getting from `a`
+	 * to `b`. But rather than using an algorithm that calculates the
+	 * smallest changes set, this function uses an algorithm for calculating
+	 * the longest common (non-consecutive) subsequence, which is an identical
+	 * (with the allowed change operations) except for the value returned
+	 * at the end, which is not inspected anyways. This algorithms has the
+	 * advantages of requiring simpler initialisation.
+	 */
+
+	char (*map)[an + 1][bn + 1] = emalloc(sizeof(char[an + 1][bn + 1]));
+	size_t (*matrix)[2][bn + 1] = ecalloc(1, sizeof(size_t[2][bn + 1]));
+	char *rc;
+	size_t ai, bi, ri = 0, mi = 0;
+
+	memset((*map)[0], 2, bn + 1);
+
+	a--, b--;
+	for (ai = 1; ai <= an; ai++) {
+		size_t *last = (*matrix)[mi];
+		size_t *this = (*matrix)[mi ^= 1];
+		(*map)[ai][0] = 1;
+		for (bi = 1; bi <= bn; bi++) {
+			size_t u = last[bi];
+			size_t d = last[bi - 1] + 1;
+			size_t l = this[bi - 1];
+			size_t lu = l >= u ? l : u;
+			if (line_eq(a + ai, b + bi) && d > lu) {
+				/* The > in this comparison makes changes happen as late
+				 * as possible. >= would make the changes happen as early
+				 * as possible. */
+				this[bi] = d;
+				(*map)[ai][bi] = 0;
+			} else {
+				this[bi] = lu;
+				(*map)[ai][bi] = 1 + (l >= u);
+			}
+		}
+	}
+	free(matrix);
+
+	rc = emalloc(an + bn + 1);
+	rc[ri++] = END_OF_PATH;
+	for (ai = an, bi = bn; ai + bi; ri++) {
+		rc[ri] = (*map)[ai][bi];
+		ai -= rc[ri] != 2;
+		bi -= rc[ri] != 1;
+	}
+	free(map);
+
+	return rc + ri;
+}
+
+static char *
+diff2_minimal_maybe_transpose(struct line *a, struct line *b, size_t an, size_t bn)
+{
+	/*
+	 * The minimise memory usage in diff2_minimal, swap the files if it helps.
+	 */
+
+	int transpose = bn < an;
+	char trace, *path, *rc = !transpose ? diff2_minimal(a, b, an, bn) : diff2_minimal(b, a, bn, an);
+	for (path = rc; transpose && (trace = *--path) != END_OF_PATH;)
+		if (trace) *path = 3 - trace;
+	return rc;
+}
+
+/* } */
+
+
+/* Result enhancements, better output and more details for the output functions. { */
+
+static struct trace *
+enhance_path(char *path)
+{
+	char *p = path;
+	size_t len, a_len = 0, b_len = 0, i = 0, d = 0, a = 0, b = 0, j = 0;
+	int have_d = 0, ch = 0, differs = 0;
+	struct trace *rc;
+
+	while (*--p != END_OF_PATH);
+	len = (size_t)(path - p);
+	rc = ecalloc(len, sizeof(*rc));
+
+	/* Find distance from edits, and mark exchanges. (left-to-right) */
+	for (--len; i < len; i++) {
+		rc[i].f = *--path;
+		if (rc[i].f) {
+			d = 0, have_d = 1;
+			ch |= ch ? ch : (3 - rc[i].f);
+			if (rc[i].f == ch)
+				rc[i].ch = 1;
+		} else {
+			ch = 0;
+			rc[i].d = (have_d ? ++d : SIZE_MAX);
+		}
+	}
+	rc[i].f = END_OF_PATH;
+
+	/* Find distance from edits, mark exchanges, and get chunk lengths. (right-to-left) */
+	for (i = len, d = 0, ch = have_d = 0; i-- > 0;) {
+		rc[i].a_len = a_len += (rc[i].f != 2);
+		rc[i].b_len = b_len += (rc[i].f != 1);
+		if (rc[i].f) {
+			d = 0, have_d = 1;
+			ch |= ch ? ch : (3 - rc[i].f);
+			if (rc[i].f == ch)
+				rc[i].ch = 1;
+		} else {
+			ch = 0;
+			if (have_d && (d + 1) < rc[i].d)
+				rc[i].d = ++d;
+			if (rc[i].d > n_context)
+				a_len = b_len = 0;
+		}
+	}
+
+	/* Put removals before additions. */
+	for (i = 0; i < len; i++) {
+		if (rc[i].f == 0) {
+			while (a--)  rc[j++].f = 1;
+			while (b--)  rc[j++].f = 2;
+			j = i + 1, a = b = 0;
+		} else if (rc[i].f == 1) {
+			a++, differs = 1;
+		} else {
+			b++, differs = 1;
+		}
+	}
+	while (a--)  rc[j++].f = 1;
+	while (b--)  rc[j++].f = 2;
+
+	free(p);
+	if (!differs) {
+		free(rc);
+		return 0;
+	}
+	return rc;
+}
+
+/* } */
+
+
+/* When you cannot reduce the time or space complexity, reduce the input. { */
+
+/*
+ * Anything that can be executed in O(n log n) time and space is fine.
+ * If it is above that, it is probably not a good idea. Anything that
+ * is above o(n²) time or space is pointless.
+ */
+
+static void
+reduce_problem(struct line **ap, struct line **bp, size_t *anp, size_t *bnp, size_t *start, size_t *end)
+{
+	size_t skip_start = 0, skip_end = 0, an = *anp, bn = *bnp;
+	struct line *a = *ap, *b = *bp;
+
+	/* Reduce problem set, by skipping identical head. */
+	for (skip_start = 0; skip_start < an && skip_start < bn; skip_start++)
+		if (strcmp(a[skip_start].line, b[skip_start].line))
+			break;
+
+	a += skip_start, an -= skip_start;
+	b += skip_start, bn -= skip_start;
+
+	/* Reduce problem set, by skipping identical tail. */
+	for (skip_end = 0; an && bn; an--, bn--, skip_end++)
+		if (strcmp(a[an - 1].line, b[bn - 1].line))
+			break;
+
+	*ap = a, *anp = an, *start = skip_start;
+	*bp = b, *bnp = bn, *end = skip_end;
+}
+
+static void
+unreduce_problem(char **pathp, struct line **ap, struct line **bp, size_t skip_start, size_t skip_end)
+{
+	char *path = *pathp;
+	size_t path_len;
+
+	if (!skip_start && !skip_end)
+		return;
+
+	while (*--path != END_OF_PATH);
+	path_len = (size_t)(*pathp - path);
+	path = erealloc(path, skip_end + path_len + skip_start);
+	if (skip_end) {
+		memmove(path + skip_end + 1, path + 1, path_len - 1);
+		memset(path + 1, 0, skip_end);
+	}
+	memset(path + skip_end + path_len, 0, skip_start);
+	*pathp = path + skip_end + path_len + skip_start;
+
+	*ap -= skip_start;
+	*bp -= skip_start;
+}
+
+
+static void
+compress_problem_1(struct line *a, struct line *b, size_t an, size_t bn,
+                   struct line **acp, struct line **bcp, size_t *acnp, size_t *bcnp)
+{
+	/*
+	 * The idea behind the algorithm.
+	 * 
+	 * For lines that appear in both files the same number of times, create a
+	 * map from line numbers in the old file to line numbers in the new file.
+	 * 
+	 * The map is not ordered, as find the longest increasing subsequence
+	 * (there are gaps in subsequences) and erase the rest from the map.
+	 * 
+	 * Inspect the map for gaps in the old line numbers and the new line numbers.
+	 * The gaps can be lumped together, but non-unique lines have to divide the
+	 * lumps. Parts between gaps can be lumped together too.
+	 */
+
+	size_t ai, bi, ao, bo;
+	struct line **a_sorted = emalloc(an * sizeof(*a_sorted));
+	struct line **b_sorted = emalloc(bn * sizeof(*b_sorted));
+	struct line *ac, *bc;
+	size_t *map = emalloc(an * sizeof(*map));
+
+	for (ai = 0; ai < an; ai++)  (a_sorted[ai] = a + ai)->number = ai;
+	for (bi = 0; bi < bn; bi++)  (b_sorted[bi] = b + bi)->number = bi;
+
+	sort(a_sorted, an, linepcmp_hash_line_stable);
+	sort(b_sorted, bn, linepcmp_hash_line_stable);
+
+	memset(map, ~0, an * sizeof(*map));
+	for (ai = bi = 0; ai < an && bi < bn;) {
+		int cmp = linecmp_hash_line(a_sorted[ai], b_sorted[bi]);
+		if (cmp < 0) {
+			a_sorted[ai++]->uncommon = 1;
+		} else if (cmp > 0) {
+			b_sorted[bi++]->uncommon = 1;
+		} else {
+			ao = ai, bo = bi;
+			while (++ai < an && line_eq(a_sorted[ai], a_sorted[ai - 1]));
+			while (++bi < bn && line_eq(b_sorted[bi], b_sorted[bi - 1]));
+			if (ai - ao == bi - bo)
+				while (ao < ai)
+					map[a_sorted[ao++]->number] = b_sorted[bo++]->number;
+		}
+	}
+	while (ai < an)  a_sorted[ai++]->uncommon = 1;
+	while (bi < bn)  b_sorted[bi++]->uncommon = 1;
+
+	free(a_sorted);
+	free(b_sorted);
+
+	retain_longest_increasing_subsequence(map, an);
+
+	for (ai = bi = ao = bo = 0; ai < an; ai++) {
+		if (map[ai] == SIZE_MAX) {
+			if (ai > 0 && a[ai - 1].uncommon && a[ai].uncommon) {
+				a[ao].compression += 1 + a[ai].compression;
+			} else if (a[ai].uncommon) {
+				ao = ai;
+			}
+		} else if (ai > 0 && map[ai] > 0 && map[ai - 1] == map[ai] - 1) {
+			a[ao].compression += 1 + a[ai].compression;
+			b[bo].compression += 1 + b[bi].compression;
+			bi++;
+		} else {
+			for (bo = SIZE_MAX; bi < map[ai]; bi++)
+				if (!b[bi].uncommon)
+					bo = SIZE_MAX;
+				else if (bo == SIZE_MAX)
+					bo = bi;
+				else
+					b[bo].compression += 1 + b[bi].compression;
+			ao = ai, bo = bi;
+		}
+	}
+	for (bo = SIZE_MAX; bi < bn; bi++)
+		if (!b[bi].uncommon)
+			bo = SIZE_MAX;
+		else if (bo == SIZE_MAX)
+			bo = bi;
+		else
+			b[bo].compression += 1 + b[bi].compression;
+
+	free(map);
+
+	*acp = ac = emalloc(an * sizeof(*ac));
+	*bcp = bc = emalloc(bn * sizeof(*bc));
+
+	for (ai = ao = 0; ai < an; ai++, ao++)  ai += (ac[ao] = a[ai]).compression;
+	for (bi = bo = 0; bi < bn; bi++, bo++)  bi += (bc[bo] = b[bi]).compression;
+
+	*acnp = ao;
+	*bcnp = bo;
+}
+
+static void
+compress_problem(struct line *a, struct line *b, size_t an, size_t bn,
+                 struct line **acp, struct line **bcp, size_t *acnp, size_t *bcnp)
+{
+	if (bdiff) {
+		*acp = a, *acnp = an;
+		*bcp = b, *bcnp = bn;
+		return;
+	}
+
+	compress_problem_1(a, b, an, bn, acp, bcp, acnp, bcnp);
+}
+
+static void
+decompress_problem(size_t an, size_t bn, struct line *a_compressed, struct line *b_compressed, char **pathp)
+{
+	char *path = *pathp;
+	char *new_path;
+	char trace;
+	size_t ai = 0, bi = 0, pi = an + bn;
+
+	if (bdiff)
+		return;
+
+	new_path = emalloc(an + bn + 1);
+
+	while ((trace = *--path) != END_OF_PATH) {
+		size_t dups = (trace == 1 ? (a_compressed + ai) : (b_compressed + bi))->compression;
+		do
+			new_path[pi--] = trace;
+		while (dups-- > 0);
+		ai += trace != 2;
+		bi += trace != 1;
+	}
+	new_path[pi] = END_OF_PATH;
+	memmove(new_path, new_path + pi, an + bn - pi + 1);
+
+	*pathp = new_path + an + bn - pi + 1;
+
+	free(path);
+	free(a_compressed);
+	free(b_compressed);
+}
+
+/* } */
+
+
+/* diff implementation for files. {  */
+
+static struct trace *
+diff2(char **a, char **b, size_t an, size_t bn)
+{
+	size_t skip_start, skip_end, i, an_compressed, bn_compressed;
+	char *path;
+	struct line *a_compressed, *b_compressed;
+	struct line *a_lines = ecalloc(an, sizeof(*a_lines));
+	struct line *b_lines = ecalloc(bn, sizeof(*b_lines));
+	char *(*algorithm)(struct line *, struct line *, size_t, size_t);
+
+	measure_time(0);
+
+	for (i = 0; i < an; i++)  a_lines[i].line = a[i];
+	for (i = 0; i < bn; i++)  b_lines[i].line = b[i];
+
+	rstrip_lines(a_lines, an);
+	rstrip_lines(b_lines, bn);
+
+	reduce_problem(&a_lines, &b_lines, &an, &bn, &skip_start, &skip_end);
+
+	hash_lines(a_lines, an);
+	hash_lines(b_lines, bn);
+
+	measure_time(0);
+	compress_problem(a_lines, b_lines, an, bn, &a_compressed, &b_compressed, &an_compressed, &bn_compressed);
+	measure_time("compress_problem");
+
+	if (bdiff) {
+		algorithm = diff2_cheap;
+	} else if (dflag || is_minimal_feasible(an_compressed, bn_compressed)) {
+		algorithm = diff2_minimal_maybe_transpose;
+	} else {
+		algorithm = diff2_cheap;
+		cheap_algorithm_used = 1;
+	}
+	measure_time(0);
+	path = algorithm(a_compressed, b_compressed, an_compressed, bn_compressed);
+	measure_time("diff");
+
+	decompress_problem(an, bn, a_compressed, b_compressed, &path);
+
+	unreduce_problem(&path, &a_lines, &b_lines, skip_start, skip_end);
+
+	unrstrip_lines(a_lines, an);
+	unrstrip_lines(b_lines, bn);
+
+	measure_time("diff2");
+
+	free(a_lines);
+	free(b_lines);
+	return enhance_path(path);
+}
+
+/* } */
+
+
+/* Functions for producing diff output. { */
+
+static char *
+get_time_string(const struct stat *attr)
+{
+	static char buf[sizeof("0000-00-00 00:00:00.000000000 +0000")];
+	struct tm *tm;
+
+	tm = localtime(&(attr->st_mtime));
+	if (tm == NULL)
+		eperror("localtime");
+
+#ifdef st_mtime
+	strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S.000000000 %z", tm);
+	sprintf(buf + (sizeof("0000-00-00 00:00:00.") - 1), "%09lu", attr->st_mtim.tv_nsec);
+	buf[sizeof("0000-00-00 00:00:00.") - 1 + 9] = ' ';
+#else
+	strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S %z", tm);
+#endif
+	return buf;
+}
+
+static void
+get_diff_chunks(struct trace *path, size_t an, size_t bn, struct chunk **headp, struct chunk **tailp)
+{
+	struct trace trace;
+	size_t ai, bi;
+	int suppressed = 1, have_a = 0, have_b = 0;
+	struct chunk *head = *headp;
+
+	head = ecalloc(an + bn + 1, sizeof(*head));
+	*tailp = head++;
+
+	for (ai = bi = 0; (trace = *path++).f != END_OF_PATH;) {
+		if (trace.d > n_context) {
+			suppressed = 1;
+			if (head->chunk) {
+				head->have_a = have_a;
+				head->have_b = have_b;
+				head++;
+			}
+			have_a = have_b = 0;
+			goto next;
+		}
+		if (suppressed) {
+			head->ai = ai;
+			head->bi = bi;
+			head->chunk = path - 1;
+		}
+		have_a |= trace.f == 1;
+		have_b |= trace.f == 2;
+		suppressed = 0;
+	next:
+		ai += trace.f != 2;
+		bi += trace.f != 1;
+	}
+	if (head->chunk) {
+		head->have_a = have_a;
+		head->have_b = have_b;
+		head++;
+	}
+
+	*headp = head;
+}
+
+#define OUTPUT_BEGIN\
+	struct trace *path;\
+	size_t ai, bi;\
+	int have_a = 0, have_b = 0;\
+	struct trace *chunk;\
+	struct trace *chunk_old;\
+	struct chunk *head;\
+	struct chunk *tail;\
+	char **a = old->lines;\
+	char **b = new->lines\
+
+#define OUTPUT_DIFF\
+	path = diff2(old->lines, new->lines, old->line_count, new->line_count);\
+	if (!path)\
+		return 0
+
+#define OUTPUT_HEAD(A, B)\
+	cprintf("\033[1m"A" %s\033[21m\t%s\033[0m\n", old->path, get_time_string(&(old->attr)));\
+	cprintf("\033[1m"B" %s\033[21m\t%s\033[0m\n", new->path, get_time_string(&(new->attr)))
+
+#define OUTPUT_QUEUE\
+	get_diff_chunks(path, old->line_count, new->line_count, &head, &tail);\
+	(void) chunk_old, (void) a, (void) b;\
+	for (head = tail;;) {\
+		head++;\
+		ai = head->ai;\
+		bi = head->bi;\
+		have_a = head->have_a;\
+		have_b = head->have_b;\
+		chunk = head->chunk;\
+		if (!chunk)\
+			break
+
+#define OUTPUT_STACK\
+	get_diff_chunks(path, old->line_count, new->line_count, &head, &tail);\
+	(void) chunk_old, (void) a, (void) b;\
+	for (;;) {\
+		head--;\
+		ai = head->ai;\
+		bi = head->bi;\
+		have_a = head->have_a;\
+		have_b = head->have_b;\
+		chunk = head->chunk;\
+		if (!chunk)\
+			break
+
+#define OUTPUT_END\
+	}\
+	free(tail);\
+	free(path);\
+	return EXIT_FILES_DIFFER
+
+static int
+output_unified(struct file_data *old, struct file_data *new)
+{
+	struct trace *path;
+	struct trace *path_;
+	struct trace trace;
+	size_t ai, bi;
+	char **a;
+	char **b;
+	int suppressed = 1;
+
+	OUTPUT_DIFF;
+	OUTPUT_HEAD("---", "+++");
+
+	a = old->lines, b = new->lines, path_ = path;
+	for (ai = bi = 0; (trace = *path++).f != END_OF_PATH;) {
+		char f = trace.f;
+		if (trace.d > n_context) {
+			suppressed = 1;
+			goto next;
+		}
+		if (suppressed) {
+			suppressed = 0;
+			cprintf("\033[1;34m_AT_@ -%zu", ai + 1 - !trace.a_len);
+			if (trace.a_len != 1)
+				printf(",%zu", trace.a_len);
+			printf(" +%zu", bi + 1 - !trace.b_len);
+			if (trace.b_len != 1)
+				printf(",%zu", trace.b_len);
+			cprintf(" _AT_@\033[m\n");
+		}
+		if (f == 0)
+			printf(" %s\n", a[ai]);
+		else
+			cprintf("\033[3%im%c%s\033[m\n", (int)f, " -+"[(int)f], f == 1 ? a[ai] : b[bi]);
+	next:
+		ai += f != 2;
+		bi += f != 1;
+	}
+
+	free(path_);
+	return EXIT_FILES_DIFFER;
+}
+
+static int
+output_copied(struct file_data *old, struct file_data *new)
+{
+#define PRINT_PART(L, C, S, A)\
+	cprintf("\033[1;3"#C"m"A" %zu", L##i + 1 - (!have_##L));\
+	if (chunk->L##_len > 1)\
+		printf(",%zu", L##i + chunk->L##_len);\
+	cprintf(" "A"\033[m\n");\
+	for (; have_##L && chunk->f != END_OF_PATH && chunk->d <= n_context; chunk++) {\
+		if (chunk->f == 0)\
+			printf("  %s\n", L[L##i]);\
+		else if (chunk->f != (3 - C))\
+			cprintf("\033[3%im%c %s\033[m\n", chunk->ch ? 3 : C, S"!"[chunk->ch], L[L##i]);\
+		L##i += chunk->f != (3 - C);\
+	}
+
+	OUTPUT_BEGIN;
+	OUTPUT_DIFF;
+	OUTPUT_HEAD("***", "---");
+	OUTPUT_QUEUE;
+
+	cprintf("\033[1;34m***************\033[m\n");
+	chunk_old = chunk;
+	PRINT_PART(a, 1, "-", "***");
+	chunk = chunk_old;
+	PRINT_PART(b, 2, "+", "---");
+
+	OUTPUT_END;
+#undef PRINT_PART
+}
+
+static int
+output_default(struct file_data *old, struct file_data *new)
+{
+#define PRINT_PART(L, C, S)\
+	for (; have_##L && chunk->f != END_OF_PATH && chunk->d <= n_context; chunk++) {\
+		if (chunk->f == 0)\
+			printf("  %s\n", L[L##i]);\
+		else if (chunk->f != (3 - C))\
+			cprintf("\033[3"#C"m"S" %s\033[m\n", L[L##i]);\
+		L##i += chunk->f != (3 - C);\
+	}
+
+	OUTPUT_BEGIN;
+	OUTPUT_DIFF;
+	OUTPUT_QUEUE;
+
+	cprintf("\033[1;34m%zu", ai + 1 - !have_a);
+	if (chunk->a_len > 1)
+		printf(",%zu", ai + chunk->a_len);
+	printf("%c", " dac"[have_a + 2 * have_b]);
+	printf("%zu", bi + 1 - !have_b);
+	if (chunk->b_len > 1)
+		printf(",%zu", bi + chunk->b_len);
+	cprintf("\033[m\n");
+
+	chunk_old = chunk;
+	PRINT_PART(a, 1, "<");
+	if (have_a && have_b)
+		cprintf("\033[34m---\033[m\n");
+	chunk = chunk_old;
+	PRINT_PART(b, 2, ">");
+
+	OUTPUT_END;
+#undef PRINT_PART
+}
+
+static int
+output_ed(struct file_data *old, struct file_data *new)
+{
+	OUTPUT_BEGIN;
+	OUTPUT_DIFF;
+	OUTPUT_STACK;
+	if (!have_b) {
+		printf("%zud\n", ai + 1);
+	} else {
+		int have_dot = 0;
+		printf("%zu", ai + 1 - !have_a);
+		if (chunk->a_len > 1)
+			printf(",%zu", ai + chunk->a_len);
+		printf("%c\n", "ac"[chunk->ch]);
+		for (; chunk->f != END_OF_PATH && chunk->d <= n_context; chunk++) {
+			if (chunk->f != 1)
+				cprintf("\033[3%im%s%s\033[m\n", chunk->ch ? 3 : 2,
+				        b[bi][0] == '.' ? "." : "", b[bi]);
+			have_dot = (chunk->f == 2 && b[bi][0] == '.');
+			if (have_dot)
+				printf(".\ns/.//\na\n");
+			bi += chunk->f != 1;
+		}
+		if (!have_dot)
+			printf(".\n");
+	}
+	OUTPUT_END;
+}
+
+static int
+output_ed_alternative(struct file_data *old, struct file_data *new)
+{
+	OUTPUT_BEGIN;
+	OUTPUT_DIFF;
+	OUTPUT_QUEUE;
+	if (!have_b) {
+		printf("d%zu\n", ai + 1);
+	} else {
+		printf("%c%zu", "ac"[chunk->ch], ai + 1 - !have_a);
+		if (chunk->a_len > 1)
+			printf(" %zu", ai + chunk->a_len);
+		printf("\n");
+		for (; chunk->f != END_OF_PATH && chunk->d <= n_context; chunk++) {
+			if (chunk->f != 1)
+				cprintf("\033[3%im%s\033[m\n", chunk->ch ? 3 : 2, b[bi]);
+			bi += chunk->f != 1;
+		}
+		printf(".\n");
+	}
+	OUTPUT_END;
+}
+
+/* } */
+
+
+static struct file_data *
+load_lines(const char *pathname)
+{
+	int fd, bin = 0;
+	char *buffer;
+	char *p;
+	char *end;
+	size_t ptr, size, n;
+	ssize_t m;
+	struct file_data* rc;
+	struct stat attr;
+
+	p = strrchr(pathname, '/');
+	if (p && !p[1])
+		return 0;
+
+	fd = strcmp(pathname, "-") ? open(pathname, O_RDONLY) : STDIN_FILENO;
+	if (fd == -1) {
+		if (errno == EISDIR)
+			return 0;
+		eperror(pathname);
+	}
+
+	if (fstat(fd, &attr))
+		eperror(pathname);
+	if (S_ISDIR(attr.st_mode))
+		return 0;
+
+	ptr = 0;
+	size = attr.st_blksize ? attr.st_blksize : 8096;
+	buffer = emalloc(size + 1);
+	for (;;) {
+		if (ptr == size)
+			buffer = erealloc(buffer, (size <<= 1) + 1);
+		m = read(fd, buffer + ptr, size - ptr);
+		if (m < 0)
+			eperror(pathname);
+		if (m == 0)
+			break;
+		ptr += (size_t)m;
+	}
+	buffer[ptr] = 0;
+
+	for (n = 1, p = buffer;; n += 1) {
+		char *lf = strchr(p, '\n');
+		if (!lf)
+			break;
+		p = lf + 1;
+	}
+	bin = (strchr(p, '\0') != buffer + ptr);
+
+	/* This is a bit ugly, if not done this way, it would require unnecessarily many
+	 * malloc:s to create rc and unnecessarily many free:s to destroy it. */
+	rc = erealloc(buffer, sizeof(*rc) + (n + 1) * sizeof(char *) + (ptr + 1 + sizeof(COLOURED_NO_LF_MARK)));
+	buffer = ((char *)rc) + sizeof(*rc) + (n + 1) * sizeof(char *);
+	memmove(buffer, rc, ptr);
+	rc->lines = (char **)((char *)rc + sizeof(*rc));
+	rc->lf_terminated = ptr && buffer[ptr - 1] == '\n';
+	rc->line_count = bin ? ptr : (n -= rc->lf_terminated);
+	buffer[ptr - rc->lf_terminated] = 0;
+	rc->attr = attr;
+	rc->path = pathname;
+	rc->is_binary = bin;
+	rc->is_empty = (ptr == 0);
+
+	close(fd);
+
+	n = bin ? n : 1;
+	rc->lines[n] = 0;
+	if (bin) {
+		rc->lines[0] = buffer;
+	} else {
+		for (ptr = 0, p = buffer; p; p = end) {
+			end = strchr(p, '\n');
+			if (end)
+				*end++ = 0;
+			rc->lines[ptr++] = p;
+		}
+	}
+
+	return rc;
+}
+
+static int
+do_binaries_differ(struct file_data *old, struct file_data *new)
+{
+	struct file_data *f = old;
+	do if (!f->is_binary) {
+		char **lines = f->lines;
+		size_t len = 0, part_len;
+		for (; *lines; lines++) {
+			len += 1 + (part_len = strlen(*lines));
+			(*lines)[part_len] = '\n';
+		}
+		f->line_count = len - !f->lf_terminated;
+	} while (f == old ? (f = new) : (void*)0);
+
+	if (old->line_count != new->line_count)
+		return 1;
+
+	return memcmp(old->lines[0], new->lines[0], old->line_count);
+}
+
+static int
+compare_files(struct file_data *old, struct file_data *new)
+{
+	int ret;
+
+	if (old->is_binary || new->is_binary) {
+		if (do_binaries_differ(old, new)) {
+			cprintf("\033[1mBinary files %s and %s differ\033[m\n", old->path, new->path);
+			return EXIT_BINARIES_DIFFER;
+		}
+		return 0;
+	}
+
+	if (!eflag && !fflag) {
+		if (!old->lf_terminated)
+			strcat(old->lines[old->line_count - 1], use_colour ? COLOURED_NO_LF_MARK : NO_LF_MARK);
+		if (!new->lf_terminated)
+			strcat(new->lines[new->line_count - 1], use_colour ? COLOURED_NO_LF_MARK : NO_LF_MARK);
+	}
+
+	ret = (uflag ? output_unified :
+	       cflag ? output_copied :
+	       eflag ? output_ed :
+	       fflag ? output_ed_alternative :
+	               output_default)(old, new);
+
+	if ((eflag || fflag) && 1 <= ret && ret <= 2) {
+		if (!old->lf_terminated)
+			ret = 2, fprintf(stderr, "%s: %s: No newline at end of file\n\n", argv0, old->path);
+		if (!new->lf_terminated)
+			ret = 2, fprintf(stderr, "%s: %s: No newline at end of file\n\n", argv0, new->path);
+	}
+
+	return ret;
+}
+
+static int
+compare_directories(const char *old, const char *new, const char *diff_line)
+{
+	int ret = 0, r, i = 0, j = 1;
+	DIR *dir;
+	const char *paths[2] = { old, new };
+	struct dirent *file;
+	struct file_data *a;
+	struct file_data *b;
+	char *b_path;
+	char *a_path;
+	struct stat a_attr;
+	struct stat b_attr;
+
+again:
+	dir = opendir(paths[i]);
+	if (!dir)
+		eperror(paths[i]);
+	while ((errno = 0, file = readdir(dir))) {
+		if (!strcmp(file->d_name, ".") || !strcmp(file->d_name, ".."))
+			continue;
+		b_path = join_paths(paths[j], file->d_name);
+		if (access(b_path, F_OK)) {
+			cprintf("\033[1mOnly in %s: %s\033[m\n", paths[i], file->d_name);
+			ret = ret > EXIT_FILES_DIFFER ? ret : EXIT_FILES_DIFFER;
+			goto next;
+		} else if (i == 1) {
+			goto next;
+		}
+		a_path = join_paths(paths[i], file->d_name);
+
+		if (stat(a_path, &a_attr))  eperror(a_path);
+		if (stat(b_path, &b_attr))  eperror(b_path);
+
+		if (a_attr.st_dev == b_attr.st_dev && a_attr.st_ino == b_attr.st_ino)
+			goto skip;
+		if (is_incommensurable(a_attr.st_mode) || is_incommensurable(b_attr.st_mode))
+			goto skip;
+
+		a = load_lines(a_path);
+		b = load_lines(b_path);
+
+		if (!a ^ !b) {
+			printf("\033[1mFile %s is a %s while file %s is a %s\033[m\n",
+			       a_path, classify(a), b_path, classify(b));
+			r = EXIT_FILES_DIFFER;
+		} else if (!a && !b && !rflag) {
+			printf("\033[1mCommon subdirectories: %s and %s\033[m\n", a_path, b_path);
+			r = 0;
+		} else if (!a && !b) {
+			r = compare_directories(a_path, b_path, diff_line);
+		} else {
+			printf("\033[1m%s %s %s\033[m\n", diff_line, a_path, b_path);
+			r = compare_files(a, b);
+		}
+		ret = ret > r ? ret : r;
+
+		free(a);
+		free(b);
+	skip:
+		free(a_path);
+	next:
+		free(b_path);
+	}
+	if (errno)
+		eperror("readdir");
+	closedir(dir);
+
+
+	if (i)
+		return ret;
+	i = 1, j = 0;
+	goto again;
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct file_data *old;
+	struct file_data *new;
+	char *old_proper = 0;
+	char *new_proper = 0;
+	int ret;
+	char *diff_line = 0;
+	char *p;
+
+	/* Construct the 'diff options file1 file2' line used diff:ing directories. */
+	if (argc > 2) {
+		size_t len = 0;
+		int i;
+		p = strrchr(argv[0], '/');
+		if (p)
+			argv[0] = p + 1;
+		for (i = 0; i < argc - 2; i++)
+			len += strlen(argv[i]) + 1;
+		p = diff_line = emalloc(len + 1);
+		for (i = 0; i < argc - 2; i++)
+			p += sprintf(p, "%s ", argv[i]);
+		p[-1] = 0;
+	}
+
+	ARGBEGIN {
+	case 'b':  bflag++;  break;
+	case 'c':  cflag++;  n_context = 3;                     break;
+	case 'C':  cflag++;  n_context = atol(EARGF(usage()));  break;
+	case 'e':  eflag++;  break;
+	case 'f':  fflag++;  break;
+	case 'u':  uflag++;  n_context = 3;                     break;
+	case 'U':  uflag++;  n_context = atol(EARGF(usage()));  break;
+	case 'r':  rflag++;  break;
+	case 'd':  dflag++;  break;
+	case 'D':  use_colour++;  break;
+	default:
+		usage();
+	} ARGEND;
+	/* Use of `atol` is intentional, '-U -1' and '-C -1' shall display the entire file.
+	 * This is a not specified in POSIX, but appears in other implementations and is
+	 * useful whilst removing complexity. */
+
+	if (argc != 2 || (bflag | rflag) > 1 || cflag + eflag + fflag + uflag > 1)
+		usage();
+
+	if (!strcmp(argv0, "bdiff")) {
+		bdiff = 1;
+	} else {
+		p = strrchr(argv0, '/');
+		if (p && !strcmp(p, "/bdiff"))
+			bdiff = 1;
+	}
+
+	use_colour = use_colour == 1 ? isatty(STDOUT_FILENO) : use_colour;
+	cprintf = use_colour ? printf : ncprintf;
+
+redo:
+	old = load_lines(old_proper ? old_proper : argv[0]);
+	new = load_lines(new_proper ? new_proper : argv[1]);
+
+	if ((old_proper || new_proper) && (!old || !new)) {
+		printf("\033[1mFile %s is a %s while file %s is a %s\033[m\n",
+		       old_proper ? old_proper : argv[0], classify(old),
+		       new_proper ? new_proper : argv[1], classify(new));
+		ret = 1;
+	} else if (!old && new) {
+		old_proper = join_paths(argv[0], basename(argv[1]));
+		goto redo;
+	} else if (old && !new) {
+		new_proper = join_paths(argv[1], basename(argv[0]));
+		goto redo;
+	} else if (!old && !new) {
+		ret = compare_directories(argv[0], argv[1], diff_line);
+	} else {
+		ret = compare_files(old, new);
+	}
+
+	if (fshut(stdout, "<stdout>"))
+		ret = EXIT_FAILURE;
+
+	free(old);
+	free(new);
+	free(old_proper);
+	free(new_proper);
+	free(diff_line);
+
+	if (!bdiff && cheap_algorithm_used && ret == EXIT_FILES_DIFFER)
+		ret = EXIT_NOT_MINIMAL;
+
+	return ret;
+}
diff --git a/libutil/ncprintf.c b/libutil/ncprintf.c
new file mode 100644
index 0000000..f745cfe
--- /dev/null
+++ b/libutil/ncprintf.c
_AT_@ -0,0 +1,50 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "../util.h"
+
+/* Variant of printf that removes colors from the format string. Only CSI m is
+ * recognized, no other appearances of ESC may occour. %i, %u and %c, but not
+ * other %-codes, may be used inside the CSI m sequence, but only if they refer
+ * to the very first arguments after the format string. Not thread-safe. */
+int
+ncprintf(const char *format, ...)
+{
+	static const char *prev_format = 0;
+	static size_t skips = 0;
+	static char fmt[128];
+	size_t r = 0, w = 0;
+	va_list args;
+	int rc, escape = 0;
+
+	va_start(args, format);
+
+	if (format == prev_format)
+		goto print;
+
+	prev_format = format;
+
+	skips = 0;
+	for (;; r++) {
+		if (escape) {
+			escape = (format[r] != 'm');
+			skips += (format[r] == '%');
+		} else if (format[r] == '\033') {
+			escape = 1;
+		} else {
+			if (!(fmt[w++] = format[r]))
+				break;
+		}
+	}
+	fmt[w] = 0;
+
+print:
+	for (r = skips; r--;)
+		(void) va_arg(args, int);
+
+	rc = vprintf(fmt, args);
+	va_end(args);
+	return rc;
+}
diff --git a/util.h b/util.h
index 4c973d1..62fd9ea 100644
--- a/util.h
+++ b/util.h
_AT_@ -76,3 +76,4 @@ long long enstrtonum(int, const char *, long long, long long);
 long long estrtonum(const char *, long long, long long);
 size_t unescape(char *);
 int mkdirp(const char *);
+int ncprintf(const char *, ...);
-- 
2.7.1
Received on Wed Feb 24 2016 - 11:26:44 CET

This archive was generated by hypermail 2.3.0 : Wed Feb 24 2016 - 11:36:09 CET