[hackers] [sbase][PATCH] du: Dedup hardlinks

From: remph <lhr_AT_disroot.org>
Date: Sun, 2 Mar 2025 18:47:44 +0000

Conform to POSIX, which says `Files with multiple links shall be counted
and written for only one entry,' in the 2008[1] and 2013[2] editions, and
uses more words to say the same thing in the 2017[3] and 2024[4] editions.
This patch also keeps inodes between operands and dedups symlinks if
applicable, which are implementation-defined in 2017 and required in 2024.
See also the `RATIONALE' section in the 2024 edition.

[1] https://pubs.opengroup.org/onlinepubs/9699919799.2008edition/utilities/du.html
[2] https://pubs.opengroup.org/onlinepubs/9699919799.2013edition/utilities/du.html
[3] https://pubs.opengroup.org/onlinepubs/9699919799/utilities/du.html
[4] https://pubs.opengroup.org/onlinepubs/9799919799/utilities/du.html

---
 du.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
diff --git a/du.c b/du.c
index 1815a02..be073a9 100644
--- a/du.c
+++ b/du.c
_AT_@ -5,6 +5,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
+#include <search.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
_AT_@ -20,6 +21,13 @@ static int aflag = 0;
 static int sflag = 0;
 static int hflag = 0;
 
+static void * devtree = NULL;
+
+struct device {
+	dev_t devno;
+	void * inodetree;
+};
+
 static void
 printpath(off_t n, const char *path)
 {
_AT_@ -35,6 +43,58 @@ nblks(blkcnt_t blocks)
 	return (512 * blocks + blksize - 1) / blksize;
 }
 
+static int
+compare_dev(const void *a, const void *b)
+{
+	const struct device *A = a, *B = b;
+	if (A->devno == B->devno)
+		return 0;
+	if (A->devno > B->devno)
+		return -1;
+	else /* A->devno < B->devno */
+		return 1;
+}
+static int
+compare_ino(const void *a, const void *b)
+{
+	ino_t A = *(ino_t*)a, B = *(ino_t*)b;
+	return A == B ? 0 : A > B ? -1 : 1;
+}
+
+static int
+isduplicate(dev_t dev, ino_t ino)
+{
+	struct device dev_branch = { dev, NULL };
+	struct device ** found = tsearch(&dev_branch, &devtree, compare_dev);
+
+	if (!found)
+		eprintf("%s:", argv0);
+
+	if (*found == &dev_branch) {
+		ino_t *ino_p;
+		/* new device added */
+		*found = emalloc(sizeof **found);
+		**found = dev_branch;
+		/* init with this inode */
+		ino_p = emalloc(sizeof *ino_p);
+		*ino_p = ino;
+		if (!tsearch(ino_p, &(**found).inodetree, compare_ino))
+			eprintf("%s:", argv0);
+	} else {
+		/* not new; see if the inode is here */
+		ino_t **found_ino = tsearch(&ino, &(**found).inodetree, compare_ino);
+		if (!found_ino)
+			eprintf("%s:", argv0);
+		if (*found_ino == &ino) {
+			*found_ino = emalloc(sizeof *found_ino);
+			**found_ino = ino;
+		} else
+			return 1;
+	}
+
+	return 0;
+}
+
 static void
 du(int dirfd, const char *path, struct stat *st, void *data, struct recursor *r)
 {
_AT_@ -43,8 +103,13 @@ du(int dirfd, const char *path, struct stat *st, void *data, struct recursor *r)
 	subtotal = nblks(st->st_blocks);
 	if (S_ISDIR(st->st_mode))
 		recurse(dirfd, path, &subtotal, r);
+	else if (r->follow != 'P' || st->st_nlink > 1)
+		if (isduplicate(st->st_dev, st->st_ino))
+			goto print;
+
 	*total += subtotal;
 
+print:
 	if (!r->depth)
 		printpath(*total, r->path);
 	else if (!sflag && r->depth <= maxdepth && (S_ISDIR(st->st_mode) || aflag))
-- 
2.48.1
Received on Sun Mar 02 2025 - 19:47:44 CET

This archive was generated by hypermail 2.3.0 : Sun Mar 02 2025 - 19:48:56 CET