Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

From: Michael Forney <mforney_AT_mforney.org>
Date: Sat, 9 Sep 2017 10:29:21 -0700

On Sat, Sep 9, 2017 at 2:08 AM, Silvan Jegen <s.jegen_AT_gmail.com> wrote:
> From: Jim Beveridge <jimbe_AT_chromium.org>
>
> The original code is by Jim Beveridge working on Fuchsia. I merged it
> with slight changes.
>
> Time to tar two 1GB files:
>
> Before patch:
>
> real 0m6.428s
> user 0m0.245s
> sys 0m4.881s
>
> real 0m6.454s
> user 0m0.239s
> sys 0m4.883s
>
> real 0m6.515s
> user 0m0.259s
> sys 0m4.839s
>
> After patch:
>
> real 0m4.755s
> user 0m0.026s
> sys 0m1.598s
>
> real 0m4.788s
> user 0m0.063s
> sys 0m1.578s
>
> real 0m4.822s
> user 0m0.007s
> sys 0m1.662s
>
> A similar speedup can be observed for untaring.
>
> In addition to the buffer size increase we change the code to only create
> directories for non-compliant tar files and we check for st to be NULL
> in the recursive copy function.

He also sent me a pull request on my github branch for oasis:
https://github.com/michaelforney/sbase/pull/2

I think we should work on fixing correctness of tar before trying to
optimize it. Currently it does not handle short reads or writes at all
(when working with pipes). I was thinking we should add a readall in
libutil analogous to writeall and then make use of that.

Regarding COPY_CHUNK_SIZE, it is probably a good idea to put that in
util.h (perhaps with a different name). concat has the same problem
with a small BUFSIZ (musl's is only 1024).

> ---
> tar.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 55 insertions(+), 17 deletions(-)
>
> diff --git a/tar.c b/tar.c
> index 53a737c..8cd1abe 100644
> --- a/tar.c
> +++ b/tar.c
> _AT_@ -16,6 +16,8 @@
> #include "util.h"
>
> #define BLKSIZ 512
> +// COPY_CHUNK_SIZE must be a power of 2
> +#define COPY_CHUNK_SIZE 8192
>
> enum Type {
> REG = '0',
> _AT_@ -236,10 +238,13 @@ archive(const char *path)
> ewrite(tarfd, b, BLKSIZ);
>
> if (fd != -1) {
> - while ((l = eread(fd, b, BLKSIZ)) > 0) {
> - if (l < BLKSIZ)
> - memset(b + l, 0, BLKSIZ - l);
> - ewrite(tarfd, b, BLKSIZ);
> + char chunk[COPY_CHUNK_SIZE];
> + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> + // Ceiling to BLKSIZ boundary
> + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + if (l < ceilsize)
> + memset(chunk + l, 0, ceilsize - l);
> + ewrite(tarfd, chunk, ceilsize);
> }
> close(fd);
> }
> _AT_@ -250,7 +255,7 @@ archive(const char *path)
> static int
> unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> {
> - char lname[101], *tmp, *p;
> + char lname[101], *p;
> long mode, major, minor, type, mtime, uid, gid;
> struct header *h = (struct header *)b;
> int fd = -1;
> _AT_@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> if (remove(fname) < 0 && errno != ENOENT)
> weprintf("remove %s:", fname);
>
> - tmp = estrdup(fname);
> - mkdirp(dirname(tmp), 0777, 0777);
> - free(tmp);
> + // tar files normally create the directory chain. This is a fallback
> + // for noncompliant tar files.
> + if (h->type != DIRECTORY) {
> + char* tmp = estrdup(fname);
> + mkdirp(dirname(tmp), 0777, 0777);
> + free(tmp);
> + }

If you tar a file within another directory, you end up with just one
entry. This check means that the parent directory won't be created
when trying to extract this file. Other tar implementations are able
to extract such an archive.

>
> switch (h->type) {
> case REG:
> _AT_@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> eprintf("strtol %s: invalid number\n", h->gid);
>
> if (fd != -1) {
> - for (; l > 0; l -= BLKSIZ)
> - if (eread(tarfd, b, BLKSIZ) > 0)
> - ewrite(fd, b, MIN(l, BLKSIZ));
> + // Ceiling to BLKSIZ boundary
> + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + char chunk[COPY_CHUNK_SIZE];
> + int lastread = 0;
> +
> + for (; readsize > 0; l -= lastread, readsize -= lastread) {
> + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> + // Short reads are legal, so don't expect to read
> + // everything that was requested.
> + lastread = eread(tarfd, chunk, chunk_size);
> + if (lastread == 0) {
> + close(fd);
> + remove(fname);
> + eprintf("unexpected end of file reading %s.\n",
> + fname);
> + }
> +
> + ewrite(fd, chunk, MIN(l, lastread));
> + }
> close(fd);
> }
>
> _AT_@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> times[0].tv_sec = times[1].tv_sec = mtime;
> times[0].tv_nsec = times[1].tv_nsec = 0;
> if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 0)
> - weprintf("utimensat %s:\n", fname);
> + weprintf("utimensat %s %d:\n", fname, errno);
> if (h->type == SYMLINK) {
> if (!getuid() && lchown(fname, uid, gid))
> weprintf("lchown %s:\n", fname);
> _AT_@ -349,10 +374,23 @@ static void
> skipblk(ssize_t l)
> {
> char b[BLKSIZ];
> -
> - for (; l > 0; l -= BLKSIZ)
> - if (!eread(tarfd, b, BLKSIZ))
> - break;
> + int lastread = 0;
> + // Ceiling to BLKSIZ boundary
> + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> +
> + off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
> + if (offset >= ceilsize)
> + return;
> + if (errno != ESPIPE)
> + eprintf("unexpected end of file.\n");
> +
> + // This is a pipe, socket or FIFO. Fall back to a sequential read.
> + for (; ceilsize > 0; ceilsize -= lastread) {
> + int chunk_size = MIN(ceilsize, BLKSIZ);
> + lastread = eread(tarfd, b, chunk_size);
> + if (lastread == 0)
> + eprintf("unexpected end of file %d.\n", errno);
> + }
> }
>
> static int
> _AT_@ -370,7 +408,7 @@ c(const char *path, struct stat *st, void *data, struct recursor *r)
> if (vflag)
> puts(path);
>
> - if (S_ISDIR(st->st_mode))
> + if (st && S_ISDIR(st->st_mode))
> recurse(path, NULL, r);
> }

I don't understand this. Can you explain? Perhaps this was just a bad
merge of https://git.suckless.org/sbase/commit/?id=a5612b0d08b9abb4e65acaa3d322581af2fd7a39?

> --
> 2.7.4
>
>
Received on Sat Sep 09 2017 - 19:29:21 CEST

This archive was generated by hypermail 2.3.0 : Sat Sep 09 2017 - 19:36:28 CEST