Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

From: Silvan Jegen <s.jegen_AT_gmail.com>
Date: Sun, 10 Sep 2017 19:08:04 +0200

Hi

On Sat, Sep 09, 2017 at 10:29:21AM -0700, Michael Forney wrote:
> On Sat, Sep 9, 2017 at 2:08 AM, Silvan Jegen <s.jegen_AT_gmail.com> wrote:
> > From: Jim Beveridge <jimbe_AT_chromium.org>
> >
> > The original code is by Jim Beveridge working on Fuchsia. I merged it
> > with slight changes.
> >
> > Time to tar two 1GB files:
> >
> > Before patch:
> >
> > real 0m6.428s
> > user 0m0.245s
> > sys 0m4.881s
> >
> > real 0m6.454s
> > user 0m0.239s
> > sys 0m4.883s
> >
> > real 0m6.515s
> > user 0m0.259s
> > sys 0m4.839s
> >
> > After patch:
> >
> > real 0m4.755s
> > user 0m0.026s
> > sys 0m1.598s
> >
> > real 0m4.788s
> > user 0m0.063s
> > sys 0m1.578s
> >
> > real 0m4.822s
> > user 0m0.007s
> > sys 0m1.662s
> >
> > A similar speedup can be observed for untaring.
> >
> > In addition to the buffer size increase we change the code to only create
> > directories for non-compliant tar files and we check for st to be NULL
> > in the recursive copy function.
>
> He also sent me a pull request on my github branch for oasis:
> https://github.com/michaelforney/sbase/pull/2

Ah, I did not see that one.
 

> I think we should work on fixing correctness of tar before trying to
> optimize it. Currently it does not handle short reads or writes at all
> (when working with pipes). I was thinking we should add a readall in
> libutil analogous to writeall and then make use of that.

That sounds good to me.
 

> Regarding COPY_CHUNK_SIZE, it is probably a good idea to put that in
> util.h (perhaps with a different name). concat has the same problem
> with a small BUFSIZ (musl's is only 1024).

What do you think of Hiltjo's idea of querying for the page size with

long sz = sysconf(_SC_PAGESIZE);

or similar? Such code could still be put inot util.h of course.
 

> > ---
> > tar.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------
> > 1 file changed, 55 insertions(+), 17 deletions(-)
> >
> > diff --git a/tar.c b/tar.c
> > index 53a737c..8cd1abe 100644
> > --- a/tar.c
> > +++ b/tar.c
> > _AT_@ -16,6 +16,8 @@
> > #include "util.h"
> >
> > #define BLKSIZ 512
> > +// COPY_CHUNK_SIZE must be a power of 2
> > +#define COPY_CHUNK_SIZE 8192
> >
> > enum Type {
> > REG = '0',
> > _AT_@ -236,10 +238,13 @@ archive(const char *path)
> > ewrite(tarfd, b, BLKSIZ);
> >
> > if (fd != -1) {
> > - while ((l = eread(fd, b, BLKSIZ)) > 0) {
> > - if (l < BLKSIZ)
> > - memset(b + l, 0, BLKSIZ - l);
> > - ewrite(tarfd, b, BLKSIZ);
> > + char chunk[COPY_CHUNK_SIZE];
> > + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> > + // Ceiling to BLKSIZ boundary
> > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > + if (l < ceilsize)
> > + memset(chunk + l, 0, ceilsize - l);
> > + ewrite(tarfd, chunk, ceilsize);
> > }
> > close(fd);
> > }
> > _AT_@ -250,7 +255,7 @@ archive(const char *path)
> > static int
> > unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > {
> > - char lname[101], *tmp, *p;
> > + char lname[101], *p;
> > long mode, major, minor, type, mtime, uid, gid;
> > struct header *h = (struct header *)b;
> > int fd = -1;
> > _AT_@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > if (remove(fname) < 0 && errno != ENOENT)
> > weprintf("remove %s:", fname);
> >
> > - tmp = estrdup(fname);
> > - mkdirp(dirname(tmp), 0777, 0777);
> > - free(tmp);
> > + // tar files normally create the directory chain. This is a fallback
> > + // for noncompliant tar files.
> > + if (h->type != DIRECTORY) {
> > + char* tmp = estrdup(fname);
> > + mkdirp(dirname(tmp), 0777, 0777);
> > + free(tmp);
> > + }
>
> If you tar a file within another directory, you end up with just one
> entry. This check means that the parent directory won't be created
> when trying to extract this file. Other tar implementations are able
> to extract such an archive.
>
> >
> > switch (h->type) {
> > case REG:
> > _AT_@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > eprintf("strtol %s: invalid number\n", h->gid);
> >
> > if (fd != -1) {
> > - for (; l > 0; l -= BLKSIZ)
> > - if (eread(tarfd, b, BLKSIZ) > 0)
> > - ewrite(fd, b, MIN(l, BLKSIZ));
> > + // Ceiling to BLKSIZ boundary
> > + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > + char chunk[COPY_CHUNK_SIZE];
> > + int lastread = 0;
> > +
> > + for (; readsize > 0; l -= lastread, readsize -= lastread) {
> > + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> > + // Short reads are legal, so don't expect to read
> > + // everything that was requested.
> > + lastread = eread(tarfd, chunk, chunk_size);
> > + if (lastread == 0) {
> > + close(fd);
> > + remove(fname);
> > + eprintf("unexpected end of file reading %s.\n",
> > + fname);
> > + }
> > +
> > + ewrite(fd, chunk, MIN(l, lastread));
> > + }
> > close(fd);
> > }
> >
> > _AT_@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > times[0].tv_sec = times[1].tv_sec = mtime;
> > times[0].tv_nsec = times[1].tv_nsec = 0;
> > if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 0)
> > - weprintf("utimensat %s:\n", fname);
> > + weprintf("utimensat %s %d:\n", fname, errno);
> > if (h->type == SYMLINK) {
> > if (!getuid() && lchown(fname, uid, gid))
> > weprintf("lchown %s:\n", fname);
> > _AT_@ -349,10 +374,23 @@ static void
> > skipblk(ssize_t l)
> > {
> > char b[BLKSIZ];
> > -
> > - for (; l > 0; l -= BLKSIZ)
> > - if (!eread(tarfd, b, BLKSIZ))
> > - break;
> > + int lastread = 0;
> > + // Ceiling to BLKSIZ boundary
> > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +
> > + off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
> > + if (offset >= ceilsize)
> > + return;
> > + if (errno != ESPIPE)
> > + eprintf("unexpected end of file.\n");
> > +
> > + // This is a pipe, socket or FIFO. Fall back to a sequential read.
> > + for (; ceilsize > 0; ceilsize -= lastread) {
> > + int chunk_size = MIN(ceilsize, BLKSIZ);
> > + lastread = eread(tarfd, b, chunk_size);
> > + if (lastread == 0)
> > + eprintf("unexpected end of file %d.\n", errno);
> > + }
> > }
> >
> > static int
> > _AT_@ -370,7 +408,7 @@ c(const char *path, struct stat *st, void *data, struct recursor *r)
> > if (vflag)
> > puts(path);
> >
> > - if (S_ISDIR(st->st_mode))
> > + if (st && S_ISDIR(st->st_mode))
> > recurse(path, NULL, r);
> > }
>
> I don't understand this. Can you explain? Perhaps this was just a bad
> merge of https://git.suckless.org/sbase/commit/?id=a5612b0d08b9abb4e65acaa3d322581af2fd7a39?

Ah, I faintly remembered something like this but then thought I could
keep it in just in case. According to that commit, st can never be NULL
when these functions are called so we can remove that check.


Cheers,

Silvan
Received on Sun Sep 10 2017 - 19:08:04 CEST

This archive was generated by hypermail 2.3.0 : Sun Sep 10 2017 - 19:12:26 CEST