Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

From: Silvan Jegen <s.jegen_AT_gmail.com>
Date: Sun, 10 Sep 2017 18:58:27 +0200

Hi Hiltjo

Thanks for the review!

On Sat, Sep 09, 2017 at 01:06:21PM +0200, Hiltjo Posthuma wrote:
> On Sat, Sep 09, 2017 at 11:08:42AM +0200, Silvan Jegen wrote:
> > From: Jim Beveridge <jimbe_AT_chromium.org>
> >
> > The original code is by Jim Beveridge working on Fuchsia. I merged it
> > with slight changes.
> >
>
> To be clear: is it under the sbase LICENSE?

Yes, from what I can tell the License for this code is the same as for
the rest of sbase.

> > #define BLKSIZ 512
> > +// COPY_CHUNK_SIZE must be a power of 2
> > +#define COPY_CHUNK_SIZE 8192
> >
>
> Instead of COPY_CHUNK_SIZE is might be worthwhile to query the pagesize, but
> i've not tested it.

Yes, I will have a look.

 
> > enum Type {
> > REG = '0',
> > _AT_@ -236,10 +238,13 @@ archive(const char *path)
> > ewrite(tarfd, b, BLKSIZ);
> >
> > if (fd != -1) {
> > - while ((l = eread(fd, b, BLKSIZ)) > 0) {
> > - if (l < BLKSIZ)
> > - memset(b + l, 0, BLKSIZ - l);
> > - ewrite(tarfd, b, BLKSIZ);
> > + char chunk[COPY_CHUNK_SIZE];
> > + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> > + // Ceiling to BLKSIZ boundary
> > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > + if (l < ceilsize)
> > + memset(chunk + l, 0, ceilsize - l);
> > + ewrite(tarfd, chunk, ceilsize);
> > }
> > close(fd);
> > }
> > _AT_@ -250,7 +255,7 @@ archive(const char *path)
> > static int
> > unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > {
> > - char lname[101], *tmp, *p;
> > + char lname[101], *p;
> > long mode, major, minor, type, mtime, uid, gid;
> > struct header *h = (struct header *)b;
> > int fd = -1;
> > _AT_@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > if (remove(fname) < 0 && errno != ENOENT)
> > weprintf("remove %s:", fname);
> >
> > - tmp = estrdup(fname);
> > - mkdirp(dirname(tmp), 0777, 0777);
> > - free(tmp);
> > + // tar files normally create the directory chain. This is a fallback
> > + // for noncompliant tar files.
> > + if (h->type != DIRECTORY) {
> > + char* tmp = estrdup(fname);
> > + mkdirp(dirname(tmp), 0777, 0777);
> > + free(tmp);
> > + }
> >
> > switch (h->type) {
> > case REG:
> > _AT_@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > eprintf("strtol %s: invalid number\n", h->gid);
> >
> > if (fd != -1) {
> > - for (; l > 0; l -= BLKSIZ)
> > - if (eread(tarfd, b, BLKSIZ) > 0)
> > - ewrite(fd, b, MIN(l, BLKSIZ));
> > + // Ceiling to BLKSIZ boundary
> > + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > + char chunk[COPY_CHUNK_SIZE];
> > + int lastread = 0;
> > +
> > + for (; readsize > 0; l -= lastread, readsize -= lastread) {
> > + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> > + // Short reads are legal, so don't expect to read
> > + // everything that was requested.
> > + lastread = eread(tarfd, chunk, chunk_size);
> > + if (lastread == 0) {
> > + close(fd);
> > + remove(fname);
>
> Do all the tar tools remove the file in this case? It might be better to not
> remove it.

You are right, this code should probably be removed if we use a
ReadAll-like function.


> > + eprintf("unexpected end of file reading %s.\n",
> > + fname);
> > + }
> > +
> > + ewrite(fd, chunk, MIN(l, lastread));
> > + }
> > close(fd);
> > }
> >
> > _AT_@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > times[0].tv_sec = times[1].tv_sec = mtime;
> > times[0].tv_nsec = times[1].tv_nsec = 0;
> > if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 0)
> > - weprintf("utimensat %s:\n", fname);
> > + weprintf("utimensat %s %d:\n", fname, errno);
> > if (h->type == SYMLINK) {
> > if (!getuid() && lchown(fname, uid, gid))
> > weprintf("lchown %s:\n", fname);
> > _AT_@ -349,10 +374,23 @@ static void
> > skipblk(ssize_t l)
> > {
> > char b[BLKSIZ];
> > -
> > - for (; l > 0; l -= BLKSIZ)
> > - if (!eread(tarfd, b, BLKSIZ))
> > - break;
> > + int lastread = 0;
> > + // Ceiling to BLKSIZ boundary
> > + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +
> > + off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
> > + if (offset >= ceilsize)
> > + return;
> > + if (errno != ESPIPE)
> > + eprintf("unexpected end of file.\n");
> > +
> > + // This is a pipe, socket or FIFO. Fall back to a sequential read.
> > + for (; ceilsize > 0; ceilsize -= lastread) {
> > + int chunk_size = MIN(ceilsize, BLKSIZ);
> > + lastread = eread(tarfd, b, chunk_size);
> > + if (lastread == 0)
> > + eprintf("unexpected end of file %d.\n", errno);
> > + }
> > }
> >
> > static int
> > _AT_@ -370,7 +408,7 @@ c(const char *path, struct stat *st, void *data, struct recursor *r)
> > if (vflag)
> > puts(path);
> >
> > - if (S_ISDIR(st->st_mode))
> > + if (st && S_ISDIR(st->st_mode))
> > recurse(path, NULL, r);
> > }
> >
> > --
> > 2.7.4
> >
> >
>
> The C++ comments should be changed to /* */. I'd also prefer all variable
> declarations at the top and the other notes above.

I can change that.


> Nice work though :)

Sadly, it's not really mine :P

I will discuss with Michael and Jim to see what changes would get this
patch (or one based on it) merged.


Cheers,

Silvan
Received on Sun Sep 10 2017 - 18:58:27 CEST

This archive was generated by hypermail 2.3.0 : Sun Sep 10 2017 - 19:00:27 CEST