Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

From: Hiltjo Posthuma <hiltjo_AT_codemadness.org>
Date: Sat, 9 Sep 2017 13:06:21 +0200

On Sat, Sep 09, 2017 at 11:08:42AM +0200, Silvan Jegen wrote:
> From: Jim Beveridge <jimbe_AT_chromium.org>
>
> The original code is by Jim Beveridge working on Fuchsia. I merged it
> with slight changes.
>

To be clear: is it under the sbase LICENSE?

> Time to tar two 1GB files:
>
> Before patch:
>
> real 0m6.428s
> user 0m0.245s
> sys 0m4.881s
>
> real 0m6.454s
> user 0m0.239s
> sys 0m4.883s
>
> real 0m6.515s
> user 0m0.259s
> sys 0m4.839s
>
> After patch:
>
> real 0m4.755s
> user 0m0.026s
> sys 0m1.598s
>
> real 0m4.788s
> user 0m0.063s
> sys 0m1.578s
>
> real 0m4.822s
> user 0m0.007s
> sys 0m1.662s
>
> A similar speedup can be observed for untaring.
>
> In addition to the buffer size increase we change the code to only create
> directories for non-compliant tar files and we check for st to be NULL
> in the recursive copy function.
> ---
> tar.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 55 insertions(+), 17 deletions(-)
>
> diff --git a/tar.c b/tar.c
> index 53a737c..8cd1abe 100644
> --- a/tar.c
> +++ b/tar.c
> _AT_@ -16,6 +16,8 @@
> #include "util.h"
>
> #define BLKSIZ 512
> +// COPY_CHUNK_SIZE must be a power of 2
> +#define COPY_CHUNK_SIZE 8192
>

Instead of COPY_CHUNK_SIZE is might be worthwhile to query the pagesize, but
i've not tested it.

> enum Type {
> REG = '0',
> _AT_@ -236,10 +238,13 @@ archive(const char *path)
> ewrite(tarfd, b, BLKSIZ);
>
> if (fd != -1) {
> - while ((l = eread(fd, b, BLKSIZ)) > 0) {
> - if (l < BLKSIZ)
> - memset(b + l, 0, BLKSIZ - l);
> - ewrite(tarfd, b, BLKSIZ);
> + char chunk[COPY_CHUNK_SIZE];
> + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> + // Ceiling to BLKSIZ boundary
> + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + if (l < ceilsize)
> + memset(chunk + l, 0, ceilsize - l);
> + ewrite(tarfd, chunk, ceilsize);
> }
> close(fd);
> }
> _AT_@ -250,7 +255,7 @@ archive(const char *path)
> static int
> unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> {
> - char lname[101], *tmp, *p;
> + char lname[101], *p;
> long mode, major, minor, type, mtime, uid, gid;
> struct header *h = (struct header *)b;
> int fd = -1;
> _AT_@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> if (remove(fname) < 0 && errno != ENOENT)
> weprintf("remove %s:", fname);
>
> - tmp = estrdup(fname);
> - mkdirp(dirname(tmp), 0777, 0777);
> - free(tmp);
> + // tar files normally create the directory chain. This is a fallback
> + // for noncompliant tar files.
> + if (h->type != DIRECTORY) {
> + char* tmp = estrdup(fname);
> + mkdirp(dirname(tmp), 0777, 0777);
> + free(tmp);
> + }
>
> switch (h->type) {
> case REG:
> _AT_@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> eprintf("strtol %s: invalid number\n", h->gid);
>
> if (fd != -1) {
> - for (; l > 0; l -= BLKSIZ)
> - if (eread(tarfd, b, BLKSIZ) > 0)
> - ewrite(fd, b, MIN(l, BLKSIZ));
> + // Ceiling to BLKSIZ boundary
> + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + char chunk[COPY_CHUNK_SIZE];
> + int lastread = 0;
> +
> + for (; readsize > 0; l -= lastread, readsize -= lastread) {
> + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> + // Short reads are legal, so don't expect to read
> + // everything that was requested.
> + lastread = eread(tarfd, chunk, chunk_size);
> + if (lastread == 0) {
> + close(fd);
> + remove(fname);

Do all the tar tools remove the file in this case? It might be better to not
remove it.

> + eprintf("unexpected end of file reading %s.\n",
> + fname);
> + }
> +
> + ewrite(fd, chunk, MIN(l, lastread));
> + }
> close(fd);
> }
>
> _AT_@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> times[0].tv_sec = times[1].tv_sec = mtime;
> times[0].tv_nsec = times[1].tv_nsec = 0;
> if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 0)
> - weprintf("utimensat %s:\n", fname);
> + weprintf("utimensat %s %d:\n", fname, errno);
> if (h->type == SYMLINK) {
> if (!getuid() && lchown(fname, uid, gid))
> weprintf("lchown %s:\n", fname);
> _AT_@ -349,10 +374,23 @@ static void
> skipblk(ssize_t l)
> {
> char b[BLKSIZ];
> -
> - for (; l > 0; l -= BLKSIZ)
> - if (!eread(tarfd, b, BLKSIZ))
> - break;
> + int lastread = 0;
> + // Ceiling to BLKSIZ boundary
> + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> +
> + off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
> + if (offset >= ceilsize)
> + return;
> + if (errno != ESPIPE)
> + eprintf("unexpected end of file.\n");
> +
> + // This is a pipe, socket or FIFO. Fall back to a sequential read.
> + for (; ceilsize > 0; ceilsize -= lastread) {
> + int chunk_size = MIN(ceilsize, BLKSIZ);
> + lastread = eread(tarfd, b, chunk_size);
> + if (lastread == 0)
> + eprintf("unexpected end of file %d.\n", errno);
> + }
> }
>
> static int
> _AT_@ -370,7 +408,7 @@ c(const char *path, struct stat *st, void *data, struct recursor *r)
> if (vflag)
> puts(path);
>
> - if (S_ISDIR(st->st_mode))
> + if (st && S_ISDIR(st->st_mode))
> recurse(path, NULL, r);
> }
>
> --
> 2.7.4
>
>

The C++ comments should be changed to /* */. I'd also prefer all variable
declarations at the top and the other notes above.

Nice work though :)

-- 
Kind regards,
Hiltjo
Received on Sat Sep 09 2017 - 13:06:21 CEST

This archive was generated by hypermail 2.3.0 : Sat Sep 09 2017 - 13:12:23 CEST