[hackers] [flate] deflate fixes: set history and block size correctly || nsz

From: <hg_AT_suckless.org>
Date: Fri, 21 Aug 2009 13:50:12 +0000 (UTC)

changeset: 100:7d681e91154a
user: nsz <nszabolcs_AT_gmail.com>
date: Sun Aug 16 11:03:32 2009 +0200
files: TODO deflate.c
description:
deflate fixes: set history and block size correctly

diff -r c7cc7828da68 -r 7d681e91154a TODO
--- a/TODO Tue Aug 11 14:48:57 2009 +0200
+++ b/TODO Sun Aug 16 11:03:32 2009 +0200
@@ -34,8 +34,9 @@
         bisect len/distbase -> lookup table
         rolling hash
 better compression:
- block split heuristics with lfreq, dfreq
+ block split heuristics with lfreq, dfreq (optimal block size?)
         (zlib huffcode trick: if same freq then shorter code goes to the one with smaller subtree)
+ last block can be allowed to be larger
 code cleanups:
         bounds on the compressend size
         verify dstsize
diff -r c7cc7828da68 -r 7d681e91154a deflate.c
--- a/deflate.c Tue Aug 11 14:48:57 2009 +0200
+++ b/deflate.c Sun Aug 16 11:03:32 2009 +0200
@@ -2,6 +2,8 @@
 #include <string.h>
 #include "flate.h"
 
+#include <stdio.h>
+
 enum {
         CodeBits = 16, /* max number of bits in a code + 1 */
         EOB = 256, /* end of block symbol */
@@ -12,14 +14,17 @@
         Nclen = 19, /* number of code length codes */
         MinMatch = 3, /* min match length */
         MaxMatch = 258, /* max match length */
- MaxDist = 1 << 15, /* max match distance */
+ WinSize = 1 << 15, /* max match distance (sliding window size) */
 
         MaxChainLen = 256, /* max length of hash chain */
         HashBits = 13,
         HashSize = 1 << HashBits, /* hash table size */
         BigDist = 1 << 12, /* max match distance for short match length */
- WinSize = 2*MaxDist + MaxMatch, /* input window size */
- DstSize = MaxDist + 2*MaxMatch + 6, /* output window size (worst case compressed block size) */
+ HistSize = 2*WinSize, /* history buffer size (64K, indexed with ushort) */
+ HistEnd = MaxMatch + MinMatch - 1, /* reserved space at the end of hist buffer */
+ HistSafe = HistSize - HistEnd - WinSize, /* worst case available history */
+ BlockSize = HistSafe, /* block size (should be <= HistSafe) */
+ DstSize = BlockSize + MaxMatch + 6, /* output buffer size (worst case compressed block size + 1) */
         LzSize = 1 << 13, /* lz buffer size */
         LzLitFlag = 1 << 15 /* marks literal run length in lz buffer */
 };
@@ -41,15 +46,15 @@
         int startpos; /* block start pos in input win */
         int avail; /* available in input win */
         Match prevm; /* previous (deferred) match */
- uchar *src; /* input data (not yet in win) */
+ uchar *src; /* input data (not yet in hist) */
         uchar *srcend;
         uchar *dst; /* compressed output (position in dstbuf) */
         uchar *dstbegin; /* start position of unflushed data in dstbuf */
         uint bits;
         int nbits;
- uchar win[WinSize]; /* input window */
+ uchar hist[HistSize]; /* history (input buffer) */
         ushort head[HashSize]; /* position of hash chain heads */
- ushort chain[MaxDist]; /* hash chain */
+ ushort chain[WinSize]; /* hash chain */
         LzCode lzbuf[LzSize]; /* literal run length, match len, match dist */
         LzCode *lz; /* current pos in lzbuf */
         int nlit; /* literal run length */
@@ -293,14 +298,14 @@
 static void putblock(State *s, ushort *lcode, uchar *llen, ushort *dcode, uchar *dlen) {
         int n;
         LzCode *lz;
- uchar *pc;
+ uchar *h;
 
- for (lz = s->lzbuf, pc = s->win + s->startpos; lz != s->lz; lz++)
+ for (lz = s->lzbuf, h = s->hist + s->startpos; lz != s->lz; lz++)
                 if (lz->bits & LzLitFlag)
- for (n = lz->n; n > 0; n--, pc++)
- putbits(s, lcode[*pc], llen[*pc]);
+ for (n = lz->n; n > 0; n--, h++)
+ putbits(s, lcode[*h], llen[*h]);
                 else {
- pc += lenbase[lz->n] + lz->bits;
+ h += lenbase[lz->n] + lz->bits;
                         putbits(s, lcode[Nlit + lz->n + 1], llen[Nlit + lz->n + 1]);
                         putbits(s, lz->bits, lenbits[lz->n]);
                         lz++;
@@ -320,10 +325,10 @@
         int i, c, n, ncodes;
         int nlit, ndist, nclen;
         LzCode *lz;
- uchar *pc;
+ uchar *h;
         int dynsize, fixsize, uncsize;
         int blocklen = s->pos - s->startpos;
-/*int dyntree;*/
+int dyntree;
 
         /* calc dynamic codes */
         hufflens(llen, s->lfreq, Nlitlen, CodeBits-1);
@@ -354,15 +359,15 @@
                 if (c == 18)
                         dynsize += 7;
         }
-/*dyntree = dynsize - 3;*/
- for (lz = s->lzbuf, pc = s->win + s->startpos; lz != s->lz; lz++)
+dyntree = dynsize - 3;
+ for (lz = s->lzbuf, h = s->hist + s->startpos; lz != s->lz; lz++)
                 if (lz->bits & LzLitFlag)
- for (n = lz->n; n > 0; n--, pc++) {
- fixsize += fixllen[*pc];
- dynsize += llen[*pc];
+ for (n = lz->n; n > 0; n--, h++) {
+ fixsize += fixllen[*h];
+ dynsize += llen[*h];
                         }
                 else {
- pc += lenbase[lz->n] + lz->bits;
+ h += lenbase[lz->n] + lz->bits;
                         fixsize += fixllen[Nlit + lz->n + 1];
                         dynsize += llen[Nlit + lz->n + 1];
                         fixsize += lenbits[lz->n];
@@ -408,14 +413,14 @@
                 s->nbits = 0;
                 putbits(s, blocklen, 16);
                 putbits(s, ~blocklen & 0xffff, 16);
- memcpy(s->dst, s->win + s->startpos, blocklen);
+ memcpy(s->dst, s->hist + s->startpos, blocklen);
                 s->dst += blocklen;
         }
-/*
-fprintf(stderr, "blen: %d [%d,%d] lzlen: %d dynlen: %d (tree: %d, rate: %.3f) fixlen: %d (rate: %.3f) unclen: %d (rate: %.3f) avail %d\n",
+
+fprintf(stderr, "blen:%d [%d,%d] lzlen:%d dynlen:%d (tree:%d rate:%.3f) fixlen:%d (rate:%.3f) unclen:%d (rate:%.3f) avail:%d\n",
         blocklen, s->startpos, s->pos, s->lz - s->lzbuf, dynsize, dyntree, dynsize/(float)blocklen,
         fixsize, fixsize/(float)blocklen, uncsize, uncsize/(float)blocklen, s->avail);
-*/
+
 }
 
 /* find n in base */
@@ -476,14 +481,14 @@
 static int updatechain(State *s) {
         int h, i;
 
- if (s->avail < MinMatch)
+ if (s->avail < MinMatch) /* assert(s->eof) */
                 return 0;
- h = gethash(s->win + s->pos);
+ h = gethash(s->hist + s->pos);
         i = s->head[h];
         s->head[h] = s->pos;
- if (i >= s->pos || s->pos - i >= MaxDist)
+ if (i >= s->pos || s->pos - i >= HistSafe)
                 i = 0;
- s->chain[s->pos % MaxDist] = i;
+ s->chain[s->pos % WinSize] = i;
         return i;
 }
 
@@ -491,14 +496,14 @@
 static Match getmatch(State *s, int mpos) {
         Match m = {0, MinMatch-1};
         int len;
- int limit = s->pos - MaxDist;
+ int limit = s->pos - HistSafe;
         int chainlen = MaxChainLen;
         uchar *q;
- uchar *p = s->win + s->pos;
+ uchar *p = s->hist + s->pos;
         uchar *end = p + MaxMatch;
 
         do {
- q = s->win + mpos;
+ q = s->hist + mpos;
                 /* at least m.len+1 long */
                 if (q[m.len] != p[m.len] || q[m.len-1] != p[m.len-1] || q[0] != p[0])
                         continue;
@@ -513,7 +518,7 @@
                         }
                         m.len = len;
                 }
- } while ((mpos = s->chain[mpos % MaxDist]) > limit && --chainlen);
+ } while ((mpos = s->chain[mpos % WinSize]) > limit && --chainlen);
         if (m.len < MinMatch || (m.len == MinMatch && m.dist > BigDist))
                 m.len = 0;
         return m;
@@ -530,43 +535,52 @@
         s->lfreq[EOB]++;
 }
 
-/* shift and fill win from s->src */
-static int fillwin(State *s) {
+/*
+correctness assertions:
+ - pos < 64K in updatechain (head and chain is ushort)
+ - updatechain has at least MinMatch input (except at eof)
+ - getmatch has at least MaxMatch input
+ - getmatch has at least HistSafe history
+ - lzbuf does not overflow before endblock
+ - dstbuf does not overflow before endblock
+ - updatehist only fails if src == srcend
+ - startpos does not underflow during shift hist
+*/
+
+/* shift and fill s->hist */
+static int updatehist(State *s) {
         int n, k;
 
- if (s->pos >= 2*MaxDist) {
- /* shift win */
- memmove(s->win, s->win + MaxDist, MaxDist + MaxMatch);
- for (n = HashSize; n--;)
- s->head[n] = s->head[n] > MaxDist ? s->head[n] - MaxDist : 0;
- for (n = MaxDist; n--;)
- s->chain[n] = s->chain[n] > MaxDist ? s->chain[n] - MaxDist : 0;
- s->startpos -= MaxDist;
- s->pos -= MaxDist;
+ if (s->avail > HistEnd)
+ return 1;
+ if (s->pos >= HistSize - HistEnd) {
+ /* shift hist */
+ memcpy(s->hist, s->hist + WinSize, HistSize - WinSize);
+ for (n = 0; n < HashSize; n++)
+ s->head[n] = s->head[n] > WinSize ? s->head[n] - WinSize : 0;
+ for (n = 0; n < WinSize; n++)
+ s->chain[n] = s->chain[n] > WinSize ? s->chain[n] - WinSize : 0;
+ s->startpos -= WinSize;
+ s->pos -= WinSize;
         }
- if (s->avail < MaxMatch && !s->eof) {
- /* fill win */
- n = WinSize - s->pos - s->avail;
+ if (!s->eof) {
+ /* fill hist */
                 k = s->srcend - s->src;
+ n = HistSize - s->pos - s->avail; /* s->avail + n >= MaxMatch */
                 if (k > n)
                         k = n;
- if (k == 0)
- return 0;
- memcpy(s->win + s->pos + s->avail, s->src, k);
+ memcpy(s->hist + s->pos + s->avail, s->src, k);
                 s->src += k;
                 s->avail += k;
+ if (s->avail <= HistEnd) /* s->srcend == s->src */
+ return 0;
         }
         return 1;
 }
 
 /* check if the current block should be ended */
 static int endofblock(State *s) {
- if (s->avail == 0)
- return 1;
- if ((s->pos - s->startpos >= MaxDist ||
- s->lz - s->lzbuf > LzSize - 3) && !s->eof)
- return 1;
- return 0;
+ return s->avail == 0 || s->pos - s->startpos >= BlockSize || s->lz - s->lzbuf > LzSize - 3;
 }
 
 /* deflate from s->src into s->dstbuf */
@@ -588,14 +602,23 @@
                 return s->state;
 
         for (;;) {
- if (!fillwin(s))
+ if (!updatehist(s))
                         return FlateIn;
+ if (endofblock(s)) {
+ flushlit(s);
+ if (s->prevm.len)
+ s->pos--;
+ deflate_block(s);
+ if (s->eof)
+ putbits(s, 0, 7);
+ return FlateOut;
+ }
                 head = updatechain(s);
                 if (head)
                         m = getmatch(s, head);
                 if (head && m.len > s->prevm.len) {
                         if (s->prevm.len)
- recordlit(s, s->win[s->pos-1]);
+ recordlit(s, s->hist[s->pos-1]);
                         s->prevm = m;
                 } else if (s->prevm.len) {
                         recordmatch(s, s->prevm);
@@ -606,18 +629,9 @@
                                 updatechain(s);
                         } while (--s->prevm.len);
                 } else
- recordlit(s, s->win[s->pos]);
+ recordlit(s, s->hist[s->pos]);
                 s->pos++;
                 s->avail--;
- if (endofblock(s)) {
- flushlit(s);
- if (s->prevm.len)
- s->pos--;
- deflate_block(s);
- if (s->eof)
- putbits(s, 0, 7);
- return FlateOut;
- }
         }
 }
 
@@ -644,7 +658,7 @@
         huffcodes(fixlcode, fixllen, Nlitlen);
         huffcodes(fixdcode, fixdlen, Ndist);
         s->dst = s->dstbegin = s->dstbuf;
- s->pos = s->startpos = MaxDist;
+ s->pos = s->startpos = WinSize;
         s->eof = 0;
         s->avail = 0;
         s->prevm.len = 0;
Received on Fri Aug 21 2009 - 13:50:12 UTC

This archive was generated by hypermail 2.2.0 : Fri Aug 21 2009 - 14:00:06 UTC