summaryrefslogtreecommitdiff
path: root/src/compression/lz77.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/compression/lz77.c')
-rw-r--r--src/compression/lz77.c347
1 files changed, 347 insertions, 0 deletions
diff --git a/src/compression/lz77.c b/src/compression/lz77.c
new file mode 100644
index 0000000..705e89d
--- /dev/null
+++ b/src/compression/lz77.c
@@ -0,0 +1,347 @@
+/* This code is part of the tng compression routines.
+ *
+ * Written by Daniel Spangberg
+ * Copyright (c) 2010, 2013, The GROMACS development team.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "warnmalloc.h"
+#include "bwt.h"
+#include "lz77.h"
+
+/* This is a simple Lempel-Ziv-77 compressor. It has not been set up
+ to remove every possible repeating pattern, but it might be better
+ than simple RLE.
+
+ Lempel-Ziv 77 with separate outputs for length, data, and offsets.
+ */
+
+#if 0
+/* Sort the strings (similar to BWT) to find similar patterns in the
+ input data. The output is an array with two values for each
+ input value. The sorted index value is the first in each doublet.
+ The second value is the "inverse" of the first value and can be
+ used to find the locations of similar strings. */
+static void sort_strings(unsigned int *vals, int nvals,
+ unsigned int *output)
+{
+ int i;
+ int *indices=warnmalloc(2*nvals*sizeof *indices);
+ unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
+ int *warr=indices+nvals;
+
+ if (nvals>0xFFFFFF)
+ {
+ fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
+ exit(1);
+ }
+
+ /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
+ for (i=0; i<nvals; i++)
+ indices[i]=i;
+ /* Find the length of the initial repeating pattern for the strings. */
+ /* First mark that the index does not have a found repeating string. */
+ for (i=0; i<nvals; i++)
+ nrepeat[i]=0U;
+ for (i=0; i<nvals; i++)
+ {
+ /* If we have not already found a repeating string we must find
+ it. */
+ if (!nrepeat[i])
+ {
+ int maxrepeat=nvals*2;
+ int j,k,m;
+ int good_j=-1, good_k=0;
+ int kmax=16;
+ /* Track repeating patterns.
+ k=1 corresponds to AAAAA...
+ k=2 corresponds to ABABAB...
+ k=3 corresponds to ABCABCABCABC...
+ k=4 corresponds to ABCDABCDABCD...
+ etc. */
+ for (k=kmax; k>=1; k--)
+ {
+ try_next_k:
+ if (k>=1)
+ {
+ for (j=k; j<maxrepeat; j+=k)
+ {
+ int is_equal=1;
+ for (m=0; m<k; m++)
+ if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
+ {
+ is_equal=0;
+ break;
+ }
+ if (is_equal)
+ {
+ int new_j=j+k;
+ if (new_j>maxrepeat)
+ new_j=j;
+ if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
+ {
+ good_j=new_j; /* We have found that
+ the strings repeat
+ for this length... */
+ good_k=k; /* ...and with this
+ length of the
+ repeating
+ pattern. */
+ }
+ }
+ else
+ {
+ /* We know that it is no point in trying
+ with more than m */
+ if (j==0)
+ {
+ k=m;
+ }
+ else
+ k--;
+ goto try_next_k;
+ }
+ }
+ }
+ }
+ /* From good_j and good_k we know the repeat for a large
+ number of strings. The very last repeat length should not
+ be assigned, since it can be much longer if a new test is
+ done. */
+ for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
+ {
+ int repeat=good_j-m;
+ if (repeat>nvals)
+ repeat=nvals;
+ nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
+ }
+ /* If no repetition was found for this value signal that here. */
+ if (!nrepeat[i])
+ nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
+ }
+ }
+
+ /* Sort cyclic shift matrix. */
+ Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
+
+ /* Form output. */
+ for (i=0; i<nvals; i++)
+ {
+ output[i*2]=indices[i];
+ output[indices[i]*2+1]=i;
+ }
+ free(nrepeat);
+ free(indices);
+}
+#endif
+
+#define NUM_PREVIOUS 4
+#define MAX_LEN 0xFFFF
+#define MAX_OFFSET 0xFFFF
+#define MAX_STRING_SEARCH 8
+
+static void add_circular(int *previous,int v, int i)
+{
+ if (previous[(NUM_PREVIOUS+3)*v+2]!=i-1)
+ {
+ previous[(NUM_PREVIOUS+3)*v]++;
+ if (previous[(NUM_PREVIOUS+3)*v]>NUM_PREVIOUS)
+ previous[(NUM_PREVIOUS+3)*v]=NUM_PREVIOUS;
+ previous[(NUM_PREVIOUS+3)*v+3+previous[(NUM_PREVIOUS+3)*v+1]]=i;
+ previous[(NUM_PREVIOUS+3)*v+1]++;
+ if (previous[(NUM_PREVIOUS+3)*v+1]>=NUM_PREVIOUS)
+ previous[(NUM_PREVIOUS+3)*v+1]=0;
+ }
+ previous[(NUM_PREVIOUS+3)*v+2]=i;
+}
+
+void Ptngc_comp_to_lz77(unsigned int *vals, int nvals,
+ unsigned int *data, int *ndata,
+ unsigned int *len, int *nlens,
+ unsigned int *offsets, int *noffsets)
+{
+ int noff=0;
+ int ndat=0;
+ int nlen=0;
+ int i,j;
+ int *previous=warnmalloc(0x20000*(NUM_PREVIOUS+3)*sizeof *previous);
+#if 0
+ unsigned int *info=warnmalloc(2*nvals*sizeof *info);
+ sort_strings(vals,nvals,info);
+#endif
+ for (i=0; i<0x20000; i++)
+ {
+ previous[(NUM_PREVIOUS+3)*i]=0; /* Number of items in a circular buffer */
+ previous[(NUM_PREVIOUS+3)*i+1]=0; /* Pointer to beginning of circular buffer. */
+ previous[(NUM_PREVIOUS+3)*i+2]=-2; /* Last offset that had this value. -2 is really never... */
+ }
+ for (i=0; i<nvals; i++)
+ {
+ int k;
+#if 0
+ int kmin,kmax;
+#endif
+ int firstoffset=i-MAX_OFFSET;
+ if (firstoffset<0)
+ firstoffset=0;
+ if (i!=0)
+ {
+ int largest_len=0;
+ int largest_offset=0;
+ int icirc, ncirc;
+ /* Is this identical to a previous offset? Prefer close
+ values for offset. Search through circular buffer for the
+ possible values for the start of this string. */
+ ncirc=previous[(NUM_PREVIOUS+3)*vals[i]];
+ for (icirc=0; icirc<ncirc; icirc++)
+ {
+ int iptr=previous[(NUM_PREVIOUS+3)*vals[i]+1]-icirc-1;
+ if (iptr<0)
+ iptr+=NUM_PREVIOUS;
+ j=previous[(NUM_PREVIOUS+3)*vals[i]+3+iptr];
+ if (j<firstoffset)
+ break;
+#if 0
+ fprintf(stderr,"Starting search for %d at %d. Found %d\n",vals[i],j,vals[j]);
+#endif
+ while ((j<i) && (vals[j]==vals[i]))
+ {
+ if (j>=firstoffset)
+ {
+ for (k=0; i+k<nvals; k++)
+ if (vals[j+k]!=vals[i+k])
+ break;
+ if ((k>largest_len) && ((k>=(i-j)+16) || ((k>4) && (i-j==1))))
+ {
+ largest_len=k;
+ largest_offset=j;
+ }
+ }
+ j++;
+ }
+ }
+#if 0
+ /* Search in sorted string buffer too. */
+ kmin=info[i*2+1]-MAX_STRING_SEARCH;
+ kmax=info[i*2+1]+MAX_STRING_SEARCH;
+ if (kmin<0)
+ kmin=0;
+ if (kmax>=nvals)
+ kmax=nvals;
+ for (k=kmin; k<kmax; k++)
+ {
+ int m;
+ int s=info[k*2];
+ if ((s<i) && (s+MAX_OFFSET>=i))
+ {
+ for (m=0; i+m<nvals; m++)
+ if (vals[i+m]!=vals[(s+m)%nvals])
+ break;
+ if ((m>largest_len) && (m>4) && (m+2>=(i-s)))
+ {
+ largest_len=m;
+ largest_offset=s;
+#if 0
+ fprintf(stderr,"Offset: %d %d\n",m,i-s);
+#endif
+ }
+ }
+ }
+#endif
+ /* Check how to write this info. */
+ if (largest_len>MAX_LEN)
+ largest_len=MAX_LEN;
+ if (largest_len)
+ {
+ if (i-largest_offset==1)
+ {
+ data[ndat++]=0;
+ }
+ else
+ {
+ data[ndat++]=1;
+ offsets[noff++]=i-largest_offset;
+ }
+ len[nlen++]=largest_len;
+#if 0
+ fprintf(stderr,"l:o: %d:%d data=%d i=%d\n",largest_len,i-largest_offset,ndat,i);
+ fflush(stderr);
+#endif
+
+#if 0
+ fprintf(stderr,"Found largest len %d at %d.\n",largest_len,i-largest_offset);
+#endif
+ /* Add these values to the circular buffer. */
+ for (k=0; k<largest_len; k++)
+ add_circular(previous,vals[i+k],i+k);
+ i+=largest_len-1;
+ }
+ else
+ {
+ data[ndat++]=vals[i]+2;
+ /* Add this value to circular buffer. */
+ add_circular(previous,vals[i],i);
+ }
+ }
+ else
+ {
+ data[ndat++]=vals[i]+2;
+ /* Add this value to circular buffer. */
+ add_circular(previous,vals[i],i);
+ }
+ }
+ *noffsets=noff;
+ *ndata=ndat;
+ *nlens=nlen;
+#if 0
+ free(info);
+#endif
+ free(previous);
+}
+
+void Ptngc_comp_from_lz77(unsigned int *data, int ndata,
+ unsigned int *len, int nlens,
+ unsigned int *offsets, int noffsets,
+ unsigned int *vals, int nvals)
+{
+ int i=0;
+ int joff=0;
+ int jdat=0;
+ int jlen=0;
+ while (i<nvals)
+ {
+ unsigned int v=data[jdat++];
+ if (v<2)
+ {
+ int offset=1;
+ int k;
+ int length=(int)len[jlen++];
+#if 0
+ fprintf(stderr,"len=%d off=%d i=%d\n",length,offset,i);
+#endif
+ if (v==1)
+ offset=offsets[joff++];
+ for (k=0; k<length; k++)
+ {
+ vals[i]=vals[i-offset];
+ if (i>=nvals)
+ {
+ fprintf(stderr,"too many vals.\n");
+ exit(EXIT_FAILURE);
+ }
+ i++;
+ }
+ }
+ else
+ vals[i++]=v-2;
+ }
+}
contact: Jan Huwald // Impressum