summaryrefslogtreecommitdiff
path: root/src/compression/bwt.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/compression/bwt.c')
-rw-r--r--src/compression/bwt.c337
1 files changed, 337 insertions, 0 deletions
diff --git a/src/compression/bwt.c b/src/compression/bwt.c
new file mode 100644
index 0000000..ea75199
--- /dev/null
+++ b/src/compression/bwt.c
@@ -0,0 +1,337 @@
+/* This code is part of the tng compression routines.
+ *
+ * Written by Daniel Spangberg
+ * Copyright (c) 2010, 2013, The GROMACS development team.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "warnmalloc.h"
+#include "bwt.h"
+
+#if 0
+#define SHOWIT
+#endif
+#if 0
+#define SHOWIT2
+#endif
+
+static int compare_index(int i1,int i2,int nvals,unsigned int *vals,unsigned int *nrepeat)
+{
+ int i,j;
+ for (i=0; i<nvals; i++)
+ {
+ /* If we have repeating patterns, we might be able to start the
+ comparison later in the string. */
+ /* Do we have a repeating pattern? If so are
+ the repeating patterns the same length? */
+ int repeat1=(int)(nrepeat[i1]>>8);
+ int k1=(int)(nrepeat[i1]&0xFFU);
+ int repeat2=(int)(nrepeat[i2]>>8);
+ int k2=(int)(nrepeat[i2]&0xFFU);
+
+ if ((repeat1>1) && (repeat1>1) && (k1==k2))
+ {
+ int maxskip=0;
+ /* Yes. Compare the repeating patterns. */
+ for (j=0; j<k1; j++)
+ {
+ unsigned int v1=vals[(i1+j)%nvals];
+ unsigned int v2=vals[(i2+j)%nvals];
+ if (v1<v2)
+ return -1;
+ else if (v1>v2)
+ return 1;
+ }
+ /* The repeating patterns are equal. Skip as far as we can
+ before continuing. */
+ maxskip=repeat1;
+ if (repeat2<repeat1)
+ maxskip=repeat2;
+ i1=(i1+maxskip)%nvals;
+ i2=(i2+maxskip)%nvals;
+ i+=maxskip-1;
+ }
+ else
+ {
+ if (vals[i1]<vals[i2])
+ return -1;
+ else if (vals[i1]>vals[i2])
+ return 1;
+ i1++;
+ if (i1>=nvals)
+ i1=0;
+ i2++;
+ if (i2>=nvals)
+ i2=0;
+ }
+ }
+ return 0;
+}
+
+void Ptngc_bwt_merge_sort_inner(int *indices, int nvals,unsigned int *vals,
+ int start, int end,
+ unsigned int *nrepeat,
+ int *workarray)
+{
+ int middle;
+ if ((end-start)>1)
+ {
+ middle=start+(end-start)/2;
+#if 0
+ printf("For start %d end %d obtained new middle: %d\n",start,end,middle);
+#endif
+ Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
+ start,middle,
+ nrepeat,
+ workarray);
+ Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
+ middle,end,
+ nrepeat,
+ workarray);
+#if 0
+ printf("For start %d end %d Before merge: Comparing element %d with %d\n",start,end,middle-1,middle);
+#endif
+ if (compare_index(indices[middle-1],indices[middle],nvals,vals,nrepeat)>0)
+ {
+ /* Merge to work array. */
+ int i, n=end-start;
+ int ileft=start;
+ int iright=middle;
+ for (i=0; i<n; i++)
+ {
+ if (ileft==middle)
+ {
+ workarray[i]=indices[iright];
+ iright++;
+ }
+ else if (iright==end)
+ {
+ workarray[i]=indices[ileft];
+ ileft++;
+ }
+ else
+ {
+#if 0
+ printf("For start %d end %d In merge: Comparing element %d with %d\n",start,end,ileft,iright);
+#endif
+ if (compare_index(indices[ileft],indices[iright],nvals,vals,nrepeat)>0)
+ {
+ workarray[i]=indices[iright];
+ iright++;
+ }
+ else
+ {
+ workarray[i]=indices[ileft];
+ ileft++;
+ }
+ }
+ }
+ /* Copy result back. */
+ memcpy(indices+start,workarray,(end-start)*sizeof(int));
+ }
+ }
+}
+
+/* Burrows-Wheeler transform. */
+void Ptngc_comp_to_bwt(unsigned int *vals, int nvals,
+ unsigned int *output, int *index)
+{
+ int i;
+ int *indices=warnmalloc(2*nvals*sizeof *indices);
+ unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
+ int *warr=indices+nvals;
+
+ if (nvals>0xFFFFFF)
+ {
+ fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
+ exit(1);
+ }
+
+ /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
+#if 0
+ printf("Size of arrays is %.2f M\n",4*nvals*sizeof *indices/1024./1024.);
+#endif
+ for (i=0; i<nvals; i++)
+ indices[i]=i;
+ /* Find the length of the initial repeating pattern for the strings. */
+ /* First mark that the index does not have a found repeating string. */
+ for (i=0; i<nvals; i++)
+ nrepeat[i]=0U;
+#ifdef SHOWIT
+ printf("nvals is %d\n",nvals);
+#endif
+ for (i=0; i<nvals; i++)
+ {
+ /* If we have not already found a repeating string we must find
+ it. */
+ if (!nrepeat[i])
+ {
+ int maxrepeat=nvals*2;
+ int j,k,m;
+ int good_j=-1, good_k=0;
+ int kmax=16;
+ /* Track repeating patterns.
+ k=1 corresponds to AAAAA...
+ k=2 corresponds to ABABAB...
+ k=3 corresponds to ABCABCABCABC...
+ k=4 corresponds to ABCDABCDABCD...
+ etc. */
+ for (k=kmax; k>=1; k--)
+ {
+ try_next_k:
+ if (k>=1)
+ {
+#ifdef SHOWIT
+ printf("Trying k=%d at i=%d\n",k,i);
+#endif
+ for (j=k; j<maxrepeat; j+=k)
+ {
+ int is_equal=1;
+#ifdef SHOWIT
+ printf("Trying j=%d at i=%d for k %d\n",j,i,k);
+#endif
+ for (m=0; m<k; m++)
+ if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
+ {
+ is_equal=0;
+ break;
+ }
+ if (is_equal)
+ {
+ int new_j=j+k;
+ if (new_j>maxrepeat)
+ new_j=j;
+ if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
+ {
+ good_j=new_j; /* We have found that
+ the strings repeat
+ for this length... */
+ good_k=k; /* ...and with this
+ length of the
+ repeating
+ pattern. */
+#ifdef SHOWIT
+ printf("Best j and k is now %d and %d\n",good_j,good_k);
+#endif
+ }
+ }
+ else
+ {
+ /* We know that it is no point in trying
+ with more than m */
+ if (j==0)
+ {
+ k=m;
+#ifdef SHOWIT
+ printf("Setting new k to m: %d\n",k);
+#endif
+ }
+ else
+ k--;
+#ifdef SHOWIT
+ printf("Trying next k\n");
+#endif
+ goto try_next_k;
+ }
+ }
+ }
+ }
+ /* From good_j and good_k we know the repeat for a large
+ number of strings. The very last repeat length should not
+ be assigned, since it can be much longer if a new test is
+ done. */
+ for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
+ {
+ int repeat=good_j-m;
+ if (repeat>nvals)
+ repeat=nvals;
+ nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
+ }
+ /* If no repetition was found for this value signal that here. */
+ if (!nrepeat[i])
+ nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
+ }
+ }
+#ifdef SHOWIT
+ for (i=0; i<nvals; i++)
+ if ((nrepeat[i]>>8)!=1)
+ printf("String repeats at %d: %d %d\n",i,nrepeat[i]>>8,nrepeat[i]&0xFFU);
+#endif
+
+ /* Sort cyclic shift matrix. */
+ Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
+
+#if 0
+ /* Test that it really is sorted. */
+ for (i=0; i<nvals-1; i++)
+ if (compare_index(indices[i],indices[i+1],nvals,vals,nrepeat)!=-1)
+ fprintf(stderr,"SORTING NOT COMPLETED AT %d. Index %d against %d\n",i,indices[i],indices[i+1]);
+
+#endif
+
+
+#ifdef SHOWIT2
+ for (i=0; i<nvals; i++)
+ {
+ int j;
+ for (j=0; j<nvals; j++)
+ printf("%c",vals[(indices[i]+j)%nvals]);
+ printf("\n");
+ }
+#endif
+ /* Which one is the original string? */
+ for (i=0; i<nvals; i++)
+ if (indices[i]==0)
+ break;
+ *index=i;
+ /* Form output. */
+ for (i=0; i<nvals; i++)
+ {
+ int lastchar=indices[i]-1;
+ if (lastchar<0)
+ lastchar=nvals-1;
+ output[i]=vals[lastchar];
+ }
+ free(nrepeat);
+ free(indices);
+}
+
+/* Burrows-Wheeler inverse transform. */
+void Ptngc_comp_from_bwt(unsigned int *input, int nvals, int index,
+ unsigned int *vals)
+{
+ /* Straightforward from the Burrows-Wheeler paper (page 13). */
+ int i;
+ unsigned int *c=warnmalloc(0x10000*sizeof *c);
+ unsigned int *p=warnmalloc(nvals*sizeof *p);
+ unsigned int sum=0;
+ for (i=0; i<0x10000; i++)
+ c[i]=0;
+ for (i=0; i<nvals; i++)
+ {
+ p[i]=c[input[i]];
+ c[input[i]]++;
+ }
+ for (i=0; i<0x10000; i++)
+ {
+ sum+=c[i];
+ c[i]=sum-c[i];
+ }
+ for (i=nvals-1; i>=0; i--)
+ {
+ vals[i]=input[index];
+ index=p[index]+c[input[index]];
+ }
+ free(p);
+ free(c);
+}
+
contact: Jan Huwald // Impressum