From 1a09f345860834fb1c2005e65948fa296b201e8b Mon Sep 17 00:00:00 2001
From: Magnus Lundborg <lundborg.magnus@gmail.com>
Date: Mon, 21 Jul 2014 15:56:54 +0200
Subject: Improved TNG compression speed.

Change-Id: I71c66c6b534cb402048dcd75e008d3db4bd3fb71

diff --git a/src/compression/bwt.c b/src/compression/bwt.c
index 66d3ecf..681d66b 100644
--- a/src/compression/bwt.c
+++ b/src/compression/bwt.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -162,8 +162,9 @@ void Ptngc_comp_to_bwt(unsigned int *vals, int nvals,
     indices[i]=i;
   /* Find the length of the initial repeating pattern for the strings. */
   /* First mark that the index does not have a found repeating string. */
-  for (i=0; i<nvals; i++)
-    nrepeat[i]=0U;
+
+  memset(nrepeat, 0U, sizeof(unsigned int) * nvals);
+
 #ifdef SHOWIT
   printf("nvals is %d\n",nvals);
 #endif
@@ -312,8 +313,9 @@ void Ptngc_comp_from_bwt(unsigned int *input, int nvals, int index,
   unsigned int *c=warnmalloc(0x10000*sizeof *c);
   unsigned int *p=warnmalloc(nvals*sizeof *p);
   unsigned int sum=0;
-  for (i=0; i<0x10000; i++)
-    c[i]=0;
+
+  memset(c, 0, sizeof(unsigned int) * 0x10000);
+
   for (i=0; i<nvals; i++)
     {
       p[i]=c[input[i]];
diff --git a/src/compression/coder.c b/src/compression/coder.c
index cf89140..d51e8cd 100644
--- a/src/compression/coder.c
+++ b/src/compression/coder.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -44,19 +44,17 @@ void DECLSPECDLLEXPORT Ptngc_coder_deinit(struct coder *coder_inst)
 
 TNG_INLINE void DECLSPECDLLEXPORT Ptngc_out8bits(struct coder *coder_inst, unsigned char **output)
 {
-  int pack_temporary_bits=coder_inst->pack_temporary_bits;
-  unsigned int pack_temporary=coder_inst->pack_temporary;
-  while (pack_temporary_bits>=8)
+  while (coder_inst->pack_temporary_bits>=8)
     {
-      unsigned int mask=~(0xFFU<<(pack_temporary_bits-8));
-      unsigned char out=(unsigned char)(pack_temporary>>(pack_temporary_bits-8));
+      unsigned int mask;
+      unsigned char out;
+      coder_inst->pack_temporary_bits-=8;
+      mask=~(0xFFU<<(coder_inst->pack_temporary_bits));
+      out=(unsigned char)(coder_inst->pack_temporary>>(coder_inst->pack_temporary_bits));
       **output=out;
       (*output)++;
-      pack_temporary_bits-=8;
-      pack_temporary&=mask;
+      coder_inst->pack_temporary&=mask;
     }
-  coder_inst->pack_temporary_bits=pack_temporary_bits;
-  coder_inst->pack_temporary=pack_temporary;
 }
 
 void DECLSPECDLLEXPORT Ptngc_write_pattern(struct coder *coder_inst, unsigned int pattern,
@@ -102,11 +100,11 @@ void DECLSPECDLLEXPORT Ptngc_write32bits(struct coder *coder_inst,unsigned int v
   while (nbits>8)
     {
       /* Make room for the bits. */
+      nbits-=8;
       coder_inst->pack_temporary<<=8;
       coder_inst->pack_temporary_bits+=8;
-      coder_inst->pack_temporary|=(value&mask)>>(nbits-8);
+      coder_inst->pack_temporary|=(value&mask)>>(nbits);
       Ptngc_out8bits(coder_inst,output_ptr);
-      nbits-=8;
       mask>>=8;
     }
   if (nbits)
@@ -246,7 +244,6 @@ unsigned char DECLSPECDLLEXPORT *Ptngc_pack_array(struct coder *coder_inst,
             {
               int item=input[k*3*natoms+i*3+j];
               pval[cnt++]=(unsigned int)(item+most_negative);
-
             }
       if (speed>=5)
         bwlzh_compress(pval,n,output+4,length);
diff --git a/src/compression/dict.c b/src/compression/dict.c
index 24e6ae7..fdfe7a0 100644
--- a/src/compression/dict.c
+++ b/src/compression/dict.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -17,6 +17,7 @@ void Ptngc_comp_canonical_dict(unsigned int *dict, int *ndict)
   int i;
   for (i=0; i<0x20004; i++)
     dict[i]=i;
+
   *ndict=0x20004;
 }
 
@@ -26,18 +27,19 @@ void Ptngc_comp_make_dict_hist(unsigned int *vals, int nvals,
 {
   int i;
   int j=0;
-  for (i=0; i<0x20004; i++)
-    hist[i]=0;
-  for (i=0; i<0x20004; i++)
-    dict[i]=i;
+
+  memset(hist, 0, sizeof(unsigned int)*0x20004);
+
   for (i=0; i<nvals; i++)
     hist[vals[i]]++;
   for (i=0; i<0x20004; i++)
     if (hist[i]!=0)
       {
         hist[j]=hist[i];
-        dict[j]=dict[i];
+        dict[j]=i;
         j++;
+        if(j==nvals)
+          break;
       }
   *ndict=j;
 }
diff --git a/src/compression/mtf.c b/src/compression/mtf.c
index d6eaf30..bfb0e19 100644
--- a/src/compression/mtf.c
+++ b/src/compression/mtf.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -10,6 +10,7 @@
 
 
 #include <stdlib.h>
+#include <string.h>
 #include "../../include/compression/warnmalloc.h"
 #include "../../include/compression/mtf.h"
 
@@ -63,8 +64,9 @@ void Ptngc_comp_conv_to_mtf_partial(unsigned int *vals, int nvals,
 {
   unsigned char *tmp=warnmalloc(nvals*2);
   int i, j;
-  for (i=0; i<nvals; i++)
-    valsmtf[i]=0U;
+
+  memset(valsmtf, 0U, sizeof(unsigned int) * nvals);
+
   for (j=0; j<3; j++)
     {
       for (i=0; i<nvals; i++)
@@ -138,8 +140,9 @@ void Ptngc_comp_conv_from_mtf_partial(unsigned int *valsmtf, int nvals,
 {
   unsigned char *tmp=warnmalloc(nvals*2);
   int i, j;
-  for (i=0; i<nvals; i++)
-    vals[i]=0U;
+
+  memset(vals, 0U, sizeof(unsigned int) * nvals);
+
   for (j=0; j<3; j++)
     {
       for (i=0; i<nvals; i++)
@@ -156,8 +159,9 @@ void Ptngc_comp_conv_from_mtf_partial3(unsigned char *valsmtf, int nvals,
 {
   unsigned char *tmp=warnmalloc(nvals);
   int i, j;
-  for (i=0; i<nvals; i++)
-    vals[i]=0U;
+
+  memset(vals, 0U, sizeof(unsigned int) * nvals);
+
   for (j=0; j<3; j++)
     {
       comp_conv_from_mtf_byte(valsmtf+j*nvals,nvals,tmp);
diff --git a/src/compression/widemuldiv.c b/src/compression/widemuldiv.c
index 7d03c57..63056f9 100644
--- a/src/compression/widemuldiv.c
+++ b/src/compression/widemuldiv.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -11,6 +11,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "../../include/compression/tng_compress.h"
 
@@ -31,8 +32,14 @@
 #endif /* gcc & x86_64 */
 #endif /* TRAJNG X86 GCC INLINE MULDIV */
 
+#ifdef USE_WINDOWS
+#define TNG_INLINE __inline
+#else
+#define TNG_INLINE inline
+#endif
+
 /* Multiply two 32 bit unsigned integers returning a 64 bit unsigned value (in two integers) */
-void Ptngc_widemul(unsigned int i1, unsigned int i2, unsigned int *ohi, unsigned int *olo)
+TNG_INLINE void Ptngc_widemul(unsigned int i1, unsigned int i2, unsigned int *ohi, unsigned int *olo)
 {
 #if defined(TRAJNG_X86_GCC_INLINE_MULDIV)
   __asm__ __volatile__ ("mull %%edx\n\t"
@@ -99,7 +106,7 @@ void Ptngc_widemul(unsigned int i1, unsigned int i2, unsigned int *ohi, unsigned
 
 /* Divide a 64 bit unsigned value in hi:lo with the 32 bit value i and
    return the result in result and the remainder in remainder */
-void Ptngc_widediv(unsigned int hi, unsigned int lo, unsigned int i, unsigned int *result, unsigned int *remainder)
+TNG_INLINE void Ptngc_widediv(unsigned int hi, unsigned int lo, unsigned int i, unsigned int *result, unsigned int *remainder)
 {
 #if defined(TRAJNG_X86_GCC_INLINE_MULDIV)
   __asm__ __volatile__ ("divl %%ecx\n\t"
@@ -163,7 +170,7 @@ void Ptngc_widediv(unsigned int hi, unsigned int lo, unsigned int i, unsigned in
 
 /* Add a unsigned int to a largeint. j determines which value in the
    largeint to add v1 to. */
-static void largeint_add_gen(unsigned int v1, unsigned int *largeint, int n, int j)
+TNG_INLINE static void largeint_add_gen(const unsigned int v1, unsigned int *largeint, const int n, int j)
 {
   /* Add with carry. unsigned ints in C wrap modulo 2**bits when "overflowed". */
   unsigned int v2=(v1+largeint[j])&0xFFFFFFFFU; /* Add and cap at 32 bits */
@@ -184,46 +191,50 @@ static void largeint_add_gen(unsigned int v1, unsigned int *largeint, int n, int
 }
 
 /* Add a unsigned int to a largeint. */
-void Ptngc_largeint_add(unsigned int v1, unsigned int *largeint, int n)
+void Ptngc_largeint_add(const unsigned int v1, unsigned int *largeint, const int n)
 {
   largeint_add_gen(v1,largeint,n,0);
 }
 
 /* Multiply v1 with largeint_in and return result in largeint_out */
-void Ptngc_largeint_mul(unsigned int v1, unsigned int *largeint_in, unsigned int *largeint_out, int n)
+TNG_INLINE void Ptngc_largeint_mul(const unsigned int v1, unsigned int *largeint_in, unsigned int *largeint_out, const int n)
 {
   int i;
-  for (i=0; i<n; i++)
-    largeint_out[i]=0U;
-  for (i=0; i<n; i++)
+  unsigned int lo,hi;
+
+  memset(largeint_out, 0U, sizeof(unsigned int) * n);
+
+  for (i=0; i<n-1; i++)
     {
       if (largeint_in[i]!=0U)
         {
-          unsigned int lo,hi;
           Ptngc_widemul(v1,largeint_in[i],&hi,&lo); /* 32x32->64 mul */
           largeint_add_gen(lo,largeint_out,n,i);
-          if (i+1<n)
-            largeint_add_gen(hi,largeint_out,n,i+1);
+          largeint_add_gen(hi,largeint_out,n,i+1);
         }
     }
+  if (largeint_in[i]!=0U)
+    {
+      Ptngc_widemul(v1,largeint_in[i],&hi,&lo); /* 32x32->64 mul */
+      largeint_add_gen(lo,largeint_out,n,i);
+    }
 }
 
 /* Return the remainder from dividing largeint_in with v1. Result of the division is returned in largeint_out */
-unsigned int Ptngc_largeint_div(unsigned int v1, unsigned int *largeint_in, unsigned int *largeint_out, int n)
+TNG_INLINE unsigned int Ptngc_largeint_div(const unsigned int v1, unsigned int *largeint_in, unsigned int *largeint_out, const int n)
 {
   unsigned int result,remainder=0;
   int i;
-  unsigned int hi, lo;
+  unsigned int hi;
   /* Boot */
   hi=0U;
   i=n;
   while (i)
     {
-      lo=largeint_in[i-1];
-      Ptngc_widediv(hi,lo,v1,&result,&remainder);
-      largeint_out[i-1]=result;
-      hi=remainder;
       i--;
+      Ptngc_widediv(hi,largeint_in[i],v1,&result,&remainder);
+      largeint_out[i]=result;
+      hi=remainder;
     }
   return remainder;
 }
diff --git a/src/compression/xtc2.c b/src/compression/xtc2.c
index bbd45e9..e5b688a 100644
--- a/src/compression/xtc2.c
+++ b/src/compression/xtc2.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -26,6 +26,12 @@
 /* Generated by gen_magic.py */
 #define MAX_MAGIC  92
 
+#ifdef USE_WINDOWS
+#define TNG_INLINE __inline
+#else
+#define TNG_INLINE inline
+#endif
+
 static unsigned int magic[MAX_MAGIC]={
 2U,  3U,  4U,  5U,
 6U,  8U,  10U,  12U,
@@ -159,20 +165,43 @@ static const double iflipgaincheck=0.89089871814033927; /*  1./(2**(1./6)) */
 #define SHOWIT
 #endif
 
+#ifdef USE_WINDOWS
+#define TNG_INLINE __inline
+#else
+#define TNG_INLINE inline
+#endif
+
 int Ptngc_magic(unsigned int i)
 {
   return magic[i];
 }
 
-int Ptngc_find_magic_index(unsigned int maxval)
+TNG_INLINE int Ptngc_find_magic_index(const unsigned int maxval)
 {
-  int i=0;
+  int i;
+
+  if(maxval > magic[MAX_MAGIC/4])
+  {
+      if(maxval > magic[MAX_MAGIC/2])
+      {
+          i = MAX_MAGIC/2 + 1;
+      }
+      else
+      {
+          i = MAX_MAGIC/4 + 1;
+      }
+  }
+  else
+  {
+      i = 0;
+  }
+
   while (magic[i]<=maxval)
     i++;
   return i;
 }
 
-static unsigned int positive_int(int item)
+TNG_INLINE static unsigned int positive_int(const int item)
 {
   int s=0;
   if (item>0)
@@ -182,7 +211,7 @@ static unsigned int positive_int(int item)
   return s;
 }
 
-static int unpositive_int(int val)
+TNG_INLINE static int unpositive_int(const int val)
 {
   int s=(val+1)/2;
   if ((val%2)==0)
@@ -438,23 +467,26 @@ static int compute_magic_bits(int *index)
 /* Convert a sequence of (hopefully) small positive integers
    using the base pointed to by index (x base, y base and z base can be different).
    The largest number of integers supported is 18 (29 to handle/detect overflow) */
-static void trajcoder_base_compress(int *input, int n, int *index, unsigned char *result)
+static void trajcoder_base_compress(int *input, const int n, int *index, unsigned char *result)
 {
   unsigned int largeint[19];
   unsigned int largeint_tmp[19];
-  int i,j;
-  for (i=0; i<19; i++)
-    largeint[i]=0U;
+  int i, j;
+
+  memset(largeint, 0U, sizeof(unsigned int) * 19);
 
-  for (i=0; i<n; i++)
+  if(n > 0)
     {
-      if (i!=0)
-        {
-          /* We must do the multiplication of the largeint with the integer base */
-          Ptngc_largeint_mul(magic[index[i%3]],largeint,largeint_tmp,19);
-          for (j=0; j<19; j++)
-            largeint[j]=largeint_tmp[j];
-        }
+      Ptngc_largeint_add(input[0],largeint,19);
+    }
+
+  for (i=1; i<n; i++)
+    {
+      /* We must do the multiplication of the largeint with the integer base */
+      Ptngc_largeint_mul(magic[index[i%3]],largeint,largeint_tmp,19);
+
+      memcpy(largeint,largeint_tmp,19*sizeof *largeint);
+
       Ptngc_largeint_add(input[i],largeint,19);
     }
   if (largeint[18])
@@ -481,7 +513,7 @@ static void trajcoder_base_compress(int *input, int n, int *index, unsigned char
 }
 
 /* The opposite of base_compress. */
-static void trajcoder_base_decompress(unsigned char *input, int n, int *index, int *output)
+static void trajcoder_base_decompress(unsigned char *input, const int n, int *index, int *output)
 {
   unsigned int largeint[19];
   unsigned int largeint_tmp[19];
@@ -524,7 +556,7 @@ static void trajcoder_base_decompress(unsigned char *input, int n, int *index, i
 /* It is "large" if we have to increase the small index quite a
    bit. Not so much to be rejected by the not very large check
    later. */
-static int is_quite_large(int *input, int small_index, int max_large_index)
+static int is_quite_large(int *input, const int small_index, const int max_large_index)
 {
   int is=0;
   int i;
@@ -547,7 +579,7 @@ int nbits_sum;
 int nvalues_sum;
 #endif
 
-static void write_three_large(struct coder *coder, int *encode_ints, int *large_index, int nbits, unsigned char *compress_buffer, unsigned char **output_ptr)
+static void write_three_large(struct coder *coder, int *encode_ints, int *large_index, const int nbits, unsigned char *compress_buffer, unsigned char **output_ptr)
 {
   trajcoder_base_compress(encode_ints,3,large_index,compress_buffer);
   Ptngc_writemanybits(coder,compress_buffer,nbits,output_ptr);
@@ -559,7 +591,7 @@ static void write_three_large(struct coder *coder, int *encode_ints, int *large_
 #endif
 }
 
-static void insert_batch(int *input_ptr, int ntriplets_left, int *prevcoord,int *minint, int *encode_ints, int startenc, int *nenc)
+static void insert_batch(int *input_ptr, int ntriplets_left, const int *prevcoord,int *minint, int *encode_ints, const int startenc, int *nenc)
 {
   int nencode=startenc*3;
   int tmp_prevcoord[3];
@@ -617,8 +649,8 @@ static void insert_batch(int *input_ptr, int ntriplets_left, int *prevcoord,int
   *nenc=nencode;
 }
 
-static void flush_large(struct coder *coder, int *has_large, int *has_large_ints, int n,
-                        int *large_index, int large_nbits, unsigned char *compress_buffer,
+static void flush_large(struct coder *coder, int *has_large, int *has_large_ints, const int n,
+                        int *large_index, const int large_nbits, unsigned char *compress_buffer,
                         unsigned char **output_ptr)
 {
   int i;
@@ -651,7 +683,7 @@ static void flush_large(struct coder *coder, int *has_large, int *has_large_ints
 }
 
 static void buffer_large(struct coder *coder, int *has_large, int *has_large_ints, int *new_large_ints,
-                        int *large_index, int large_nbits, unsigned char *compress_buffer,
+                        int *large_index, const int large_nbits, unsigned char *compress_buffer,
                         unsigned char **output_ptr)
 {
   /* If it is full we must write them all. */
@@ -669,7 +701,7 @@ unsigned char *Ptngc_pack_array_xtc2(struct coder *coder,int *input, int *length
 {
   unsigned char *output=NULL;
   unsigned char *output_ptr=NULL;
-  int i,ienc,j;
+  int i,ienc,j,imult;
   int output_length=0;
   /* Pack triplets. */
   int ntriplets=*length/3;
@@ -709,13 +741,15 @@ unsigned char *Ptngc_pack_array_xtc2(struct coder *coder,int *input, int *length
   maxint[2]=minint[2]=input[2];
 
   for (i=1; i<ntriplets; i++)
-    for (j=0; j<3; j++)
-      {
-        if (input[i*3+j]>maxint[j])
-          maxint[j]=input[i*3+j];
-        if (input[i*3+j]<minint[j])
-          minint[j]=input[i*3+j];
-      }
+    {
+      for (j=0; j<3; j++)
+        {
+          if (input[i*3+j]>maxint[j])
+            maxint[j]=input[i*3+j];
+          if (input[i*3+j]<minint[j])
+            minint[j]=input[i*3+j];
+        }
+    }
 
   large_index[0]=Ptngc_find_magic_index(maxint[0]-minint[0]+1);
   large_index[1]=Ptngc_find_magic_index(maxint[1]-minint[1]+1);
@@ -794,9 +828,7 @@ unsigned char *Ptngc_pack_array_xtc2(struct coder *coder,int *input, int *length
 #endif
 
   /* Initial prevcoord is the minimum integers. */
-  prevcoord[0]=minint[0];
-  prevcoord[1]=minint[1];
-  prevcoord[2]=minint[2];
+  memcpy(prevcoord, minint, 3*sizeof *prevcoord);
 
   while (ntriplets_left)
     {
@@ -1304,9 +1336,7 @@ int Ptngc_unpack_array_xtc2(struct coder *coder,unsigned char *packed,int *outpu
 #endif
 
   /* Initial prevcoord is the minimum integers. */
-  prevcoord[0]=minint[0];
-  prevcoord[1]=minint[1];
-  prevcoord[2]=minint[2];
+  memcpy(prevcoord, minint, 3*sizeof *prevcoord);
 
   while (ntriplets_left)
     {
@@ -1329,9 +1359,7 @@ int Ptngc_unpack_array_xtc2(struct coder *coder,unsigned char *packed,int *outpu
               /* Get the large value. */
               readmanybits(&ptr,&bitptr,large_nbits,compress_buffer);
               trajcoder_base_decompress(compress_buffer,3,large_index,encode_ints);
-              large_ints[0]=encode_ints[0];
-              large_ints[1]=encode_ints[1];
-              large_ints[2]=encode_ints[2];
+              memcpy(large_ints, encode_ints, 3*sizeof *large_ints);
 #ifdef SHOWIT
               fprintf(stderr,"large ints: %d %d %d\n",large_ints[0],large_ints[1],large_ints[2]);
 #endif
@@ -1444,16 +1472,12 @@ int Ptngc_unpack_array_xtc2(struct coder *coder,unsigned char *packed,int *outpu
               /* Get the large value. */
               readmanybits(&ptr,&bitptr,large_nbits,compress_buffer);
               trajcoder_base_decompress(compress_buffer,3,large_index,encode_ints);
-              large_ints[0]=encode_ints[0];
-              large_ints[1]=encode_ints[1];
-              large_ints[2]=encode_ints[2];
+              memcpy(large_ints, encode_ints, 3*sizeof *large_ints);
               /* Output large value */
               *output++=large_ints[0]+minint[0];
               *output++=large_ints[1]+minint[1];
               *output++=large_ints[2]+minint[2];
-              prevcoord[0]=large_ints[0];
-              prevcoord[1]=large_ints[1];
-              prevcoord[2]=large_ints[2];
+              memcpy(prevcoord, large_ints, 3*sizeof *prevcoord);
             }
           ntriplets_left-=n;
         }
diff --git a/src/compression/xtc3.c b/src/compression/xtc3.c
index 95483b3..673b321 100644
--- a/src/compression/xtc3.c
+++ b/src/compression/xtc3.c
@@ -1,7 +1,7 @@
 /* This code is part of the tng compression routines.
  *
- * Written by Daniel Spangberg
- * Copyright (c) 2010, 2013, The GROMACS development team.
+ * Written by Daniel Spangberg and Magnus Lundborg
+ * Copyright (c) 2010, 2013-2014 The GROMACS development team.
  *
  *
  * This program is free software; you can redistribute it and/or
@@ -60,11 +60,17 @@ static const double iflipgaincheck=0.89089871814033927; /*  1./(2**(1./6)) */
 #define SHOWIT_LIGHT
 #endif
 
+#ifdef USE_WINDOWS
+#define TNG_INLINE __inline
+#else
+#define TNG_INLINE inline
+#endif
+
 /* These routines are in xtc2.c */
 int Ptngc_magic(unsigned int i);
 int Ptngc_find_magic_index(unsigned int maxval);
 
-static unsigned int positive_int(int item)
+TNG_INLINE static unsigned int positive_int(int item)
 {
   int s=0;
   if (item>0)
@@ -74,7 +80,7 @@ static unsigned int positive_int(int item)
   return s;
 }
 
-static int unpositive_int(int val)
+TNG_INLINE static int unpositive_int(int val)
 {
   int s=(val+1)/2;
   if ((val%2)==0)
@@ -574,15 +580,15 @@ static int base_bytes(unsigned int base, int n)
   unsigned int largeint[MAXMAXBASEVALS+1];
   unsigned int largeint_tmp[MAXMAXBASEVALS+1];
   int numbytes=0;
-  for (i=0; i<n+1; i++)
-    largeint[i]=0U;
+
+  memset(largeint, 0U, sizeof(unsigned int) * (n+1));
+
   for (i=0; i<n; i++)
     {
       if (i!=0)
         {
           Ptngc_largeint_mul(base,largeint,largeint_tmp,n+1);
-          for (j=0; j<n+1; j++)
-            largeint[j]=largeint_tmp[j];
+          memcpy(largeint, largeint_tmp, (n+1)*sizeof *largeint);
         }
       Ptngc_largeint_add(base-1U,largeint,n+1);
     }
@@ -612,8 +618,9 @@ static void base_compress(unsigned int *data, int len, unsigned char *output, in
       unsigned int base=0U;
       int nvals=0;
       int basegiven=0;
-      for (j=0; j<MAXBASEVALS+1; j++)
-        largeint[j]=0U;
+
+      memset(largeint, 0U, sizeof(unsigned int) * (MAXBASEVALS+1));
+
       for (i=ixyz; i<len; i+=3)
         {
          if (nvals==0)
@@ -679,8 +686,8 @@ static void base_compress(unsigned int *data, int len, unsigned char *output, in
               fprintf(stderr,"\n");
 #endif
               nvals=0;
-              for (j=0; j<MAXBASEVALS+1; j++)
-                largeint[j]=0U;
+
+              memset(largeint, 0U, sizeof(unsigned int) * (MAXBASEVALS+1));
             }
         }
       if (nvals)
@@ -747,8 +754,7 @@ static void base_decompress(unsigned char *input, int len, unsigned int *output)
               fprintf(stderr,"Base for %d is %u. I need %d bytes for %d values.\n",ixyz,base,numbytes,nvals_left);
 #endif
             }
-          for (j=0; j<maxbasevals+1; j++)
-            largeint[j]=0U;
+          memset(largeint, 0U, sizeof(unsigned int) * (maxbasevals+1));
 #ifdef SHOWIT
           fprintf(stderr,"Reading largeint: ");
 #endif
@@ -840,9 +846,8 @@ unsigned char *Ptngc_pack_array_xtc3(int *input, int *length, int natoms, int sp
   struct xtc3_context xtc3_context;
   init_xtc3_context(&xtc3_context);
 
-  xtc3_context.maxint[0]=xtc3_context.minint[0]=input[0];
-  xtc3_context.maxint[1]=xtc3_context.minint[1]=input[1];
-  xtc3_context.maxint[2]=xtc3_context.minint[2]=input[2];
+  memcpy(xtc3_context.maxint, input, 3*sizeof *xtc3_context.maxint);
+  memcpy(xtc3_context.minint, input, 3*sizeof *xtc3_context.maxint);
 
   /* Values of speed should be sane. */
   if (speed<1)
@@ -923,6 +928,7 @@ unsigned char *Ptngc_pack_array_xtc3(int *input, int *length, int natoms, int sp
 #endif
 
   /* Initial prevcoord is the minimum integers. */
+  memcpy(prevcoord, xtc3_context.minint, 3*sizeof *prevcoord);
   prevcoord[0]=xtc3_context.minint[0];
   prevcoord[1]=xtc3_context.minint[1];
   prevcoord[2]=xtc3_context.minint[2];
@@ -1722,9 +1728,7 @@ static void unpack_one_large(struct xtc3_context *xtc3_context,
         +output[outdata-natoms*3+2+didswap*3];
       (*ilargeinter)+=3;
     }
-  prevcoord[0]=large_ints[0];
-  prevcoord[1]=large_ints[1];
-  prevcoord[2]=large_ints[2];
+  memcpy(prevcoord, large_ints, 3*sizeof *prevcoord);
   output[outdata]=large_ints[0];
   output[outdata+1]=large_ints[1];
   output[outdata+2]=large_ints[2];
@@ -1833,9 +1837,7 @@ int Ptngc_unpack_array_xtc3(unsigned char *packed,int *output, int length, int n
     }
 
   /* Initial prevcoord is the minimum integers. */
-  prevcoord[0]=minint[0];
-  prevcoord[1]=minint[1];
-  prevcoord[2]=minint[2];
+  memcpy(prevcoord, minint, 3*sizeof *prevcoord);
 
   while (ntriplets_left>0 && iinstr<xtc3_context.ninstr)
     {
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index f465b6b..cd2d7fd 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -74,6 +74,13 @@ if(TNG_BUILD_EXAMPLES)
     endif()
     set_property(TARGET tng_io_read_pos_util PROPERTY RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/examples)
 
+    add_executable(tng_io_gen_versioned_output tng_io_gen_versioned_output.c)
+    target_link_libraries(tng_io_gen_versioned_output tng_io)
+    if(HAVE_INTTYPES_H)
+      set_property(TARGET tng_io_gen_versioned_output APPEND PROPERTY COMPILE_DEFINITIONS USE_STD_INTTYPES_H=1)
+    endif()
+    set_property(TARGET tng_io_gen_versioned_output PROPERTY RUNTIME_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/examples)
+
     if(TNG_BUILD_FORTRAN)
         # This does not work due to a bug in CMake. Remove lines below if no fortran compiler is found.
         enable_language(Fortran OPTIONAL)
-- 
cgit v0.10.1