1 files changed, 144 insertions, 0 deletions
diff --git a/keccak.cu b/keccak.cu
new file mode 100644
index 0000000..8614aa5
--- /dev/null
+++ b/keccak.cu
@@ -0,0 +1,144 @@
+#include "keccak.cuh"
+
+namespace keccak {
+
+// CUDA helper functions
+__device__ uint thread_id()     { return threadIdx.x; }
+__device__ uint warp_num()      { return thread_id() / 32; }
+__device__ uint warp_id()       { return thread_id() % 32; }
+
+__device__ bool warp_leader()   { return thread_id() % 32 == 0; }
+__device__ bool leading_warp()  { return warp_num() == 0; }
+
+#define PAR_FOR(count)				\
+  if (thread_id() < (count))
+
+// unsigned version of shuffle intrinsic
+__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
+  return __shfl((int) val, lane);
+}
+
+
+// KECCAK constants
+ __constant__ uint32_t roundConsts[22] = {
+   0x00000001,
+   0x00008082,
+   0x0000808a,
+   0x80008000,
+   0x0000808b,
+   0x80000001,
+   0x80008081,
+   0x00008009,
+   0x0000008a,
+   0x00000088,
+   0x80008009,
+   0x8000000a,
+   0x8000808b,
+   0x0000008b,
+   0x00008089,
+   0x00008003,
+   0x00008002,
+   0x00000080,
+   0x0000800a,
+   0x8000000a,
+   0x80008081,
+   0x00008080
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t rotateConsts[25] = {
+//y0  y1  y2  y3  y4
+  0, 36,  3, 41, 18, // x0
+   1, 44, 10, 45,  2,// x1
+  62,  6, 43, 15, 61,// x2
+  27, 20, 39, 14,  8,// x3
+  28, 55, 25, 21, 55 // x4
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t shuffleConsts[25] = {
+   0, 15,  5, 20, 10,
+   6, 21, 11,  1, 16,
+  12,  2, 17,  7, 22,
+  18,  8, 23, 13,  3,
+  24, 14, 4,  19,  9
+};
+
+
+// KECCAK functions
+__device__ uint32_t rot(uint32_t w, uint32_t offset) {
+  return (w << offset) 
+       ^ (w >> 32-offset);
+}
+
+__device__ void round(RegisterState &s, uint8_t round_no) {
+  uint32_t  &a = s.a;
+
+  // \Theta: parity is stored in threads 0..4 of each warp
+  uint32_t p = 0;
+  for (int i=0; i<5; i++)
+    p ^= shuffle(a, 5*warp_id() + i);
+  p =     shuffle(p, (warp_id() - 1) % 5)
+    ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
+  a ^= shuffle(p, warp_id() / 5);
+
+  // \rho+\pi
+  a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);
+
+  // \Chi
+  a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
+        & shuffle(a, (warp_id() + 10) % 25);
+
+  // \iota: round constant
+  if (warp_leader())
+    a ^= roundConsts[round_no];
+}
+
+__device__ void init(RegisterState &s) {
+  s.a = 0;
+}
+
+__device__ void add_l(RegisterState &s, uint32_t data) {
+  // sponge: xor data into registers
+  s.a ^= data;
+
+  // round function: 22 iterations (12 + 2*log(sizeof(word)))
+  #pragma unroll
+  for (int i=0; i<22; i++)
+    round(s, i);
+}
+
+__device__ void add(RegisterState &s, uint32_t data) {
+  // phase 1: use the data value from the own thread
+  PAR_FOR(25)
+    add_l(s, data);
+
+  // phase 2:
+  // - use the data value from the disabled threads with warp_id >= 25
+  // - threads 8-24 add 10*1 padding to full block
+  data = shuffle(data, (warp_id() + 25) % 32);
+  if (warp_id() >= 8)  data = 0;
+  if (warp_id() == 8)  data = 0xA00000;
+  if (warp_id() == 24) data = 0x000001;
+  PAR_FOR(25)
+    add_l(s, data);
+}
+
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
+  // store the resulting hash from all warps in shared memory (768
+  // bytes of shared memory could be saved by streaming them one word
+  // at a time instead)
+  if (warp_id() < digest_words)
+    ss.digests[warp_num()].digest[warp_id()] = s.a;
+  __syncthreads();
+
+  if (leading_warp()) {
+    init(s);
+    for (int i=0; i<digest_words; i++)
+      add(s, ss.digests[thread_id()].digest[i]);
+  }
+  PAR_FOR(digest_words)
+    dst.digest[thread_id()] = s.a;
+}
+
+} // namespace keccak