From ac11c3b784234d93cc10ca931a178bf50ee26faf Mon Sep 17 00:00:00 2001
From: Jan Huwald <jh@sotun.de>
Date: Wed, 26 Jun 2013 15:23:07 +0200
Subject: Initial commit

Throughput: 330 MB/s on one multiprocessor of a GTX680

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8e18520
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+SDK       := /usr/local/cuda-5.0
+
+NVCC      := $(SDK)/bin/nvcc 
+NVLINK    := g++ -fPIC
+NVCCFLAGS := -O2 --compiler-options -fno-strict-aliasing \
+	     -gencode=arch=compute_30,code=\"sm_30,compute_30\" \
+	     --ptxas-options="-v" \
+	     -I.
+
+NVLIB     := -L$(SDK)/lib64 -lcudart $(LIB)
+
+ENV       := PATH=/home/mit/csb/biosys/CUDA/gcc-4.6:$$PATH:$(SDK)/bin \
+             LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(SDK)/lib64:$(SDK)/lib
+SHELL     := /bin/bash
+
+keccak_bench: keccak.cu keccak_bench.cu keccak.cuh keccak_bench.cuh
+	$(ENV) $(NVCC) $(NVCCFLAGS) $(NVLIB) -o $@ keccak_bench.cu
+
+clean:
+	rm keccak_bench *{~,.{o,ptx,ptxas_info}}
diff --git a/keccak.cu b/keccak.cu
new file mode 100644
index 0000000..8614aa5
--- /dev/null
+++ b/keccak.cu
@@ -0,0 +1,144 @@
+#include "keccak.cuh"
+
+namespace keccak {
+
+// CUDA helper functions
+__device__ uint thread_id()     { return threadIdx.x; }
+__device__ uint warp_num()      { return thread_id() / 32; }
+__device__ uint warp_id()       { return thread_id() % 32; }
+
+__device__ bool warp_leader()   { return thread_id() % 32 == 0; }
+__device__ bool leading_warp()  { return warp_num() == 0; }
+
+#define PAR_FOR(count)				\
+  if (thread_id() < (count))
+
+// unsigned version of shuffle intrinsic
+__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
+  return __shfl((int) val, lane);
+}
+
+
+// KECCAK constants
+ __constant__ uint32_t roundConsts[22] = {
+   0x00000001,
+   0x00008082,
+   0x0000808a,
+   0x80008000,
+   0x0000808b,
+   0x80000001,
+   0x80008081,
+   0x00008009,
+   0x0000008a,
+   0x00000088,
+   0x80008009,
+   0x8000000a,
+   0x8000808b,
+   0x0000008b,
+   0x00008089,
+   0x00008003,
+   0x00008002,
+   0x00000080,
+   0x0000800a,
+   0x8000000a,
+   0x80008081,
+   0x00008080
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t rotateConsts[25] = {
+//y0  y1  y2  y3  y4
+  0, 36,  3, 41, 18, // x0
+   1, 44, 10, 45,  2,// x1
+  62,  6, 43, 15, 61,// x2
+  27, 20, 39, 14,  8,// x3
+  28, 55, 25, 21, 55 // x4
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t shuffleConsts[25] = {
+   0, 15,  5, 20, 10,
+   6, 21, 11,  1, 16,
+  12,  2, 17,  7, 22,
+  18,  8, 23, 13,  3,
+  24, 14, 4,  19,  9
+};
+
+
+// KECCAK functions
+__device__ uint32_t rot(uint32_t w, uint32_t offset) {
+  return (w << offset) 
+       ^ (w >> 32-offset);
+}
+
+__device__ void round(RegisterState &s, uint8_t round_no) {
+  uint32_t  &a = s.a;
+
+  // \Theta: parity is stored in threads 0..4 of each warp
+  uint32_t p = 0;
+  for (int i=0; i<5; i++)
+    p ^= shuffle(a, 5*warp_id() + i);
+  p =     shuffle(p, (warp_id() - 1) % 5)
+    ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
+  a ^= shuffle(p, warp_id() / 5);
+
+  // \rho+\pi
+  a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);
+
+  // \Chi
+  a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
+        & shuffle(a, (warp_id() + 10) % 25);
+
+  // \iota: round constant
+  if (warp_leader())
+    a ^= roundConsts[round_no];
+}
+
+__device__ void init(RegisterState &s) {
+  s.a = 0;
+}
+
+__device__ void add_l(RegisterState &s, uint32_t data) {
+  // sponge: xor data into registers
+  s.a ^= data;
+
+  // round function: 22 iterations (12 + 2*log(sizeof(word)))
+  #pragma unroll
+  for (int i=0; i<22; i++)
+    round(s, i);
+}
+
+__device__ void add(RegisterState &s, uint32_t data) {
+  // phase 1: use the data value from the own thread
+  PAR_FOR(25)
+    add_l(s, data);
+
+  // phase 2:
+  // - use the data value from the disabled threads with warp_id >= 25
+  // - threads 8-24 add 10*1 padding to full block
+  data = shuffle(data, (warp_id() + 25) % 32);
+  if (warp_id() >= 8)  data = 0;
+  if (warp_id() == 8)  data = 0xA00000;
+  if (warp_id() == 24) data = 0x000001;
+  PAR_FOR(25)
+    add_l(s, data);
+}
+
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
+  // store the resulting hash from all warps in shared memory (768
+  // bytes of shared memory could be saved by streaming them one word
+  // at a time instead)
+  if (warp_id() < digest_words)
+    ss.digests[warp_num()].digest[warp_id()] = s.a;
+  __syncthreads();
+
+  if (leading_warp()) {
+    init(s);
+    for (int i=0; i<digest_words; i++)
+      add(s, ss.digests[thread_id()].digest[i]);
+  }
+  PAR_FOR(digest_words)
+    dst.digest[thread_id()] = s.a;
+}
+
+} // namespace keccak
diff --git a/keccak.cuh b/keccak.cuh
new file mode 100644
index 0000000..68dbde1
--- /dev/null
+++ b/keccak.cuh
@@ -0,0 +1,43 @@
+#pragma once
+
+/* Compute a keccak-like hash in 1024 threads via values supplied
+   locally in each thread (never leaving register memory). The input
+   stream given by each warp has to be synchronised: the same amount of
+   data has to be hashed at the same time).
+
+   Approach:
+   1. Compute a keccak-hash of an input stream of the local warp until
+      end of data is reached. 25 (of the 32 threads) store one 32-bit
+      word each.
+   2. Compute the Keccak-Hash of all those hashes by streaming them to
+      one warp via shared memory.
+*/
+
+#include <inttypes.h>
+
+namespace keccak {
+
+const uint32_t digest_words = 256 / 32;
+struct Result {
+  uint32_t digest[digest_words];
+};
+
+union State {
+  uint32_t
+    a[5][5], 
+    s[25];
+};
+
+struct SharedState {
+  Result digests[1024 / 32]; // digest from 32 warps
+};
+
+union RegisterState {
+  uint32_t a;
+};
+
+__device__ void init(RegisterState &s);
+__device__ void add(RegisterState &s, uint32_t data);
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst);
+
+} // namespace keccak
diff --git a/keccak.py b/keccak.py
new file mode 100755
index 0000000..f55e8b7
--- /dev/null
+++ b/keccak.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python
+
+a={}
+for x in range(5):
+    for y in range(5):
+        a[(y*5)+((2*x+3*y)%5)] = x*5+y
+for k in range(25):
+    print a[k]
diff --git a/keccak_bench.cu b/keccak_bench.cu
new file mode 100644
index 0000000..962d69a
--- /dev/null
+++ b/keccak_bench.cu
@@ -0,0 +1,25 @@
+#include "keccak.cu"
+
+const int count = 100000;
+
+__device__ keccak::Result result;
+
+__global__ void bench_kernel() {
+  __shared__ keccak::SharedState s;
+  keccak::RegisterState r;
+  keccak::init(r);
+  for (int i=0; i<count; i++) {
+    int val = i * 1024 + threadIdx.x;
+    keccak::add(r, val);
+  }
+  keccak::finish(r, s, result);
+}
+
+int main(int argc, char **argv) {
+  dim3
+    dimGrid(1),
+    dimBlock(1024);
+  long count = (argc == 2) ? atol(argv[1]) : 1;
+  bench_kernel<<<dimGrid, dimBlock, 0>>>();
+  return cudaPeekAtLastError() != cudaSuccess;
+}
diff --git a/keccak_bench.cuh b/keccak_bench.cuh
new file mode 100644
index 0000000..e69de29
-- 
cgit v0.10.1