summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Huwald <jh@sotun.de>2013-06-26 13:23:07 (GMT)
committerJan Huwald <jh@sotun.de>2013-06-26 13:23:07 (GMT)
commitac11c3b784234d93cc10ca931a178bf50ee26faf (patch)
tree4b8bc9fc15a92a8828ea3abeb79d711a75eb4070
Initial commitHEADmaster
Throughput: 330 MB/s on one multiprocessor of a GTX680
-rw-r--r--Makefile20
-rw-r--r--keccak.cu144
-rw-r--r--keccak.cuh43
-rwxr-xr-xkeccak.py8
-rw-r--r--keccak_bench.cu25
-rw-r--r--keccak_bench.cuh0
6 files changed, 240 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8e18520
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+SDK := /usr/local/cuda-5.0
+
+NVCC := $(SDK)/bin/nvcc
+NVLINK := g++ -fPIC
+NVCCFLAGS := -O2 --compiler-options -fno-strict-aliasing \
+ -gencode=arch=compute_30,code=\"sm_30,compute_30\" \
+ --ptxas-options="-v" \
+ -I.
+
+NVLIB := -L$(SDK)/lib64 -lcudart $(LIB)
+
+ENV := PATH=/home/mit/csb/biosys/CUDA/gcc-4.6:$$PATH:$(SDK)/bin \
+ LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(SDK)/lib64:$(SDK)/lib
+SHELL := /bin/bash
+
+keccak_bench: keccak.cu keccak_bench.cu keccak.cuh keccak_bench.cuh
+ $(ENV) $(NVCC) $(NVCCFLAGS) $(NVLIB) -o $@ keccak_bench.cu
+
+clean:
+ rm keccak_bench *{~,.{o,ptx,ptxas_info}}
diff --git a/keccak.cu b/keccak.cu
new file mode 100644
index 0000000..8614aa5
--- /dev/null
+++ b/keccak.cu
@@ -0,0 +1,144 @@
+#include "keccak.cuh"
+
+namespace keccak {
+
+// CUDA helper functions
+__device__ uint thread_id() { return threadIdx.x; }
+__device__ uint warp_num() { return thread_id() / 32; }
+__device__ uint warp_id() { return thread_id() % 32; }
+
+__device__ bool warp_leader() { return thread_id() % 32 == 0; }
+__device__ bool leading_warp() { return warp_num() == 0; }
+
+#define PAR_FOR(count) \
+ if (thread_id() < (count))
+
+// unsigned version of shuffle intrinsic
+__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
+ return __shfl((int) val, lane);
+}
+
+
+// KECCAK constants
+ __constant__ uint32_t roundConsts[22] = {
+ 0x00000001,
+ 0x00008082,
+ 0x0000808a,
+ 0x80008000,
+ 0x0000808b,
+ 0x80000001,
+ 0x80008081,
+ 0x00008009,
+ 0x0000008a,
+ 0x00000088,
+ 0x80008009,
+ 0x8000000a,
+ 0x8000808b,
+ 0x0000008b,
+ 0x00008089,
+ 0x00008003,
+ 0x00008002,
+ 0x00000080,
+ 0x0000800a,
+ 0x8000000a,
+ 0x80008081,
+ 0x00008080
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t rotateConsts[25] = {
+//y0 y1 y2 y3 y4
+ 0, 36, 3, 41, 18, // x0
+ 1, 44, 10, 45, 2,// x1
+ 62, 6, 43, 15, 61,// x2
+ 27, 20, 39, 14, 8,// x3
+ 28, 55, 25, 21, 55 // x4
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t shuffleConsts[25] = {
+ 0, 15, 5, 20, 10,
+ 6, 21, 11, 1, 16,
+ 12, 2, 17, 7, 22,
+ 18, 8, 23, 13, 3,
+ 24, 14, 4, 19, 9
+};
+
+
+// KECCAK functions
+__device__ uint32_t rot(uint32_t w, uint32_t offset) {
+ return (w << offset)
+ ^ (w >> 32-offset);
+}
+
+__device__ void round(RegisterState &s, uint8_t round_no) {
+ uint32_t &a = s.a;
+
+ // \Theta: parity is stored in threads 0..4 of each warp
+ uint32_t p = 0;
+ for (int i=0; i<5; i++)
+ p ^= shuffle(a, 5*warp_id() + i);
+ p = shuffle(p, (warp_id() - 1) % 5)
+ ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
+ a ^= shuffle(p, warp_id() / 5);
+
+ // \rho+\pi
+ a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);
+
+ // \Chi
+ a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
+ & shuffle(a, (warp_id() + 10) % 25);
+
+ // \iota: round constant
+ if (warp_leader())
+ a ^= roundConsts[round_no];
+}
+
+__device__ void init(RegisterState &s) {
+ s.a = 0;
+}
+
+__device__ void add_l(RegisterState &s, uint32_t data) {
+ // sponge: xor data into registers
+ s.a ^= data;
+
+ // round function: 22 iterations (12 + 2*log(sizeof(word)))
+ #pragma unroll
+ for (int i=0; i<22; i++)
+ round(s, i);
+}
+
+__device__ void add(RegisterState &s, uint32_t data) {
+ // phase 1: use the data value from the own thread
+ PAR_FOR(25)
+ add_l(s, data);
+
+ // phase 2:
+ // - use the data value from the disabled threads with warp_id >= 25
+ // - threads 8-24 add 10*1 padding to full block
+ data = shuffle(data, (warp_id() + 25) % 32);
+ if (warp_id() >= 8) data = 0;
+ if (warp_id() == 8) data = 0xA00000;
+ if (warp_id() == 24) data = 0x000001;
+ PAR_FOR(25)
+ add_l(s, data);
+}
+
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
+ // store the resulting hash from all warps in shared memory (768
+ // bytes of shared memory could be saved by streaming them one word
+ // at a time instead)
+ if (warp_id() < digest_words)
+ ss.digests[warp_num()].digest[warp_id()] = s.a;
+ __syncthreads();
+
+ if (leading_warp()) {
+ init(s);
+ for (int i=0; i<digest_words; i++)
+ add(s, ss.digests[thread_id()].digest[i]);
+ }
+ PAR_FOR(digest_words)
+ dst.digest[thread_id()] = s.a;
+}
+
+} // namespace keccak
diff --git a/keccak.cuh b/keccak.cuh
new file mode 100644
index 0000000..68dbde1
--- /dev/null
+++ b/keccak.cuh
@@ -0,0 +1,43 @@
+#pragma once
+
+/* Compute a keccak-like hash in 1024 threads via values supplied
+ locally in each thread (never leaving register memory). The input
+ stream given by each warp has to be synchronised: the same amount of
+ data has to be hashed at the same time).
+
+ Approach:
+ 1. Compute a keccak-hash of an input stream of the local warp until
+ end of data is reached. 25 (of the 32 threads) store one 32-bit
+ word each.
+ 2. Compute the Keccak-Hash of all those hashes by streaming them to
+ one warp via shared memory.
+*/
+
+#include <inttypes.h>
+
+namespace keccak {
+
+const uint32_t digest_words = 256 / 32;
+struct Result {
+ uint32_t digest[digest_words];
+};
+
+union State {
+ uint32_t
+ a[5][5],
+ s[25];
+};
+
+struct SharedState {
+ Result digests[1024 / 32]; // digest from 32 warps
+};
+
+union RegisterState {
+ uint32_t a;
+};
+
+__device__ void init(RegisterState &s);
+__device__ void add(RegisterState &s, uint32_t data);
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst);
+
+} // namespace keccak
diff --git a/keccak.py b/keccak.py
new file mode 100755
index 0000000..f55e8b7
--- /dev/null
+++ b/keccak.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python
+
+a={}
+for x in range(5):
+ for y in range(5):
+ a[(y*5)+((2*x+3*y)%5)] = x*5+y
+for k in range(25):
+ print a[k]
diff --git a/keccak_bench.cu b/keccak_bench.cu
new file mode 100644
index 0000000..962d69a
--- /dev/null
+++ b/keccak_bench.cu
@@ -0,0 +1,25 @@
+#include "keccak.cu"
+
+const int count = 100000;
+
+__device__ keccak::Result result;
+
+__global__ void bench_kernel() {
+ __shared__ keccak::SharedState s;
+ keccak::RegisterState r;
+ keccak::init(r);
+ for (int i=0; i<count; i++) {
+ int val = i * 1024 + threadIdx.x;
+ keccak::add(r, val);
+ }
+ keccak::finish(r, s, result);
+}
+
+int main(int argc, char **argv) {
+ dim3
+ dimGrid(1),
+ dimBlock(1024);
+ long count = (argc == 2) ? atol(argv[1]) : 1;
+ bench_kernel<<<dimGrid, dimBlock, 0>>>();
+ return cudaPeekAtLastError() != cudaSuccess;
+}
diff --git a/keccak_bench.cuh b/keccak_bench.cuh
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/keccak_bench.cuh
contact: Jan Huwald // Impressum