From ac11c3b784234d93cc10ca931a178bf50ee26faf Mon Sep 17 00:00:00 2001 From: Jan Huwald Date: Wed, 26 Jun 2013 15:23:07 +0200 Subject: Initial commit Throughput: 330 MB/s on one multiprocessor of a GTX680 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8e18520 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +SDK := /usr/local/cuda-5.0 + +NVCC := $(SDK)/bin/nvcc +NVLINK := g++ -fPIC +NVCCFLAGS := -O2 --compiler-options -fno-strict-aliasing \ + -gencode=arch=compute_30,code=\"sm_30,compute_30\" \ + --ptxas-options="-v" \ + -I. + +NVLIB := -L$(SDK)/lib64 -lcudart $(LIB) + +ENV := PATH=/home/mit/csb/biosys/CUDA/gcc-4.6:$$PATH:$(SDK)/bin \ + LD_LIBRARY_PATH=$$LD_LIBRARY_PATH:$(SDK)/lib64:$(SDK)/lib +SHELL := /bin/bash + +keccak_bench: keccak.cu keccak_bench.cu keccak.cuh keccak_bench.cuh + $(ENV) $(NVCC) $(NVCCFLAGS) $(NVLIB) -o $@ keccak_bench.cu + +clean: + rm keccak_bench *{~,.{o,ptx,ptxas_info}} diff --git a/keccak.cu b/keccak.cu new file mode 100644 index 0000000..8614aa5 --- /dev/null +++ b/keccak.cu @@ -0,0 +1,144 @@ +#include "keccak.cuh" + +namespace keccak { + +// CUDA helper functions +__device__ uint thread_id() { return threadIdx.x; } +__device__ uint warp_num() { return thread_id() / 32; } +__device__ uint warp_id() { return thread_id() % 32; } + +__device__ bool warp_leader() { return thread_id() % 32 == 0; } +__device__ bool leading_warp() { return warp_num() == 0; } + +#define PAR_FOR(count) \ + if (thread_id() < (count)) + +// unsigned version of shuffle intrinsic +__device__ uint32_t shuffle(uint32_t val, uint32_t lane) { + return __shfl((int) val, lane); +} + + +// KECCAK constants + __constant__ uint32_t roundConsts[22] = { + 0x00000001, + 0x00008082, + 0x0000808a, + 0x80008000, + 0x0000808b, + 0x80000001, + 0x80008081, + 0x00008009, + 0x0000008a, + 0x00000088, + 0x80008009, + 0x8000000a, + 0x8000808b, + 0x0000008b, + 0x00008089, + 0x00008003, + 0x00008002, + 0x00000080, + 0x0000800a, + 0x8000000a, + 0x80008081, + 0x00008080 +}; + +// TEST: use smaller type (8b/16b) for performance +__constant__ uint32_t rotateConsts[25] = { +//y0 y1 y2 y3 y4 + 0, 36, 3, 41, 18, // x0 + 1, 44, 10, 45, 2,// x1 + 62, 6, 43, 15, 61,// x2 + 27, 20, 39, 14, 8,// x3 + 28, 55, 25, 21, 55 // x4 +}; + +// TEST: use smaller type (8b/16b) for performance +__constant__ uint32_t shuffleConsts[25] = { + 0, 15, 5, 20, 10, + 6, 21, 11, 1, 16, + 12, 2, 17, 7, 22, + 18, 8, 23, 13, 3, + 24, 14, 4, 19, 9 +}; + + +// KECCAK functions +__device__ uint32_t rot(uint32_t w, uint32_t offset) { + return (w << offset) + ^ (w >> 32-offset); +} + +__device__ void round(RegisterState &s, uint8_t round_no) { + uint32_t &a = s.a; + + // \Theta: parity is stored in threads 0..4 of each warp + uint32_t p = 0; + for (int i=0; i<5; i++) + p ^= shuffle(a, 5*warp_id() + i); + p = shuffle(p, (warp_id() - 1) % 5) + ^ rot(shuffle(p, (warp_id() + 1) % 5), 1); + a ^= shuffle(p, warp_id() / 5); + + // \rho+\pi + a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]); + + // \Chi + a ^= (~ shuffle(a, (warp_id() + 5 ) % 25)) + & shuffle(a, (warp_id() + 10) % 25); + + // \iota: round constant + if (warp_leader()) + a ^= roundConsts[round_no]; +} + +__device__ void init(RegisterState &s) { + s.a = 0; +} + +__device__ void add_l(RegisterState &s, uint32_t data) { + // sponge: xor data into registers + s.a ^= data; + + // round function: 22 iterations (12 + 2*log(sizeof(word))) + #pragma unroll + for (int i=0; i<22; i++) + round(s, i); +} + +__device__ void add(RegisterState &s, uint32_t data) { + // phase 1: use the data value from the own thread + PAR_FOR(25) + add_l(s, data); + + // phase 2: + // - use the data value from the disabled threads with warp_id >= 25 + // - threads 8-24 add 10*1 padding to full block + data = shuffle(data, (warp_id() + 25) % 32); + if (warp_id() >= 8) data = 0; + if (warp_id() == 8) data = 0xA00000; + if (warp_id() == 24) data = 0x000001; + PAR_FOR(25) + add_l(s, data); +} + +__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) { + // store the resulting hash from all warps in shared memory (768 + // bytes of shared memory could be saved by streaming them one word + // at a time instead) + if (warp_id() < digest_words) + ss.digests[warp_num()].digest[warp_id()] = s.a; + __syncthreads(); + + if (leading_warp()) { + init(s); + for (int i=0; i + +namespace keccak { + +const uint32_t digest_words = 256 / 32; +struct Result { + uint32_t digest[digest_words]; +}; + +union State { + uint32_t + a[5][5], + s[25]; +}; + +struct SharedState { + Result digests[1024 / 32]; // digest from 32 warps +}; + +union RegisterState { + uint32_t a; +}; + +__device__ void init(RegisterState &s); +__device__ void add(RegisterState &s, uint32_t data); +__device__ void finish(RegisterState &s, SharedState &ss, Result &dst); + +} // namespace keccak diff --git a/keccak.py b/keccak.py new file mode 100755 index 0000000..f55e8b7 --- /dev/null +++ b/keccak.py @@ -0,0 +1,8 @@ +#!/usr/bin/python + +a={} +for x in range(5): + for y in range(5): + a[(y*5)+((2*x+3*y)%5)] = x*5+y +for k in range(25): + print a[k] diff --git a/keccak_bench.cu b/keccak_bench.cu new file mode 100644 index 0000000..962d69a --- /dev/null +++ b/keccak_bench.cu @@ -0,0 +1,25 @@ +#include "keccak.cu" + +const int count = 100000; + +__device__ keccak::Result result; + +__global__ void bench_kernel() { + __shared__ keccak::SharedState s; + keccak::RegisterState r; + keccak::init(r); + for (int i=0; i>>(); + return cudaPeekAtLastError() != cudaSuccess; +} diff --git a/keccak_bench.cuh b/keccak_bench.cuh new file mode 100644 index 0000000..e69de29 -- cgit v0.10.1