diff options
Diffstat (limited to 'keccak.cu')
-rw-r--r-- | keccak.cu | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/keccak.cu b/keccak.cu new file mode 100644 index 0000000..8614aa5 --- /dev/null +++ b/keccak.cu @@ -0,0 +1,144 @@ +#include "keccak.cuh" + +namespace keccak { + +// CUDA helper functions +__device__ uint thread_id() { return threadIdx.x; } +__device__ uint warp_num() { return thread_id() / 32; } +__device__ uint warp_id() { return thread_id() % 32; } + +__device__ bool warp_leader() { return thread_id() % 32 == 0; } +__device__ bool leading_warp() { return warp_num() == 0; } + +#define PAR_FOR(count) \ + if (thread_id() < (count)) + +// unsigned version of shuffle intrinsic +__device__ uint32_t shuffle(uint32_t val, uint32_t lane) { + return __shfl((int) val, lane); +} + + +// KECCAK constants + __constant__ uint32_t roundConsts[22] = { + 0x00000001, + 0x00008082, + 0x0000808a, + 0x80008000, + 0x0000808b, + 0x80000001, + 0x80008081, + 0x00008009, + 0x0000008a, + 0x00000088, + 0x80008009, + 0x8000000a, + 0x8000808b, + 0x0000008b, + 0x00008089, + 0x00008003, + 0x00008002, + 0x00000080, + 0x0000800a, + 0x8000000a, + 0x80008081, + 0x00008080 +}; + +// TEST: use smaller type (8b/16b) for performance +__constant__ uint32_t rotateConsts[25] = { +//y0 y1 y2 y3 y4 + 0, 36, 3, 41, 18, // x0 + 1, 44, 10, 45, 2,// x1 + 62, 6, 43, 15, 61,// x2 + 27, 20, 39, 14, 8,// x3 + 28, 55, 25, 21, 55 // x4 +}; + +// TEST: use smaller type (8b/16b) for performance +__constant__ uint32_t shuffleConsts[25] = { + 0, 15, 5, 20, 10, + 6, 21, 11, 1, 16, + 12, 2, 17, 7, 22, + 18, 8, 23, 13, 3, + 24, 14, 4, 19, 9 +}; + + +// KECCAK functions +__device__ uint32_t rot(uint32_t w, uint32_t offset) { + return (w << offset) + ^ (w >> 32-offset); +} + +__device__ void round(RegisterState &s, uint8_t round_no) { + uint32_t &a = s.a; + + // \Theta: parity is stored in threads 0..4 of each warp + uint32_t p = 0; + for (int i=0; i<5; i++) + p ^= shuffle(a, 5*warp_id() + i); + p = shuffle(p, (warp_id() - 1) % 5) + ^ rot(shuffle(p, (warp_id() + 1) % 5), 1); + a ^= shuffle(p, warp_id() / 5); + + // \rho+\pi + a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]); + + // \Chi + a ^= (~ shuffle(a, (warp_id() + 5 ) % 25)) + & shuffle(a, (warp_id() + 10) % 25); + + // \iota: round constant + if (warp_leader()) + a ^= roundConsts[round_no]; +} + +__device__ void init(RegisterState &s) { + s.a = 0; +} + +__device__ void add_l(RegisterState &s, uint32_t data) { + // sponge: xor data into registers + s.a ^= data; + + // round function: 22 iterations (12 + 2*log(sizeof(word))) + #pragma unroll + for (int i=0; i<22; i++) + round(s, i); +} + +__device__ void add(RegisterState &s, uint32_t data) { + // phase 1: use the data value from the own thread + PAR_FOR(25) + add_l(s, data); + + // phase 2: + // - use the data value from the disabled threads with warp_id >= 25 + // - threads 8-24 add 10*1 padding to full block + data = shuffle(data, (warp_id() + 25) % 32); + if (warp_id() >= 8) data = 0; + if (warp_id() == 8) data = 0xA00000; + if (warp_id() == 24) data = 0x000001; + PAR_FOR(25) + add_l(s, data); +} + +__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) { + // store the resulting hash from all warps in shared memory (768 + // bytes of shared memory could be saved by streaming them one word + // at a time instead) + if (warp_id() < digest_words) + ss.digests[warp_num()].digest[warp_id()] = s.a; + __syncthreads(); + + if (leading_warp()) { + init(s); + for (int i=0; i<digest_words; i++) + add(s, ss.digests[thread_id()].digest[i]); + } + PAR_FOR(digest_words) + dst.digest[thread_id()] = s.a; +} + +} // namespace keccak |