#include "keccak.cuh" namespace keccak { // CUDA helper functions __device__ uint thread_id() { return threadIdx.x; } __device__ uint warp_num() { return thread_id() / 32; } __device__ uint warp_id() { return thread_id() % 32; } __device__ bool warp_leader() { return thread_id() % 32 == 0; } __device__ bool leading_warp() { return warp_num() == 0; } #define PAR_FOR(count) \ if (thread_id() < (count)) // unsigned version of shuffle intrinsic __device__ uint32_t shuffle(uint32_t val, uint32_t lane) { return __shfl((int) val, lane); } // KECCAK constants __constant__ uint32_t roundConsts[22] = { 0x00000001, 0x00008082, 0x0000808a, 0x80008000, 0x0000808b, 0x80000001, 0x80008081, 0x00008009, 0x0000008a, 0x00000088, 0x80008009, 0x8000000a, 0x8000808b, 0x0000008b, 0x00008089, 0x00008003, 0x00008002, 0x00000080, 0x0000800a, 0x8000000a, 0x80008081, 0x00008080 }; // TEST: use smaller type (8b/16b) for performance __constant__ uint32_t rotateConsts[25] = { //y0 y1 y2 y3 y4 0, 36, 3, 41, 18, // x0 1, 44, 10, 45, 2,// x1 62, 6, 43, 15, 61,// x2 27, 20, 39, 14, 8,// x3 28, 55, 25, 21, 55 // x4 }; // TEST: use smaller type (8b/16b) for performance __constant__ uint32_t shuffleConsts[25] = { 0, 15, 5, 20, 10, 6, 21, 11, 1, 16, 12, 2, 17, 7, 22, 18, 8, 23, 13, 3, 24, 14, 4, 19, 9 }; // KECCAK functions __device__ uint32_t rot(uint32_t w, uint32_t offset) { return (w << offset) ^ (w >> 32-offset); } __device__ void round(RegisterState &s, uint8_t round_no) { uint32_t &a = s.a; // \Theta: parity is stored in threads 0..4 of each warp uint32_t p = 0; for (int i=0; i<5; i++) p ^= shuffle(a, 5*warp_id() + i); p = shuffle(p, (warp_id() - 1) % 5) ^ rot(shuffle(p, (warp_id() + 1) % 5), 1); a ^= shuffle(p, warp_id() / 5); // \rho+\pi a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]); // \Chi a ^= (~ shuffle(a, (warp_id() + 5 ) % 25)) & shuffle(a, (warp_id() + 10) % 25); // \iota: round constant if (warp_leader()) a ^= roundConsts[round_no]; } __device__ void init(RegisterState &s) { s.a = 0; } __device__ void add_l(RegisterState &s, uint32_t data) { // sponge: xor data into registers s.a ^= data; // round function: 22 iterations (12 + 2*log(sizeof(word))) #pragma unroll for (int i=0; i<22; i++) round(s, i); } __device__ void add(RegisterState &s, uint32_t data) { // phase 1: use the data value from the own thread PAR_FOR(25) add_l(s, data); // phase 2: // - use the data value from the disabled threads with warp_id >= 25 // - threads 8-24 add 10*1 padding to full block data = shuffle(data, (warp_id() + 25) % 32); if (warp_id() >= 8) data = 0; if (warp_id() == 8) data = 0xA00000; if (warp_id() == 24) data = 0x000001; PAR_FOR(25) add_l(s, data); } __device__ void finish(RegisterState &s, SharedState &ss, Result &dst) { // store the resulting hash from all warps in shared memory (768 // bytes of shared memory could be saved by streaming them one word // at a time instead) if (warp_id() < digest_words) ss.digests[warp_num()].digest[warp_id()] = s.a; __syncthreads(); if (leading_warp()) { init(s); for (int i=0; i