summaryrefslogtreecommitdiff
path: root/keccak.cu
diff options
context:
space:
mode:
Diffstat (limited to 'keccak.cu')
-rw-r--r--keccak.cu144
1 files changed, 144 insertions, 0 deletions
diff --git a/keccak.cu b/keccak.cu
new file mode 100644
index 0000000..8614aa5
--- /dev/null
+++ b/keccak.cu
@@ -0,0 +1,144 @@
+#include "keccak.cuh"
+
+namespace keccak {
+
+// CUDA helper functions
+__device__ uint thread_id() { return threadIdx.x; }
+__device__ uint warp_num() { return thread_id() / 32; }
+__device__ uint warp_id() { return thread_id() % 32; }
+
+__device__ bool warp_leader() { return thread_id() % 32 == 0; }
+__device__ bool leading_warp() { return warp_num() == 0; }
+
+#define PAR_FOR(count) \
+ if (thread_id() < (count))
+
+// unsigned version of shuffle intrinsic
+__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
+ return __shfl((int) val, lane);
+}
+
+
+// KECCAK constants
+ __constant__ uint32_t roundConsts[22] = {
+ 0x00000001,
+ 0x00008082,
+ 0x0000808a,
+ 0x80008000,
+ 0x0000808b,
+ 0x80000001,
+ 0x80008081,
+ 0x00008009,
+ 0x0000008a,
+ 0x00000088,
+ 0x80008009,
+ 0x8000000a,
+ 0x8000808b,
+ 0x0000008b,
+ 0x00008089,
+ 0x00008003,
+ 0x00008002,
+ 0x00000080,
+ 0x0000800a,
+ 0x8000000a,
+ 0x80008081,
+ 0x00008080
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t rotateConsts[25] = {
+//y0 y1 y2 y3 y4
+ 0, 36, 3, 41, 18, // x0
+ 1, 44, 10, 45, 2,// x1
+ 62, 6, 43, 15, 61,// x2
+ 27, 20, 39, 14, 8,// x3
+ 28, 55, 25, 21, 55 // x4
+};
+
+// TEST: use smaller type (8b/16b) for performance
+__constant__ uint32_t shuffleConsts[25] = {
+ 0, 15, 5, 20, 10,
+ 6, 21, 11, 1, 16,
+ 12, 2, 17, 7, 22,
+ 18, 8, 23, 13, 3,
+ 24, 14, 4, 19, 9
+};
+
+
+// KECCAK functions
+__device__ uint32_t rot(uint32_t w, uint32_t offset) {
+ return (w << offset)
+ ^ (w >> 32-offset);
+}
+
+__device__ void round(RegisterState &s, uint8_t round_no) {
+ uint32_t &a = s.a;
+
+ // \Theta: parity is stored in threads 0..4 of each warp
+ uint32_t p = 0;
+ for (int i=0; i<5; i++)
+ p ^= shuffle(a, 5*warp_id() + i);
+ p = shuffle(p, (warp_id() - 1) % 5)
+ ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
+ a ^= shuffle(p, warp_id() / 5);
+
+ // \rho+\pi
+ a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);
+
+ // \Chi
+ a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
+ & shuffle(a, (warp_id() + 10) % 25);
+
+ // \iota: round constant
+ if (warp_leader())
+ a ^= roundConsts[round_no];
+}
+
+__device__ void init(RegisterState &s) {
+ s.a = 0;
+}
+
+__device__ void add_l(RegisterState &s, uint32_t data) {
+ // sponge: xor data into registers
+ s.a ^= data;
+
+ // round function: 22 iterations (12 + 2*log(sizeof(word)))
+ #pragma unroll
+ for (int i=0; i<22; i++)
+ round(s, i);
+}
+
+__device__ void add(RegisterState &s, uint32_t data) {
+ // phase 1: use the data value from the own thread
+ PAR_FOR(25)
+ add_l(s, data);
+
+ // phase 2:
+ // - use the data value from the disabled threads with warp_id >= 25
+ // - threads 8-24 add 10*1 padding to full block
+ data = shuffle(data, (warp_id() + 25) % 32);
+ if (warp_id() >= 8) data = 0;
+ if (warp_id() == 8) data = 0xA00000;
+ if (warp_id() == 24) data = 0x000001;
+ PAR_FOR(25)
+ add_l(s, data);
+}
+
+__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
+ // store the resulting hash from all warps in shared memory (768
+ // bytes of shared memory could be saved by streaming them one word
+ // at a time instead)
+ if (warp_id() < digest_words)
+ ss.digests[warp_num()].digest[warp_id()] = s.a;
+ __syncthreads();
+
+ if (leading_warp()) {
+ init(s);
+ for (int i=0; i<digest_words; i++)
+ add(s, ss.digests[thread_id()].digest[i]);
+ }
+ PAR_FOR(digest_words)
+ dst.digest[thread_id()] = s.a;
+}
+
+} // namespace keccak
contact: Jan Huwald // Impressum