summaryrefslogtreecommitdiff
path: root/keccak.cu
blob: 8614aa5a70ac411e6e97b3812419ae1b8f9bd4eb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include "keccak.cuh"

namespace keccak {

// CUDA helper functions
__device__ uint thread_id()     { return threadIdx.x; }
__device__ uint warp_num()      { return thread_id() / 32; }
__device__ uint warp_id()       { return thread_id() % 32; }

__device__ bool warp_leader()   { return thread_id() % 32 == 0; }
__device__ bool leading_warp()  { return warp_num() == 0; }

#define PAR_FOR(count)				\
  if (thread_id() < (count))

// unsigned version of shuffle intrinsic
__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
  return __shfl((int) val, lane);
}


// KECCAK constants
 __constant__ uint32_t roundConsts[22] = {
   0x00000001,
   0x00008082,
   0x0000808a,
   0x80008000,
   0x0000808b,
   0x80000001,
   0x80008081,
   0x00008009,
   0x0000008a,
   0x00000088,
   0x80008009,
   0x8000000a,
   0x8000808b,
   0x0000008b,
   0x00008089,
   0x00008003,
   0x00008002,
   0x00000080,
   0x0000800a,
   0x8000000a,
   0x80008081,
   0x00008080
};

// TEST: use smaller type (8b/16b) for performance
__constant__ uint32_t rotateConsts[25] = {
//y0  y1  y2  y3  y4
  0, 36,  3, 41, 18, // x0
   1, 44, 10, 45,  2,// x1
  62,  6, 43, 15, 61,// x2
  27, 20, 39, 14,  8,// x3
  28, 55, 25, 21, 55 // x4
};

// TEST: use smaller type (8b/16b) for performance
__constant__ uint32_t shuffleConsts[25] = {
   0, 15,  5, 20, 10,
   6, 21, 11,  1, 16,
  12,  2, 17,  7, 22,
  18,  8, 23, 13,  3,
  24, 14, 4,  19,  9
};


// KECCAK functions
__device__ uint32_t rot(uint32_t w, uint32_t offset) {
  return (w << offset) 
       ^ (w >> 32-offset);
}

__device__ void round(RegisterState &s, uint8_t round_no) {
  uint32_t  &a = s.a;

  // \Theta: parity is stored in threads 0..4 of each warp
  uint32_t p = 0;
  for (int i=0; i<5; i++)
    p ^= shuffle(a, 5*warp_id() + i);
  p =     shuffle(p, (warp_id() - 1) % 5)
    ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
  a ^= shuffle(p, warp_id() / 5);

  // \rho+\pi
  a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);

  // \Chi
  a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
        & shuffle(a, (warp_id() + 10) % 25);

  // \iota: round constant
  if (warp_leader())
    a ^= roundConsts[round_no];
}

__device__ void init(RegisterState &s) {
  s.a = 0;
}

__device__ void add_l(RegisterState &s, uint32_t data) {
  // sponge: xor data into registers
  s.a ^= data;

  // round function: 22 iterations (12 + 2*log(sizeof(word)))
  #pragma unroll
  for (int i=0; i<22; i++)
    round(s, i);
}

__device__ void add(RegisterState &s, uint32_t data) {
  // phase 1: use the data value from the own thread
  PAR_FOR(25)
    add_l(s, data);

  // phase 2:
  // - use the data value from the disabled threads with warp_id >= 25
  // - threads 8-24 add 10*1 padding to full block
  data = shuffle(data, (warp_id() + 25) % 32);
  if (warp_id() >= 8)  data = 0;
  if (warp_id() == 8)  data = 0xA00000;
  if (warp_id() == 24) data = 0x000001;
  PAR_FOR(25)
    add_l(s, data);
}

__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
  // store the resulting hash from all warps in shared memory (768
  // bytes of shared memory could be saved by streaming them one word
  // at a time instead)
  if (warp_id() < digest_words)
    ss.digests[warp_num()].digest[warp_id()] = s.a;
  __syncthreads();

  if (leading_warp()) {
    init(s);
    for (int i=0; i<digest_words; i++)
      add(s, ss.digests[thread_id()].digest[i]);
  }
  PAR_FOR(digest_words)
    dst.digest[thread_id()] = s.a;
}

} // namespace keccak
contact: Jan Huwald // Impressum