#include "keccak.cuh"

namespace keccak {

// CUDA helper functions
__device__ uint thread_id()     { return threadIdx.x; }
__device__ uint warp_num()      { return thread_id() / 32; }
__device__ uint warp_id()       { return thread_id() % 32; }

__device__ bool warp_leader()   { return thread_id() % 32 == 0; }
__device__ bool leading_warp()  { return warp_num() == 0; }

#define PAR_FOR(count)				\
  if (thread_id() < (count))

// unsigned version of shuffle intrinsic
__device__ uint32_t shuffle(uint32_t val, uint32_t lane) {
  return __shfl((int) val, lane);
}


// KECCAK constants
 __constant__ uint32_t roundConsts[22] = {
   0x00000001,
   0x00008082,
   0x0000808a,
   0x80008000,
   0x0000808b,
   0x80000001,
   0x80008081,
   0x00008009,
   0x0000008a,
   0x00000088,
   0x80008009,
   0x8000000a,
   0x8000808b,
   0x0000008b,
   0x00008089,
   0x00008003,
   0x00008002,
   0x00000080,
   0x0000800a,
   0x8000000a,
   0x80008081,
   0x00008080
};

// TEST: use smaller type (8b/16b) for performance
__constant__ uint32_t rotateConsts[25] = {
//y0  y1  y2  y3  y4
  0, 36,  3, 41, 18, // x0
   1, 44, 10, 45,  2,// x1
  62,  6, 43, 15, 61,// x2
  27, 20, 39, 14,  8,// x3
  28, 55, 25, 21, 55 // x4
};

// TEST: use smaller type (8b/16b) for performance
__constant__ uint32_t shuffleConsts[25] = {
   0, 15,  5, 20, 10,
   6, 21, 11,  1, 16,
  12,  2, 17,  7, 22,
  18,  8, 23, 13,  3,
  24, 14, 4,  19,  9
};


// KECCAK functions
__device__ uint32_t rot(uint32_t w, uint32_t offset) {
  return (w << offset) 
       ^ (w >> 32-offset);
}

__device__ void round(RegisterState &s, uint8_t round_no) {
  uint32_t  &a = s.a;

  // \Theta: parity is stored in threads 0..4 of each warp
  uint32_t p = 0;
  for (int i=0; i<5; i++)
    p ^= shuffle(a, 5*warp_id() + i);
  p =     shuffle(p, (warp_id() - 1) % 5)
    ^ rot(shuffle(p, (warp_id() + 1) % 5), 1);
  a ^= shuffle(p, warp_id() / 5);

  // \rho+\pi
  a = rot(shuffle(a, shuffleConsts[warp_id()]), rotateConsts[warp_id()]);

  // \Chi
  a ^= (~ shuffle(a, (warp_id() + 5 ) % 25))
        & shuffle(a, (warp_id() + 10) % 25);

  // \iota: round constant
  if (warp_leader())
    a ^= roundConsts[round_no];
}

__device__ void init(RegisterState &s) {
  s.a = 0;
}

__device__ void add_l(RegisterState &s, uint32_t data) {
  // sponge: xor data into registers
  s.a ^= data;

  // round function: 22 iterations (12 + 2*log(sizeof(word)))
  #pragma unroll
  for (int i=0; i<22; i++)
    round(s, i);
}

__device__ void add(RegisterState &s, uint32_t data) {
  // phase 1: use the data value from the own thread
  PAR_FOR(25)
    add_l(s, data);

  // phase 2:
  // - use the data value from the disabled threads with warp_id >= 25
  // - threads 8-24 add 10*1 padding to full block
  data = shuffle(data, (warp_id() + 25) % 32);
  if (warp_id() >= 8)  data = 0;
  if (warp_id() == 8)  data = 0xA00000;
  if (warp_id() == 24) data = 0x000001;
  PAR_FOR(25)
    add_l(s, data);
}

__device__ void finish(RegisterState &s, SharedState &ss, Result &dst) {
  // store the resulting hash from all warps in shared memory (768
  // bytes of shared memory could be saved by streaming them one word
  // at a time instead)
  if (warp_id() < digest_words)
    ss.digests[warp_num()].digest[warp_id()] = s.a;
  __syncthreads();

  if (leading_warp()) {
    init(s);
    for (int i=0; i<digest_words; i++)
      add(s, ss.digests[thread_id()].digest[i]);
  }
  PAR_FOR(digest_words)
    dst.digest[thread_id()] = s.a;
}

} // namespace keccak