#pragma once

/* Compute a keccak-like hash in 1024 threads via values supplied
   locally in each thread (never leaving register memory). The input
   stream given by each warp has to be synchronised: the same amount of
   data has to be hashed at the same time).

   Approach:
   1. Compute a keccak-hash of an input stream of the local warp until
      end of data is reached. 25 (of the 32 threads) store one 32-bit
      word each.
   2. Compute the Keccak-Hash of all those hashes by streaming them to
      one warp via shared memory.
*/

#include <inttypes.h>

namespace keccak {

const uint32_t digest_words = 256 / 32;
struct Result {
  uint32_t digest[digest_words];
};

union State {
  uint32_t
    a[5][5], 
    s[25];
};

struct SharedState {
  Result digests[1024 / 32]; // digest from 32 warps
};

union RegisterState {
  uint32_t a;
};

__device__ void init(RegisterState &s);
__device__ void add(RegisterState &s, uint32_t data);
__device__ void finish(RegisterState &s, SharedState &ss, Result &dst);

} // namespace keccak