#pragma once /* Compute a keccak-like hash in 1024 threads via values supplied locally in each thread (never leaving register memory). The input stream given by each warp has to be synchronised: the same amount of data has to be hashed at the same time). Approach: 1. Compute a keccak-hash of an input stream of the local warp until end of data is reached. 25 (of the 32 threads) store one 32-bit word each. 2. Compute the Keccak-Hash of all those hashes by streaming them to one warp via shared memory. */ #include namespace keccak { const uint32_t digest_words = 256 / 32; struct Result { uint32_t digest[digest_words]; }; union State { uint32_t a[5][5], s[25]; }; struct SharedState { Result digests[1024 / 32]; // digest from 32 warps }; union RegisterState { uint32_t a; }; __device__ void init(RegisterState &s); __device__ void add(RegisterState &s, uint32_t data); __device__ void finish(RegisterState &s, SharedState &ss, Result &dst); } // namespace keccak