84 lines
4.1 KiB
Plaintext
84 lines
4.1 KiB
Plaintext
//
|
|
// Tomás Oliveira e Silva, September 2025
|
|
//
|
|
// Arquiteturas de Alto Desempenho 2025/2026
|
|
//
|
|
|
|
|
|
//
|
|
// sha1_cuda_kernel() --- each CUDA thread computes the SHA1 secure hash of one message
|
|
//
|
|
// this kernel should only be used to validate the SHA1 secure hash code in CUDA
|
|
//
|
|
|
|
#include "aad_sha1.h"
|
|
#include "aad_data_types.h"
|
|
|
|
//
|
|
// the nvcc compiler stores w[] in registers (constant indices!)
|
|
//
|
|
// global thread number: n = threadIdx.x + blockDim.x * blockIdx.x
|
|
// global warp number: n >> 5
|
|
// warp thread number: n & 31 -- the lane
|
|
//
|
|
|
|
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
|
|
void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash)
|
|
{
|
|
u32_t n;
|
|
|
|
//
|
|
// get the global thread number (to make things easier, only the x dimension is used)
|
|
//
|
|
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
|
|
//
|
|
// adjust data and hash pointers; together with the DATA and HASH macros below, these pointer adjustments ensure that
|
|
// the 32 threads of a warp access consecutive memory addresses; for one warp addresses grow from the left to the
|
|
// right, and then from top to bottom
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// | | | | | |
|
|
// | data[ 0] for lane 0 | data[ 0] for lane 1 | ..... | data[ 0] for lane 30 | data[ 0] for lane 31 |
|
|
// | | | | | |
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// | | | | | |
|
|
// | data[ 1] for lane 0 | data[ 1] for lane 1 | ..... | data[ 1] for lane 30 | data[ 1] for lane 31 |
|
|
// | | | | | |
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// ...
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// | | | | | |
|
|
// | data[13] for lane 0 | data[13] for lane 1 | ..... | data[13] for lane 30 | data[13] for lane 31 |
|
|
// | | | | | |
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// this is followed by the data for the next warp
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// | | | | | |
|
|
// | data[ 0] for lane 0 | data[ 0] for lane 1 | ..... | data[ 0] for lane 30 | data[ 0] for lane 31 |
|
|
// | | | | | |
|
|
// +----------------------+----------------------+- ... -+----------------------+----------------------+
|
|
// ...
|
|
// And so on. The interleaved32_data is CONCEPTUALLY organized in the following way
|
|
// interleaved32_data[number_of_warps] [14] [32]
|
|
// [warp_number] [idx] [lane]
|
|
// for the same warp number and the same idx, the data for the 32 lanes (warp thread number) are in consecutive addresses
|
|
//
|
|
// the same happens for the interleaved32_hash, but the indices go only from 0 to 4
|
|
//
|
|
interleaved32_data = &interleaved32_data[(n >> 5u) * (32u * 14u) + (n & 31u)];
|
|
interleaved32_hash = &interleaved32_hash[(n >> 5u) * (32u * 5u) + (n & 31u)];
|
|
//
|
|
// compute the SHA1 secure hash
|
|
//
|
|
# define T u32_t
|
|
# define C(c) (c)
|
|
# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
|
|
# define DATA(idx) interleaved32_data[32u * (idx)]
|
|
# define HASH(idx) interleaved32_hash[32u * (idx)]
|
|
CUSTOM_SHA1_CODE();
|
|
# undef T
|
|
# undef C
|
|
# undef ROTATE
|
|
# undef DATA
|
|
# undef HASH
|
|
}
|