// // Tomás Oliveira e Silva, September 2025 // // Arquiteturas de Alto Desempenho 2025/2026 // // // sha1_cuda_kernel() --- each CUDA thread computes the SHA1 secure hash of one message // // this kernel should only be used to validate the SHA1 secure hash code in CUDA // #include "aad_sha1.h" #include "aad_data_types.h" // // the nvcc compiler stores w[] in registers (constant indices!) // // global thread number: n = threadIdx.x + blockDim.x * blockIdx.x // global warp number: n >> 5 // warp thread number: n & 31 -- the lane // extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1) void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash) { u32_t n; // // get the global thread number (to make things easier, only the x dimension is used) // n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; // // adjust data and hash pointers; together with the DATA and HASH macros below, these pointer adjustments ensure that // the 32 threads of a warp access consecutive memory addresses; for one warp addresses grow from the left to the // right, and then from top to bottom // +----------------------+----------------------+- ... -+----------------------+----------------------+ // | | | | | | // | data[ 0] for lane 0 | data[ 0] for lane 1 | ..... | data[ 0] for lane 30 | data[ 0] for lane 31 | // | | | | | | // +----------------------+----------------------+- ... -+----------------------+----------------------+ // | | | | | | // | data[ 1] for lane 0 | data[ 1] for lane 1 | ..... | data[ 1] for lane 30 | data[ 1] for lane 31 | // | | | | | | // +----------------------+----------------------+- ... -+----------------------+----------------------+ // ... // +----------------------+----------------------+- ... -+----------------------+----------------------+ // | | | | | | // | data[13] for lane 0 | data[13] for lane 1 | ..... | data[13] for lane 30 | data[13] for lane 31 | // | | | | | | // +----------------------+----------------------+- ... -+----------------------+----------------------+ // this is followed by the data for the next warp // +----------------------+----------------------+- ... -+----------------------+----------------------+ // | | | | | | // | data[ 0] for lane 0 | data[ 0] for lane 1 | ..... | data[ 0] for lane 30 | data[ 0] for lane 31 | // | | | | | | // +----------------------+----------------------+- ... -+----------------------+----------------------+ // ... // And so on. The interleaved32_data is CONCEPTUALLY organized in the following way // interleaved32_data[number_of_warps] [14] [32] // [warp_number] [idx] [lane] // for the same warp number and the same idx, the data for the 32 lanes (warp thread number) are in consecutive addresses // // the same happens for the interleaved32_hash, but the indices go only from 0 to 4 // interleaved32_data = &interleaved32_data[(n >> 5u) * (32u * 14u) + (n & 31u)]; interleaved32_hash = &interleaved32_hash[(n >> 5u) * (32u * 5u) + (n & 31u)]; // // compute the SHA1 secure hash // # define T u32_t # define C(c) (c) # define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n)))) # define DATA(idx) interleaved32_data[32u * (idx)] # define HASH(idx) interleaved32_hash[32u * (idx)] CUSTOM_SHA1_CODE(); # undef T # undef C # undef ROTATE # undef DATA # undef HASH }