aad-assignment-1/aad_sha1_ocl_kernel.cl

176 lines
6.1 KiB
Common Lisp

//
// Arquiteturas de Alto Desempenho 2025/2026
//
// SHA-1 OpenCL kernel - validation and testing
//
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
// SHA-1 functions
#define SHA1_F1(x,y,z) ((x & y) | (~x & z))
#define SHA1_K1 0x5A827999u
#define SHA1_F2(x,y,z) (x ^ y ^ z)
#define SHA1_K2 0x6ED9EBA1u
#define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z))
#define SHA1_K3 0x8F1BBCDCu
#define SHA1_F4(x,y,z) (x ^ y ^ z)
#define SHA1_K4 0xCA62C1D6u
//
// sha1_ocl_kernel() --- each OpenCL work item computes the SHA-1 secure hash of one message
//
// This kernel should only be used to validate the SHA-1 secure hash code in OpenCL
// It matches the structure of the CUDA sha1_cuda_kernel
//
// The data is organized in an interleaved fashion for optimal memory access:
// For work group size W, the layout is:
// data[work_group_num][14][W]
// where data for the same index across all work items in a group are consecutive
//
__kernel void sha1_ocl_kernel(__global uint *interleaved_data, __global uint *interleaved_hash)
{
uint gid = get_global_id(0);
uint local_id = get_local_id(0);
uint local_size = get_local_size(0);
uint group_id = get_group_id(0);
// Adjust pointers for interleaved access pattern
// Each work group processes local_size messages
// Within a group, data is interleaved: all work items' data[0], then all data[1], etc.
__global uint *data = &interleaved_data[group_id * (local_size * 14) + local_id];
__global uint *hash = &interleaved_hash[group_id * (local_size * 5) + local_id];
// Local storage for computation
uint a, b, c, d, e, w[16];
uint tmp;
// Initial hash values
a = 0x67452301u;
b = 0xEFCDAB89u;
c = 0x98BADCFEu;
d = 0x10325476u;
e = 0xC3D2E1F0u;
// Load message schedule from interleaved data
// DATA(idx) accesses data[idx * local_size] to get the correct interleaved value
for(int i = 0; i < 14; i++)
w[i] = data[local_size * i];
w[14] = 0;
w[15] = 440; // 55 bytes * 8 bits
// SHA-1 compression - 80 rounds
// Rounds 0-15 (no message schedule expansion needed)
#define SHA1_STEP(F, K, t) \
tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t] + K; \
e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
SHA1_STEP(SHA1_F1, SHA1_K1, 0);
SHA1_STEP(SHA1_F1, SHA1_K1, 1);
SHA1_STEP(SHA1_F1, SHA1_K1, 2);
SHA1_STEP(SHA1_F1, SHA1_K1, 3);
SHA1_STEP(SHA1_F1, SHA1_K1, 4);
SHA1_STEP(SHA1_F1, SHA1_K1, 5);
SHA1_STEP(SHA1_F1, SHA1_K1, 6);
SHA1_STEP(SHA1_F1, SHA1_K1, 7);
SHA1_STEP(SHA1_F1, SHA1_K1, 8);
SHA1_STEP(SHA1_F1, SHA1_K1, 9);
SHA1_STEP(SHA1_F1, SHA1_K1, 10);
SHA1_STEP(SHA1_F1, SHA1_K1, 11);
SHA1_STEP(SHA1_F1, SHA1_K1, 12);
SHA1_STEP(SHA1_F1, SHA1_K1, 13);
SHA1_STEP(SHA1_F1, SHA1_K1, 14);
SHA1_STEP(SHA1_F1, SHA1_K1, 15);
#undef SHA1_STEP
// Rounds 16-79 with message schedule expansion
#define SHA1_EXPAND_STEP(F, K, t) \
tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \
w[t & 15] = ROTATE_LEFT(tmp, 1); \
tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \
e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
// Rounds 16-19 (still using F1)
SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 16);
SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 17);
SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 18);
SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 19);
// Rounds 20-39 (F2)
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 20);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 21);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 22);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 23);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 24);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 25);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 26);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 27);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 28);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 29);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 30);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 31);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 32);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 33);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 34);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 35);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 36);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 37);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 38);
SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 39);
// Rounds 40-59 (F3)
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 40);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 41);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 42);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 43);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 44);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 45);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 46);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 47);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 48);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 49);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 50);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 51);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 52);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 53);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 54);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 55);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 56);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 57);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 58);
SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 59);
// Rounds 60-79 (F4)
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 60);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 61);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 62);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 63);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 64);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 65);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 66);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 67);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 68);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 69);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 70);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 71);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 72);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 73);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 74);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 75);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 76);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 77);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 78);
SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 79);
#undef SHA1_EXPAND_STEP
// Add to initial hash values and store in interleaved format
// HASH(idx) writes to hash[idx * local_size]
hash[local_size * 0] = a + 0x67452301u;
hash[local_size * 1] = b + 0xEFCDAB89u;
hash[local_size * 2] = c + 0x98BADCFEu;
hash[local_size * 3] = d + 0x10325476u;
hash[local_size * 4] = e + 0xC3D2E1F0u;
}