// // Arquiteturas de Alto Desempenho 2025/2026 // // SHA-1 OpenCL kernel - validation and testing // #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) // SHA-1 functions #define SHA1_F1(x,y,z) ((x & y) | (~x & z)) #define SHA1_K1 0x5A827999u #define SHA1_F2(x,y,z) (x ^ y ^ z) #define SHA1_K2 0x6ED9EBA1u #define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z)) #define SHA1_K3 0x8F1BBCDCu #define SHA1_F4(x,y,z) (x ^ y ^ z) #define SHA1_K4 0xCA62C1D6u // // sha1_ocl_kernel() --- each OpenCL work item computes the SHA-1 secure hash of one message // // This kernel should only be used to validate the SHA-1 secure hash code in OpenCL // It matches the structure of the CUDA sha1_cuda_kernel // // The data is organized in an interleaved fashion for optimal memory access: // For work group size W, the layout is: // data[work_group_num][14][W] // where data for the same index across all work items in a group are consecutive // __kernel void sha1_ocl_kernel(__global uint *interleaved_data, __global uint *interleaved_hash) { uint gid = get_global_id(0); uint local_id = get_local_id(0); uint local_size = get_local_size(0); uint group_id = get_group_id(0); // Adjust pointers for interleaved access pattern // Each work group processes local_size messages // Within a group, data is interleaved: all work items' data[0], then all data[1], etc. __global uint *data = &interleaved_data[group_id * (local_size * 14) + local_id]; __global uint *hash = &interleaved_hash[group_id * (local_size * 5) + local_id]; // Local storage for computation uint a, b, c, d, e, w[16]; uint tmp; // Initial hash values a = 0x67452301u; b = 0xEFCDAB89u; c = 0x98BADCFEu; d = 0x10325476u; e = 0xC3D2E1F0u; // Load message schedule from interleaved data // DATA(idx) accesses data[idx * local_size] to get the correct interleaved value for(int i = 0; i < 14; i++) w[i] = data[local_size * i]; w[14] = 0; w[15] = 440; // 55 bytes * 8 bits // SHA-1 compression - 80 rounds // Rounds 0-15 (no message schedule expansion needed) #define SHA1_STEP(F, K, t) \ tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t] + K; \ e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; SHA1_STEP(SHA1_F1, SHA1_K1, 0); SHA1_STEP(SHA1_F1, SHA1_K1, 1); SHA1_STEP(SHA1_F1, SHA1_K1, 2); SHA1_STEP(SHA1_F1, SHA1_K1, 3); SHA1_STEP(SHA1_F1, SHA1_K1, 4); SHA1_STEP(SHA1_F1, SHA1_K1, 5); SHA1_STEP(SHA1_F1, SHA1_K1, 6); SHA1_STEP(SHA1_F1, SHA1_K1, 7); SHA1_STEP(SHA1_F1, SHA1_K1, 8); SHA1_STEP(SHA1_F1, SHA1_K1, 9); SHA1_STEP(SHA1_F1, SHA1_K1, 10); SHA1_STEP(SHA1_F1, SHA1_K1, 11); SHA1_STEP(SHA1_F1, SHA1_K1, 12); SHA1_STEP(SHA1_F1, SHA1_K1, 13); SHA1_STEP(SHA1_F1, SHA1_K1, 14); SHA1_STEP(SHA1_F1, SHA1_K1, 15); #undef SHA1_STEP // Rounds 16-79 with message schedule expansion #define SHA1_EXPAND_STEP(F, K, t) \ tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \ w[t & 15] = ROTATE_LEFT(tmp, 1); \ tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \ e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; // Rounds 16-19 (still using F1) SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 16); SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 17); SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 18); SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 19); // Rounds 20-39 (F2) SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 20); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 21); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 22); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 23); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 24); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 25); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 26); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 27); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 28); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 29); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 30); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 31); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 32); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 33); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 34); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 35); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 36); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 37); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 38); SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 39); // Rounds 40-59 (F3) SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 40); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 41); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 42); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 43); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 44); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 45); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 46); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 47); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 48); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 49); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 50); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 51); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 52); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 53); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 54); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 55); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 56); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 57); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 58); SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 59); // Rounds 60-79 (F4) SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 60); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 61); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 62); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 63); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 64); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 65); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 66); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 67); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 68); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 69); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 70); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 71); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 72); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 73); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 74); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 75); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 76); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 77); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 78); SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 79); #undef SHA1_EXPAND_STEP // Add to initial hash values and store in interleaved format // HASH(idx) writes to hash[idx * local_size] hash[local_size * 0] = a + 0x67452301u; hash[local_size * 1] = b + 0xEFCDAB89u; hash[local_size * 2] = c + 0x98BADCFEu; hash[local_size * 3] = d + 0x10325476u; hash[local_size * 4] = e + 0xC3D2E1F0u; }