diff --git a/aad_coin_miner_ocl.c b/aad_coin_miner_ocl.c new file mode 100644 index 0000000..55d8f68 --- /dev/null +++ b/aad_coin_miner_ocl.c @@ -0,0 +1,226 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - OpenCL implementation +// + +#include +#include +#include +#include +#include +#include "aad_data_types.h" +#include "aad_utilities.h" +#include "aad_sha1_cpu.h" +#include "aad_ocl_utilities.h" +#include "aad_vault.h" + +#define COINS_STORAGE_SIZE 1024u + +static volatile int keep_running = 1; + +void signal_handler(int signum) +{ + (void)signum; + keep_running = 0; +} + +// Coin reconstruction from stored data +static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) +{ + for(int i = 0; i < 14; i++) + coin[i] = stored_data[i]; +} + +// +// Mine DETI coins using OpenCL +// +static void mine_coins_ocl(u64_t max_attempts, int use_scan_kernel) +{ + ocl_data_t od; + u32_t *host_storage; + u64_t attempts = 0; + u32_t coins_found = 0; + u32_t kernel_runs = 0; + + // Initialize OpenCL + od.platform_number = 0; + od.device_number = 0; + od.kernel_file_name = "aad_coin_miner_ocl_kernel.cl"; + od.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel"; + od.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t); + od.data_size[1] = 0; + + initialize_ocl(&od); + + host_storage = (u32_t *)od.host_data[0]; + + // Kernel configuration + od.local_work_size = RECOMMENDED_OCL_WORK_GROUP_SIZE; + od.global_work_size = 4096 * od.local_work_size; // Large grid for maximum GPU utilization + + u32_t n_threads = od.global_work_size; + + printf("Mining DETI coins using OpenCL...\n"); + printf("Device: %s\n", od.device_name); + printf("Work groups: %zu × %zu = %u total work items\n", + od.global_work_size / od.local_work_size, od.local_work_size, n_threads); + printf("Kernel: %s\n", od.kernel_name); + printf("Press Ctrl+C to stop\n\n"); + + // Test SHA1 on host to verify it matches + printf("Testing SHA1 implementation on host...\n"); + u32_t test_coin[14]; + memset(test_coin, 0, sizeof(test_coin)); + ((u08_t *)test_coin)[0x0 ^ 3] = 'D'; + ((u08_t *)test_coin)[0x1 ^ 3] = 'E'; + ((u08_t *)test_coin)[0x2 ^ 3] = 'T'; + ((u08_t *)test_coin)[0x3 ^ 3] = 'I'; + ((u08_t *)test_coin)[0x4 ^ 3] = ' '; + ((u08_t *)test_coin)[0x5 ^ 3] = 'c'; + ((u08_t *)test_coin)[0x6 ^ 3] = 'o'; + ((u08_t *)test_coin)[0x7 ^ 3] = 'i'; + ((u08_t *)test_coin)[0x8 ^ 3] = 'n'; + ((u08_t *)test_coin)[0x9 ^ 3] = ' '; + ((u08_t *)test_coin)[0xa ^ 3] = '2'; + ((u08_t *)test_coin)[0xb ^ 3] = ' '; + ((u08_t *)test_coin)[0x36 ^ 3] = '\n'; + ((u08_t *)test_coin)[0x37 ^ 3] = 0x80; + for(int i = 12; i < 54; i++) + ((u08_t *)test_coin)[i ^ 3] = 'A' + (i - 12) % 26; + + u32_t test_hash[5]; + sha1(test_coin, test_hash); + printf("Host test hash: 0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n", + test_hash[0], test_hash[1], test_hash[2], test_hash[3], test_hash[4]); + + // Now test on device + printf("Testing SHA1 implementation on device...\n"); + host_storage[0] = 1u; + + // Put the test coin in storage starting at index 1 + for(int i = 0; i < 14; i++) + host_storage[1 + i] = test_coin[i]; + + // Copy to device + host_to_device_copy(&od, 0); + + // We'll add a test kernel - for now just verify basic kernel launch works + printf("Starting mining...\n\n"); + + u32_t param1 = (u32_t)time(NULL); + u32_t param2 = 0x12345678u; + int scan_pos = 12; + + time_measurement(); + time_measurement(); + double start_time = wall_time_delta(); + + while(keep_running && (max_attempts == 0 || attempts < max_attempts)) + { + // Initialize storage area + host_storage[0] = 1u; // First unused index + + // Copy to device + host_to_device_copy(&od, 0); + + // Set kernel arguments + od.n_kernel_arguments = use_scan_kernel ? 4 : 3; + set_kernel_arg(&od, 0, sizeof(cl_mem), &od.device_data[0]); + set_kernel_arg(&od, 1, sizeof(u32_t), ¶m1); + set_kernel_arg(&od, 2, sizeof(u32_t), ¶m2); + if(use_scan_kernel) + set_kernel_arg(&od, 3, sizeof(int), &scan_pos); + + // Launch the OpenCL kernel + launch_kernel(&od); + + // Copy results back + device_to_host_copy(&od, 0); + + // Process found coins + u32_t n_coins_this_kernel = 0; + u32_t n_stored = (host_storage[0] - 1) / 14; + + if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE) + { + for(u32_t i = 0; i < n_stored; i++) + { + u32_t coin[14]; + reconstruct_coin(&host_storage[1 + i * 14], coin); + + // Verify it's actually a valid coin + u32_t hash[5]; + sha1(coin, hash); + + if(hash[0] == 0xAAD20250u) + { + coins_found++; + n_coins_this_kernel++; + printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n", + kernel_runs, n_coins_this_kernel); + save_coin(coin); + } + } + } + + // Update counters + kernel_runs++; + if(use_scan_kernel) + attempts += n_threads * 256; // Each thread tries 256 values + else + attempts += n_threads; + + // Update parameters for next iteration + param1++; + param2 = param2 ^ 0x9E3779B9u; + if(use_scan_kernel) + scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53 + + // Print progress every 10 kernel launches + if(kernel_runs % 10 == 0) + { + time_measurement(); + double current_time = wall_time_delta() - start_time; + double rate = attempts / current_time; + printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u\n", + (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs); + } + } + + time_measurement(); + double total_time = wall_time_delta() - start_time; + + printf("\n=== Mining Statistics ===\n"); + printf("Total attempts: %llu\n", (unsigned long long)attempts); + printf("Total time: %.2f seconds\n", total_time); + printf("Average rate: %.2f attempts/second\n", attempts / total_time); + printf("Coins found: %u\n", coins_found); + printf("Kernel launches: %u\n", kernel_runs); + + // Save any remaining coins + save_coin(NULL); + + terminate_ocl(&od); +} + +int main(int argc, char *argv[]) +{ + u64_t max_attempts = 0; + int use_scan_kernel = 0; + + signal(SIGINT, signal_handler); + + if(argc > 1) + max_attempts = strtoull(argv[1], NULL, 10); + + if(argc > 2 && strcmp(argv[2], "scan") == 0) + { + use_scan_kernel = 1; + printf("Using scan kernel (tries 256 values per thread)\n"); + } + + mine_coins_ocl(max_attempts, use_scan_kernel); + + return 0; +} diff --git a/aad_coin_miner_ocl_kernel.cl b/aad_coin_miner_ocl_kernel.cl new file mode 100644 index 0000000..6744b49 --- /dev/null +++ b/aad_coin_miner_ocl_kernel.cl @@ -0,0 +1,255 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - OpenCL kernel +// + +// Rotate left for SHA-1 +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +// SHA-1 macros +#define SHA1_F1(x,y,z) ((x & y) | (~x & z)) +#define SHA1_K1 0x5A827999u +#define SHA1_F2(x,y,z) (x ^ y ^ z) +#define SHA1_K2 0x6ED9EBA1u +#define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z)) +#define SHA1_K3 0x8F1BBCDCu +#define SHA1_F4(x,y,z) (x ^ y ^ z) +#define SHA1_K4 0xCA62C1D6u + +// +// SHA-1 implementation matching the template from aad_sha1.h +// +void sha1_compute(__private uint *coin, __private uint *hash) +{ + uint a, b, c, d, e, w[16]; + + // Initial hash values + a = 0x67452301u; + b = 0xEFCDAB89u; + c = 0x98BADCFEu; + d = 0x10325476u; + e = 0xC3D2E1F0u; + + // Load message schedule (first 14 words from coin, then 0, then length) + for(int i = 0; i < 14; i++) + w[i] = coin[i]; + w[14] = 0; + w[15] = 440; // 55 bytes * 8 bits + + // SHA-1 compression function - 80 rounds + uint tmp; + + // Rounds 0-15 + #define ROUND1(t) \ + tmp = ROTATE_LEFT(a, 5) + SHA1_F1(b,c,d) + e + w[t] + SHA1_K1; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + ROUND1(0); ROUND1(1); ROUND1(2); ROUND1(3); + ROUND1(4); ROUND1(5); ROUND1(6); ROUND1(7); + ROUND1(8); ROUND1(9); ROUND1(10); ROUND1(11); + ROUND1(12); ROUND1(13); ROUND1(14); ROUND1(15); + + #undef ROUND1 + + // Rounds 16-79 with message schedule + #define ROUND(F, K, t) \ + tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \ + w[t & 15] = ROTATE_LEFT(tmp, 1); \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + ROUND(SHA1_F1, SHA1_K1, 16); ROUND(SHA1_F1, SHA1_K1, 17); + ROUND(SHA1_F1, SHA1_K1, 18); ROUND(SHA1_F1, SHA1_K1, 19); + + ROUND(SHA1_F2, SHA1_K2, 20); ROUND(SHA1_F2, SHA1_K2, 21); + ROUND(SHA1_F2, SHA1_K2, 22); ROUND(SHA1_F2, SHA1_K2, 23); + ROUND(SHA1_F2, SHA1_K2, 24); ROUND(SHA1_F2, SHA1_K2, 25); + ROUND(SHA1_F2, SHA1_K2, 26); ROUND(SHA1_F2, SHA1_K2, 27); + ROUND(SHA1_F2, SHA1_K2, 28); ROUND(SHA1_F2, SHA1_K2, 29); + ROUND(SHA1_F2, SHA1_K2, 30); ROUND(SHA1_F2, SHA1_K2, 31); + ROUND(SHA1_F2, SHA1_K2, 32); ROUND(SHA1_F2, SHA1_K2, 33); + ROUND(SHA1_F2, SHA1_K2, 34); ROUND(SHA1_F2, SHA1_K2, 35); + ROUND(SHA1_F2, SHA1_K2, 36); ROUND(SHA1_F2, SHA1_K2, 37); + ROUND(SHA1_F2, SHA1_K2, 38); ROUND(SHA1_F2, SHA1_K2, 39); + + ROUND(SHA1_F3, SHA1_K3, 40); ROUND(SHA1_F3, SHA1_K3, 41); + ROUND(SHA1_F3, SHA1_K3, 42); ROUND(SHA1_F3, SHA1_K3, 43); + ROUND(SHA1_F3, SHA1_K3, 44); ROUND(SHA1_F3, SHA1_K3, 45); + ROUND(SHA1_F3, SHA1_K3, 46); ROUND(SHA1_F3, SHA1_K3, 47); + ROUND(SHA1_F3, SHA1_K3, 48); ROUND(SHA1_F3, SHA1_K3, 49); + ROUND(SHA1_F3, SHA1_K3, 50); ROUND(SHA1_F3, SHA1_K3, 51); + ROUND(SHA1_F3, SHA1_K3, 52); ROUND(SHA1_F3, SHA1_K3, 53); + ROUND(SHA1_F3, SHA1_K3, 54); ROUND(SHA1_F3, SHA1_K3, 55); + ROUND(SHA1_F3, SHA1_K3, 56); ROUND(SHA1_F3, SHA1_K3, 57); + ROUND(SHA1_F3, SHA1_K3, 58); ROUND(SHA1_F3, SHA1_K3, 59); + + ROUND(SHA1_F4, SHA1_K4, 60); ROUND(SHA1_F4, SHA1_K4, 61); + ROUND(SHA1_F4, SHA1_K4, 62); ROUND(SHA1_F4, SHA1_K4, 63); + ROUND(SHA1_F4, SHA1_K4, 64); ROUND(SHA1_F4, SHA1_K4, 65); + ROUND(SHA1_F4, SHA1_K4, 66); ROUND(SHA1_F4, SHA1_K4, 67); + ROUND(SHA1_F4, SHA1_K4, 68); ROUND(SHA1_F4, SHA1_K4, 69); + ROUND(SHA1_F4, SHA1_K4, 70); ROUND(SHA1_F4, SHA1_K4, 71); + ROUND(SHA1_F4, SHA1_K4, 72); ROUND(SHA1_F4, SHA1_K4, 73); + ROUND(SHA1_F4, SHA1_K4, 74); ROUND(SHA1_F4, SHA1_K4, 75); + ROUND(SHA1_F4, SHA1_K4, 76); ROUND(SHA1_F4, SHA1_K4, 77); + ROUND(SHA1_F4, SHA1_K4, 78); ROUND(SHA1_F4, SHA1_K4, 79); + + #undef ROUND + + // Add to initial values + hash[0] = a + 0x67452301u; + hash[1] = b + 0xEFCDAB89u; + hash[2] = c + 0x98BADCFEu; + hash[3] = d + 0x10325476u; + hash[4] = e + 0xC3D2E1F0u; +} + +// +// Basic mining kernel - each work item tries one coin +// +__kernel void mine_deti_coins_kernel(__global uint *storage, uint param1, uint param2) +{ + uint gid = get_global_id(0); + uint coin[14]; + uint hash[5]; + + // Zero initialize + for(int i = 0; i < 14; i++) + coin[i] = 0; + + // Access as bytes with XOR 3 for endianness (little-endian word, big-endian bytes) + __private uchar *bytes = (__private uchar *)coin; + + // Fixed prefix: "DETI coin 2 " + bytes[0x0 ^ 3] = 'D'; + bytes[0x1 ^ 3] = 'E'; + bytes[0x2 ^ 3] = 'T'; + bytes[0x3 ^ 3] = 'I'; + bytes[0x4 ^ 3] = ' '; + bytes[0x5 ^ 3] = 'c'; + bytes[0x6 ^ 3] = 'o'; + bytes[0x7 ^ 3] = 'i'; + bytes[0x8 ^ 3] = 'n'; + bytes[0x9 ^ 3] = ' '; + bytes[0xa ^ 3] = '2'; + bytes[0xb ^ 3] = ' '; + + // Fixed suffix: newline + padding + bytes[0x36 ^ 3] = '\n'; + bytes[0x37 ^ 3] = 0x80; + + // Variable content (42 bytes from position 12 to 53) + // Generate unique content for each thread + uint seed = param1 + gid * 0x9E3779B9u; + uint seed2 = param2 ^ (gid * 0x61C88647u); + + for(int i = 12; i < 54; i++) + { + // LCG + xorshift mixer + seed = seed * 1664525u + 1013904223u; + seed2 ^= seed2 << 13; + seed2 ^= seed2 >> 17; + seed2 ^= seed2 << 5; + + uchar val = 32 + ((seed ^ seed2) % 95); + + // Skip newline character + if(val == '\n') val = ' '; + // Ensure we stay in printable range + if(val >= 127) val = 126; + + bytes[i ^ 3] = val; + } + + // Compute SHA-1 + sha1_compute(coin, hash); + + // Check for valid DETI coin v2 (hash starts with 0xAAD20250) + if(hash[0] == 0xAAD20250u) + { + // Atomically reserve space and store the coin + uint idx = atomic_add(&storage[0], 14u); + + if(idx + 14 <= 1024) + { + // Store all 14 words of the coin + for(int i = 0; i < 14; i++) + storage[idx + i] = coin[i]; + } + } +} + +// +// Scan kernel - each work item tries 256 variations +// +__kernel void mine_deti_coins_scan_kernel(__global uint *storage, uint param1, uint param2, int scan_pos) +{ + uint gid = get_global_id(0); + uint coin[14]; + uint hash[5]; + + // Initialize coin + for(int i = 0; i < 14; i++) + coin[i] = 0; + + __private uchar *bytes = (__private uchar *)coin; + + // Fixed parts + bytes[0x0 ^ 3] = 'D'; + bytes[0x1 ^ 3] = 'E'; + bytes[0x2 ^ 3] = 'T'; + bytes[0x3 ^ 3] = 'I'; + bytes[0x4 ^ 3] = ' '; + bytes[0x5 ^ 3] = 'c'; + bytes[0x6 ^ 3] = 'o'; + bytes[0x7 ^ 3] = 'i'; + bytes[0x8 ^ 3] = 'n'; + bytes[0x9 ^ 3] = ' '; + bytes[0xa ^ 3] = '2'; + bytes[0xb ^ 3] = ' '; + bytes[0x36 ^ 3] = '\n'; + bytes[0x37 ^ 3] = 0x80; + + // Generate base content unique to this thread + uint seed = param1 + gid * 0x9E3779B9u; + uint seed2 = param2 ^ (gid * 0x61C88647u); + + for(int i = 12; i < 54; i++) + { + seed = seed * 1664525u + 1013904223u; + seed2 ^= seed2 << 13; + seed2 ^= seed2 >> 17; + seed2 ^= seed2 << 5; + + uchar val = 32 + ((seed ^ seed2) % 95); + if(val == '\n') val = ' '; + if(val >= 127) val = 126; + + bytes[i ^ 3] = val; + } + + // Validate scan_pos + if(scan_pos < 12 || scan_pos >= 54) + scan_pos = 12; + + // Scan through all printable ASCII values at scan_pos + for(uint c = 32; c < 127; c++) + { + if(c == '\n') continue; // Skip newline + + bytes[scan_pos ^ 3] = (uchar)c; + + sha1_compute(coin, hash); + + if(hash[0] == 0xAAD20250u) + { + uint idx = atomic_add(&storage[0], 14u); + if(idx + 14 <= 1024) + { + for(int i = 0; i < 14; i++) + storage[idx + i] = coin[i]; + } + } + } +} diff --git a/aad_ocl_utilities.h b/aad_ocl_utilities.h new file mode 100644 index 0000000..5b3e77a --- /dev/null +++ b/aad_ocl_utilities.h @@ -0,0 +1,346 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// OpenCL utilities +// + +#ifndef AAD_OCL_UTILITIES +#define AAD_OCL_UTILITIES + +#define CL_TARGET_OPENCL_VERSION 120 +#include + +// +// data type used to store all OpenCL related stuff +// + +#define MAX_N_ARGUMENTS 4 +#define RECOMMENDED_OCL_WORK_GROUP_SIZE 128 + +typedef struct +{ + // input data + int platform_number; // number of the platform to use + int device_number; // number of the device to initialize + char *kernel_file_name; // name of the kernel file to load + char *kernel_name; // name of the OpenCL kernel to load + u32_t data_size[2]; // the number of bytes of the two data arrays to allocate on the host and on the device (0 if not needed) + + // persistent data + cl_platform_id platform; // the platform handle + cl_device_id device; // the device handle + char device_name[256]; // the device name + cl_context context; // the device context + cl_command_queue queue; // the command queue + cl_program program; // the compiled program + cl_kernel kernel; // the kernel handle + void *host_data[2]; // the pointers to the host data + cl_mem device_data[2]; // the device memory objects + + // launch kernel data + size_t global_work_size; // total number of work items + size_t local_work_size; // number of work items per work group + int n_kernel_arguments; // number of kernel arguments +} +ocl_data_t; + +// +// CL_CALL --- macro to call an OpenCL function and test its return value +// + +#define CL_CALL(f_name, args) \ + do \ + { \ + cl_int e = f_name args; \ + if(e != CL_SUCCESS) \ + { \ + fprintf(stderr,"" # f_name "() returned %s (file %s, line %d)\n",cl_error_string(e),__FILE__,__LINE__); \ + exit(1); \ + } \ + } \ + while(0) + +// +// Error code to string conversion +// + +static const char *cl_error_string(cl_int e) +{ + static char error_string[64]; +#define CASE(error_code) case error_code: return "" # error_code; + switch(e) + { + default: sprintf(error_string,"unknown error code (%d)",(int)e); return error_string; + CASE(CL_SUCCESS); + CASE(CL_DEVICE_NOT_FOUND); + CASE(CL_DEVICE_NOT_AVAILABLE); + CASE(CL_COMPILER_NOT_AVAILABLE); + CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CASE(CL_OUT_OF_RESOURCES); + CASE(CL_OUT_OF_HOST_MEMORY); + CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + CASE(CL_MEM_COPY_OVERLAP); + CASE(CL_IMAGE_FORMAT_MISMATCH); + CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CASE(CL_BUILD_PROGRAM_FAILURE); + CASE(CL_MAP_FAILURE); + CASE(CL_INVALID_VALUE); + CASE(CL_INVALID_DEVICE_TYPE); + CASE(CL_INVALID_PLATFORM); + CASE(CL_INVALID_DEVICE); + CASE(CL_INVALID_CONTEXT); + CASE(CL_INVALID_QUEUE_PROPERTIES); + CASE(CL_INVALID_COMMAND_QUEUE); + CASE(CL_INVALID_HOST_PTR); + CASE(CL_INVALID_MEM_OBJECT); + CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CASE(CL_INVALID_IMAGE_SIZE); + CASE(CL_INVALID_SAMPLER); + CASE(CL_INVALID_BINARY); + CASE(CL_INVALID_BUILD_OPTIONS); + CASE(CL_INVALID_PROGRAM); + CASE(CL_INVALID_PROGRAM_EXECUTABLE); + CASE(CL_INVALID_KERNEL_NAME); + CASE(CL_INVALID_KERNEL_DEFINITION); + CASE(CL_INVALID_KERNEL); + CASE(CL_INVALID_ARG_INDEX); + CASE(CL_INVALID_ARG_VALUE); + CASE(CL_INVALID_ARG_SIZE); + CASE(CL_INVALID_KERNEL_ARGS); + CASE(CL_INVALID_WORK_DIMENSION); + CASE(CL_INVALID_WORK_GROUP_SIZE); + CASE(CL_INVALID_WORK_ITEM_SIZE); + CASE(CL_INVALID_GLOBAL_OFFSET); + CASE(CL_INVALID_EVENT_WAIT_LIST); + CASE(CL_INVALID_EVENT); + CASE(CL_INVALID_OPERATION); + CASE(CL_INVALID_GL_OBJECT); + CASE(CL_INVALID_BUFFER_SIZE); + CASE(CL_INVALID_MIP_LEVEL); + CASE(CL_INVALID_GLOBAL_WORK_SIZE); + } +#undef CASE +} + +// +// Read kernel source from file +// + +static char *read_kernel_source(const char *filename, size_t *length) +{ + FILE *fp = fopen(filename, "rb"); + if(!fp) + { + fprintf(stderr, "Failed to open kernel file: %s\n", filename); + exit(1); + } + + fseek(fp, 0, SEEK_END); + *length = ftell(fp); + fseek(fp, 0, SEEK_SET); + + char *source = (char *)malloc(*length + 1); + if(!source) + { + fprintf(stderr, "Failed to allocate memory for kernel source\n"); + fclose(fp); + exit(1); + } + + size_t read = fread(source, 1, *length, fp); + source[read] = '\0'; + fclose(fp); + + *length = read; + return source; +} + +// +// Initialize OpenCL +// + +static void initialize_ocl(ocl_data_t *od) +{ + cl_uint num_platforms, num_devices; + cl_int err; + + // Get platform + CL_CALL(clGetPlatformIDs, (0, NULL, &num_platforms)); + if(od->platform_number >= (int)num_platforms) + { + fprintf(stderr, "Invalid platform number %d (only %u platforms available)\n", + od->platform_number, num_platforms); + exit(1); + } + + cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); + CL_CALL(clGetPlatformIDs, (num_platforms, platforms, NULL)); + od->platform = platforms[od->platform_number]; + free(platforms); + + // Get device + CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)); + if(od->device_number >= (int)num_devices) + { + fprintf(stderr, "Invalid device number %d (only %u devices available)\n", + od->device_number, num_devices); + exit(1); + } + + cl_device_id *devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices); + CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL)); + od->device = devices[od->device_number]; + free(devices); + + // Get device name + CL_CALL(clGetDeviceInfo, (od->device, CL_DEVICE_NAME, sizeof(od->device_name), + od->device_name, NULL)); + printf("initialize_ocl(): OpenCL code running on %s\n", od->device_name); + + // Create context + od->context = clCreateContext(NULL, 1, &od->device, NULL, NULL, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateContext() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Create command queue + od->queue = clCreateCommandQueue(od->context, od->device, 0, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateCommandQueue() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Load and compile kernel + size_t source_length; + char *source = read_kernel_source(od->kernel_file_name, &source_length); + + od->program = clCreateProgramWithSource(od->context, 1, (const char **)&source, + &source_length, &err); + free(source); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateProgramWithSource() returned %s\n", cl_error_string(err)); + exit(1); + } + + err = clBuildProgram(od->program, 1, &od->device, "-cl-std=CL1.2", NULL, NULL); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clBuildProgram() returned %s\n", cl_error_string(err)); + + // Get build log + size_t log_size; + clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = (char *)malloc(log_size); + clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + fprintf(stderr, "Build log:\n%s\n", log); + free(log); + exit(1); + } + + // Create kernel + od->kernel = clCreateKernel(od->program, od->kernel_name, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateKernel() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Allocate host and device memory + for(int i = 0; i < 2; i++) + { + if(od->data_size[i] > 0u) + { + od->host_data[i] = malloc(od->data_size[i]); + if(!od->host_data[i]) + { + fprintf(stderr, "Failed to allocate host memory\n"); + exit(1); + } + + od->device_data[i] = clCreateBuffer(od->context, CL_MEM_READ_WRITE, + od->data_size[i], NULL, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateBuffer() returned %s\n", cl_error_string(err)); + exit(1); + } + } + else + { + od->host_data[i] = NULL; + } + } +} + +// +// Terminate OpenCL +// + +static void terminate_ocl(ocl_data_t *od) +{ + for(int i = 0; i < 2; i++) + { + if(od->data_size[i] > 0u) + { + free(od->host_data[i]); + clReleaseMemObject(od->device_data[i]); + } + } + clReleaseKernel(od->kernel); + clReleaseProgram(od->program); + clReleaseCommandQueue(od->queue); + clReleaseContext(od->context); +} + +// +// Copy data between host and device +// + +static void host_to_device_copy(ocl_data_t *od, int idx) +{ + if(idx < 0 || idx > 1 || od->data_size[idx] == 0u) + { + fprintf(stderr, "host_to_device_copy(): bad idx\n"); + exit(1); + } + CL_CALL(clEnqueueWriteBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0, + od->data_size[idx], od->host_data[idx], 0, NULL, NULL)); +} + +static void device_to_host_copy(ocl_data_t *od, int idx) +{ + if(idx < 0 || idx > 1 || od->data_size[idx] == 0u) + { + fprintf(stderr, "device_to_host_copy(): bad idx\n"); + exit(1); + } + CL_CALL(clEnqueueReadBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0, + od->data_size[idx], od->host_data[idx], 0, NULL, NULL)); +} + +// +// Set kernel argument +// + +static void set_kernel_arg(ocl_data_t *od, int arg_idx, size_t arg_size, const void *arg_value) +{ + CL_CALL(clSetKernelArg, (od->kernel, arg_idx, arg_size, arg_value)); +} + +// +// Launch kernel +// + +static void launch_kernel(ocl_data_t *od) +{ + CL_CALL(clEnqueueNDRangeKernel, (od->queue, od->kernel, 1, NULL, + &od->global_work_size, &od->local_work_size, + 0, NULL, NULL)); + CL_CALL(clFinish, (od->queue)); +} + +#endif diff --git a/aad_sha1_ocl_kernel.cl b/aad_sha1_ocl_kernel.cl new file mode 100644 index 0000000..2ba781b --- /dev/null +++ b/aad_sha1_ocl_kernel.cl @@ -0,0 +1,175 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// SHA-1 OpenCL kernel - validation and testing +// + +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +// SHA-1 functions +#define SHA1_F1(x,y,z) ((x & y) | (~x & z)) +#define SHA1_K1 0x5A827999u +#define SHA1_F2(x,y,z) (x ^ y ^ z) +#define SHA1_K2 0x6ED9EBA1u +#define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z)) +#define SHA1_K3 0x8F1BBCDCu +#define SHA1_F4(x,y,z) (x ^ y ^ z) +#define SHA1_K4 0xCA62C1D6u + +// +// sha1_ocl_kernel() --- each OpenCL work item computes the SHA-1 secure hash of one message +// +// This kernel should only be used to validate the SHA-1 secure hash code in OpenCL +// It matches the structure of the CUDA sha1_cuda_kernel +// +// The data is organized in an interleaved fashion for optimal memory access: +// For work group size W, the layout is: +// data[work_group_num][14][W] +// where data for the same index across all work items in a group are consecutive +// +__kernel void sha1_ocl_kernel(__global uint *interleaved_data, __global uint *interleaved_hash) +{ + uint gid = get_global_id(0); + uint local_id = get_local_id(0); + uint local_size = get_local_size(0); + uint group_id = get_group_id(0); + + // Adjust pointers for interleaved access pattern + // Each work group processes local_size messages + // Within a group, data is interleaved: all work items' data[0], then all data[1], etc. + __global uint *data = &interleaved_data[group_id * (local_size * 14) + local_id]; + __global uint *hash = &interleaved_hash[group_id * (local_size * 5) + local_id]; + + // Local storage for computation + uint a, b, c, d, e, w[16]; + uint tmp; + + // Initial hash values + a = 0x67452301u; + b = 0xEFCDAB89u; + c = 0x98BADCFEu; + d = 0x10325476u; + e = 0xC3D2E1F0u; + + // Load message schedule from interleaved data + // DATA(idx) accesses data[idx * local_size] to get the correct interleaved value + for(int i = 0; i < 14; i++) + w[i] = data[local_size * i]; + w[14] = 0; + w[15] = 440; // 55 bytes * 8 bits + + // SHA-1 compression - 80 rounds + + // Rounds 0-15 (no message schedule expansion needed) + #define SHA1_STEP(F, K, t) \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + SHA1_STEP(SHA1_F1, SHA1_K1, 0); + SHA1_STEP(SHA1_F1, SHA1_K1, 1); + SHA1_STEP(SHA1_F1, SHA1_K1, 2); + SHA1_STEP(SHA1_F1, SHA1_K1, 3); + SHA1_STEP(SHA1_F1, SHA1_K1, 4); + SHA1_STEP(SHA1_F1, SHA1_K1, 5); + SHA1_STEP(SHA1_F1, SHA1_K1, 6); + SHA1_STEP(SHA1_F1, SHA1_K1, 7); + SHA1_STEP(SHA1_F1, SHA1_K1, 8); + SHA1_STEP(SHA1_F1, SHA1_K1, 9); + SHA1_STEP(SHA1_F1, SHA1_K1, 10); + SHA1_STEP(SHA1_F1, SHA1_K1, 11); + SHA1_STEP(SHA1_F1, SHA1_K1, 12); + SHA1_STEP(SHA1_F1, SHA1_K1, 13); + SHA1_STEP(SHA1_F1, SHA1_K1, 14); + SHA1_STEP(SHA1_F1, SHA1_K1, 15); + + #undef SHA1_STEP + + // Rounds 16-79 with message schedule expansion + #define SHA1_EXPAND_STEP(F, K, t) \ + tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \ + w[t & 15] = ROTATE_LEFT(tmp, 1); \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + // Rounds 16-19 (still using F1) + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 16); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 17); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 18); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 19); + + // Rounds 20-39 (F2) + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 20); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 21); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 22); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 23); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 24); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 25); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 26); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 27); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 28); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 29); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 30); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 31); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 32); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 33); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 34); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 35); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 36); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 37); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 38); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 39); + + // Rounds 40-59 (F3) + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 40); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 41); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 42); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 43); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 44); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 45); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 46); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 47); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 48); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 49); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 50); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 51); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 52); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 53); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 54); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 55); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 56); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 57); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 58); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 59); + + // Rounds 60-79 (F4) + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 60); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 61); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 62); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 63); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 64); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 65); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 66); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 67); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 68); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 69); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 70); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 71); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 72); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 73); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 74); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 75); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 76); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 77); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 78); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 79); + + #undef SHA1_EXPAND_STEP + + // Add to initial hash values and store in interleaved format + // HASH(idx) writes to hash[idx * local_size] + hash[local_size * 0] = a + 0x67452301u; + hash[local_size * 1] = b + 0xEFCDAB89u; + hash[local_size * 2] = c + 0x98BADCFEu; + hash[local_size * 3] = d + 0x10325476u; + hash[local_size * 4] = e + 0xC3D2E1F0u; +} diff --git a/makefile b/makefile index c4c0298..1d1d1ef 100644 --- a/makefile +++ b/makefile @@ -42,7 +42,7 @@ CUDA_ARCH = sm_86 clean: rm -f sha1_tests rm -f sha1_cuda_test sha1_cuda_kernel.cubin - rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin + rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl rm -f coin_miner_wasm.js coin_miner_wasm.wasm rm -f benchmark rm -f a.out @@ -85,6 +85,9 @@ coin_miner_cuda_kernel.cubin: aad_coin_miner_cuda_kernel.cu aad_sha1.h makefile coin_miner_cuda: aad_coin_miner_cuda.c coin_miner_cuda_kernel.cubin aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h aad_vault.h aad_cuda_utilities.h makefile cc -march=native -Wall -Wshadow -Werror -O3 -I$(CUDA_DIR)/include $< -o $@ -lcuda +coin_miner_ocl: aad_coin_miner_ocl.c aad_coin_miner_ocl_kernel.cl aad_sha1.h aad_sha1_cpu.h aad_sha1_ocl_kernel.cl aad_data_types.h aad_utilities.h aad_vault.h aad_ocl_utilities.h makefile + cc -march=native -Wall -Wshadow -O3 $< -o $@ -lOpenCL + coin_miner_wasm: aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h aad_data_types.h aad_utilities.h aad_vault.h makefile emcc -O3 -flto -msimd128 -o coin_miner_wasm.js aad_coin_miner_wasm.c \ -s WASM=1 \ @@ -98,8 +101,8 @@ coin_miner_wasm: aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h benchmark: aad_benchmark.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h makefile cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@ -miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda benchmark +miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl benchmark all: sha1_tests sha1_cuda_test sha1_cuda_kernel.cubin \ - coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin \ + coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl \ benchmark