OpenCL implementation (optimized for Intel Iris XE)

Signed-off-by: Tiago Garcia <tiago.rgarcia@ua.pt>
2025-11-22 17:48:07 +00:00 · 2025-11-22 17:48:07 +00:00 · 5febf93e65
parent 4d2bbe4f9b
commit 5febf93e65
5 changed files with 1008 additions and 3 deletions
--- a/aad_coin_miner_ocl.c
+++ b/aad_coin_miner_ocl.c
@ -0,0 +1,226 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
 // DETI Coin Miner - OpenCL implementation
 //
 #include <time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
 #include "aad_data_types.h"
 #include "aad_utilities.h"
 #include "aad_sha1_cpu.h"
 #include "aad_ocl_utilities.h"
 #include "aad_vault.h"
 #define COINS_STORAGE_SIZE  1024u
 static volatile int keep_running = 1;
 void signal_handler(int signum)
 {
  (void)signum;
  keep_running = 0;
 }
 // Coin reconstruction from stored data
 static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
 {
  for(int i = 0; i < 14; i++)
    coin[i] = stored_data[i];
 }
 //
 // Mine DETI coins using OpenCL
 //
 static void mine_coins_ocl(u64_t max_attempts, int use_scan_kernel)
 {
  ocl_data_t od;
  u32_t *host_storage;
  u64_t attempts = 0;
  u32_t coins_found = 0;
  u32_t kernel_runs = 0;
  // Initialize OpenCL
  od.platform_number = 0;
  od.device_number = 0;
  od.kernel_file_name = "aad_coin_miner_ocl_kernel.cl";
  od.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel";
  od.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t);
  od.data_size[1] = 0;
  initialize_ocl(&od);
  host_storage = (u32_t *)od.host_data[0];
  // Kernel configuration
  od.local_work_size = RECOMMENDED_OCL_WORK_GROUP_SIZE;
  od.global_work_size = 4096 * od.local_work_size;  // Large grid for maximum GPU utilization
  u32_t n_threads = od.global_work_size;
  printf("Mining DETI coins using OpenCL...\n");
  printf("Device: %s\n", od.device_name);
  printf("Work groups: %zu × %zu = %u total work items\n",
         od.global_work_size / od.local_work_size, od.local_work_size, n_threads);
  printf("Kernel: %s\n", od.kernel_name);
  printf("Press Ctrl+C to stop\n\n");
  // Test SHA1 on host to verify it matches
  printf("Testing SHA1 implementation on host...\n");
  u32_t test_coin[14];
  memset(test_coin, 0, sizeof(test_coin));
  ((u08_t *)test_coin)[0x0 ^ 3] = 'D';
  ((u08_t *)test_coin)[0x1 ^ 3] = 'E';
  ((u08_t *)test_coin)[0x2 ^ 3] = 'T';
  ((u08_t *)test_coin)[0x3 ^ 3] = 'I';
  ((u08_t *)test_coin)[0x4 ^ 3] = ' ';
  ((u08_t *)test_coin)[0x5 ^ 3] = 'c';
  ((u08_t *)test_coin)[0x6 ^ 3] = 'o';
  ((u08_t *)test_coin)[0x7 ^ 3] = 'i';
  ((u08_t *)test_coin)[0x8 ^ 3] = 'n';
  ((u08_t *)test_coin)[0x9 ^ 3] = ' ';
  ((u08_t *)test_coin)[0xa ^ 3] = '2';
  ((u08_t *)test_coin)[0xb ^ 3] = ' ';
  ((u08_t *)test_coin)[0x36 ^ 3] = '\n';
  ((u08_t *)test_coin)[0x37 ^ 3] = 0x80;
  for(int i = 12; i < 54; i++)
    ((u08_t *)test_coin)[i ^ 3] = 'A' + (i - 12) % 26;
  u32_t test_hash[5];
  sha1(test_coin, test_hash);
  printf("Host test hash: 0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n",
         test_hash[0], test_hash[1], test_hash[2], test_hash[3], test_hash[4]);
  // Now test on device
  printf("Testing SHA1 implementation on device...\n");
  host_storage[0] = 1u;
  // Put the test coin in storage starting at index 1
  for(int i = 0; i < 14; i++)
    host_storage[1 + i] = test_coin[i];
  // Copy to device
  host_to_device_copy(&od, 0);
  // We'll add a test kernel - for now just verify basic kernel launch works
  printf("Starting mining...\n\n");
  u32_t param1 = (u32_t)time(NULL);
  u32_t param2 = 0x12345678u;
  int scan_pos = 12;
  time_measurement();
  time_measurement();
  double start_time = wall_time_delta();
  while(keep_running && (max_attempts == 0 || attempts < max_attempts))
  {
    // Initialize storage area
    host_storage[0] = 1u; // First unused index
    // Copy to device
    host_to_device_copy(&od, 0);
    // Set kernel arguments
    od.n_kernel_arguments = use_scan_kernel ? 4 : 3;
    set_kernel_arg(&od, 0, sizeof(cl_mem), &od.device_data[0]);
    set_kernel_arg(&od, 1, sizeof(u32_t), &param1);
    set_kernel_arg(&od, 2, sizeof(u32_t), &param2);
    if(use_scan_kernel)
      set_kernel_arg(&od, 3, sizeof(int), &scan_pos);
    // Launch the OpenCL kernel
    launch_kernel(&od);
    // Copy results back
    device_to_host_copy(&od, 0);
    // Process found coins
    u32_t n_coins_this_kernel = 0;
    u32_t n_stored = (host_storage[0] - 1) / 14;
    if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE)
    {
      for(u32_t i = 0; i < n_stored; i++)
      {
        u32_t coin[14];
        reconstruct_coin(&host_storage[1 + i * 14], coin);
        // Verify it's actually a valid coin
        u32_t hash[5];
        sha1(coin, hash);
        if(hash[0] == 0xAAD20250u)
        {
          coins_found++;
          n_coins_this_kernel++;
          printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n",
                 kernel_runs, n_coins_this_kernel);
          save_coin(coin);
        }
      }
    }
    // Update counters
    kernel_runs++;
    if(use_scan_kernel)
      attempts += n_threads * 256; // Each thread tries 256 values
    else
      attempts += n_threads;
    // Update parameters for next iteration
    param1++;
    param2 = param2 ^ 0x9E3779B9u;
    if(use_scan_kernel)
      scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53
    // Print progress every 10 kernel launches
    if(kernel_runs % 10 == 0)
    {
      time_measurement();
      double current_time = wall_time_delta() - start_time;
      double rate = attempts / current_time;
      printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u\n",
             (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs);
    }
  }
  time_measurement();
  double total_time = wall_time_delta() - start_time;
  printf("\n=== Mining Statistics ===\n");
  printf("Total attempts: %llu\n", (unsigned long long)attempts);
  printf("Total time: %.2f seconds\n", total_time);
  printf("Average rate: %.2f attempts/second\n", attempts / total_time);
  printf("Coins found: %u\n", coins_found);
  printf("Kernel launches: %u\n", kernel_runs);
  // Save any remaining coins
  save_coin(NULL);
  terminate_ocl(&od);
 }
 int main(int argc, char *argv[])
 {
  u64_t max_attempts = 0;
  int use_scan_kernel = 0;
  signal(SIGINT, signal_handler);
  if(argc > 1)
    max_attempts = strtoull(argv[1], NULL, 10);
  if(argc > 2 && strcmp(argv[2], "scan") == 0)
  {
    use_scan_kernel = 1;
    printf("Using scan kernel (tries 256 values per thread)\n");
  }
  mine_coins_ocl(max_attempts, use_scan_kernel);
  return 0;
 }
--- a/aad_coin_miner_ocl_kernel.cl
+++ b/aad_coin_miner_ocl_kernel.cl
@ -0,0 +1,255 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
 // DETI Coin Miner - OpenCL kernel
 //
 // Rotate left for SHA-1
 #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
 // SHA-1 macros
 #define SHA1_F1(x,y,z) ((x & y) | (~x & z))
 #define SHA1_K1 0x5A827999u
 #define SHA1_F2(x,y,z) (x ^ y ^ z)
 #define SHA1_K2 0x6ED9EBA1u
 #define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z))
 #define SHA1_K3 0x8F1BBCDCu
 #define SHA1_F4(x,y,z) (x ^ y ^ z)
 #define SHA1_K4 0xCA62C1D6u
 //
 // SHA-1 implementation matching the template from aad_sha1.h
 //
 void sha1_compute(__private uint *coin, __private uint *hash)
 {
  uint a, b, c, d, e, w[16];
  // Initial hash values
  a = 0x67452301u;
  b = 0xEFCDAB89u;
  c = 0x98BADCFEu;
  d = 0x10325476u;
  e = 0xC3D2E1F0u;
  // Load message schedule (first 14 words from coin, then 0, then length)
  for(int i = 0; i < 14; i++)
    w[i] = coin[i];
  w[14] = 0;
  w[15] = 440; // 55 bytes * 8 bits
  // SHA-1 compression function - 80 rounds
  uint tmp;
  // Rounds 0-15
  #define ROUND1(t) \
    tmp = ROTATE_LEFT(a, 5) + SHA1_F1(b,c,d) + e + w[t] + SHA1_K1; \
    e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
  ROUND1(0); ROUND1(1); ROUND1(2); ROUND1(3);
  ROUND1(4); ROUND1(5); ROUND1(6); ROUND1(7);
  ROUND1(8); ROUND1(9); ROUND1(10); ROUND1(11);
  ROUND1(12); ROUND1(13); ROUND1(14); ROUND1(15);
  #undef ROUND1
  // Rounds 16-79 with message schedule
  #define ROUND(F, K, t) \
    tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \
    w[t & 15] = ROTATE_LEFT(tmp, 1); \
    tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \
    e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
  ROUND(SHA1_F1, SHA1_K1, 16); ROUND(SHA1_F1, SHA1_K1, 17);
  ROUND(SHA1_F1, SHA1_K1, 18); ROUND(SHA1_F1, SHA1_K1, 19);
  ROUND(SHA1_F2, SHA1_K2, 20); ROUND(SHA1_F2, SHA1_K2, 21);
  ROUND(SHA1_F2, SHA1_K2, 22); ROUND(SHA1_F2, SHA1_K2, 23);
  ROUND(SHA1_F2, SHA1_K2, 24); ROUND(SHA1_F2, SHA1_K2, 25);
  ROUND(SHA1_F2, SHA1_K2, 26); ROUND(SHA1_F2, SHA1_K2, 27);
  ROUND(SHA1_F2, SHA1_K2, 28); ROUND(SHA1_F2, SHA1_K2, 29);
  ROUND(SHA1_F2, SHA1_K2, 30); ROUND(SHA1_F2, SHA1_K2, 31);
  ROUND(SHA1_F2, SHA1_K2, 32); ROUND(SHA1_F2, SHA1_K2, 33);
  ROUND(SHA1_F2, SHA1_K2, 34); ROUND(SHA1_F2, SHA1_K2, 35);
  ROUND(SHA1_F2, SHA1_K2, 36); ROUND(SHA1_F2, SHA1_K2, 37);
  ROUND(SHA1_F2, SHA1_K2, 38); ROUND(SHA1_F2, SHA1_K2, 39);
  ROUND(SHA1_F3, SHA1_K3, 40); ROUND(SHA1_F3, SHA1_K3, 41);
  ROUND(SHA1_F3, SHA1_K3, 42); ROUND(SHA1_F3, SHA1_K3, 43);
  ROUND(SHA1_F3, SHA1_K3, 44); ROUND(SHA1_F3, SHA1_K3, 45);
  ROUND(SHA1_F3, SHA1_K3, 46); ROUND(SHA1_F3, SHA1_K3, 47);
  ROUND(SHA1_F3, SHA1_K3, 48); ROUND(SHA1_F3, SHA1_K3, 49);
  ROUND(SHA1_F3, SHA1_K3, 50); ROUND(SHA1_F3, SHA1_K3, 51);
  ROUND(SHA1_F3, SHA1_K3, 52); ROUND(SHA1_F3, SHA1_K3, 53);
  ROUND(SHA1_F3, SHA1_K3, 54); ROUND(SHA1_F3, SHA1_K3, 55);
  ROUND(SHA1_F3, SHA1_K3, 56); ROUND(SHA1_F3, SHA1_K3, 57);
  ROUND(SHA1_F3, SHA1_K3, 58); ROUND(SHA1_F3, SHA1_K3, 59);
  ROUND(SHA1_F4, SHA1_K4, 60); ROUND(SHA1_F4, SHA1_K4, 61);
  ROUND(SHA1_F4, SHA1_K4, 62); ROUND(SHA1_F4, SHA1_K4, 63);
  ROUND(SHA1_F4, SHA1_K4, 64); ROUND(SHA1_F4, SHA1_K4, 65);
  ROUND(SHA1_F4, SHA1_K4, 66); ROUND(SHA1_F4, SHA1_K4, 67);
  ROUND(SHA1_F4, SHA1_K4, 68); ROUND(SHA1_F4, SHA1_K4, 69);
  ROUND(SHA1_F4, SHA1_K4, 70); ROUND(SHA1_F4, SHA1_K4, 71);
  ROUND(SHA1_F4, SHA1_K4, 72); ROUND(SHA1_F4, SHA1_K4, 73);
  ROUND(SHA1_F4, SHA1_K4, 74); ROUND(SHA1_F4, SHA1_K4, 75);
  ROUND(SHA1_F4, SHA1_K4, 76); ROUND(SHA1_F4, SHA1_K4, 77);
  ROUND(SHA1_F4, SHA1_K4, 78); ROUND(SHA1_F4, SHA1_K4, 79);
  #undef ROUND
  // Add to initial values
  hash[0] = a + 0x67452301u;
  hash[1] = b + 0xEFCDAB89u;
  hash[2] = c + 0x98BADCFEu;
  hash[3] = d + 0x10325476u;
  hash[4] = e + 0xC3D2E1F0u;
 }
 //
 // Basic mining kernel - each work item tries one coin
 //
 __kernel void mine_deti_coins_kernel(__global uint *storage, uint param1, uint param2)
 {
  uint gid = get_global_id(0);
  uint coin[14];
  uint hash[5];
  // Zero initialize
  for(int i = 0; i < 14; i++)
    coin[i] = 0;
  // Access as bytes with XOR 3 for endianness (little-endian word, big-endian bytes)
  __private uchar *bytes = (__private uchar *)coin;
  // Fixed prefix: "DETI coin 2 "
  bytes[0x0 ^ 3] = 'D';
  bytes[0x1 ^ 3] = 'E';
  bytes[0x2 ^ 3] = 'T';
  bytes[0x3 ^ 3] = 'I';
  bytes[0x4 ^ 3] = ' ';
  bytes[0x5 ^ 3] = 'c';
  bytes[0x6 ^ 3] = 'o';
  bytes[0x7 ^ 3] = 'i';
  bytes[0x8 ^ 3] = 'n';
  bytes[0x9 ^ 3] = ' ';
  bytes[0xa ^ 3] = '2';
  bytes[0xb ^ 3] = ' ';
  // Fixed suffix: newline + padding
  bytes[0x36 ^ 3] = '\n';
  bytes[0x37 ^ 3] = 0x80;
  // Variable content (42 bytes from position 12 to 53)
  // Generate unique content for each thread
  uint seed = param1 + gid * 0x9E3779B9u;
  uint seed2 = param2 ^ (gid * 0x61C88647u);
  for(int i = 12; i < 54; i++)
  {
    // LCG + xorshift mixer
    seed = seed * 1664525u + 1013904223u;
    seed2 ^= seed2 << 13;
    seed2 ^= seed2 >> 17;
    seed2 ^= seed2 << 5;
    uchar val = 32 + ((seed ^ seed2) % 95);
    // Skip newline character
    if(val == '\n') val = ' ';
    // Ensure we stay in printable range
    if(val >= 127) val = 126;
    bytes[i ^ 3] = val;
  }
  // Compute SHA-1
  sha1_compute(coin, hash);
  // Check for valid DETI coin v2 (hash starts with 0xAAD20250)
  if(hash[0] == 0xAAD20250u)
  {
    // Atomically reserve space and store the coin
    uint idx = atomic_add(&storage[0], 14u);
    if(idx + 14 <= 1024)
    {
      // Store all 14 words of the coin
      for(int i = 0; i < 14; i++)
        storage[idx + i] = coin[i];
    }
  }
 }
 //
 // Scan kernel - each work item tries 256 variations
 //
 __kernel void mine_deti_coins_scan_kernel(__global uint *storage, uint param1, uint param2, int scan_pos)
 {
  uint gid = get_global_id(0);
  uint coin[14];
  uint hash[5];
  // Initialize coin
  for(int i = 0; i < 14; i++)
    coin[i] = 0;
  __private uchar *bytes = (__private uchar *)coin;
  // Fixed parts
  bytes[0x0 ^ 3] = 'D';
  bytes[0x1 ^ 3] = 'E';
  bytes[0x2 ^ 3] = 'T';
  bytes[0x3 ^ 3] = 'I';
  bytes[0x4 ^ 3] = ' ';
  bytes[0x5 ^ 3] = 'c';
  bytes[0x6 ^ 3] = 'o';
  bytes[0x7 ^ 3] = 'i';
  bytes[0x8 ^ 3] = 'n';
  bytes[0x9 ^ 3] = ' ';
  bytes[0xa ^ 3] = '2';
  bytes[0xb ^ 3] = ' ';
  bytes[0x36 ^ 3] = '\n';
  bytes[0x37 ^ 3] = 0x80;
  // Generate base content unique to this thread
  uint seed = param1 + gid * 0x9E3779B9u;
  uint seed2 = param2 ^ (gid * 0x61C88647u);
  for(int i = 12; i < 54; i++)
  {
    seed = seed * 1664525u + 1013904223u;
    seed2 ^= seed2 << 13;
    seed2 ^= seed2 >> 17;
    seed2 ^= seed2 << 5;
    uchar val = 32 + ((seed ^ seed2) % 95);
    if(val == '\n') val = ' ';
    if(val >= 127) val = 126;
    bytes[i ^ 3] = val;
  }
  // Validate scan_pos
  if(scan_pos < 12 || scan_pos >= 54)
    scan_pos = 12;
  // Scan through all printable ASCII values at scan_pos
  for(uint c = 32; c < 127; c++)
  {
    if(c == '\n') continue; // Skip newline
    bytes[scan_pos ^ 3] = (uchar)c;
    sha1_compute(coin, hash);
    if(hash[0] == 0xAAD20250u)
    {
      uint idx = atomic_add(&storage[0], 14u);
      if(idx + 14 <= 1024)
      {
        for(int i = 0; i < 14; i++)
          storage[idx + i] = coin[i];
      }
    }
  }
 }
--- a/aad_ocl_utilities.h
+++ b/aad_ocl_utilities.h
@ -0,0 +1,346 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
 // OpenCL utilities
 //
 #ifndef AAD_OCL_UTILITIES
 #define AAD_OCL_UTILITIES
 #define CL_TARGET_OPENCL_VERSION 120
 #include <CL/cl.h>
 //
 // data type used to store all OpenCL related stuff
 //
 #define MAX_N_ARGUMENTS  4
 #define RECOMMENDED_OCL_WORK_GROUP_SIZE  128
 typedef struct
 {
  // input data
  int platform_number;           // number of the platform to use
  int device_number;             // number of the device to initialize
  char *kernel_file_name;        // name of the kernel file to load
  char *kernel_name;             // name of the OpenCL kernel to load
  u32_t data_size[2];            // the number of bytes of the two data arrays to allocate on the host and on the device (0 if not needed)
  // persistent data
  cl_platform_id   platform;     // the platform handle
  cl_device_id     device;       // the device handle
  char             device_name[256]; // the device name
  cl_context       context;      // the device context
  cl_command_queue queue;        // the command queue
  cl_program       program;      // the compiled program
  cl_kernel        kernel;       // the kernel handle
  void            *host_data[2]; // the pointers to the host data
  cl_mem           device_data[2]; // the device memory objects
  // launch kernel data
  size_t global_work_size;       // total number of work items
  size_t local_work_size;        // number of work items per work group
  int n_kernel_arguments;        // number of kernel arguments
 }
 ocl_data_t;
 //
 // CL_CALL --- macro to call an OpenCL function and test its return value
 //
 #define CL_CALL(f_name, args)                                                                                \
  do                                                                                                         \
  {                                                                                                          \
    cl_int e = f_name args;                                                                                  \
    if(e != CL_SUCCESS)                                                                                      \
    {                                                                                                        \
      fprintf(stderr,"" # f_name "() returned %s (file %s, line %d)\n",cl_error_string(e),__FILE__,__LINE__); \
      exit(1);                                                                                               \
    }                                                                                                        \
  }                                                                                                          \
  while(0)
 //
 // Error code to string conversion
 //
 static const char *cl_error_string(cl_int e)
 {
  static char error_string[64];
 #define CASE(error_code)  case error_code: return "" # error_code;
  switch(e)
  {
    default: sprintf(error_string,"unknown error code (%d)",(int)e); return error_string;
    CASE(CL_SUCCESS);
    CASE(CL_DEVICE_NOT_FOUND);
    CASE(CL_DEVICE_NOT_AVAILABLE);
    CASE(CL_COMPILER_NOT_AVAILABLE);
    CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
    CASE(CL_OUT_OF_RESOURCES);
    CASE(CL_OUT_OF_HOST_MEMORY);
    CASE(CL_PROFILING_INFO_NOT_AVAILABLE);
    CASE(CL_MEM_COPY_OVERLAP);
    CASE(CL_IMAGE_FORMAT_MISMATCH);
    CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
    CASE(CL_BUILD_PROGRAM_FAILURE);
    CASE(CL_MAP_FAILURE);
    CASE(CL_INVALID_VALUE);
    CASE(CL_INVALID_DEVICE_TYPE);
    CASE(CL_INVALID_PLATFORM);
    CASE(CL_INVALID_DEVICE);
    CASE(CL_INVALID_CONTEXT);
    CASE(CL_INVALID_QUEUE_PROPERTIES);
    CASE(CL_INVALID_COMMAND_QUEUE);
    CASE(CL_INVALID_HOST_PTR);
    CASE(CL_INVALID_MEM_OBJECT);
    CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
    CASE(CL_INVALID_IMAGE_SIZE);
    CASE(CL_INVALID_SAMPLER);
    CASE(CL_INVALID_BINARY);
    CASE(CL_INVALID_BUILD_OPTIONS);
    CASE(CL_INVALID_PROGRAM);
    CASE(CL_INVALID_PROGRAM_EXECUTABLE);
    CASE(CL_INVALID_KERNEL_NAME);
    CASE(CL_INVALID_KERNEL_DEFINITION);
    CASE(CL_INVALID_KERNEL);
    CASE(CL_INVALID_ARG_INDEX);
    CASE(CL_INVALID_ARG_VALUE);
    CASE(CL_INVALID_ARG_SIZE);
    CASE(CL_INVALID_KERNEL_ARGS);
    CASE(CL_INVALID_WORK_DIMENSION);
    CASE(CL_INVALID_WORK_GROUP_SIZE);
    CASE(CL_INVALID_WORK_ITEM_SIZE);
    CASE(CL_INVALID_GLOBAL_OFFSET);
    CASE(CL_INVALID_EVENT_WAIT_LIST);
    CASE(CL_INVALID_EVENT);
    CASE(CL_INVALID_OPERATION);
    CASE(CL_INVALID_GL_OBJECT);
    CASE(CL_INVALID_BUFFER_SIZE);
    CASE(CL_INVALID_MIP_LEVEL);
    CASE(CL_INVALID_GLOBAL_WORK_SIZE);
  }
 #undef CASE
 }
 //
 // Read kernel source from file
 //
 static char *read_kernel_source(const char *filename, size_t *length)
 {
  FILE *fp = fopen(filename, "rb");
  if(!fp)
  {
    fprintf(stderr, "Failed to open kernel file: %s\n", filename);
    exit(1);
  }
  fseek(fp, 0, SEEK_END);
  *length = ftell(fp);
  fseek(fp, 0, SEEK_SET);
  char *source = (char *)malloc(*length + 1);
  if(!source)
  {
    fprintf(stderr, "Failed to allocate memory for kernel source\n");
    fclose(fp);
    exit(1);
  }
  size_t read = fread(source, 1, *length, fp);
  source[read] = '\0';
  fclose(fp);
  *length = read;
  return source;
 }
 //
 // Initialize OpenCL
 //
 static void initialize_ocl(ocl_data_t *od)
 {
  cl_uint num_platforms, num_devices;
  cl_int err;
  // Get platform
  CL_CALL(clGetPlatformIDs, (0, NULL, &num_platforms));
  if(od->platform_number >= (int)num_platforms)
  {
    fprintf(stderr, "Invalid platform number %d (only %u platforms available)\n",
            od->platform_number, num_platforms);
    exit(1);
  }
  cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
  CL_CALL(clGetPlatformIDs, (num_platforms, platforms, NULL));
  od->platform = platforms[od->platform_number];
  free(platforms);
  // Get device
  CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices));
  if(od->device_number >= (int)num_devices)
  {
    fprintf(stderr, "Invalid device number %d (only %u devices available)\n",
            od->device_number, num_devices);
    exit(1);
  }
  cl_device_id *devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
  CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL));
  od->device = devices[od->device_number];
  free(devices);
  // Get device name
  CL_CALL(clGetDeviceInfo, (od->device, CL_DEVICE_NAME, sizeof(od->device_name),
                            od->device_name, NULL));
  printf("initialize_ocl(): OpenCL code running on %s\n", od->device_name);
  // Create context
  od->context = clCreateContext(NULL, 1, &od->device, NULL, NULL, &err);
  if(err != CL_SUCCESS)
  {
    fprintf(stderr, "clCreateContext() returned %s\n", cl_error_string(err));
    exit(1);
  }
  // Create command queue
  od->queue = clCreateCommandQueue(od->context, od->device, 0, &err);
  if(err != CL_SUCCESS)
  {
    fprintf(stderr, "clCreateCommandQueue() returned %s\n", cl_error_string(err));
    exit(1);
  }
  // Load and compile kernel
  size_t source_length;
  char *source = read_kernel_source(od->kernel_file_name, &source_length);
  od->program = clCreateProgramWithSource(od->context, 1, (const char **)&source,
                                          &source_length, &err);
  free(source);
  if(err != CL_SUCCESS)
  {
    fprintf(stderr, "clCreateProgramWithSource() returned %s\n", cl_error_string(err));
    exit(1);
  }
  err = clBuildProgram(od->program, 1, &od->device, "-cl-std=CL1.2", NULL, NULL);
  if(err != CL_SUCCESS)
  {
    fprintf(stderr, "clBuildProgram() returned %s\n", cl_error_string(err));
    // Get build log
    size_t log_size;
    clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    char *log = (char *)malloc(log_size);
    clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
    fprintf(stderr, "Build log:\n%s\n", log);
    free(log);
    exit(1);
  }
  // Create kernel
  od->kernel = clCreateKernel(od->program, od->kernel_name, &err);
  if(err != CL_SUCCESS)
  {
    fprintf(stderr, "clCreateKernel() returned %s\n", cl_error_string(err));
    exit(1);
  }
  // Allocate host and device memory
  for(int i = 0; i < 2; i++)
  {
    if(od->data_size[i] > 0u)
    {
      od->host_data[i] = malloc(od->data_size[i]);
      if(!od->host_data[i])
      {
        fprintf(stderr, "Failed to allocate host memory\n");
        exit(1);
      }
      od->device_data[i] = clCreateBuffer(od->context, CL_MEM_READ_WRITE,
                                          od->data_size[i], NULL, &err);
      if(err != CL_SUCCESS)
      {
        fprintf(stderr, "clCreateBuffer() returned %s\n", cl_error_string(err));
        exit(1);
      }
    }
    else
    {
      od->host_data[i] = NULL;
    }
  }
 }
 //
 // Terminate OpenCL
 //
 static void terminate_ocl(ocl_data_t *od)
 {
  for(int i = 0; i < 2; i++)
  {
    if(od->data_size[i] > 0u)
    {
      free(od->host_data[i]);
      clReleaseMemObject(od->device_data[i]);
    }
  }
  clReleaseKernel(od->kernel);
  clReleaseProgram(od->program);
  clReleaseCommandQueue(od->queue);
  clReleaseContext(od->context);
 }
 //
 // Copy data between host and device
 //
 static void host_to_device_copy(ocl_data_t *od, int idx)
 {
  if(idx < 0 || idx > 1 || od->data_size[idx] == 0u)
  {
    fprintf(stderr, "host_to_device_copy(): bad idx\n");
    exit(1);
  }
  CL_CALL(clEnqueueWriteBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0,
                                 od->data_size[idx], od->host_data[idx], 0, NULL, NULL));
 }
 static void device_to_host_copy(ocl_data_t *od, int idx)
 {
  if(idx < 0 || idx > 1 || od->data_size[idx] == 0u)
  {
    fprintf(stderr, "device_to_host_copy(): bad idx\n");
    exit(1);
  }
  CL_CALL(clEnqueueReadBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0,
                                od->data_size[idx], od->host_data[idx], 0, NULL, NULL));
 }
 //
 // Set kernel argument
 //
 static void set_kernel_arg(ocl_data_t *od, int arg_idx, size_t arg_size, const void *arg_value)
 {
  CL_CALL(clSetKernelArg, (od->kernel, arg_idx, arg_size, arg_value));
 }
 //
 // Launch kernel
 //
 static void launch_kernel(ocl_data_t *od)
 {
  CL_CALL(clEnqueueNDRangeKernel, (od->queue, od->kernel, 1, NULL,
                                   &od->global_work_size, &od->local_work_size,
                                   0, NULL, NULL));
  CL_CALL(clFinish, (od->queue));
 }
 #endif
--- a/aad_sha1_ocl_kernel.cl
+++ b/aad_sha1_ocl_kernel.cl
@ -0,0 +1,175 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
 // SHA-1 OpenCL kernel - validation and testing
 //
 #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
 // SHA-1 functions
 #define SHA1_F1(x,y,z) ((x & y) | (~x & z))
 #define SHA1_K1 0x5A827999u
 #define SHA1_F2(x,y,z) (x ^ y ^ z)
 #define SHA1_K2 0x6ED9EBA1u
 #define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z))
 #define SHA1_K3 0x8F1BBCDCu
 #define SHA1_F4(x,y,z) (x ^ y ^ z)
 #define SHA1_K4 0xCA62C1D6u
 //
 // sha1_ocl_kernel() --- each OpenCL work item computes the SHA-1 secure hash of one message
 //
 // This kernel should only be used to validate the SHA-1 secure hash code in OpenCL
 // It matches the structure of the CUDA sha1_cuda_kernel
 //
 // The data is organized in an interleaved fashion for optimal memory access:
 // For work group size W, the layout is:
 //   data[work_group_num][14][W]
 //   where data for the same index across all work items in a group are consecutive
 //
 __kernel void sha1_ocl_kernel(__global uint *interleaved_data, __global uint *interleaved_hash)
 {
  uint gid = get_global_id(0);
  uint local_id = get_local_id(0);
  uint local_size = get_local_size(0);
  uint group_id = get_group_id(0);
  // Adjust pointers for interleaved access pattern
  // Each work group processes local_size messages
  // Within a group, data is interleaved: all work items' data[0], then all data[1], etc.
  __global uint *data = &interleaved_data[group_id * (local_size * 14) + local_id];
  __global uint *hash = &interleaved_hash[group_id * (local_size * 5) + local_id];
  // Local storage for computation
  uint a, b, c, d, e, w[16];
  uint tmp;
  // Initial hash values
  a = 0x67452301u;
  b = 0xEFCDAB89u;
  c = 0x98BADCFEu;
  d = 0x10325476u;
  e = 0xC3D2E1F0u;
  // Load message schedule from interleaved data
  // DATA(idx) accesses data[idx * local_size] to get the correct interleaved value
  for(int i = 0; i < 14; i++)
    w[i] = data[local_size * i];
  w[14] = 0;
  w[15] = 440; // 55 bytes * 8 bits
  // SHA-1 compression - 80 rounds
  // Rounds 0-15 (no message schedule expansion needed)
  #define SHA1_STEP(F, K, t) \
    tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t] + K; \
    e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
  SHA1_STEP(SHA1_F1, SHA1_K1, 0);
  SHA1_STEP(SHA1_F1, SHA1_K1, 1);
  SHA1_STEP(SHA1_F1, SHA1_K1, 2);
  SHA1_STEP(SHA1_F1, SHA1_K1, 3);
  SHA1_STEP(SHA1_F1, SHA1_K1, 4);
  SHA1_STEP(SHA1_F1, SHA1_K1, 5);
  SHA1_STEP(SHA1_F1, SHA1_K1, 6);
  SHA1_STEP(SHA1_F1, SHA1_K1, 7);
  SHA1_STEP(SHA1_F1, SHA1_K1, 8);
  SHA1_STEP(SHA1_F1, SHA1_K1, 9);
  SHA1_STEP(SHA1_F1, SHA1_K1, 10);
  SHA1_STEP(SHA1_F1, SHA1_K1, 11);
  SHA1_STEP(SHA1_F1, SHA1_K1, 12);
  SHA1_STEP(SHA1_F1, SHA1_K1, 13);
  SHA1_STEP(SHA1_F1, SHA1_K1, 14);
  SHA1_STEP(SHA1_F1, SHA1_K1, 15);
  #undef SHA1_STEP
  // Rounds 16-79 with message schedule expansion
  #define SHA1_EXPAND_STEP(F, K, t) \
    tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \
    w[t & 15] = ROTATE_LEFT(tmp, 1); \
    tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \
    e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp;
  // Rounds 16-19 (still using F1)
  SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 16);
  SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 17);
  SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 18);
  SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 19);
  // Rounds 20-39 (F2)
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 20);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 21);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 22);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 23);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 24);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 25);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 26);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 27);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 28);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 29);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 30);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 31);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 32);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 33);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 34);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 35);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 36);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 37);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 38);
  SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 39);
  // Rounds 40-59 (F3)
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 40);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 41);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 42);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 43);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 44);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 45);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 46);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 47);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 48);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 49);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 50);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 51);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 52);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 53);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 54);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 55);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 56);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 57);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 58);
  SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 59);
  // Rounds 60-79 (F4)
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 60);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 61);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 62);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 63);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 64);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 65);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 66);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 67);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 68);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 69);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 70);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 71);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 72);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 73);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 74);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 75);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 76);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 77);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 78);
  SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 79);
  #undef SHA1_EXPAND_STEP
  // Add to initial hash values and store in interleaved format
  // HASH(idx) writes to hash[idx * local_size]
  hash[local_size * 0] = a + 0x67452301u;
  hash[local_size * 1] = b + 0xEFCDAB89u;
  hash[local_size * 2] = c + 0x98BADCFEu;
  hash[local_size * 3] = d + 0x10325476u;
  hash[local_size * 4] = e + 0xC3D2E1F0u;
 }
--- a/9
+++ b/9
@ -42,7 +42,7 @@ CUDA_ARCH = sm_86
 clean:
 	rm -f sha1_tests
 	rm -f sha1_cuda_test sha1_cuda_kernel.cubin
-	rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin
+	rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl
 	rm -f coin_miner_wasm.js coin_miner_wasm.wasm
 	rm -f benchmark
 	rm -f a.out
@ -85,6 +85,9 @@ coin_miner_cuda_kernel.cubin:	aad_coin_miner_cuda_kernel.cu aad_sha1.h makefile
 coin_miner_cuda:	aad_coin_miner_cuda.c coin_miner_cuda_kernel.cubin aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h aad_vault.h aad_cuda_utilities.h makefile
 	cc -march=native -Wall -Wshadow -Werror -O3 -I$(CUDA_DIR)/include $< -o $@ -lcuda
 coin_miner_ocl:	aad_coin_miner_ocl.c aad_coin_miner_ocl_kernel.cl aad_sha1.h aad_sha1_cpu.h aad_sha1_ocl_kernel.cl aad_data_types.h aad_utilities.h aad_vault.h aad_ocl_utilities.h makefile
 	cc -march=native -Wall -Wshadow -O3 $< -o $@ -lOpenCL
 coin_miner_wasm:	aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h aad_data_types.h aad_utilities.h aad_vault.h makefile
 	emcc -O3 -flto -msimd128 -o coin_miner_wasm.js aad_coin_miner_wasm.c \
 		-s WASM=1 \
@ -98,8 +101,8 @@ coin_miner_wasm:	aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h
 benchmark:	aad_benchmark.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h makefile
 	cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@
-miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda benchmark
+miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl benchmark
 all: 	sha1_tests sha1_cuda_test sha1_cuda_kernel.cubin \
-		coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin \
+		coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl \
 		benchmark