7 changed files with 225 additions and 443 deletions
--- a/.gitignore
+++ b/.gitignore
@ -68,8 +68,6 @@ coin_miner_cuda
 coin_miner_ocl
 coin_miner_wasm.js
 coin_miner_wasm.wasm
 coin_miner_cpu_threads
 coin_miner_dna_shape_cuda
 # Vault
 deti_coins*_vault.txt
--- a/aad_coin_miner_cuda.c
+++ b/aad_coin_miner_cuda.c
@ -1,7 +1,7 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
-// DETI Coin Miner - Host Code
+// DETI Coin Miner - CUDA implementation with histograms
 //
 #include <time.h>
@ -11,123 +11,173 @@
 #include <signal.h>
 #include <getopt.h>
 #include "aad_data_types.h"
 #include "aad_utilities.h"
 #include "aad_sha1_cpu.h"
 #include "aad_cuda_utilities.h"
 #include "aad_vault.h"
-#define COINS_STORAGE_SIZE  2048u // Increased buffer slightly
+#define COINS_STORAGE_SIZE  1024u
 #define MAX_HISTOGRAM_BINS  100
 static volatile int keep_running = 1;
-void signal_handler(int signum) {
+void signal_handler(int signum)
 {
  (void)signum;
  keep_running = 0;
 }
-static double get_wall_time(void) {
+// Get current wall time in seconds
 static double get_wall_time(void)
 {
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
  return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
 }
 // Coin reconstruction from stored data
 static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
 {
  // Simply copy the complete coin data from storage
  for(int i = 0; i < 14; i++)
    coin[i] = stored_data[i];
 }
 //
 // Mine DETI coins using CUDA
 //
 static void mine_coins_cuda(u64_t max_attempts, double max_time)
 {
  cuda_data_t cd;
  u32_t *host_storage;
  u64_t attempts = 0;
-  u32_t coins_found_total = 0;
+  u32_t coins_found = 0;
  u32_t kernel_runs = 0;
  // Initialize CUDA
  memset(&cd, 0, sizeof(cd));
  cd.device_number = 0;
  cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
  cd.kernel_name = "mine_deti_coins_kernel";
  // Allocate memory for results [ Counter (1 u32) | Data ... ]
  cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t);
  cd.data_size[1] = 0;
  initialize_cuda(&cd);
  host_storage = (u32_t *)cd.host_data[0];
-  // Configure Launch Dimensions
+  // Kernel configuration
-  // Maximizing occupancy:
+  cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE;
-  cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE; // Usually 128 or 256
+  cd.grid_dim_x = 4096;  // Large grid for maximum GPU utilization
  cd.grid_dim_x = 80 * 4; // High number of blocks to hide latency
-  u32_t total_threads = cd.grid_dim_x * cd.block_dim_x;
+  u32_t n_threads = cd.grid_dim_x * cd.block_dim_x;
  u32_t attempts_per_thread = 4096; // Work per kernel launch
-  printf("Starting CUDA Miner on %s\n", cd.device_name);
+  printf("Mining DETI coins using CUDA...\n");
-  printf("Threads: %u, Attempts/Thread: %u\n", total_threads, attempts_per_thread);
+  printf("Grid: %u blocks × %u threads = %u total threads\n",
         cd.grid_dim_x, cd.block_dim_x, n_threads);
  printf("Kernel: %s\n", cd.kernel_name);
  if(max_attempts > 0 && max_time > 0)
    printf("Will stop after %llu attempts OR %.2f seconds (whichever comes first)\n",
           (unsigned long long)max_attempts, max_time);
  else if(max_attempts > 0)
    printf("Will stop after %llu attempts\n", (unsigned long long)max_attempts);
  else if(max_time > 0)
    printf("Will stop after %.2f seconds\n", max_time);
  else
    printf("Running indefinitely until Ctrl+C...\n");
  printf("Press Ctrl+C to stop\n\n");
  u64_t base_nonce = 0;
-  double start_time = get_wall_time();
+  u32_t attempts_per_thread = 1024 * 8; // Increased attempts per thread
-  // Arguments pointers
+  double start_time = get_wall_time();
-  cd.n_kernel_arguments = 3;
+  time_measurement();
  cd.arg[0] = &cd.device_data[0];
  cd.arg[1] = &base_nonce;
  cd.arg[2] = &attempts_per_thread;
  while(keep_running)
  {
-    // 1. Reset storage counter
+    // Check stopping conditions
-    host_storage[0] = 1u; // Index 0 is the atomic counter. Start data at index 1.
+    if(max_attempts > 0 && attempts >= max_attempts)
      break;
    double elapsed = get_wall_time() - start_time;
    if(max_time > 0 && elapsed >= max_time)
      break;
    // Initialize storage area
    host_storage[0] = 1u; // First unused index
    // Copy to device
    host_to_device_copy(&cd, 0);
-    // 2. Launch Kernel
+    // Set kernel arguments
    cd.n_kernel_arguments = 2;
    cd.arg[0] = &cd.device_data[0];
    cd.arg[1] = &base_nonce;
    cd.arg[2] = &attempts_per_thread;
    // Launch the CUDA kernel
    launch_kernel(&cd);
-    // 3. Retrieve Results
+    // Copy results back
    device_to_host_copy(&cd, 0);
-    // 4. Process Found Coins
+    // Process found coins
-    u32_t next_write_idx = host_storage[0];
+    u32_t n_coins_this_kernel = 0;
-    u32_t num_u32_written = next_write_idx - 1;
+    u32_t n_stored = (host_storage[0] - 1) / 14;
-    // Each coin is 14 u32 words
+    if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE)
    if(num_u32_written >= 14)
    {
-       int coins_in_batch = num_u32_written / 14;
+      for(u32_t i = 0; i < n_stored; i++)
-       for(int c = 0; c < coins_in_batch; c++)
+      {
-       {
+        u32_t coin[14];
-           u32_t found_coin[14];
+        reconstruct_coin(&host_storage[1 + i * 14], coin);
           // Copy from host buffer to temp array
           for(int w=0; w<14; w++) {
               found_coin[w] = host_storage[1 + (c * 14) + w];
           }
-           // Verify/Save using required function
+        coins_found++;
-           save_coin(found_coin);
+        n_coins_this_kernel++;
-           coins_found_total++;
+        printf("COIN FOUND! (kernel %u, coin %u in this kernel). Total coins:%u\n",
-           printf("Coin Found! Total: %u\n", coins_found_total);
+               kernel_runs, n_coins_this_kernel, coins_found);
-       }
+        save_coin(coin);
      }
    }
-    // 5. Update Progress
+    // Update counters
-    u64_t batch_attempts = (u64_t)total_threads * attempts_per_thread;
+    kernel_runs++;
-    attempts += batch_attempts;
+    u64_t attempts_this_launch = (u64_t)n_threads * attempts_per_thread;
-    base_nonce += batch_attempts; // Ensure next kernel uses new nonces
+    attempts += attempts_this_launch;
-
+    base_nonce += attempts_this_launch;
    // 6. Check Limits
    if((max_attempts > 0 && attempts >= max_attempts) ||
       (max_time > 0 && (get_wall_time() - start_time) >= max_time)) {
        break;
    }
  }
-  // Cleanup
+  time_measurement();
-  double total_time = get_wall_time() - start_time;
+  double total_time = cpu_time_delta();
-  printf("\nMining Finished.\n");
+
-  printf("Attempts: %llu\n", (unsigned long long)attempts);
+  printf("\n=== Mining Statistics ===\n");
-  printf("Time: %.4fs\n", total_time);
+  printf("Total attempts: %llu\n", (unsigned long long)attempts);
-  printf("Hashrate: %.2f MH/s\n", (attempts / total_time) / 1000000.0);
+  printf("Total time: %.2f seconds\n", total_time);
  printf("Average rate: %.2f attempts/second\n", attempts / total_time);
  printf("Coins found: %u\n", coins_found);
  printf("Kernel launches: %u\n", kernel_runs);
  // Save any remaining coins
  save_coin(NULL);
  save_coin(NULL); // Flush vault
  terminate_cuda(&cd);
 }
 void print_usage(const char *prog_name)
 {
  printf("Usage: %s [OPTIONS]\n", prog_name);
  printf("Options:\n");
  printf("  -a <attempts>    Maximum number of attempts\n");
  printf("  -t <seconds>     Maximum time in seconds\n");
  printf("  -h               Show this help message\n");
  printf("\nExamples:\n");
  printf("  %s -a 1000000000           # Run for 1B attempts\n", prog_name);
  printf("  %s -t 60                   # Run for 60 seconds\n", prog_name);
  printf("  %s -a 1000000000 -t 60     # Stop at 1B attempts OR 60s (whichever first)\n", prog_name);
  printf("  %s                         # Run indefinitely until Ctrl+C\n", prog_name);
 }
 int main(int argc, char *argv[])
 {
  u64_t max_attempts = 0;
@ -136,17 +186,27 @@ int main(int argc, char *argv[])
  signal(SIGINT, signal_handler);
-  while((opt = getopt(argc, argv, "a:t:")) != -1)
+  // Parse command line options
  while((opt = getopt(argc, argv, "a:t:h")) != -1)
  {
-    switch(opt) {
+    switch(opt)
-      case 'a': max_attempts = strtoull(optarg, NULL, 10); break;
+    {
-      case 't': max_time = atof(optarg); break;
+      case 'a':
        max_attempts = strtoull(optarg, NULL, 10);
        break;
      case 't':
        max_time = atof(optarg);
        break;
      case 'h':
        print_usage(argv[0]);
        return 0;
      default:
-        fprintf(stderr, "Usage: %s -a <attempts> -t <seconds>\n", argv[0]);
+        print_usage(argv[0]);
        return 1;
    }
  }
  mine_coins_cuda(max_attempts, max_time);
  return 0;
 }
--- a/aad_coin_miner_cuda_kernel.cu
+++ b/aad_coin_miner_cuda_kernel.cu
@ -1,192 +1,96 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
-// DETI Coin Miner - CUDA kernel (Optimized)
+// DETI Coin Miner - CUDA kernel (optimized for mining)
 //
 #include "aad_sha1.h"
 #include "aad_data_types.h"
 //
-// Optimized CUDA kernel
+// Optimized CUDA kernel for DETI coin mining
 // Each thread generates coins using the same approach as CPU/SIMD miners
 //
-extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
+
 extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
 void mine_deti_coins_kernel(u32_t *coins_storage_area, u64_t base_nonce, u32_t attempts_per_thread)
 {
-  u32_t coin[16]; // SHA1 requires 16 words (64 bytes)
+  u32_t coin[14];
  u32_t hash[5];
  u32_t n;
  u08_t *bytes = (u08_t *)coin;
-  // 1. Initialize Fixed Prefix: "DETI coin 2 " (12 bytes)
+  // Get thread index (used as offset from base counter)
-  coin[0] = (u32_t)'D' << 24 | (u32_t)'E' << 16 | (u32_t)'T' << 8 | (u32_t)'I';
+  n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
  coin[1] = (u32_t)' ' << 24 | (u32_t)'c' << 16 | (u32_t)'o' << 8 | (u32_t)'i';
  coin[2] = (u32_t)'n' << 24 | (u32_t)' ' << 16 | (u32_t)'2' << 8 | (u32_t)' ';
-  // 2. Initialize Variable Part (Bytes 12 to 53)
+  // Initialize coin template: "DETI coin 2 " + variable + "\n\x80"
-  // Fill with a safe printable char ' ' (0x20)
+  // Use byte-swapped format to match host expectations (idx ^ 3)
-  #pragma unroll
+  coin[0] = ('D' << 24) + ('E' << 16) + ('T' << 8) + 'I';
-  for(int i = 3; i <= 12; i++) {
+  coin[1] = (' ' << 24) + ('c' << 16) + ('o' << 8) + 'i';
-    coin[i] = 0x20202020;
+  coin[2] = ('n' << 24) + (' ' << 16) + ('2' << 8) + ' ';
  }
  // Word 13 is partial variable + suffix
  // Bytes 52, 53 are variable. Byte 54 is '\n', Byte 55 is 0x80 (Padding)
  coin[13] = 0x20200A80;
-  // 3. Initialize SHA1 Length Padding
+  // Fill the variable part of the coin with a pattern
-  // Message is 55 bytes. Length in bits = 55 * 8 = 440.
+  for(int i = 3; i < 14; i++)
-  // SHA1 puts length at the very end (Word 15).
+    coin[i] = 0x41414141; // 'AAAA'
  coin[14] = 0x00000000;
  coin[15] = 440;
-  // 4. Thread Unique Initialization
+  // End with newline and padding
-  // Uses thread ID to set the initial state of the variable bytes
+  bytes[0x36 ^ 3] = '\n';   // Position 54
-  // to ensure every thread starts at a different point.
+  bytes[0x37 ^ 3] = 0x80;   // Position 55
  u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
  u64_t nonce_offset = base_nonce + thread_id * attempts_per_thread;
-  // Seeding the message with the nonce (Fast update of specific bytes)
+  for(u32_t i = 0; i < attempts_per_thread; ++i) {
-  u08_t *byte_ptr = (u08_t*)coin;
+    // Initialize variable part (positions 12-53, 42 bytes)
    // Start with A-Z pattern like CPU/SIMD miners
    for(int j = 12; j < 54; j++)
      bytes[j ^ 3] = 'A' + ((j - 12) % 26);
-  // Apply the nonce offset to the message structure
+    // Calculate offset based on thread index and parameters
-  u64_t temp_nonce = nonce_offset;
+    // This creates a unique starting point for each thread
-  for (int k = 12; k < 54 && temp_nonce > 0; k++) {
+    u64_t offset = base_nonce + n + (u64_t)i * gridDim.x * blockDim.x;
      u32_t val = byte_ptr[k ^ 3] + (temp_nonce % 95); // mod 95 to stay in printable ASCII
      temp_nonce /= 95;
-      if (val > 0x7E) { // Wrap around printable range
+    // Apply offset to variable part (increment the coin counter)
-          val -= 95;
+    for(int pos = 53; pos >= 12 && offset > 0; pos--)
          temp_nonce++; // Carry
      }
      byte_ptr[k ^ 3] = (u08_t)val;
  }
  // 5. Mining Loop
  for(u32_t attempt = 0; attempt < attempts_per_thread; attempt++)
  {
    // --- SHA1 HASH CALCULATION ---
    #define T            u32_t
    #define C(c)         (c)
    #define ROTATE(x,n)  (((x) << (n)) | ((x) >> (32 - (n))))
    #define DATA(idx)    coin[idx]
    #define HASH(idx)    hash[idx]
    CUSTOM_SHA1_CODE();
    #undef T
    #undef C
    #undef ROTATE
    #undef DATA
    #undef HASH
    // --- CHECK RESULT ---
    // Check for "aad20250" prefix (AAD20250 hex)
    if(hash[0] == 0xAAD20250u)
    {
-       // Found a candidate! Save it.
+      u08_t *byte = &bytes[pos ^ 3];
-       u32_t idx = atomicAdd(&coins_storage_area[0], 14u);
+      u64_t add = offset % 127;
      offset /= 127;
-       // Boundary check (first word is count, data starts at index 1)
+      u32_t val = *byte + add;
-       // We normalize the index to be relative to storage start
+      u08_t new_val = val % 127;
-       if(idx < 1024u - 15u) // Ensure space
+
-       {
+      // Skip newline character (ASCII 10) in the variable part
-          // Store valid coin (14 words = 56 bytes, covers the 55 byte content)
+      if(new_val == '\n')
-          // Adjust idx because coins_storage_area[0] is the counter
+        new_val++;
-          for(int w=0; w<14; w++) {
+
-              coins_storage_area[idx + w] = coin[w];
+      *byte = new_val;
-          }
+      offset += val / 127;  // Carry
       }
    }
-    // --- UPDATE MESSAGE (ODOMETER) ---
+  // Compute SHA1 hash
-    // Increment the message string for the next attempt
+# define T            u32_t
-    // Start at byte 53 (just before the \n) and work backwards if carry needed.
+# define C(c)         (c)
-
+# define ROTATE(x,n)  (((x) << (n)) | ((x) >> (32 - (n))))
-    int pos = 53;
+# define DATA(idx)    coin[idx]
-    while (pos >= 12) {
+# define HASH(idx)    hash[idx]
        u08_t *b = &byte_ptr[pos ^ 3];
        (*b)++;
        if (*b <= 0x7E) {
            break; // No carry, done incrementing
        }
        // Overflow printable range, reset to start of range (0x20) and carry
        *b = 0x20;
        pos--;
    }
  }
 }
 //
 // Kernel: Mines a coin where the first 48 bytes are FIXED (the visual pattern)
 // and only the last ~7 bytes are mutated to find the hash.
 //
 extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
 void mine_visual_row_kernel(u32_t *coins_storage_area, u32_t *row_template, u64_t base_nonce)
 {
  u32_t coin[16]; // SHA1 working buffer
  u32_t hash[5];
  // 1. Load the template
  #pragma unroll
  for(int i = 0; i < 12; i++) {
      coin[i] = row_template[i];
  }
  // 2. Setup the "Mining Area" (Bytes 48-53)
  // Template provided by host: [ ... visual ... ] [ mining_space ] \n 0x80
  coin[12] = 0x41414141; // Initialize mining space with 'AAAA'
  coin[13] = row_template[13]; // This contains the \n (byte 54) and 0x80 (byte 55)
  // SHA1 Length padding (55 bytes = 440 bits)
  coin[14] = 0;
  coin[15] = 440;
  // 3. Thread unique nonce calculation
  u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
  u64_t nonce = base_nonce + thread_id; // Simple linear nonce
  // 4. Map nonce to the "Mining Area" (Bytes 48-53)
  // Change bytes from 48 to 53
  u08_t *bytes = (u08_t*)coin;
  u64_t temp_nonce = nonce;
  for(int k = 48; k <= 53; k++)
  {
      // Map to printable ASCII (0x21 to 0x7E) to avoid forbidden \n
      u32_t val = (bytes[k^3] + (temp_nonce % 90));
      temp_nonce /= 90;
      if(val > 0x7E) {
         val = 0x21 + (val - 0x7E); // Wrap
         temp_nonce++; // Carry
      }
      bytes[k^3] = (u08_t)val;
  }
  // 5. SHA1 Computation
  #define T            u32_t
  #define C(c)         (c)
  #define ROTATE(x,n)  (((x) << (n)) | ((x) >> (32 - (n))))
  #define DATA(idx)    coin[idx]
  #define HASH(idx)    hash[idx]
  CUSTOM_SHA1_CODE();
 # undef T
 # undef C
 # undef ROTATE
 # undef DATA
 # undef HASH
-  #undef T
+  // Check if this is a valid DETI coin
  #undef C
  #undef ROTATE
  #undef DATA
  #undef HASH
  // 6. Check Result
  if(hash[0] == 0xAAD20250u)
  {
-    u32_t idx = atomicAdd(&coins_storage_area[0], 14u);
+    // Found a coin! Store it atomically
-    if(idx < 1024u - 15u)
+    u32_t idx = atomicAdd(coins_storage_area, 14u);
-    {
+
-       // Save the found coin
+      // Make sure we don't write outside buffer
-       for(int w=0; w<14; w++) {
+      if(idx < 1024u - 14u)
-           coins_storage_area[idx + w] = coin[w];
+      {
-       }
+        // Store the complete coin data
        for(int k = 0; k < 14; k++)
          coins_storage_area[idx + k] = coin[k];
      }
    }
  }
 }
--- a/aad_coin_miner_dna_shape_cuda.c
+++ b/aad_coin_miner_dna_shape_cuda.c
@ -1,180 +0,0 @@
 //
 // Arquiteturas de Alto Desempenho 2025/2026
 //
 // DETI Coin Miner - DNA Helix Generator
 //
 #include <math.h>
 #include <time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
 #include "aad_data_types.h"
 #include "aad_sha1_cpu.h"
 #include "aad_cuda_utilities.h"
 #include "aad_vault.h"
 #define PI 3.14159265
 #define COINS_STORAGE_SIZE 1024u
 static volatile int keep_running = 1;
 void signal_handler(int signum) {
  (void)signum;
  keep_running = 0;
 }
 //
 // Visual Generator: Creates one line of the DNA Helix
 //
 void generate_dna_row(int row_idx, u32_t *template_buffer) {
    char line_str[64];
    memset(line_str, 0, 64);
    // 1. Standard Header (12 bytes)
    memcpy(line_str, "DETI coin 2 ", 12);
    // 2. The Visual Area (Bytes 12 to 47 -> 36 chars wide)
    // We draw two sine waves.
    // Center is roughly at relative index 18.
    int width = 36;
    int center = width / 2;
    double amplitude = 14.0;
    double frequency = 0.3;
    // Fill background with space
    for(int i=12; i < 48; i++) line_str[i] = ' ';
    // Calculate positions
    int pos1 = center + (int)(amplitude * sin(row_idx * frequency));
    int pos2 = center + (int)(amplitude * sin(row_idx * frequency + PI)); // 180 deg out of phase
    // Draw the helix strands
    // Valid visual range is index 12 to 47
    if(pos1 >= 0 && pos1 < width) line_str[12 + pos1] = '(';
    if(pos2 >= 0 && pos2 < width) line_str[12 + pos2] = ')';
    // Draw the "rungs" connecting the DNA strands
    int left = (pos1 < pos2) ? pos1 : pos2;
    int right = (pos1 < pos2) ? pos2 : pos1;
    // Add some "biology" chars in the middle
    if (row_idx % 2 == 0) {
        int mid = 12 + (left + right) / 2;
        line_str[mid] = (row_idx % 4 == 0) ? '-' : '+';
    }
    // 3. The Mining Area (Bytes 48-53)
    // Initialize with placeholders (GPU will overwrite these)
    for(int i=48; i<54; i++) line_str[i] = '.';
    // 4. Mandatory Suffix
    line_str[54] = '\n';
    line_str[55] = (char)0x80; // Padding
    // 5. Convert char buffer to u32 array (Endian safe copy)
    // We copy 14 words (56 bytes)
    for(int i=0; i<14; i++) {
        u08_t *ptr = (u08_t*)&template_buffer[i];
        ptr[3] = line_str[i*4 + 0];
        ptr[2] = line_str[i*4 + 1];
        ptr[1] = line_str[i*4 + 2];
        ptr[0] = line_str[i*4 + 3];
    }
 }
 int main(int argc, char *argv[])
 {
  cuda_data_t cd;
  u32_t *host_storage;
  u32_t *host_template;
  u64_t base_nonce = 0;
  int current_row = 0;
  signal(SIGINT, signal_handler);
  // Initialize CUDA
  memset(&cd, 0, sizeof(cd));
  cd.device_number = 0;
  cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
  cd.kernel_name = "mine_visual_row_kernel"; // Note the new kernel name
  // Allocations
  cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t); // Storage for found coins
  cd.data_size[1] = 16 * sizeof(u32_t); // Storage for the Row Template
  initialize_cuda(&cd);
  host_storage = (u32_t *)cd.host_data[0];
  host_template = (u32_t *)cd.host_data[1];
  // Configure Kernel
  cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE;
  cd.grid_dim_x = 128; // Smaller grid is fine since we stop as soon as we find ONE coin
  printf("Generating DNA Helix Blockchain...\n");
  printf("Press Ctrl+C to stop.\n\n");
  cd.n_kernel_arguments = 3;
  cd.arg[0] = &cd.device_data[0]; // Storage
  cd.arg[1] = &cd.device_data[1]; // Template
  cd.arg[2] = &base_nonce;        // Nonce
  while(keep_running)
  {
      // 1. Generate the visual template for this specific row
      generate_dna_row(current_row, host_template);
      // 2. Reset storage counter
      host_storage[0] = 1u;
      // 3. Copy Template and Reset Counter to GPU
      // We copy both buffers (idx 0 and 1)
      host_to_device_copy(&cd, 0);
      host_to_device_copy(&cd, 1);
      int coin_found = 0;
      // 4. Loop until we find a coin for THIS row
      while(!coin_found && keep_running) {
          cd.arg[2] = &base_nonce; // Update nonce pointer arg
          launch_kernel(&cd);
          // Check if we found something
          device_to_host_copy(&cd, 0);
          u32_t count = host_storage[0];
          if(count > 1) {
              // Coin found!
              u32_t coin[14];
              // Extract the first found coin
              for(int i=0; i<14; i++) coin[i] = host_storage[1+i];
              save_coin(coin); // Save to disk
              // Visual Feedback to Console (Reconstruct char string for display)
              char debug_str[56];
              for(int i=0; i<14; i++) {
                 u32_t w = coin[i];
                 debug_str[i*4+0] = (w >> 24) & 0xFF;
                 debug_str[i*4+1] = (w >> 16) & 0xFF;
                 debug_str[i*4+2] = (w >> 8) & 0xFF;
                 debug_str[i*4+3] = w & 0xFF;
              }
              // Only print the visual part (hide the ugly mining bits at the end)
              printf("%.54s\n", debug_str);
              coin_found = 1;
              current_row++; // Advance to next visual row
          }
          base_nonce += (cd.grid_dim_x * cd.block_dim_x);
      }
  }
  save_coin(NULL);
  terminate_cuda(&cd);
  return 0;
 }
--- a/aad_coin_miner_simd.c
+++ b/aad_coin_miner_simd.c
@ -211,13 +211,13 @@ static void mine_coins_avx(u64_t max_attempts, double max_time)
    }
    // Print progress every 1M attempts
-    // if(attempts % 1000000 < SIMD_WIDTH)
+    if(attempts % 1000000 < SIMD_WIDTH)
-    // {
+    {
-    //   elapsed = get_wall_time() - start_time;
+      elapsed = get_wall_time() - start_time;
-    //   double rate = attempts / elapsed;
+      double rate = attempts / elapsed;
-    //   printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
+      printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
-    //          (unsigned long long)attempts, rate / 1e6, coins_found, elapsed);
+             (unsigned long long)attempts, rate / 1e6, coins_found, elapsed);
-    // }
+    }
  }
  double total_time = get_wall_time() - start_time;
@ -313,13 +313,13 @@ static void mine_coins_avx2(u64_t max_attempts, double max_time)
      }
    }
-    // if(attempts % 1000000 < SIMD_WIDTH)
+    if(attempts % 1000000 < SIMD_WIDTH)
-    // {
+    {
-    //   elapsed = get_wall_time() - start_time;
+      elapsed = get_wall_time() - start_time;
-    //   double rate = attempts / elapsed;
+      double rate = attempts / elapsed;
-    //   printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
+      printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
-    //          (unsigned long long)attempts, rate / 1e6, coins_found, elapsed);
+             (unsigned long long)attempts, rate / 1e6, coins_found, elapsed);
-    // }
+    }
  }
  double total_time = get_wall_time() - start_time;
@ -452,14 +452,14 @@ static void mine_coins_avx2_omp(u64_t max_attempts, double max_time)
          #pragma omp atomic read
          current_attempts = attempts;
-          // if(current_attempts - last_reported_attempts >= 1000000)
+          if(current_attempts - last_reported_attempts >= 1000000)
-          // {
+          {
-          //   double elapsed = get_wall_time() - start_time;
+            double elapsed = get_wall_time() - start_time;
-          //   double rate = current_attempts / elapsed;
+            double rate = current_attempts / elapsed;
-          //   printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
+            printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Elapsed: %.2fs\n",
-          //          (unsigned long long)current_attempts, rate / 1e6, coins_found, elapsed);
+                   (unsigned long long)current_attempts, rate / 1e6, coins_found, elapsed);
-          //   last_reported_attempts = current_attempts;
+            last_reported_attempts = current_attempts;
-          // }
+          }
        }
      }
    }
--- a/aad_sha1.h
+++ b/aad_sha1.h
@ -28,7 +28,7 @@
 //
 // we place this here to simplify things (aad_sha1_cuda_kernel.cu includes this file...)
 //
-#define RECOMMENDED_CUDA_BLOCK_SIZE  256
+#define RECOMMENDED_CUDA_BLOCK_SIZE  128
 //
--- a/14
+++ b/14
@ -42,7 +42,7 @@ CUDA_ARCH = sm_86
 clean:
 	rm -f sha1_tests
 	rm -f sha1_cuda_test sha1_cuda_kernel.cubin
-	rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl coin_miner_dna_shape_cuda
+	rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl
 	rm -f coin_miner_wasm.js coin_miner_wasm.wasm
 	rm -f benchmark
 	rm -f a.out
@ -77,7 +77,7 @@ coin_miner_cpu:	aad_coin_miner_cpu.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h
 	cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@
 coin_miner_simd:	aad_coin_miner_simd.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h aad_vault.h makefile
-	cc -march=native -Wall -Wshadow -fopenmp -mavx2 -O3 $< -o $@
+	cc -march=native -Wall -Wshadow -Werror -fopenmp -mavx2 -O3 $< -o $@
 coin_miner_cuda_kernel.cubin:	aad_coin_miner_cuda_kernel.cu aad_sha1.h makefile
 	nvcc -arch=$(CUDA_ARCH) --compiler-options -O2,-Wall -I$(CUDA_DIR)/include --cubin $< -o $@
@ -88,9 +88,6 @@ coin_miner_cuda:	aad_coin_miner_cuda.c coin_miner_cuda_kernel.cubin aad_sha1.h a
 coin_miner_ocl:	aad_coin_miner_ocl.c aad_coin_miner_ocl_kernel.cl aad_sha1.h aad_sha1_cpu.h aad_sha1_ocl_kernel.cl aad_data_types.h aad_utilities.h aad_vault.h aad_ocl_utilities.h makefile
 	cc -march=native -Wall -Wshadow -O3 $< -o $@ -lOpenCL
 coin_miner_dna_shape_cuda:	aad_coin_miner_dna_shape_cuda.c coin_miner_cuda_kernel.cubin aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h aad_vault.h aad_cuda_utilities.h makefile
 	cc -march=native -Wall -Wshadow -Werror -O3 -I$(CUDA_DIR)/include $< -o $@ -lcuda -lm
 coin_miner_wasm:	aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h aad_data_types.h aad_utilities.h aad_vault.h makefile
 	emcc -O3 -flto -msimd128 -o coin_miner_wasm.js aad_coin_miner_wasm.c \
 		-s WASM=1 \
@ -101,8 +98,11 @@ coin_miner_wasm:	aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h
 		-s EXPORT_NAME='CoinMinerModule' \
 		-s INITIAL_MEMORY=67108864
 benchmark:	aad_benchmark.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h makefile
 	cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@
-miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl
+miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl benchmark
 all: 	sha1_tests sha1_cuda_test sha1_cuda_kernel.cubin \
-		coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl
+		coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl \
 		benchmark