2 typos fixed + cuda kernel now uses values across 0-127

Signed-off-by: RubenCGomes <rlcg@ua.pt>
2025-11-11 00:15:42 +00:00 · 2025-11-11 00:15:42 +00:00 · cbb14ce858
parent bd06cb1133
commit cbb14ce858
6 changed files with 39 additions and 133 deletions
--- a/aad_coin_miner_cuda.c
+++ b/aad_coin_miner_cuda.c
@ -26,47 +26,6 @@ void signal_handler(int signum)
  keep_running = 0;
 }

-// Histogram data structures
-typedef struct {
-  u32_t bins[MAX_HISTOGRAM_BINS];
-  u32_t count;
-  double min_value;
-  double max_value;
-} histogram_t;
-
-static void histogram_init(histogram_t *h)
-{
-  memset(h->bins, 0, sizeof(h->bins));
-  h->count = 0;
-  h->min_value = 1e99;
-  h->max_value = 0.0;
-}
-
-static void histogram_add(histogram_t *h, double value)
-{
-  if(value < h->min_value)
-    h->min_value = value;
-  if(value > h->max_value)
-    h->max_value = value;
-  h->count++;
-
-  // For now, just count - we'll bin them later when printing
-}
-
-static void histogram_print(histogram_t *h, const char *title, int n_bins)
-{
-  if(h->count == 0)
-  {
-    printf("%s: No data\n", title);
-    return;
-  }
-
-  printf("\n%s:\n", title);
-  printf("  Count: %u\n", h->count);
-  printf("  Min: %.6f\n", h->min_value);
-  printf("  Max: %.6f\n", h->max_value);
-  printf("  Avg: %.6f\n", (h->min_value + h->max_value) / 2.0);
-}

 // Coin reconstruction from stored data
 static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
@ -87,19 +46,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
  u32_t coins_found = 0;
  u32_t kernel_runs = 0;

-  // Histograms
-  histogram_t time_histogram;
-  histogram_t coins_histogram;
-  double *kernel_times = NULL;
-  u32_t *kernel_coin_counts = NULL;
-  u32_t histogram_capacity = 10000;
-
-  histogram_init(&time_histogram);
-  histogram_init(&coins_histogram);
-
-  kernel_times = (double *)malloc(histogram_capacity * sizeof(double));
-  kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t));
-
  // Initialize CUDA
  cd.device_number = 0;
  cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
@ -112,7 +58,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
  host_storage = (u32_t *)cd.host_data[0];

  // Kernel configuration
-  cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE;
+  cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE;
  cd.grid_dim_x = 4096;  // Large grid for maximum GPU utilization

  u32_t n_threads = cd.grid_dim_x * cd.block_dim_x;
@ -128,9 +74,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
  int scan_pos = 12;

  time_measurement();
-  time_measurement();
-  double start_time = wall_time_delta();
-  double last_report_time = 0.0;
+  // double start_time = wall_time_delta();

  while(keep_running && (max_attempts == 0 || attempts < max_attempts))
  {
@ -148,13 +92,8 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
    if(use_scan_kernel)
      cd.arg[3] = &scan_pos;

-    // Launch kernel and measure time
-    time_measurement();
-    double kernel_start = cpu_time_delta();
-    lauch_kernel(&cd);
-    time_measurement();
-    double kernel_end = cpu_time_delta();
-    double kernel_time = kernel_end - kernel_start;
+    // Launch the CUDA kernel
+    launch_kernel(&cd);

    // Copy results back
    device_to_host_copy(&cd, 0);
@ -203,16 +142,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
      }
    }

-    // Update histograms
-    if(kernel_runs < histogram_capacity)
-    {
-      kernel_times[kernel_runs] = kernel_time;
-      kernel_coin_counts[kernel_runs] = n_coins_this_kernel;
-    }
-
-    histogram_add(&time_histogram, kernel_time);
-    histogram_add(&coins_histogram, (double)n_coins_this_kernel);
-
    // Update counters
    kernel_runs++;
    if(use_scan_kernel)
@ -226,21 +155,11 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
    if(use_scan_kernel)
      scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53

-    // Print progress every second
-    time_measurement();
-    double current_time = wall_time_delta() - start_time;
-    if(current_time - last_report_time >= 1.0)
-    {
-      double rate = attempts / current_time;
-      printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n",
-             (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs,
-             current_time / kernel_runs);
-      last_report_time = current_time;
-    }
+
  }

  time_measurement();
-  double total_time = wall_time_delta() - start_time;
+  double total_time = cpu_time_delta();

  printf("\n=== Mining Statistics ===\n");
  printf("Total attempts: %llu\n", (unsigned long long)attempts);
@ -249,30 +168,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
  printf("Coins found: %u\n", coins_found);
  printf("Kernel launches: %u\n", kernel_runs);

-  // Print histograms
-  histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20);
-  histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10);
-
-  // Save detailed histogram data
-  FILE *fp = fopen("cuda_kernel_stats.csv", "w");
-  if(fp != NULL)
-  {
-    fprintf(fp, "kernel_id,time_seconds,coins_found\n");
-    u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity;
-    for(u32_t i = 0; i < n_to_save; i++)
-    {
-      fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]);
-    }
-    fclose(fp);
-    printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n");
-  }

  // Save any remaining coins
  save_coin(NULL);

-  // Cleanup
-  free(kernel_times);
-  free(kernel_coin_counts);
  terminate_cuda(&cd);
 }

--- a/aad_coin_miner_cuda_kernel.cu
+++ b/aad_coin_miner_cuda_kernel.cu
@ -5,17 +5,14 @@
 //

 #include "aad_sha1.h"
-
-typedef unsigned int u32_t;
-typedef unsigned char u08_t;
-typedef unsigned long long u64_t;
+#include "aad_data_types.h"

 //
 // Optimized CUDA kernel for DETI coin mining
 // Each thread generates coins using the same approach as CPU/SIMD miners
 //

-extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
+extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
 void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2)
 {
  u32_t coin[14];
@ -47,16 +44,21 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
  offset += (u64_t)n;

  // Apply offset to variable part (increment the coin counter)
-  // Use the same carry logic as CPU/SIMD miners
  for(int pos = 53; pos >= 12 && offset > 0; pos--)
  {
    u08_t *byte = &bytes[pos ^ 3];
-    u64_t add = offset % 95;  // Range: 32-126 (95 values)
-    offset /= 95;
+    u64_t add = offset % 127;
+    offset /= 127;

-    u32_t val = (*byte - 32 + add);
-    *byte = 32 + (val % 95);
-    offset += val / 95;  // Carry
+    u32_t val = *byte + add;
+    u08_t new_val = val % 127;
+
+    // Skip newline character (ASCII 10) in the variable part
+    if(new_val == '\n')
+      new_val++;
+
+    *byte = new_val;
+    offset += val / 127;  // Carry
  }

  // Compute SHA1 hash
@ -92,7 +94,7 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
 // Kernel that tries all possible values for one character position
 //

-extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
+extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
 void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position)
 {
  u32_t coin[14];
@ -130,7 +132,13 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
    offset /= 95;
    
    u32_t val = (*byte - 32 + add);
-    *byte = 32 + (val % 95);
+    u08_t new_val = 32 + (val % 95);
+
+    // Skip newline character (ASCII 10) in the variable part
+    if(new_val == '\n')
+      new_val++;
+
+    *byte = new_val;
    offset += val / 95;
  }

--- a/aad_cuda_utilities.h
+++ b/aad_cuda_utilities.h
@ -36,7 +36,7 @@ typedef struct
  CUdeviceptr  device_data[2];   // the pointers to the device data
  // launch kernel data
  unsigned int grid_dim_x;       // the number of grid blocks (in the X dimension, the only one we will use here)
-  unsigned int block_dim_x;      // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMENDED_CUDA_BLOCK_SIZE)
+  unsigned int block_dim_x;      // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMMENDED_CUDA_BLOCK_SIZE)
  int n_kernel_arguments;        // number of kernel arguments
  void *arg[MAX_N_ARGUMENTS];    // pointers to the kernel argument data

@ -260,10 +260,10 @@ static void device_to_host_copy(cuda_data_t *cd,int idx)
 // launch a CUDA kernel (with 0 bytes of shared memory and no extra options)
 //

-static void lauch_kernel(cuda_data_t *cd)
+static void launch_kernel(cuda_data_t *cd)
 {
-  if(cd->block_dim_x != (unsigned int)RECOMENDED_CUDA_BLOCK_SIZE)
-    fprintf(stderr,"lauch_kernel(): block_dim_x should be equal to %d\n",RECOMENDED_CUDA_BLOCK_SIZE);
+  if(cd->block_dim_x != (unsigned int)RECOMMENDED_CUDA_BLOCK_SIZE)
+    fprintf(stderr,"launch_kernel(): block_dim_x should be equal to %d\n",RECOMMENDED_CUDA_BLOCK_SIZE);
  CU_CALL( cuLaunchKernel , (cd->cu_kernel,cd->grid_dim_x,1u,1u,cd->block_dim_x,1u,1u,0u,cd->cu_stream,&cd->arg[0],NULL) );
  synchronize_cuda(cd);
 }
--- a/aad_sha1.h
+++ b/aad_sha1.h
@ -28,7 +28,7 @@
 //
 // we place this here to simplify things (aad_sha1_cuda_kernel.cu includes this file...)
 //
-#define RECOMENDED_CUDA_BLOCK_SIZE  128
+#define RECOMMENDED_CUDA_BLOCK_SIZE  128


 //
--- a/aad_sha1_cuda_kernel.cu
+++ b/aad_sha1_cuda_kernel.cu
@ -12,8 +12,7 @@
 //

 #include "aad_sha1.h"
-
-typedef unsigned int u32_t;
+#include "aad_data_types.h"

 //
 // the nvcc compiler stores w[] in registers (constant indices!)
@ -23,7 +22,7 @@ typedef unsigned int u32_t;
 // warp thread number: n & 31 -- the lane
 //

-extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
+extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
 void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash)
 {
  u32_t n;
--- a/aad_sha1_cuda_test.c
+++ b/aad_sha1_cuda_test.c
@ -19,7 +19,7 @@ static void test_sha1_cuda(int n_tests)
  double host_to_device_time,kernel_time,device_to_host_time,hashes_per_second;
  cuda_data_t cd;

-  if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMENDED_CUDA_BLOCK_SIZE != 0)
+  if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMMENDED_CUDA_BLOCK_SIZE != 0)
  {
    fprintf(stderr,"test_sha1_cuda(): bad number of tests\n");
    exit(1);
@ -44,13 +44,13 @@ static void test_sha1_cuda(int n_tests)
  host_to_device_copy(&cd,0); // idx=0 means that the interleaved32_data is copied to the CUDA device
  time_measurement();
  host_to_device_time = wall_time_delta();
-  cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
-  cd.block_dim_x = (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
+  cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
+  cd.block_dim_x = (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
  cd.n_kernel_arguments = 2;
  cd.arg[0] = &cd.device_data[0]; // interleaved32_data
  cd.arg[1] = &cd.device_data[1]; // interleaved32_hash
  time_measurement();
-  lauch_kernel(&cd);
+  launch_kernel(&cd);
  time_measurement();
  kernel_time = wall_time_delta();
  time_measurement();