2 typos fixed + cuda kernel now uses values across 0-127

Signed-off-by: RubenCGomes <rlcg@ua.pt>
This commit is contained in:
RubenCGomes 2025-11-11 00:15:42 +00:00
parent bd06cb1133
commit cbb14ce858
No known key found for this signature in database
GPG Key ID: 0D213021197E3EE0
6 changed files with 39 additions and 133 deletions

View File

@ -26,47 +26,6 @@ void signal_handler(int signum)
keep_running = 0;
}
// Histogram data structures
typedef struct {
u32_t bins[MAX_HISTOGRAM_BINS];
u32_t count;
double min_value;
double max_value;
} histogram_t;
static void histogram_init(histogram_t *h)
{
memset(h->bins, 0, sizeof(h->bins));
h->count = 0;
h->min_value = 1e99;
h->max_value = 0.0;
}
static void histogram_add(histogram_t *h, double value)
{
if(value < h->min_value)
h->min_value = value;
if(value > h->max_value)
h->max_value = value;
h->count++;
// For now, just count - we'll bin them later when printing
}
static void histogram_print(histogram_t *h, const char *title, int n_bins)
{
if(h->count == 0)
{
printf("%s: No data\n", title);
return;
}
printf("\n%s:\n", title);
printf(" Count: %u\n", h->count);
printf(" Min: %.6f\n", h->min_value);
printf(" Max: %.6f\n", h->max_value);
printf(" Avg: %.6f\n", (h->min_value + h->max_value) / 2.0);
}
// Coin reconstruction from stored data
static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
@ -87,19 +46,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
u32_t coins_found = 0;
u32_t kernel_runs = 0;
// Histograms
histogram_t time_histogram;
histogram_t coins_histogram;
double *kernel_times = NULL;
u32_t *kernel_coin_counts = NULL;
u32_t histogram_capacity = 10000;
histogram_init(&time_histogram);
histogram_init(&coins_histogram);
kernel_times = (double *)malloc(histogram_capacity * sizeof(double));
kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t));
// Initialize CUDA
cd.device_number = 0;
cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
@ -112,7 +58,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
host_storage = (u32_t *)cd.host_data[0];
// Kernel configuration
cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE;
cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE;
cd.grid_dim_x = 4096; // Large grid for maximum GPU utilization
u32_t n_threads = cd.grid_dim_x * cd.block_dim_x;
@ -128,9 +74,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
int scan_pos = 12;
time_measurement();
time_measurement();
double start_time = wall_time_delta();
double last_report_time = 0.0;
// double start_time = wall_time_delta();
while(keep_running && (max_attempts == 0 || attempts < max_attempts))
{
@ -148,13 +92,8 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
if(use_scan_kernel)
cd.arg[3] = &scan_pos;
// Launch kernel and measure time
time_measurement();
double kernel_start = cpu_time_delta();
lauch_kernel(&cd);
time_measurement();
double kernel_end = cpu_time_delta();
double kernel_time = kernel_end - kernel_start;
// Launch the CUDA kernel
launch_kernel(&cd);
// Copy results back
device_to_host_copy(&cd, 0);
@ -203,16 +142,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
}
}
// Update histograms
if(kernel_runs < histogram_capacity)
{
kernel_times[kernel_runs] = kernel_time;
kernel_coin_counts[kernel_runs] = n_coins_this_kernel;
}
histogram_add(&time_histogram, kernel_time);
histogram_add(&coins_histogram, (double)n_coins_this_kernel);
// Update counters
kernel_runs++;
if(use_scan_kernel)
@ -226,21 +155,11 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
if(use_scan_kernel)
scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53
// Print progress every second
time_measurement();
double current_time = wall_time_delta() - start_time;
if(current_time - last_report_time >= 1.0)
{
double rate = attempts / current_time;
printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n",
(unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs,
current_time / kernel_runs);
last_report_time = current_time;
}
}
time_measurement();
double total_time = wall_time_delta() - start_time;
double total_time = cpu_time_delta();
printf("\n=== Mining Statistics ===\n");
printf("Total attempts: %llu\n", (unsigned long long)attempts);
@ -249,30 +168,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
printf("Coins found: %u\n", coins_found);
printf("Kernel launches: %u\n", kernel_runs);
// Print histograms
histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20);
histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10);
// Save detailed histogram data
FILE *fp = fopen("cuda_kernel_stats.csv", "w");
if(fp != NULL)
{
fprintf(fp, "kernel_id,time_seconds,coins_found\n");
u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity;
for(u32_t i = 0; i < n_to_save; i++)
{
fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]);
}
fclose(fp);
printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n");
}
// Save any remaining coins
save_coin(NULL);
// Cleanup
free(kernel_times);
free(kernel_coin_counts);
terminate_cuda(&cd);
}

View File

@ -5,17 +5,14 @@
//
#include "aad_sha1.h"
typedef unsigned int u32_t;
typedef unsigned char u08_t;
typedef unsigned long long u64_t;
#include "aad_data_types.h"
//
// Optimized CUDA kernel for DETI coin mining
// Each thread generates coins using the same approach as CPU/SIMD miners
//
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2)
{
u32_t coin[14];
@ -47,16 +44,21 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
offset += (u64_t)n;
// Apply offset to variable part (increment the coin counter)
// Use the same carry logic as CPU/SIMD miners
for(int pos = 53; pos >= 12 && offset > 0; pos--)
{
u08_t *byte = &bytes[pos ^ 3];
u64_t add = offset % 95; // Range: 32-126 (95 values)
offset /= 95;
u32_t val = (*byte - 32 + add);
*byte = 32 + (val % 95);
offset += val / 95; // Carry
u64_t add = offset % 127;
offset /= 127;
u32_t val = *byte + add;
u08_t new_val = val % 127;
// Skip newline character (ASCII 10) in the variable part
if(new_val == '\n')
new_val++;
*byte = new_val;
offset += val / 127; // Carry
}
// Compute SHA1 hash
@ -92,7 +94,7 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
// Kernel that tries all possible values for one character position
//
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position)
{
u32_t coin[14];
@ -130,7 +132,13 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
offset /= 95;
u32_t val = (*byte - 32 + add);
*byte = 32 + (val % 95);
u08_t new_val = 32 + (val % 95);
// Skip newline character (ASCII 10) in the variable part
if(new_val == '\n')
new_val++;
*byte = new_val;
offset += val / 95;
}

View File

@ -36,7 +36,7 @@ typedef struct
CUdeviceptr device_data[2]; // the pointers to the device data
// launch kernel data
unsigned int grid_dim_x; // the number of grid blocks (in the X dimension, the only one we will use here)
unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMENDED_CUDA_BLOCK_SIZE)
unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMMENDED_CUDA_BLOCK_SIZE)
int n_kernel_arguments; // number of kernel arguments
void *arg[MAX_N_ARGUMENTS]; // pointers to the kernel argument data
@ -260,10 +260,10 @@ static void device_to_host_copy(cuda_data_t *cd,int idx)
// launch a CUDA kernel (with 0 bytes of shared memory and no extra options)
//
static void lauch_kernel(cuda_data_t *cd)
static void launch_kernel(cuda_data_t *cd)
{
if(cd->block_dim_x != (unsigned int)RECOMENDED_CUDA_BLOCK_SIZE)
fprintf(stderr,"lauch_kernel(): block_dim_x should be equal to %d\n",RECOMENDED_CUDA_BLOCK_SIZE);
if(cd->block_dim_x != (unsigned int)RECOMMENDED_CUDA_BLOCK_SIZE)
fprintf(stderr,"launch_kernel(): block_dim_x should be equal to %d\n",RECOMMENDED_CUDA_BLOCK_SIZE);
CU_CALL( cuLaunchKernel , (cd->cu_kernel,cd->grid_dim_x,1u,1u,cd->block_dim_x,1u,1u,0u,cd->cu_stream,&cd->arg[0],NULL) );
synchronize_cuda(cd);
}

View File

@ -28,7 +28,7 @@
//
// we place this here to simplify things (aad_sha1_cuda_kernel.cu includes this file...)
//
#define RECOMENDED_CUDA_BLOCK_SIZE 128
#define RECOMMENDED_CUDA_BLOCK_SIZE 128
//

View File

@ -12,8 +12,7 @@
//
#include "aad_sha1.h"
typedef unsigned int u32_t;
#include "aad_data_types.h"
//
// the nvcc compiler stores w[] in registers (constant indices!)
@ -23,7 +22,7 @@ typedef unsigned int u32_t;
// warp thread number: n & 31 -- the lane
//
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash)
{
u32_t n;

View File

@ -19,7 +19,7 @@ static void test_sha1_cuda(int n_tests)
double host_to_device_time,kernel_time,device_to_host_time,hashes_per_second;
cuda_data_t cd;
if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMENDED_CUDA_BLOCK_SIZE != 0)
if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMMENDED_CUDA_BLOCK_SIZE != 0)
{
fprintf(stderr,"test_sha1_cuda(): bad number of tests\n");
exit(1);
@ -44,13 +44,13 @@ static void test_sha1_cuda(int n_tests)
host_to_device_copy(&cd,0); // idx=0 means that the interleaved32_data is copied to the CUDA device
time_measurement();
host_to_device_time = wall_time_delta();
cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
cd.block_dim_x = (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
cd.block_dim_x = (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
cd.n_kernel_arguments = 2;
cd.arg[0] = &cd.device_data[0]; // interleaved32_data
cd.arg[1] = &cd.device_data[1]; // interleaved32_hash
time_measurement();
lauch_kernel(&cd);
launch_kernel(&cd);
time_measurement();
kernel_time = wall_time_delta();
time_measurement();