2 typos fixed + cuda kernel now uses values across 0-127
Signed-off-by: RubenCGomes <rlcg@ua.pt>
This commit is contained in:
parent
bd06cb1133
commit
cbb14ce858
|
|
@ -26,47 +26,6 @@ void signal_handler(int signum)
|
|||
keep_running = 0;
|
||||
}
|
||||
|
||||
// Histogram data structures
|
||||
typedef struct {
|
||||
u32_t bins[MAX_HISTOGRAM_BINS];
|
||||
u32_t count;
|
||||
double min_value;
|
||||
double max_value;
|
||||
} histogram_t;
|
||||
|
||||
static void histogram_init(histogram_t *h)
|
||||
{
|
||||
memset(h->bins, 0, sizeof(h->bins));
|
||||
h->count = 0;
|
||||
h->min_value = 1e99;
|
||||
h->max_value = 0.0;
|
||||
}
|
||||
|
||||
static void histogram_add(histogram_t *h, double value)
|
||||
{
|
||||
if(value < h->min_value)
|
||||
h->min_value = value;
|
||||
if(value > h->max_value)
|
||||
h->max_value = value;
|
||||
h->count++;
|
||||
|
||||
// For now, just count - we'll bin them later when printing
|
||||
}
|
||||
|
||||
static void histogram_print(histogram_t *h, const char *title, int n_bins)
|
||||
{
|
||||
if(h->count == 0)
|
||||
{
|
||||
printf("%s: No data\n", title);
|
||||
return;
|
||||
}
|
||||
|
||||
printf("\n%s:\n", title);
|
||||
printf(" Count: %u\n", h->count);
|
||||
printf(" Min: %.6f\n", h->min_value);
|
||||
printf(" Max: %.6f\n", h->max_value);
|
||||
printf(" Avg: %.6f\n", (h->min_value + h->max_value) / 2.0);
|
||||
}
|
||||
|
||||
// Coin reconstruction from stored data
|
||||
static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
|
||||
|
|
@ -87,19 +46,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
u32_t coins_found = 0;
|
||||
u32_t kernel_runs = 0;
|
||||
|
||||
// Histograms
|
||||
histogram_t time_histogram;
|
||||
histogram_t coins_histogram;
|
||||
double *kernel_times = NULL;
|
||||
u32_t *kernel_coin_counts = NULL;
|
||||
u32_t histogram_capacity = 10000;
|
||||
|
||||
histogram_init(&time_histogram);
|
||||
histogram_init(&coins_histogram);
|
||||
|
||||
kernel_times = (double *)malloc(histogram_capacity * sizeof(double));
|
||||
kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t));
|
||||
|
||||
// Initialize CUDA
|
||||
cd.device_number = 0;
|
||||
cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
|
||||
|
|
@ -112,7 +58,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
host_storage = (u32_t *)cd.host_data[0];
|
||||
|
||||
// Kernel configuration
|
||||
cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.grid_dim_x = 4096; // Large grid for maximum GPU utilization
|
||||
|
||||
u32_t n_threads = cd.grid_dim_x * cd.block_dim_x;
|
||||
|
|
@ -128,9 +74,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
int scan_pos = 12;
|
||||
|
||||
time_measurement();
|
||||
time_measurement();
|
||||
double start_time = wall_time_delta();
|
||||
double last_report_time = 0.0;
|
||||
// double start_time = wall_time_delta();
|
||||
|
||||
while(keep_running && (max_attempts == 0 || attempts < max_attempts))
|
||||
{
|
||||
|
|
@ -148,13 +92,8 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
if(use_scan_kernel)
|
||||
cd.arg[3] = &scan_pos;
|
||||
|
||||
// Launch kernel and measure time
|
||||
time_measurement();
|
||||
double kernel_start = cpu_time_delta();
|
||||
lauch_kernel(&cd);
|
||||
time_measurement();
|
||||
double kernel_end = cpu_time_delta();
|
||||
double kernel_time = kernel_end - kernel_start;
|
||||
// Launch the CUDA kernel
|
||||
launch_kernel(&cd);
|
||||
|
||||
// Copy results back
|
||||
device_to_host_copy(&cd, 0);
|
||||
|
|
@ -203,16 +142,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
}
|
||||
}
|
||||
|
||||
// Update histograms
|
||||
if(kernel_runs < histogram_capacity)
|
||||
{
|
||||
kernel_times[kernel_runs] = kernel_time;
|
||||
kernel_coin_counts[kernel_runs] = n_coins_this_kernel;
|
||||
}
|
||||
|
||||
histogram_add(&time_histogram, kernel_time);
|
||||
histogram_add(&coins_histogram, (double)n_coins_this_kernel);
|
||||
|
||||
// Update counters
|
||||
kernel_runs++;
|
||||
if(use_scan_kernel)
|
||||
|
|
@ -226,21 +155,11 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
if(use_scan_kernel)
|
||||
scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53
|
||||
|
||||
// Print progress every second
|
||||
time_measurement();
|
||||
double current_time = wall_time_delta() - start_time;
|
||||
if(current_time - last_report_time >= 1.0)
|
||||
{
|
||||
double rate = attempts / current_time;
|
||||
printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n",
|
||||
(unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs,
|
||||
current_time / kernel_runs);
|
||||
last_report_time = current_time;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
time_measurement();
|
||||
double total_time = wall_time_delta() - start_time;
|
||||
double total_time = cpu_time_delta();
|
||||
|
||||
printf("\n=== Mining Statistics ===\n");
|
||||
printf("Total attempts: %llu\n", (unsigned long long)attempts);
|
||||
|
|
@ -249,30 +168,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
|
|||
printf("Coins found: %u\n", coins_found);
|
||||
printf("Kernel launches: %u\n", kernel_runs);
|
||||
|
||||
// Print histograms
|
||||
histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20);
|
||||
histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10);
|
||||
|
||||
// Save detailed histogram data
|
||||
FILE *fp = fopen("cuda_kernel_stats.csv", "w");
|
||||
if(fp != NULL)
|
||||
{
|
||||
fprintf(fp, "kernel_id,time_seconds,coins_found\n");
|
||||
u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity;
|
||||
for(u32_t i = 0; i < n_to_save; i++)
|
||||
{
|
||||
fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n");
|
||||
}
|
||||
|
||||
// Save any remaining coins
|
||||
save_coin(NULL);
|
||||
|
||||
// Cleanup
|
||||
free(kernel_times);
|
||||
free(kernel_coin_counts);
|
||||
terminate_cuda(&cd);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,17 +5,14 @@
|
|||
//
|
||||
|
||||
#include "aad_sha1.h"
|
||||
|
||||
typedef unsigned int u32_t;
|
||||
typedef unsigned char u08_t;
|
||||
typedef unsigned long long u64_t;
|
||||
#include "aad_data_types.h"
|
||||
|
||||
//
|
||||
// Optimized CUDA kernel for DETI coin mining
|
||||
// Each thread generates coins using the same approach as CPU/SIMD miners
|
||||
//
|
||||
|
||||
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
|
||||
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
|
||||
void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2)
|
||||
{
|
||||
u32_t coin[14];
|
||||
|
|
@ -47,16 +44,21 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
|
|||
offset += (u64_t)n;
|
||||
|
||||
// Apply offset to variable part (increment the coin counter)
|
||||
// Use the same carry logic as CPU/SIMD miners
|
||||
for(int pos = 53; pos >= 12 && offset > 0; pos--)
|
||||
{
|
||||
u08_t *byte = &bytes[pos ^ 3];
|
||||
u64_t add = offset % 95; // Range: 32-126 (95 values)
|
||||
offset /= 95;
|
||||
u64_t add = offset % 127;
|
||||
offset /= 127;
|
||||
|
||||
u32_t val = (*byte - 32 + add);
|
||||
*byte = 32 + (val % 95);
|
||||
offset += val / 95; // Carry
|
||||
u32_t val = *byte + add;
|
||||
u08_t new_val = val % 127;
|
||||
|
||||
// Skip newline character (ASCII 10) in the variable part
|
||||
if(new_val == '\n')
|
||||
new_val++;
|
||||
|
||||
*byte = new_val;
|
||||
offset += val / 127; // Carry
|
||||
}
|
||||
|
||||
// Compute SHA1 hash
|
||||
|
|
@ -92,7 +94,7 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
|
|||
// Kernel that tries all possible values for one character position
|
||||
//
|
||||
|
||||
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
|
||||
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
|
||||
void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position)
|
||||
{
|
||||
u32_t coin[14];
|
||||
|
|
@ -130,7 +132,13 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
|
|||
offset /= 95;
|
||||
|
||||
u32_t val = (*byte - 32 + add);
|
||||
*byte = 32 + (val % 95);
|
||||
u08_t new_val = 32 + (val % 95);
|
||||
|
||||
// Skip newline character (ASCII 10) in the variable part
|
||||
if(new_val == '\n')
|
||||
new_val++;
|
||||
|
||||
*byte = new_val;
|
||||
offset += val / 95;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ typedef struct
|
|||
CUdeviceptr device_data[2]; // the pointers to the device data
|
||||
// launch kernel data
|
||||
unsigned int grid_dim_x; // the number of grid blocks (in the X dimension, the only one we will use here)
|
||||
unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMENDED_CUDA_BLOCK_SIZE)
|
||||
unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMMENDED_CUDA_BLOCK_SIZE)
|
||||
int n_kernel_arguments; // number of kernel arguments
|
||||
void *arg[MAX_N_ARGUMENTS]; // pointers to the kernel argument data
|
||||
|
||||
|
|
@ -260,10 +260,10 @@ static void device_to_host_copy(cuda_data_t *cd,int idx)
|
|||
// launch a CUDA kernel (with 0 bytes of shared memory and no extra options)
|
||||
//
|
||||
|
||||
static void lauch_kernel(cuda_data_t *cd)
|
||||
static void launch_kernel(cuda_data_t *cd)
|
||||
{
|
||||
if(cd->block_dim_x != (unsigned int)RECOMENDED_CUDA_BLOCK_SIZE)
|
||||
fprintf(stderr,"lauch_kernel(): block_dim_x should be equal to %d\n",RECOMENDED_CUDA_BLOCK_SIZE);
|
||||
if(cd->block_dim_x != (unsigned int)RECOMMENDED_CUDA_BLOCK_SIZE)
|
||||
fprintf(stderr,"launch_kernel(): block_dim_x should be equal to %d\n",RECOMMENDED_CUDA_BLOCK_SIZE);
|
||||
CU_CALL( cuLaunchKernel , (cd->cu_kernel,cd->grid_dim_x,1u,1u,cd->block_dim_x,1u,1u,0u,cd->cu_stream,&cd->arg[0],NULL) );
|
||||
synchronize_cuda(cd);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@
|
|||
//
|
||||
// we place this here to simplify things (aad_sha1_cuda_kernel.cu includes this file...)
|
||||
//
|
||||
#define RECOMENDED_CUDA_BLOCK_SIZE 128
|
||||
#define RECOMMENDED_CUDA_BLOCK_SIZE 128
|
||||
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -12,8 +12,7 @@
|
|||
//
|
||||
|
||||
#include "aad_sha1.h"
|
||||
|
||||
typedef unsigned int u32_t;
|
||||
#include "aad_data_types.h"
|
||||
|
||||
//
|
||||
// the nvcc compiler stores w[] in registers (constant indices!)
|
||||
|
|
@ -23,7 +22,7 @@ typedef unsigned int u32_t;
|
|||
// warp thread number: n & 31 -- the lane
|
||||
//
|
||||
|
||||
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
|
||||
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
|
||||
void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash)
|
||||
{
|
||||
u32_t n;
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ static void test_sha1_cuda(int n_tests)
|
|||
double host_to_device_time,kernel_time,device_to_host_time,hashes_per_second;
|
||||
cuda_data_t cd;
|
||||
|
||||
if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMENDED_CUDA_BLOCK_SIZE != 0)
|
||||
if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMMENDED_CUDA_BLOCK_SIZE != 0)
|
||||
{
|
||||
fprintf(stderr,"test_sha1_cuda(): bad number of tests\n");
|
||||
exit(1);
|
||||
|
|
@ -44,13 +44,13 @@ static void test_sha1_cuda(int n_tests)
|
|||
host_to_device_copy(&cd,0); // idx=0 means that the interleaved32_data is copied to the CUDA device
|
||||
time_measurement();
|
||||
host_to_device_time = wall_time_delta();
|
||||
cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.block_dim_x = (u32_t)RECOMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.block_dim_x = (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE;
|
||||
cd.n_kernel_arguments = 2;
|
||||
cd.arg[0] = &cd.device_data[0]; // interleaved32_data
|
||||
cd.arg[1] = &cd.device_data[1]; // interleaved32_hash
|
||||
time_measurement();
|
||||
lauch_kernel(&cd);
|
||||
launch_kernel(&cd);
|
||||
time_measurement();
|
||||
kernel_time = wall_time_delta();
|
||||
time_measurement();
|
||||
|
|
|
|||
Loading…
Reference in New Issue