diff --git a/aad_coin_miner_cuda.c b/aad_coin_miner_cuda.c index 8080b0f..7056b9b 100644 --- a/aad_coin_miner_cuda.c +++ b/aad_coin_miner_cuda.c @@ -26,47 +26,6 @@ void signal_handler(int signum) keep_running = 0; } -// Histogram data structures -typedef struct { - u32_t bins[MAX_HISTOGRAM_BINS]; - u32_t count; - double min_value; - double max_value; -} histogram_t; - -static void histogram_init(histogram_t *h) -{ - memset(h->bins, 0, sizeof(h->bins)); - h->count = 0; - h->min_value = 1e99; - h->max_value = 0.0; -} - -static void histogram_add(histogram_t *h, double value) -{ - if(value < h->min_value) - h->min_value = value; - if(value > h->max_value) - h->max_value = value; - h->count++; - - // For now, just count - we'll bin them later when printing -} - -static void histogram_print(histogram_t *h, const char *title, int n_bins) -{ - if(h->count == 0) - { - printf("%s: No data\n", title); - return; - } - - printf("\n%s:\n", title); - printf(" Count: %u\n", h->count); - printf(" Min: %.6f\n", h->min_value); - printf(" Max: %.6f\n", h->max_value); - printf(" Avg: %.6f\n", (h->min_value + h->max_value) / 2.0); -} // Coin reconstruction from stored data static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) @@ -87,19 +46,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) u32_t coins_found = 0; u32_t kernel_runs = 0; - // Histograms - histogram_t time_histogram; - histogram_t coins_histogram; - double *kernel_times = NULL; - u32_t *kernel_coin_counts = NULL; - u32_t histogram_capacity = 10000; - - histogram_init(&time_histogram); - histogram_init(&coins_histogram); - - kernel_times = (double *)malloc(histogram_capacity * sizeof(double)); - kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t)); - // Initialize CUDA cd.device_number = 0; cd.cubin_file_name = "coin_miner_cuda_kernel.cubin"; @@ -112,7 +58,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) host_storage = (u32_t *)cd.host_data[0]; // Kernel configuration - cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE; + cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE; cd.grid_dim_x = 4096; // Large grid for maximum GPU utilization u32_t n_threads = cd.grid_dim_x * cd.block_dim_x; @@ -128,9 +74,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) int scan_pos = 12; time_measurement(); - time_measurement(); - double start_time = wall_time_delta(); - double last_report_time = 0.0; + // double start_time = wall_time_delta(); while(keep_running && (max_attempts == 0 || attempts < max_attempts)) { @@ -148,13 +92,8 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) if(use_scan_kernel) cd.arg[3] = &scan_pos; - // Launch kernel and measure time - time_measurement(); - double kernel_start = cpu_time_delta(); - lauch_kernel(&cd); - time_measurement(); - double kernel_end = cpu_time_delta(); - double kernel_time = kernel_end - kernel_start; + // Launch the CUDA kernel + launch_kernel(&cd); // Copy results back device_to_host_copy(&cd, 0); @@ -203,16 +142,6 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) } } - // Update histograms - if(kernel_runs < histogram_capacity) - { - kernel_times[kernel_runs] = kernel_time; - kernel_coin_counts[kernel_runs] = n_coins_this_kernel; - } - - histogram_add(&time_histogram, kernel_time); - histogram_add(&coins_histogram, (double)n_coins_this_kernel); - // Update counters kernel_runs++; if(use_scan_kernel) @@ -226,21 +155,11 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) if(use_scan_kernel) scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53 - // Print progress every second - time_measurement(); - double current_time = wall_time_delta() - start_time; - if(current_time - last_report_time >= 1.0) - { - double rate = attempts / current_time; - printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n", - (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs, - current_time / kernel_runs); - last_report_time = current_time; - } + } time_measurement(); - double total_time = wall_time_delta() - start_time; + double total_time = cpu_time_delta(); printf("\n=== Mining Statistics ===\n"); printf("Total attempts: %llu\n", (unsigned long long)attempts); @@ -249,30 +168,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) printf("Coins found: %u\n", coins_found); printf("Kernel launches: %u\n", kernel_runs); - // Print histograms - histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20); - histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10); - - // Save detailed histogram data - FILE *fp = fopen("cuda_kernel_stats.csv", "w"); - if(fp != NULL) - { - fprintf(fp, "kernel_id,time_seconds,coins_found\n"); - u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity; - for(u32_t i = 0; i < n_to_save; i++) - { - fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]); - } - fclose(fp); - printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n"); - } // Save any remaining coins save_coin(NULL); - // Cleanup - free(kernel_times); - free(kernel_coin_counts); terminate_cuda(&cd); } diff --git a/aad_coin_miner_cuda_kernel.cu b/aad_coin_miner_cuda_kernel.cu index 0683c51..f64869f 100644 --- a/aad_coin_miner_cuda_kernel.cu +++ b/aad_coin_miner_cuda_kernel.cu @@ -5,17 +5,14 @@ // #include "aad_sha1.h" - -typedef unsigned int u32_t; -typedef unsigned char u08_t; -typedef unsigned long long u64_t; +#include "aad_data_types.h" // // Optimized CUDA kernel for DETI coin mining // Each thread generates coins using the same approach as CPU/SIMD miners // -extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) +extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1) void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2) { u32_t coin[14]; @@ -47,16 +44,21 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param offset += (u64_t)n; // Apply offset to variable part (increment the coin counter) - // Use the same carry logic as CPU/SIMD miners for(int pos = 53; pos >= 12 && offset > 0; pos--) { u08_t *byte = &bytes[pos ^ 3]; - u64_t add = offset % 95; // Range: 32-126 (95 values) - offset /= 95; - - u32_t val = (*byte - 32 + add); - *byte = 32 + (val % 95); - offset += val / 95; // Carry + u64_t add = offset % 127; + offset /= 127; + + u32_t val = *byte + add; + u08_t new_val = val % 127; + + // Skip newline character (ASCII 10) in the variable part + if(new_val == '\n') + new_val++; + + *byte = new_val; + offset += val / 127; // Carry } // Compute SHA1 hash @@ -92,7 +94,7 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param // Kernel that tries all possible values for one character position // -extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) +extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1) void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position) { u32_t coin[14]; @@ -130,7 +132,13 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t offset /= 95; u32_t val = (*byte - 32 + add); - *byte = 32 + (val % 95); + u08_t new_val = 32 + (val % 95); + + // Skip newline character (ASCII 10) in the variable part + if(new_val == '\n') + new_val++; + + *byte = new_val; offset += val / 95; } diff --git a/aad_cuda_utilities.h b/aad_cuda_utilities.h index 290894c..1ed5bd7 100644 --- a/aad_cuda_utilities.h +++ b/aad_cuda_utilities.h @@ -36,7 +36,7 @@ typedef struct CUdeviceptr device_data[2]; // the pointers to the device data // launch kernel data unsigned int grid_dim_x; // the number of grid blocks (in the X dimension, the only one we will use here) - unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMENDED_CUDA_BLOCK_SIZE) + unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMMENDED_CUDA_BLOCK_SIZE) int n_kernel_arguments; // number of kernel arguments void *arg[MAX_N_ARGUMENTS]; // pointers to the kernel argument data @@ -260,10 +260,10 @@ static void device_to_host_copy(cuda_data_t *cd,int idx) // launch a CUDA kernel (with 0 bytes of shared memory and no extra options) // -static void lauch_kernel(cuda_data_t *cd) +static void launch_kernel(cuda_data_t *cd) { - if(cd->block_dim_x != (unsigned int)RECOMENDED_CUDA_BLOCK_SIZE) - fprintf(stderr,"lauch_kernel(): block_dim_x should be equal to %d\n",RECOMENDED_CUDA_BLOCK_SIZE); + if(cd->block_dim_x != (unsigned int)RECOMMENDED_CUDA_BLOCK_SIZE) + fprintf(stderr,"launch_kernel(): block_dim_x should be equal to %d\n",RECOMMENDED_CUDA_BLOCK_SIZE); CU_CALL( cuLaunchKernel , (cd->cu_kernel,cd->grid_dim_x,1u,1u,cd->block_dim_x,1u,1u,0u,cd->cu_stream,&cd->arg[0],NULL) ); synchronize_cuda(cd); } diff --git a/aad_sha1.h b/aad_sha1.h index 54c4fe3..5f7e1aa 100644 --- a/aad_sha1.h +++ b/aad_sha1.h @@ -28,7 +28,7 @@ // // we place this here to simplify things (aad_sha1_cuda_kernel.cu includes this file...) // -#define RECOMENDED_CUDA_BLOCK_SIZE 128 +#define RECOMMENDED_CUDA_BLOCK_SIZE 128 // diff --git a/aad_sha1_cuda_kernel.cu b/aad_sha1_cuda_kernel.cu index 9d67b94..ab29bc7 100644 --- a/aad_sha1_cuda_kernel.cu +++ b/aad_sha1_cuda_kernel.cu @@ -12,8 +12,7 @@ // #include "aad_sha1.h" - -typedef unsigned int u32_t; +#include "aad_data_types.h" // // the nvcc compiler stores w[] in registers (constant indices!) @@ -23,7 +22,7 @@ typedef unsigned int u32_t; // warp thread number: n & 31 -- the lane // -extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) +extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1) void sha1_cuda_kernel(u32_t *interleaved32_data,u32_t *interleaved32_hash) { u32_t n; diff --git a/aad_sha1_cuda_test.c b/aad_sha1_cuda_test.c index 4a73c47..0d4bdce 100644 --- a/aad_sha1_cuda_test.c +++ b/aad_sha1_cuda_test.c @@ -19,7 +19,7 @@ static void test_sha1_cuda(int n_tests) double host_to_device_time,kernel_time,device_to_host_time,hashes_per_second; cuda_data_t cd; - if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMENDED_CUDA_BLOCK_SIZE != 0) + if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMMENDED_CUDA_BLOCK_SIZE != 0) { fprintf(stderr,"test_sha1_cuda(): bad number of tests\n"); exit(1); @@ -44,13 +44,13 @@ static void test_sha1_cuda(int n_tests) host_to_device_copy(&cd,0); // idx=0 means that the interleaved32_data is copied to the CUDA device time_measurement(); host_to_device_time = wall_time_delta(); - cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMENDED_CUDA_BLOCK_SIZE; - cd.block_dim_x = (u32_t)RECOMENDED_CUDA_BLOCK_SIZE; + cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE; + cd.block_dim_x = (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE; cd.n_kernel_arguments = 2; cd.arg[0] = &cd.device_data[0]; // interleaved32_data cd.arg[1] = &cd.device_data[1]; // interleaved32_hash time_measurement(); - lauch_kernel(&cd); + launch_kernel(&cd); time_measurement(); kernel_time = wall_time_delta(); time_measurement();