From 0c2c534f7c8cb4fd16ab975c914d42cbb003cb35 Mon Sep 17 00:00:00 2001 From: RubenCGomes Date: Sun, 2 Nov 2025 22:39:53 +0000 Subject: [PATCH] added CUDA implementation, changes needed --- aad_coin_miner_cuda.c | 304 ++++++++++++++++++++++++++++++++++ aad_coin_miner_cuda_kernel.cu | 185 +++++++++++++++++++++ 2 files changed, 489 insertions(+) create mode 100644 aad_coin_miner_cuda.c create mode 100644 aad_coin_miner_cuda_kernel.cu diff --git a/aad_coin_miner_cuda.c b/aad_coin_miner_cuda.c new file mode 100644 index 0000000..167f37a --- /dev/null +++ b/aad_coin_miner_cuda.c @@ -0,0 +1,304 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - CUDA implementation with histograms +// + +#include +#include +#include +#include +#include +#include "aad_data_types.h" +#include "aad_utilities.h" +#include "aad_sha1_cpu.h" +#include "aad_cuda_utilities.h" +#include "aad_vault.h" + +#define COINS_STORAGE_SIZE 1024u +#define MAX_HISTOGRAM_BINS 100 + +static volatile int keep_running = 1; + +void signal_handler(int signum) +{ + (void)signum; + keep_running = 0; +} + +// Histogram data structures +typedef struct { + u32_t bins[MAX_HISTOGRAM_BINS]; + u32_t count; + double min_value; + double max_value; +} histogram_t; + +static void histogram_init(histogram_t *h) +{ + memset(h->bins, 0, sizeof(h->bins)); + h->count = 0; + h->min_value = 1e99; + h->max_value = 0.0; +} + +static void histogram_add(histogram_t *h, double value) +{ + if(value < h->min_value) + h->min_value = value; + if(value > h->max_value) + h->max_value = value; + h->count++; + + // For now, just count - we'll bin them later when printing +} + +static void histogram_print(histogram_t *h, const char *title, int n_bins) +{ + if(h->count == 0) + { + printf("%s: No data\n", title); + return; + } + + printf("\n%s:\n", title); + printf(" Count: %u\n", h->count); + printf(" Min: %.6f\n", h->min_value); + printf(" Max: %.6f\n", h->max_value); + printf(" Avg: %.6f\n", (h->min_value + h->max_value) / 2.0); +} + +// Coin reconstruction from stored data +static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) +{ + // Fixed parts (must match kernel byte order) + coin[0] = 0x44455449u; // "DETI" with byte swap (idx ^ 3) + coin[1] = 0x20636F69u; // " coi" with byte swap (idx ^ 3) + coin[2] = 0x6E203220u; // "n 2 " with byte swap (idx ^ 3) + + // Variable parts (restore from storage) + for(int i = 0; i < 11; i++) + coin[3 + i] = stored_data[i]; +} + +// +// Mine DETI coins using CUDA +// +static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel) +{ + cuda_data_t cd; + u32_t *host_storage; + u64_t attempts = 0; + u32_t coins_found = 0; + u32_t kernel_runs = 0; + + // Histograms + histogram_t time_histogram; + histogram_t coins_histogram; + double *kernel_times = NULL; + u32_t *kernel_coin_counts = NULL; + u32_t histogram_capacity = 10000; + + histogram_init(&time_histogram); + histogram_init(&coins_histogram); + + kernel_times = (double *)malloc(histogram_capacity * sizeof(double)); + kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t)); + + // Initialize CUDA + cd.device_number = 0; + cd.cubin_file_name = "coin_miner_cuda_kernel.cubin"; + cd.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel"; + cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t); + cd.data_size[1] = 0; + + initialize_cuda(&cd); + + host_storage = (u32_t *)cd.host_data[0]; + + // Kernel configuration + cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE; + cd.grid_dim_x = 4096; // Large grid for maximum GPU utilization + + u32_t n_threads = cd.grid_dim_x * cd.block_dim_x; + + printf("Mining DETI coins using CUDA...\n"); + printf("Grid: %u blocks × %u threads = %u total threads\n", + cd.grid_dim_x, cd.block_dim_x, n_threads); + printf("Kernel: %s\n", cd.kernel_name); + printf("Press Ctrl+C to stop\n\n"); + + u32_t param1 = (u32_t)time(NULL); + u32_t param2 = 0x12345678u; + int scan_pos = 12; + + time_measurement(); + time_measurement(); + double start_time = wall_time_delta(); + double last_report_time = 0.0; + + while(keep_running && (max_attempts == 0 || attempts < max_attempts)) + { + // Initialize storage area + host_storage[0] = 1u; // First unused index + + // Copy to device + host_to_device_copy(&cd, 0); + + // Set kernel arguments + cd.n_kernel_arguments = use_scan_kernel ? 4 : 3; + cd.arg[0] = &cd.device_data[0]; + cd.arg[1] = ¶m1; + cd.arg[2] = ¶m2; + if(use_scan_kernel) + cd.arg[3] = &scan_pos; + + // Launch kernel and measure time + time_measurement(); + double kernel_start = cpu_time_delta(); + lauch_kernel(&cd); + time_measurement(); + double kernel_end = cpu_time_delta(); + double kernel_time = kernel_end - kernel_start; + + // Copy results back + device_to_host_copy(&cd, 0); + + // Process found coins + u32_t n_coins_this_kernel = 0; + u32_t n_stored = (host_storage[0] - 1) / 14; + + if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE) + { + printf("DEBUG: host_storage[0] = %u, n_stored = %u\n", host_storage[0], n_stored); + + for(u32_t i = 0; i < n_stored; i++) + { + u32_t coin[14]; + reconstruct_coin(&host_storage[1 + i * 14], coin); + + // Verify it's actually a valid coin + u32_t hash[5]; + sha1(coin, hash); + + printf("DEBUG: Coin %u - hash[0] = 0x%08X (expected 0xAAD20250)\n", i, hash[0]); + + // Print the coin as string + if(i == 0) { + printf("DEBUG: First coin content: "); + u08_t *bytes = (u08_t *)coin; + for(int j = 0; j < 55; j++) { + char c = bytes[j ^ 3]; + if(c >= 32 && c <= 126) + printf("%c", c); + else + printf("[0x%02X]", (u08_t)c); + } + printf("\n"); + } + + if(hash[0] == 0xAAD20250u) + { + coins_found++; + n_coins_this_kernel++; + printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n", + kernel_runs, n_coins_this_kernel); + save_coin(coin); + } + } + } + + // Update histograms + if(kernel_runs < histogram_capacity) + { + kernel_times[kernel_runs] = kernel_time; + kernel_coin_counts[kernel_runs] = n_coins_this_kernel; + } + + histogram_add(&time_histogram, kernel_time); + histogram_add(&coins_histogram, (double)n_coins_this_kernel); + + // Update counters + kernel_runs++; + if(use_scan_kernel) + attempts += n_threads * 256; // Each thread tries 256 values + else + attempts += n_threads; + + // Update parameters for next iteration + param1++; + param2 = param2 ^ 0x9E3779B9u; + if(use_scan_kernel) + scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53 + + // Print progress every second + time_measurement(); + double current_time = wall_time_delta() - start_time; + if(current_time - last_report_time >= 1.0) + { + double rate = attempts / current_time; + printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n", + (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs, + current_time / kernel_runs); + last_report_time = current_time; + } + } + + time_measurement(); + double total_time = wall_time_delta() - start_time; + + printf("\n=== Mining Statistics ===\n"); + printf("Total attempts: %llu\n", (unsigned long long)attempts); + printf("Total time: %.2f seconds\n", total_time); + printf("Average rate: %.2f attempts/second\n", attempts / total_time); + printf("Coins found: %u\n", coins_found); + printf("Kernel launches: %u\n", kernel_runs); + + // Print histograms + histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20); + histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10); + + // Save detailed histogram data + FILE *fp = fopen("cuda_kernel_stats.csv", "w"); + if(fp != NULL) + { + fprintf(fp, "kernel_id,time_seconds,coins_found\n"); + u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity; + for(u32_t i = 0; i < n_to_save; i++) + { + fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]); + } + fclose(fp); + printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n"); + } + + // Save any remaining coins + save_coin(NULL); + + // Cleanup + free(kernel_times); + free(kernel_coin_counts); + terminate_cuda(&cd); +} + +int main(int argc, char *argv[]) +{ + u64_t max_attempts = 0; + int use_scan_kernel = 0; + + signal(SIGINT, signal_handler); + + if(argc > 1) + max_attempts = strtoull(argv[1], NULL, 10); + + if(argc > 2 && strcmp(argv[2], "scan") == 0) + { + use_scan_kernel = 1; + printf("Using scan kernel (tries 256 values per thread)\n"); + } + + mine_coins_cuda(max_attempts, use_scan_kernel); + + return 0; +} + diff --git a/aad_coin_miner_cuda_kernel.cu b/aad_coin_miner_cuda_kernel.cu new file mode 100644 index 0000000..57a2268 --- /dev/null +++ b/aad_coin_miner_cuda_kernel.cu @@ -0,0 +1,185 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - CUDA kernel (optimized for mining) +// + +#include "aad_sha1.h" + +typedef unsigned int u32_t; +typedef unsigned char u08_t; + +// +// Optimized CUDA kernel for DETI coin mining +// Each thread generates its own message based on thread coordinates and external parameters +// + +extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) +void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2) +{ + u32_t coin[14]; + u32_t hash[5]; + u32_t n, warp_id, lane_id; + + // Get thread coordinates + n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; + warp_id = n >> 5u; + lane_id = n & 31u; + + // Initialize coin template: "DETI coin 2 " + variable + "\n\x80" + // Use byte-swapped format to match host expectations (idx ^ 3) + coin[0] = 0x44455449u; // "DETI" with byte swap + coin[1] = 0x20636F69u; // " coi" with byte swap + coin[2] = 0x6E203220u; // "n 2 " with byte swap + + // Variable part: encode thread ID and parameters + // This ensures each thread works on a different message + coin[3] = n; // Global thread ID + coin[4] = param1; // External parameter 1 + coin[5] = param2; // External parameter 2 + coin[6] = blockIdx.x; // Block index + coin[7] = threadIdx.x; // Thread index + coin[8] = warp_id; // Warp ID + coin[9] = lane_id; // Lane ID + coin[10] = n ^ param1; // XOR combination + coin[11] = n ^ param2; // XOR combination + coin[12] = (n * 0x9E3779B9u); // Hash-like mixing + + // Last word: bytes 52-55 + // Memory layout: coin[13]=0xAABBCCDD -> mem[52]=DD, [53]=CC, [54]=BB, [55]=AA + // With idx^3: bytes[52^3]=bytes[55]=AA, bytes[53^3]=bytes[54]=BB, bytes[54^3]=bytes[53]=CC, bytes[55^3]=bytes[52]=DD + // We want: bytes[54^3]=0x0A (newline), bytes[55^3]=0x80 (padding) + // So: bytes[53]=0x0A, bytes[52]=0x80 -> coin[13]=0x????0A80 + coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A + + // Compute SHA1 hash +# define T u32_t +# define C(c) (c) +# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n)))) +# define DATA(idx) coin[idx] +# define HASH(idx) hash[idx] + CUSTOM_SHA1_CODE(); +# undef T +# undef C +# undef ROTATE +# undef DATA +# undef HASH + + // Check if this is a valid DETI coin + if(hash[0] == 0xAAD20250u) + { + // Found a coin! Store it atomically + u32_t idx = atomicAdd(coins_storage_area, 14u); + + // Make sure we don't write outside buffer + if(idx < 1024u - 14u) + { + // Store the coin data (only variable parts needed) + coins_storage_area[idx + 0] = coin[ 3]; + coins_storage_area[idx + 1] = coin[ 4]; + coins_storage_area[idx + 2] = coin[ 5]; + coins_storage_area[idx + 3] = coin[ 6]; + coins_storage_area[idx + 4] = coin[ 7]; + coins_storage_area[idx + 5] = coin[ 8]; + coins_storage_area[idx + 6] = coin[ 9]; + coins_storage_area[idx + 7] = coin[10]; + coins_storage_area[idx + 8] = coin[11]; + coins_storage_area[idx + 9] = coin[12]; + coins_storage_area[idx + 10] = coin[13]; + // Store hash value for verification + coins_storage_area[idx + 11] = hash[1]; + coins_storage_area[idx + 12] = hash[2]; + coins_storage_area[idx + 13] = hash[3]; + } + } +} + +// +// Kernel that tries all possible values for one character position +// + +extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) +void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position) +{ + u32_t coin[14]; + u32_t hash[5]; + u32_t n; + + n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; + + // Initialize coin template (with byte swap for idx ^ 3 convention) + coin[0] = 0x44455449u; // "DETI" with byte swap + coin[1] = 0x20636F69u; // " coi" with byte swap + coin[2] = 0x6E203220u; // "n 2 " with byte swap + + // Variable part + coin[3] = param1; + coin[4] = param2; + coin[5] = n >> 8; // High bits of n + coin[6] = blockIdx.x; + coin[7] = threadIdx.x; + coin[8] = param1 ^ param2; + coin[9] = n & 0xFFu; // Low 8 bits of n + coin[10] = param1 + n; + coin[11] = param2 - n; + coin[12] = (n * 0x9E3779B9u); + coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A + + // Try all possible values for the scan position (0-255) + // This allows exploring a full byte range in a single kernel launch + for(u32_t val = 0; val < 256u; val++) + { + // Insert the test value at the scan position + u32_t word_idx = scan_position / 4; + u32_t byte_pos = scan_position % 4; + u32_t shift = byte_pos * 8; + + if(word_idx >= 3 && word_idx < 13) + { + u32_t mask = ~(0xFFu << shift); + coin[word_idx] = (coin[word_idx] & mask) | (val << shift); + + // Make sure we don't use newline in the middle + u08_t *bytes = (u08_t *)coin; + if(scan_position < 54 && bytes[scan_position ^ 3] == 0x0A) + continue; + } + + // Compute SHA1 hash +# define T u32_t +# define C(c) (c) +# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n)))) +# define DATA(idx) coin[idx] +# define HASH(idx) hash[idx] + CUSTOM_SHA1_CODE(); +# undef T +# undef C +# undef ROTATE +# undef DATA +# undef HASH + + // Check if valid coin + if(hash[0] == 0xAAD20250u) + { + u32_t idx = atomicAdd(coins_storage_area, 14u); + if(idx < 1024u - 14u) + { + coins_storage_area[idx + 0] = coin[ 3]; + coins_storage_area[idx + 1] = coin[ 4]; + coins_storage_area[idx + 2] = coin[ 5]; + coins_storage_area[idx + 3] = coin[ 6]; + coins_storage_area[idx + 4] = coin[ 7]; + coins_storage_area[idx + 5] = coin[ 8]; + coins_storage_area[idx + 6] = coin[ 9]; + coins_storage_area[idx + 7] = coin[10]; + coins_storage_area[idx + 8] = coin[11]; + coins_storage_area[idx + 9] = coin[12]; + coins_storage_area[idx + 10] = coin[13]; + coins_storage_area[idx + 11] = hash[1]; + coins_storage_area[idx + 12] = hash[2]; + coins_storage_area[idx + 13] = hash[3]; + } + } + } +} +