From 7350d6be95ee7c12ef11d4fbd553d74099c4fec6 Mon Sep 17 00:00:00 2001 From: RubenCGomes Date: Wed, 26 Nov 2025 18:09:13 +0000 Subject: [PATCH] Refactor CUDA miner: optimize kernel and improve coin storage handling Signed-off-by: RubenCGomes --- aad_coin_miner_cuda.c | 194 ++++++++++++---------------------- aad_coin_miner_cuda_kernel.cu | 169 +++++++++++++++++------------ makefile | 7 +- 3 files changed, 169 insertions(+), 201 deletions(-) diff --git a/aad_coin_miner_cuda.c b/aad_coin_miner_cuda.c index 4e7c374..fdd52f4 100644 --- a/aad_coin_miner_cuda.c +++ b/aad_coin_miner_cuda.c @@ -1,7 +1,7 @@ // // Arquiteturas de Alto Desempenho 2025/2026 // -// DETI Coin Miner - CUDA implementation with histograms +// DETI Coin Miner - Host Code // #include @@ -11,173 +11,123 @@ #include #include #include "aad_data_types.h" -#include "aad_utilities.h" #include "aad_sha1_cpu.h" #include "aad_cuda_utilities.h" #include "aad_vault.h" -#define COINS_STORAGE_SIZE 1024u -#define MAX_HISTOGRAM_BINS 100 +#define COINS_STORAGE_SIZE 2048u // Increased buffer slightly static volatile int keep_running = 1; -void signal_handler(int signum) -{ +void signal_handler(int signum) { (void)signum; keep_running = 0; } -// Get current wall time in seconds -static double get_wall_time(void) -{ +static double get_wall_time(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9; } -// Coin reconstruction from stored data -static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) -{ - // Simply copy the complete coin data from storage - for(int i = 0; i < 14; i++) - coin[i] = stored_data[i]; -} - -// -// Mine DETI coins using CUDA -// static void mine_coins_cuda(u64_t max_attempts, double max_time) { cuda_data_t cd; u32_t *host_storage; u64_t attempts = 0; - u32_t coins_found = 0; - u32_t kernel_runs = 0; + u32_t coins_found_total = 0; // Initialize CUDA + memset(&cd, 0, sizeof(cd)); cd.device_number = 0; cd.cubin_file_name = "coin_miner_cuda_kernel.cubin"; cd.kernel_name = "mine_deti_coins_kernel"; + + // Allocate memory for results [ Counter (1 u32) | Data ... ] cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t); cd.data_size[1] = 0; initialize_cuda(&cd); - host_storage = (u32_t *)cd.host_data[0]; - // Kernel configuration - cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE; - cd.grid_dim_x = 400; // Large grid for maximum GPU utilization + // Configure Launch Dimensions + // Maximizing occupancy: + cd.block_dim_x = RECOMMENDED_CUDA_BLOCK_SIZE; // Usually 128 or 256 + cd.grid_dim_x = 80 * 4; // High number of blocks to hide latency - u32_t n_threads = cd.grid_dim_x * cd.block_dim_x; + u32_t total_threads = cd.grid_dim_x * cd.block_dim_x; + u32_t attempts_per_thread = 4096; // Work per kernel launch - printf("Mining DETI coins using CUDA...\n"); - printf("Grid: %u blocks × %u threads = %u total threads\n", - cd.grid_dim_x, cd.block_dim_x, n_threads); - printf("Kernel: %s\n", cd.kernel_name); - - if(max_attempts > 0 && max_time > 0) - printf("Will stop after %llu attempts OR %.2f seconds (whichever comes first)\n", - (unsigned long long)max_attempts, max_time); - else if(max_attempts > 0) - printf("Will stop after %llu attempts\n", (unsigned long long)max_attempts); - else if(max_time > 0) - printf("Will stop after %.2f seconds\n", max_time); - else - printf("Running indefinitely until Ctrl+C...\n"); - - printf("Press Ctrl+C to stop\n\n"); + printf("Starting CUDA Miner on %s\n", cd.device_name); + printf("Threads: %u, Attempts/Thread: %u\n", total_threads, attempts_per_thread); u64_t base_nonce = 0; - u32_t attempts_per_thread = 1024 * 16; // Increased attempts per thread - double start_time = get_wall_time(); - time_measurement(); + + // Arguments pointers + cd.n_kernel_arguments = 3; + cd.arg[0] = &cd.device_data[0]; + cd.arg[1] = &base_nonce; + cd.arg[2] = &attempts_per_thread; while(keep_running) { - // Check stopping conditions - if(max_attempts > 0 && attempts >= max_attempts) - break; - - double elapsed = get_wall_time() - start_time; - if(max_time > 0 && elapsed >= max_time) - break; - - // Initialize storage area - host_storage[0] = 1u; // First unused index - - // Copy to device + // 1. Reset storage counter + host_storage[0] = 1u; // Index 0 is the atomic counter. Start data at index 1. host_to_device_copy(&cd, 0); - // Set kernel arguments - cd.n_kernel_arguments = 2; - cd.arg[0] = &cd.device_data[0]; - cd.arg[1] = &base_nonce; - cd.arg[2] = &attempts_per_thread; - - // Launch the CUDA kernel + // 2. Launch Kernel launch_kernel(&cd); - // Copy results back + // 3. Retrieve Results device_to_host_copy(&cd, 0); - // Process found coins - u32_t n_coins_this_kernel = 0; - u32_t n_stored = (host_storage[0] - 1) / 14; + // 4. Process Found Coins + u32_t next_write_idx = host_storage[0]; + u32_t num_u32_written = next_write_idx - 1; - if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE) + // Each coin is 14 u32 words + if(num_u32_written >= 14) { - for(u32_t i = 0; i < n_stored; i++) - { - u32_t coin[14]; - reconstruct_coin(&host_storage[1 + i * 14], coin); + int coins_in_batch = num_u32_written / 14; + for(int c = 0; c < coins_in_batch; c++) + { + u32_t found_coin[14]; + // Copy from host buffer to temp array + for(int w=0; w<14; w++) { + found_coin[w] = host_storage[1 + (c * 14) + w]; + } - coins_found++; - n_coins_this_kernel++; - printf("COIN FOUND! (kernel %u, coin %u in this kernel). Total coins:%u\n", - kernel_runs, n_coins_this_kernel, coins_found); - save_coin(coin); - } + // Verify/Save using required function + save_coin(found_coin); + coins_found_total++; + printf("Coin Found! Total: %u\n", coins_found_total); + } } - // Update counters - kernel_runs++; - u64_t attempts_this_launch = (u64_t)n_threads * attempts_per_thread; - attempts += attempts_this_launch; - base_nonce += attempts_this_launch; + // 5. Update Progress + u64_t batch_attempts = (u64_t)total_threads * attempts_per_thread; + attempts += batch_attempts; + base_nonce += batch_attempts; // Ensure next kernel uses new nonces + + // 6. Check Limits + if((max_attempts > 0 && attempts >= max_attempts) || + (max_time > 0 && (get_wall_time() - start_time) >= max_time)) { + break; + } } - time_measurement(); - double total_time = cpu_time_delta(); - - printf("\n=== Mining Statistics ===\n"); - printf("Total attempts: %llu\n", (unsigned long long)attempts); - printf("Total time: %.2f seconds\n", total_time); - printf("Average rate: %.2f attempts/second\n", attempts / total_time); - printf("Coins found: %u\n", coins_found); - printf("Kernel launches: %u\n", kernel_runs); - - // Save any remaining coins - save_coin(NULL); + // Cleanup + double total_time = get_wall_time() - start_time; + printf("\nMining Finished.\n"); + printf("Attempts: %llu\n", (unsigned long long)attempts); + printf("Time: %.4fs\n", total_time); + printf("Hashrate: %.2f MH/s\n", (attempts / total_time) / 1000000.0); + save_coin(NULL); // Flush vault terminate_cuda(&cd); } -void print_usage(const char *prog_name) -{ - printf("Usage: %s [OPTIONS]\n", prog_name); - printf("Options:\n"); - printf(" -a Maximum number of attempts\n"); - printf(" -t Maximum time in seconds\n"); - printf(" -h Show this help message\n"); - printf("\nExamples:\n"); - printf(" %s -a 1000000000 # Run for 1B attempts\n", prog_name); - printf(" %s -t 60 # Run for 60 seconds\n", prog_name); - printf(" %s -a 1000000000 -t 60 # Stop at 1B attempts OR 60s (whichever first)\n", prog_name); - printf(" %s # Run indefinitely until Ctrl+C\n", prog_name); -} - int main(int argc, char *argv[]) { u64_t max_attempts = 0; @@ -186,27 +136,17 @@ int main(int argc, char *argv[]) signal(SIGINT, signal_handler); - // Parse command line options - while((opt = getopt(argc, argv, "a:t:h")) != -1) + while((opt = getopt(argc, argv, "a:t:")) != -1) { - switch(opt) - { - case 'a': - max_attempts = strtoull(optarg, NULL, 10); - break; - case 't': - max_time = atof(optarg); - break; - case 'h': - print_usage(argv[0]); - return 0; + switch(opt) { + case 'a': max_attempts = strtoull(optarg, NULL, 10); break; + case 't': max_time = atof(optarg); break; default: - print_usage(argv[0]); + fprintf(stderr, "Usage: %s -a -t \n", argv[0]); return 1; } } mine_coins_cuda(max_attempts, max_time); - return 0; -} +} \ No newline at end of file diff --git a/aad_coin_miner_cuda_kernel.cu b/aad_coin_miner_cuda_kernel.cu index 7c0c873..0d16421 100644 --- a/aad_coin_miner_cuda_kernel.cu +++ b/aad_coin_miner_cuda_kernel.cu @@ -1,96 +1,127 @@ // // Arquiteturas de Alto Desempenho 2025/2026 // -// DETI Coin Miner - CUDA kernel (optimized for mining) +// DETI Coin Miner - CUDA kernel (Optimized) // #include "aad_sha1.h" #include "aad_data_types.h" // -// Optimized CUDA kernel for DETI coin mining -// Each thread generates coins using the same approach as CPU/SIMD miners +// Optimized CUDA kernel // - -extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1) +extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1) void mine_deti_coins_kernel(u32_t *coins_storage_area, u64_t base_nonce, u32_t attempts_per_thread) { - u32_t coin[14]; + u32_t coin[16]; // SHA1 requires 16 words (64 bytes) u32_t hash[5]; - u32_t n; - u08_t *bytes = (u08_t *)coin; - // Get thread index (used as offset from base counter) - n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; + // 1. Initialize Fixed Prefix: "DETI coin 2 " (12 bytes) + // We construct this directly into the u32 array. + // Note: We assume the system is Little Endian, but SHA1 input via macro usually handles bytes. + // Ideally, we pack bytes: 'D','E','T','I' -> 0x44455449 - // Initialize coin template: "DETI coin 2 " + variable + "\n\x80" - // Use byte-swapped format to match host expectations (idx ^ 3) - coin[0] = ('D' << 24) + ('E' << 16) + ('T' << 8) + 'I'; - coin[1] = (' ' << 24) + ('c' << 16) + ('o' << 8) + 'i'; - coin[2] = ('n' << 24) + (' ' << 16) + ('2' << 8) + ' '; + // Word 0: "DETI" + coin[0] = (u32_t)'D' << 24 | (u32_t)'E' << 16 | (u32_t)'T' << 8 | (u32_t)'I'; + // Word 1: " coi" + coin[1] = (u32_t)' ' << 24 | (u32_t)'c' << 16 | (u32_t)'o' << 8 | (u32_t)'i'; + // Word 2: "n 2 " + coin[2] = (u32_t)'n' << 24 | (u32_t)' ' << 16 | (u32_t)'2' << 8 | (u32_t)' '; - // Fill the variable part of the coin with a pattern - for(int i = 3; i < 14; i++) - coin[i] = 0x41414141; // 'AAAA' + // 2. Initialize Variable Part (Bytes 12 to 53) + // Fill with a safe printable char ' ' (0x20) + #pragma unroll + for(int i = 3; i <= 12; i++) { + coin[i] = 0x20202020; + } + // Word 13 is partial variable + suffix + // Bytes 52, 53 are variable. Byte 54 is '\n', Byte 55 is 0x80 (Padding) + coin[13] = 0x20200A80; - // End with newline and padding - bytes[0x36 ^ 3] = '\n'; // Position 54 - bytes[0x37 ^ 3] = 0x80; // Position 55 + // 3. Initialize SHA1 Length Padding + // Message is 55 bytes. Length in bits = 55 * 8 = 440. + // SHA1 puts length at the very end (Word 15). + coin[14] = 0x00000000; + coin[15] = 440; - for(u32_t i = 0; i < attempts_per_thread; ++i) { - // Initialize variable part (positions 12-53, 42 bytes) - // Start with A-Z pattern like CPU/SIMD miners - for(int j = 12; j < 54; j++) - bytes[j ^ 3] = 'A' + ((j - 12) % 26); + // 4. Thread Unique Initialization + // We use the thread ID to set the initial state of the variable bytes + // to ensure every thread starts at a different point. + u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x; + u64_t nonce_offset = base_nonce + thread_id * attempts_per_thread; - // Calculate offset based on thread index and parameters - // This creates a unique starting point for each thread - u64_t offset = base_nonce + n + (u64_t)i * gridDim.x * blockDim.x; + // "Seeding" the message with the nonce (Fast update of specific bytes) + // We modify the bytes in words 3 through 12. + // Accessing as byte pointer for easier manipulation + u08_t *byte_ptr = (u08_t*)coin; - // Apply offset to variable part (increment the coin counter) - for(int pos = 53; pos >= 12 && offset > 0; pos--) + // Apply the nonce offset to the message structure (Odometer setup) + // Start modifying from byte 12 + u64_t temp_nonce = nonce_offset; + for (int k = 12; k < 54 && temp_nonce > 0; k++) { + u32_t val = byte_ptr[k ^ 3] + (temp_nonce % 95); // mod 95 to stay in printable ASCII + temp_nonce /= 95; + + if (val > 0x7E) { // Wrap around printable range + val -= 95; + temp_nonce++; // Carry + } + byte_ptr[k ^ 3] = (u08_t)val; + } + + // 5. Mining Loop + for(u32_t attempt = 0; attempt < attempts_per_thread; attempt++) + { + // --- SHA1 HASH CALCULATION --- + #define T u32_t + #define C(c) (c) + #define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n)))) + #define DATA(idx) coin[idx] + #define HASH(idx) hash[idx] + + CUSTOM_SHA1_CODE(); + + #undef T + #undef C + #undef ROTATE + #undef DATA + #undef HASH + + // --- CHECK RESULT --- + // Check for "aad20250" prefix (AAD20250 hex) + if(hash[0] == 0xAAD20250u) { - u08_t *byte = &bytes[pos ^ 3]; - u64_t add = offset % 127; - offset /= 127; + // Found a candidate! Save it. + u32_t idx = atomicAdd(&coins_storage_area[0], 14u); - u32_t val = *byte + add; - u08_t new_val = val % 127; - - // Skip newline character (ASCII 10) in the variable part - if(new_val == '\n') - new_val++; - - *byte = new_val; - offset += val / 127; // Carry + // Boundary check (first word is count, data starts at index 1) + // We normalize the index to be relative to storage start + if(idx < 1024u - 15u) // Ensure space + { + // Store valid coin (14 words = 56 bytes, covers the 55 byte content) + // Adjust idx because coins_storage_area[0] is the counter + for(int w=0; w<14; w++) { + coins_storage_area[idx + w] = coin[w]; + } + } } - // Compute SHA1 hash -# define T u32_t -# define C(c) (c) -# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n)))) -# define DATA(idx) coin[idx] -# define HASH(idx) hash[idx] - CUSTOM_SHA1_CODE(); -# undef T -# undef C -# undef ROTATE -# undef DATA -# undef HASH + // --- UPDATE MESSAGE (ODOMETER) --- + // Increment the message string for the next attempt + // We only touch the variable bytes. + // Start at byte 53 (just before the \n) and work backwards if carry needed. + // Note: byte_ptr access needs XOR 3 for Endianness correction on arrays treated as words - // Check if this is a valid DETI coin - if(hash[0] == 0xAAD20250u) - { - // Found a coin! Store it atomically - u32_t idx = atomicAdd(coins_storage_area, 14u); - - // Make sure we don't write outside buffer - if(idx < 1024u - 14u) - { - // Store the complete coin data - for(int k = 0; k < 14; k++) - coins_storage_area[idx + k] = coin[k]; - } + int pos = 53; + while (pos >= 12) { + u08_t *b = &byte_ptr[pos ^ 3]; + (*b)++; + if (*b <= 0x7E) { + break; // No carry, done incrementing + } + // Overflow printable range, reset to start of range (0x20) and carry + *b = 0x20; + pos--; } } -} +} \ No newline at end of file diff --git a/makefile b/makefile index 1d1d1ef..261530a 100644 --- a/makefile +++ b/makefile @@ -98,11 +98,8 @@ coin_miner_wasm: aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h -s EXPORT_NAME='CoinMinerModule' \ -s INITIAL_MEMORY=67108864 -benchmark: aad_benchmark.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h makefile - cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@ -miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl benchmark +miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl all: sha1_tests sha1_cuda_test sha1_cuda_kernel.cubin \ - coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl \ - benchmark + coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl