added CUDA implementation, changes needed

This commit is contained in:
RubenCGomes 2025-11-02 22:39:53 +00:00
parent 845884a11c
commit 0c2c534f7c
No known key found for this signature in database
GPG Key ID: 0D213021197E3EE0
2 changed files with 489 additions and 0 deletions

304
aad_coin_miner_cuda.c Normal file
View File

@ -0,0 +1,304 @@
//
// Arquiteturas de Alto Desempenho 2025/2026
//
// DETI Coin Miner - CUDA implementation with histograms
//
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include "aad_data_types.h"
#include "aad_utilities.h"
#include "aad_sha1_cpu.h"
#include "aad_cuda_utilities.h"
#include "aad_vault.h"
#define COINS_STORAGE_SIZE 1024u
#define MAX_HISTOGRAM_BINS 100
static volatile int keep_running = 1;
void signal_handler(int signum)
{
(void)signum;
keep_running = 0;
}
// Histogram data structures
typedef struct {
u32_t bins[MAX_HISTOGRAM_BINS];
u32_t count;
double min_value;
double max_value;
} histogram_t;
static void histogram_init(histogram_t *h)
{
memset(h->bins, 0, sizeof(h->bins));
h->count = 0;
h->min_value = 1e99;
h->max_value = 0.0;
}
static void histogram_add(histogram_t *h, double value)
{
if(value < h->min_value)
h->min_value = value;
if(value > h->max_value)
h->max_value = value;
h->count++;
// For now, just count - we'll bin them later when printing
}
static void histogram_print(histogram_t *h, const char *title, int n_bins)
{
if(h->count == 0)
{
printf("%s: No data\n", title);
return;
}
printf("\n%s:\n", title);
printf(" Count: %u\n", h->count);
printf(" Min: %.6f\n", h->min_value);
printf(" Max: %.6f\n", h->max_value);
printf(" Avg: %.6f\n", (h->min_value + h->max_value) / 2.0);
}
// Coin reconstruction from stored data
static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
{
// Fixed parts (must match kernel byte order)
coin[0] = 0x44455449u; // "DETI" with byte swap (idx ^ 3)
coin[1] = 0x20636F69u; // " coi" with byte swap (idx ^ 3)
coin[2] = 0x6E203220u; // "n 2 " with byte swap (idx ^ 3)
// Variable parts (restore from storage)
for(int i = 0; i < 11; i++)
coin[3 + i] = stored_data[i];
}
//
// Mine DETI coins using CUDA
//
static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
{
cuda_data_t cd;
u32_t *host_storage;
u64_t attempts = 0;
u32_t coins_found = 0;
u32_t kernel_runs = 0;
// Histograms
histogram_t time_histogram;
histogram_t coins_histogram;
double *kernel_times = NULL;
u32_t *kernel_coin_counts = NULL;
u32_t histogram_capacity = 10000;
histogram_init(&time_histogram);
histogram_init(&coins_histogram);
kernel_times = (double *)malloc(histogram_capacity * sizeof(double));
kernel_coin_counts = (u32_t *)malloc(histogram_capacity * sizeof(u32_t));
// Initialize CUDA
cd.device_number = 0;
cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
cd.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel";
cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t);
cd.data_size[1] = 0;
initialize_cuda(&cd);
host_storage = (u32_t *)cd.host_data[0];
// Kernel configuration
cd.block_dim_x = RECOMENDED_CUDA_BLOCK_SIZE;
cd.grid_dim_x = 4096; // Large grid for maximum GPU utilization
u32_t n_threads = cd.grid_dim_x * cd.block_dim_x;
printf("Mining DETI coins using CUDA...\n");
printf("Grid: %u blocks × %u threads = %u total threads\n",
cd.grid_dim_x, cd.block_dim_x, n_threads);
printf("Kernel: %s\n", cd.kernel_name);
printf("Press Ctrl+C to stop\n\n");
u32_t param1 = (u32_t)time(NULL);
u32_t param2 = 0x12345678u;
int scan_pos = 12;
time_measurement();
time_measurement();
double start_time = wall_time_delta();
double last_report_time = 0.0;
while(keep_running && (max_attempts == 0 || attempts < max_attempts))
{
// Initialize storage area
host_storage[0] = 1u; // First unused index
// Copy to device
host_to_device_copy(&cd, 0);
// Set kernel arguments
cd.n_kernel_arguments = use_scan_kernel ? 4 : 3;
cd.arg[0] = &cd.device_data[0];
cd.arg[1] = &param1;
cd.arg[2] = &param2;
if(use_scan_kernel)
cd.arg[3] = &scan_pos;
// Launch kernel and measure time
time_measurement();
double kernel_start = cpu_time_delta();
lauch_kernel(&cd);
time_measurement();
double kernel_end = cpu_time_delta();
double kernel_time = kernel_end - kernel_start;
// Copy results back
device_to_host_copy(&cd, 0);
// Process found coins
u32_t n_coins_this_kernel = 0;
u32_t n_stored = (host_storage[0] - 1) / 14;
if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE)
{
printf("DEBUG: host_storage[0] = %u, n_stored = %u\n", host_storage[0], n_stored);
for(u32_t i = 0; i < n_stored; i++)
{
u32_t coin[14];
reconstruct_coin(&host_storage[1 + i * 14], coin);
// Verify it's actually a valid coin
u32_t hash[5];
sha1(coin, hash);
printf("DEBUG: Coin %u - hash[0] = 0x%08X (expected 0xAAD20250)\n", i, hash[0]);
// Print the coin as string
if(i == 0) {
printf("DEBUG: First coin content: ");
u08_t *bytes = (u08_t *)coin;
for(int j = 0; j < 55; j++) {
char c = bytes[j ^ 3];
if(c >= 32 && c <= 126)
printf("%c", c);
else
printf("[0x%02X]", (u08_t)c);
}
printf("\n");
}
if(hash[0] == 0xAAD20250u)
{
coins_found++;
n_coins_this_kernel++;
printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n",
kernel_runs, n_coins_this_kernel);
save_coin(coin);
}
}
}
// Update histograms
if(kernel_runs < histogram_capacity)
{
kernel_times[kernel_runs] = kernel_time;
kernel_coin_counts[kernel_runs] = n_coins_this_kernel;
}
histogram_add(&time_histogram, kernel_time);
histogram_add(&coins_histogram, (double)n_coins_this_kernel);
// Update counters
kernel_runs++;
if(use_scan_kernel)
attempts += n_threads * 256; // Each thread tries 256 values
else
attempts += n_threads;
// Update parameters for next iteration
param1++;
param2 = param2 ^ 0x9E3779B9u;
if(use_scan_kernel)
scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53
// Print progress every second
time_measurement();
double current_time = wall_time_delta() - start_time;
if(current_time - last_report_time >= 1.0)
{
double rate = attempts / current_time;
printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u, Avg time: %.6f s\n",
(unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs,
current_time / kernel_runs);
last_report_time = current_time;
}
}
time_measurement();
double total_time = wall_time_delta() - start_time;
printf("\n=== Mining Statistics ===\n");
printf("Total attempts: %llu\n", (unsigned long long)attempts);
printf("Total time: %.2f seconds\n", total_time);
printf("Average rate: %.2f attempts/second\n", attempts / total_time);
printf("Coins found: %u\n", coins_found);
printf("Kernel launches: %u\n", kernel_runs);
// Print histograms
histogram_print(&time_histogram, "Kernel Execution Time Histogram", 20);
histogram_print(&coins_histogram, "Coins Found Per Kernel Histogram", 10);
// Save detailed histogram data
FILE *fp = fopen("cuda_kernel_stats.csv", "w");
if(fp != NULL)
{
fprintf(fp, "kernel_id,time_seconds,coins_found\n");
u32_t n_to_save = (kernel_runs < histogram_capacity) ? kernel_runs : histogram_capacity;
for(u32_t i = 0; i < n_to_save; i++)
{
fprintf(fp, "%u,%.9f,%u\n", i, kernel_times[i], kernel_coin_counts[i]);
}
fclose(fp);
printf("\nDetailed statistics saved to cuda_kernel_stats.csv\n");
}
// Save any remaining coins
save_coin(NULL);
// Cleanup
free(kernel_times);
free(kernel_coin_counts);
terminate_cuda(&cd);
}
int main(int argc, char *argv[])
{
u64_t max_attempts = 0;
int use_scan_kernel = 0;
signal(SIGINT, signal_handler);
if(argc > 1)
max_attempts = strtoull(argv[1], NULL, 10);
if(argc > 2 && strcmp(argv[2], "scan") == 0)
{
use_scan_kernel = 1;
printf("Using scan kernel (tries 256 values per thread)\n");
}
mine_coins_cuda(max_attempts, use_scan_kernel);
return 0;
}

View File

@ -0,0 +1,185 @@
//
// Arquiteturas de Alto Desempenho 2025/2026
//
// DETI Coin Miner - CUDA kernel (optimized for mining)
//
#include "aad_sha1.h"
typedef unsigned int u32_t;
typedef unsigned char u08_t;
//
// Optimized CUDA kernel for DETI coin mining
// Each thread generates its own message based on thread coordinates and external parameters
//
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2)
{
u32_t coin[14];
u32_t hash[5];
u32_t n, warp_id, lane_id;
// Get thread coordinates
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
warp_id = n >> 5u;
lane_id = n & 31u;
// Initialize coin template: "DETI coin 2 " + variable + "\n\x80"
// Use byte-swapped format to match host expectations (idx ^ 3)
coin[0] = 0x44455449u; // "DETI" with byte swap
coin[1] = 0x20636F69u; // " coi" with byte swap
coin[2] = 0x6E203220u; // "n 2 " with byte swap
// Variable part: encode thread ID and parameters
// This ensures each thread works on a different message
coin[3] = n; // Global thread ID
coin[4] = param1; // External parameter 1
coin[5] = param2; // External parameter 2
coin[6] = blockIdx.x; // Block index
coin[7] = threadIdx.x; // Thread index
coin[8] = warp_id; // Warp ID
coin[9] = lane_id; // Lane ID
coin[10] = n ^ param1; // XOR combination
coin[11] = n ^ param2; // XOR combination
coin[12] = (n * 0x9E3779B9u); // Hash-like mixing
// Last word: bytes 52-55
// Memory layout: coin[13]=0xAABBCCDD -> mem[52]=DD, [53]=CC, [54]=BB, [55]=AA
// With idx^3: bytes[52^3]=bytes[55]=AA, bytes[53^3]=bytes[54]=BB, bytes[54^3]=bytes[53]=CC, bytes[55^3]=bytes[52]=DD
// We want: bytes[54^3]=0x0A (newline), bytes[55^3]=0x80 (padding)
// So: bytes[53]=0x0A, bytes[52]=0x80 -> coin[13]=0x????0A80
coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A
// Compute SHA1 hash
# define T u32_t
# define C(c) (c)
# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
# define DATA(idx) coin[idx]
# define HASH(idx) hash[idx]
CUSTOM_SHA1_CODE();
# undef T
# undef C
# undef ROTATE
# undef DATA
# undef HASH
// Check if this is a valid DETI coin
if(hash[0] == 0xAAD20250u)
{
// Found a coin! Store it atomically
u32_t idx = atomicAdd(coins_storage_area, 14u);
// Make sure we don't write outside buffer
if(idx < 1024u - 14u)
{
// Store the coin data (only variable parts needed)
coins_storage_area[idx + 0] = coin[ 3];
coins_storage_area[idx + 1] = coin[ 4];
coins_storage_area[idx + 2] = coin[ 5];
coins_storage_area[idx + 3] = coin[ 6];
coins_storage_area[idx + 4] = coin[ 7];
coins_storage_area[idx + 5] = coin[ 8];
coins_storage_area[idx + 6] = coin[ 9];
coins_storage_area[idx + 7] = coin[10];
coins_storage_area[idx + 8] = coin[11];
coins_storage_area[idx + 9] = coin[12];
coins_storage_area[idx + 10] = coin[13];
// Store hash value for verification
coins_storage_area[idx + 11] = hash[1];
coins_storage_area[idx + 12] = hash[2];
coins_storage_area[idx + 13] = hash[3];
}
}
}
//
// Kernel that tries all possible values for one character position
//
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position)
{
u32_t coin[14];
u32_t hash[5];
u32_t n;
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
// Initialize coin template (with byte swap for idx ^ 3 convention)
coin[0] = 0x44455449u; // "DETI" with byte swap
coin[1] = 0x20636F69u; // " coi" with byte swap
coin[2] = 0x6E203220u; // "n 2 " with byte swap
// Variable part
coin[3] = param1;
coin[4] = param2;
coin[5] = n >> 8; // High bits of n
coin[6] = blockIdx.x;
coin[7] = threadIdx.x;
coin[8] = param1 ^ param2;
coin[9] = n & 0xFFu; // Low 8 bits of n
coin[10] = param1 + n;
coin[11] = param2 - n;
coin[12] = (n * 0x9E3779B9u);
coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A
// Try all possible values for the scan position (0-255)
// This allows exploring a full byte range in a single kernel launch
for(u32_t val = 0; val < 256u; val++)
{
// Insert the test value at the scan position
u32_t word_idx = scan_position / 4;
u32_t byte_pos = scan_position % 4;
u32_t shift = byte_pos * 8;
if(word_idx >= 3 && word_idx < 13)
{
u32_t mask = ~(0xFFu << shift);
coin[word_idx] = (coin[word_idx] & mask) | (val << shift);
// Make sure we don't use newline in the middle
u08_t *bytes = (u08_t *)coin;
if(scan_position < 54 && bytes[scan_position ^ 3] == 0x0A)
continue;
}
// Compute SHA1 hash
# define T u32_t
# define C(c) (c)
# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
# define DATA(idx) coin[idx]
# define HASH(idx) hash[idx]
CUSTOM_SHA1_CODE();
# undef T
# undef C
# undef ROTATE
# undef DATA
# undef HASH
// Check if valid coin
if(hash[0] == 0xAAD20250u)
{
u32_t idx = atomicAdd(coins_storage_area, 14u);
if(idx < 1024u - 14u)
{
coins_storage_area[idx + 0] = coin[ 3];
coins_storage_area[idx + 1] = coin[ 4];
coins_storage_area[idx + 2] = coin[ 5];
coins_storage_area[idx + 3] = coin[ 6];
coins_storage_area[idx + 4] = coin[ 7];
coins_storage_area[idx + 5] = coin[ 8];
coins_storage_area[idx + 6] = coin[ 9];
coins_storage_area[idx + 7] = coin[10];
coins_storage_area[idx + 8] = coin[11];
coins_storage_area[idx + 9] = coin[12];
coins_storage_area[idx + 10] = coin[13];
coins_storage_area[idx + 11] = hash[1];
coins_storage_area[idx + 12] = hash[2];
coins_storage_area[idx + 13] = hash[3];
}
}
}
}