reduced kernel launches for more efficiency

Signed-off-by: RubenCGomes <rlcg@ua.pt>
This commit is contained in:
RubenCGomes 2025-11-22 18:00:30 +00:00
parent a906816cd4
commit 111aa0fa74
No known key found for this signature in database
GPG Key ID: 0D213021197E3EE0
2 changed files with 50 additions and 182 deletions

View File

@ -38,7 +38,7 @@ static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
//
// Mine DETI coins using CUDA
//
static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
static void mine_coins_cuda(u64_t max_attempts)
{
cuda_data_t cd;
u32_t *host_storage;
@ -49,7 +49,7 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
// Initialize CUDA
cd.device_number = 0;
cd.cubin_file_name = "coin_miner_cuda_kernel.cubin";
cd.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel";
cd.kernel_name = "mine_deti_coins_kernel";
cd.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t);
cd.data_size[1] = 0;
@ -69,12 +69,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
printf("Kernel: %s\n", cd.kernel_name);
printf("Press Ctrl+C to stop\n\n");
u32_t param1 = (u32_t)time(NULL);
u32_t param2 = 0x12345678u;
int scan_pos = 12;
u64_t base_nonce = 0;
u32_t attempts_per_thread = 1024 * 8; // Increased attempts per thread
time_measurement();
// double start_time = wall_time_delta();
while(keep_running && (max_attempts == 0 || attempts < max_attempts))
{
@ -85,12 +83,10 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
host_to_device_copy(&cd, 0);
// Set kernel arguments
cd.n_kernel_arguments = use_scan_kernel ? 4 : 3;
cd.n_kernel_arguments = 2;
cd.arg[0] = &cd.device_data[0];
cd.arg[1] = &param1;
cd.arg[2] = &param2;
if(use_scan_kernel)
cd.arg[3] = &scan_pos;
cd.arg[1] = &base_nonce;
cd.arg[2] = &attempts_per_thread;
// Launch the CUDA kernel
launch_kernel(&cd);
@ -104,58 +100,24 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE)
{
printf("DEBUG: host_storage[0] = %u, n_stored = %u\n", host_storage[0], n_stored);
for(u32_t i = 0; i < n_stored; i++)
{
u32_t coin[14];
reconstruct_coin(&host_storage[1 + i * 14], coin);
// Verify it's actually a valid coin
u32_t hash[5];
sha1(coin, hash);
printf("DEBUG: Coin %u - hash[0] = 0x%08X (expected 0xAAD20250)\n", i, hash[0]);
// Print the coin as string
if(i == 0) {
printf("DEBUG: First coin content: ");
u08_t *bytes = (u08_t *)coin;
for(int j = 0; j < 55; j++) {
char c = bytes[j ^ 3];
if(c >= 32 && c <= 126)
printf("%c", c);
else
printf("[0x%02X]", (u08_t)c);
}
printf("\n");
}
if(hash[0] == 0xAAD20250u)
{
coins_found++;
n_coins_this_kernel++;
printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n",
kernel_runs, n_coins_this_kernel);
save_coin(coin);
}
coins_found++;
n_coins_this_kernel++;
printf("COIN FOUND! (kernel %u, coin %u in this kernel). Total coins:%u\n",
kernel_runs, n_coins_this_kernel, coins_found);
save_coin(coin);
}
}
// Update counters
kernel_runs++;
if(use_scan_kernel)
attempts += n_threads * 256; // Each thread tries 256 values
else
attempts += n_threads;
// Update parameters for next iteration
param1++;
param2 = param2 ^ 0x9E3779B9u;
if(use_scan_kernel)
scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53
u64_t attempts_this_launch = (u64_t)n_threads * attempts_per_thread;
attempts += attempts_this_launch;
base_nonce += attempts_this_launch;
}
time_measurement();
@ -178,20 +140,13 @@ static void mine_coins_cuda(u64_t max_attempts, int use_scan_kernel)
int main(int argc, char *argv[])
{
u64_t max_attempts = 0;
int use_scan_kernel = 0;
signal(SIGINT, signal_handler);
if(argc > 1)
max_attempts = strtoull(argv[1], NULL, 10);
if(argc > 2 && strcmp(argv[2], "scan") == 0)
{
use_scan_kernel = 1;
printf("Using scan kernel (tries 256 values per thread)\n");
}
mine_coins_cuda(max_attempts, use_scan_kernel);
mine_coins_cuda(max_attempts);
return 0;
}

View File

@ -13,7 +13,7 @@
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u64_t base_nonce, u32_t attempts_per_thread)
{
u32_t coin[14];
u32_t hash[5];
@ -25,41 +25,45 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
// Initialize coin template: "DETI coin 2 " + variable + "\n\x80"
// Use byte-swapped format to match host expectations (idx ^ 3)
coin[0] = 0x44455449u; // "DETI" with byte swap
coin[1] = 0x20636F69u; // " coi" with byte swap
coin[2] = 0x6E203220u; // "n 2 " with byte swap
coin[0] = ('D' << 24) + ('E' << 16) + ('T' << 8) + 'I';
coin[1] = (' ' << 24) + ('c' << 16) + ('o' << 8) + 'i';
coin[2] = ('n' << 24) + (' ' << 16) + ('2' << 8) + ' ';
// Initialize variable part (positions 12-53, 42 bytes)
// Start with A-Z pattern like CPU/SIMD miners
for(int i = 12; i < 54; i++)
bytes[i ^ 3] = 'A' + ((i - 12) % 26);
// Fill the variable part of the coin with a pattern
for(int i = 3; i < 14; i++)
coin[i] = 0x41414141; // 'AAAA'
// End with newline and padding
bytes[0x36 ^ 3] = '\n'; // Position 54
bytes[0x37 ^ 3] = 0x80; // Position 55
// Calculate offset based on thread index and parameters
// This creates a unique starting point for each thread
u64_t offset = ((u64_t)param1 << 32) | param2;
offset += (u64_t)n;
for(u32_t i = 0; i < attempts_per_thread; ++i) {
// Initialize variable part (positions 12-53, 42 bytes)
// Start with A-Z pattern like CPU/SIMD miners
for(int j = 12; j < 54; j++)
bytes[j ^ 3] = 'A' + ((j - 12) % 26);
// Apply offset to variable part (increment the coin counter)
for(int pos = 53; pos >= 12 && offset > 0; pos--)
{
u08_t *byte = &bytes[pos ^ 3];
u64_t add = offset % 127;
offset /= 127;
// Calculate offset based on thread index and parameters
// This creates a unique starting point for each thread
u64_t offset = base_nonce + n + (u64_t)i * gridDim.x * blockDim.x;
u32_t val = *byte + add;
u08_t new_val = val % 127;
// Apply offset to variable part (increment the coin counter)
for(int pos = 53; pos >= 12 && offset > 0; pos--)
{
u08_t *byte = &bytes[pos ^ 3];
u64_t add = offset % 127;
offset /= 127;
// Skip newline character (ASCII 10) in the variable part
if(new_val == '\n')
new_val++;
u32_t val = *byte + add;
u08_t new_val = val % 127;
*byte = new_val;
offset += val / 127; // Carry
}
// Skip newline character (ASCII 10) in the variable part
if(new_val == '\n')
new_val++;
*byte = new_val;
offset += val / 127; // Carry
}
// Compute SHA1 hash
# define T u32_t
@ -80,104 +84,13 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
// Found a coin! Store it atomically
u32_t idx = atomicAdd(coins_storage_area, 14u);
// Make sure we don't write outside buffer
if(idx < 1024u - 14u)
{
// Store the complete coin data
for(int i = 0; i < 14; i++)
coins_storage_area[idx + i] = coin[i];
}
}
}
//
// Kernel that tries all possible values for one character position
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE,1)
void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param2, int scan_position)
{
u32_t coin[14];
u32_t hash[5];
u32_t n;
u08_t *bytes = (u08_t *)coin;
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
// Initialize coin template (with byte swap for idx ^ 3 convention)
coin[0] = 0x44455449u; // "DETI" with byte swap
coin[1] = 0x20636F69u; // " coi" with byte swap
coin[2] = 0x6E203220u; // "n 2 " with byte swap
// Initialize variable part with A-Z pattern
for(int i = 12; i < 54; i++)
bytes[i ^ 3] = 'A' + ((i - 12) % 26);
// End with newline and padding
bytes[0x36 ^ 3] = '\n'; // Position 54
bytes[0x37 ^ 3] = 0x80; // Position 55
// Apply base offset from parameters (similar to main kernel)
u64_t offset = ((u64_t)param1 << 32) | param2;
offset += (u64_t)n;
// Apply offset to all positions except the scan position
for(int pos = 53; pos >= 12 && offset > 0; pos--)
{
if(pos == scan_position)
continue; // Skip the scan position
u08_t *byte = &bytes[pos ^ 3];
u64_t add = offset % 95;
offset /= 95;
u32_t val = (*byte - 32 + add);
u08_t new_val = 32 + (val % 95);
// Skip newline character (ASCII 10) in the variable part
if(new_val == '\n')
new_val++;
*byte = new_val;
offset += val / 95;
}
// Try all possible printable ASCII values for the scan position (32-126)
for(u32_t val = 32; val < 127; val++)
{
// Set the test value at the scan position
if(scan_position >= 12 && scan_position < 54)
{
bytes[scan_position ^ 3] = (u08_t)val;
// Skip newline in the middle (it's only valid at position 54)
if(scan_position != 54 && val == '\n')
continue;
}
// Compute SHA1 hash
# define T u32_t
# define C(c) (c)
# define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
# define DATA(idx) coin[idx]
# define HASH(idx) hash[idx]
CUSTOM_SHA1_CODE();
# undef T
# undef C
# undef ROTATE
# undef DATA
# undef HASH
// Check if valid coin
if(hash[0] == 0xAAD20250u)
{
u32_t idx = atomicAdd(coins_storage_area, 14u);
// Make sure we don't write outside buffer
if(idx < 1024u - 14u)
{
for(int i = 0; i < 14; i++)
coins_storage_area[idx + i] = coin[i];
// Store the complete coin data
for(int k = 0; k < 14; k++)
coins_storage_area[idx + k] = coin[k];
}
}
}
}