diff --git a/aad_coin_miner_cuda.c b/aad_coin_miner_cuda.c index 167f37a..8080b0f 100644 --- a/aad_coin_miner_cuda.c +++ b/aad_coin_miner_cuda.c @@ -71,14 +71,9 @@ static void histogram_print(histogram_t *h, const char *title, int n_bins) // Coin reconstruction from stored data static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) { - // Fixed parts (must match kernel byte order) - coin[0] = 0x44455449u; // "DETI" with byte swap (idx ^ 3) - coin[1] = 0x20636F69u; // " coi" with byte swap (idx ^ 3) - coin[2] = 0x6E203220u; // "n 2 " with byte swap (idx ^ 3) - - // Variable parts (restore from storage) - for(int i = 0; i < 11; i++) - coin[3 + i] = stored_data[i]; + // Simply copy the complete coin data from storage + for(int i = 0; i < 14; i++) + coin[i] = stored_data[i]; } // diff --git a/aad_coin_miner_cuda_kernel.cu b/aad_coin_miner_cuda_kernel.cu index 57a2268..0683c51 100644 --- a/aad_coin_miner_cuda_kernel.cu +++ b/aad_coin_miner_cuda_kernel.cu @@ -8,10 +8,11 @@ typedef unsigned int u32_t; typedef unsigned char u08_t; +typedef unsigned long long u64_t; // // Optimized CUDA kernel for DETI coin mining -// Each thread generates its own message based on thread coordinates and external parameters +// Each thread generates coins using the same approach as CPU/SIMD miners // extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1) @@ -19,12 +20,11 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param { u32_t coin[14]; u32_t hash[5]; - u32_t n, warp_id, lane_id; + u32_t n; + u08_t *bytes = (u08_t *)coin; - // Get thread coordinates + // Get thread index (used as offset from base counter) n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; - warp_id = n >> 5u; - lane_id = n & 31u; // Initialize coin template: "DETI coin 2 " + variable + "\n\x80" // Use byte-swapped format to match host expectations (idx ^ 3) @@ -32,25 +32,32 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param coin[1] = 0x20636F69u; // " coi" with byte swap coin[2] = 0x6E203220u; // "n 2 " with byte swap - // Variable part: encode thread ID and parameters - // This ensures each thread works on a different message - coin[3] = n; // Global thread ID - coin[4] = param1; // External parameter 1 - coin[5] = param2; // External parameter 2 - coin[6] = blockIdx.x; // Block index - coin[7] = threadIdx.x; // Thread index - coin[8] = warp_id; // Warp ID - coin[9] = lane_id; // Lane ID - coin[10] = n ^ param1; // XOR combination - coin[11] = n ^ param2; // XOR combination - coin[12] = (n * 0x9E3779B9u); // Hash-like mixing + // Initialize variable part (positions 12-53, 42 bytes) + // Start with A-Z pattern like CPU/SIMD miners + for(int i = 12; i < 54; i++) + bytes[i ^ 3] = 'A' + ((i - 12) % 26); - // Last word: bytes 52-55 - // Memory layout: coin[13]=0xAABBCCDD -> mem[52]=DD, [53]=CC, [54]=BB, [55]=AA - // With idx^3: bytes[52^3]=bytes[55]=AA, bytes[53^3]=bytes[54]=BB, bytes[54^3]=bytes[53]=CC, bytes[55^3]=bytes[52]=DD - // We want: bytes[54^3]=0x0A (newline), bytes[55^3]=0x80 (padding) - // So: bytes[53]=0x0A, bytes[52]=0x80 -> coin[13]=0x????0A80 - coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A + // End with newline and padding + bytes[0x36 ^ 3] = '\n'; // Position 54 + bytes[0x37 ^ 3] = 0x80; // Position 55 + + // Calculate offset based on thread index and parameters + // This creates a unique starting point for each thread + u64_t offset = ((u64_t)param1 << 32) | param2; + offset += (u64_t)n; + + // Apply offset to variable part (increment the coin counter) + // Use the same carry logic as CPU/SIMD miners + for(int pos = 53; pos >= 12 && offset > 0; pos--) + { + u08_t *byte = &bytes[pos ^ 3]; + u64_t add = offset % 95; // Range: 32-126 (95 values) + offset /= 95; + + u32_t val = (*byte - 32 + add); + *byte = 32 + (val % 95); + offset += val / 95; // Carry + } // Compute SHA1 hash # define T u32_t @@ -74,22 +81,9 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param // Make sure we don't write outside buffer if(idx < 1024u - 14u) { - // Store the coin data (only variable parts needed) - coins_storage_area[idx + 0] = coin[ 3]; - coins_storage_area[idx + 1] = coin[ 4]; - coins_storage_area[idx + 2] = coin[ 5]; - coins_storage_area[idx + 3] = coin[ 6]; - coins_storage_area[idx + 4] = coin[ 7]; - coins_storage_area[idx + 5] = coin[ 8]; - coins_storage_area[idx + 6] = coin[ 9]; - coins_storage_area[idx + 7] = coin[10]; - coins_storage_area[idx + 8] = coin[11]; - coins_storage_area[idx + 9] = coin[12]; - coins_storage_area[idx + 10] = coin[13]; - // Store hash value for verification - coins_storage_area[idx + 11] = hash[1]; - coins_storage_area[idx + 12] = hash[2]; - coins_storage_area[idx + 13] = hash[3]; + // Store the complete coin data + for(int i = 0; i < 14; i++) + coins_storage_area[idx + i] = coin[i]; } } } @@ -104,6 +98,7 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t u32_t coin[14]; u32_t hash[5]; u32_t n; + u08_t *bytes = (u08_t *)coin; n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x; @@ -112,36 +107,43 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t coin[1] = 0x20636F69u; // " coi" with byte swap coin[2] = 0x6E203220u; // "n 2 " with byte swap - // Variable part - coin[3] = param1; - coin[4] = param2; - coin[5] = n >> 8; // High bits of n - coin[6] = blockIdx.x; - coin[7] = threadIdx.x; - coin[8] = param1 ^ param2; - coin[9] = n & 0xFFu; // Low 8 bits of n - coin[10] = param1 + n; - coin[11] = param2 - n; - coin[12] = (n * 0x9E3779B9u); - coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A + // Initialize variable part with A-Z pattern + for(int i = 12; i < 54; i++) + bytes[i ^ 3] = 'A' + ((i - 12) % 26); - // Try all possible values for the scan position (0-255) - // This allows exploring a full byte range in a single kernel launch - for(u32_t val = 0; val < 256u; val++) + // End with newline and padding + bytes[0x36 ^ 3] = '\n'; // Position 54 + bytes[0x37 ^ 3] = 0x80; // Position 55 + + // Apply base offset from parameters (similar to main kernel) + u64_t offset = ((u64_t)param1 << 32) | param2; + offset += (u64_t)n; + + // Apply offset to all positions except the scan position + for(int pos = 53; pos >= 12 && offset > 0; pos--) { - // Insert the test value at the scan position - u32_t word_idx = scan_position / 4; - u32_t byte_pos = scan_position % 4; - u32_t shift = byte_pos * 8; + if(pos == scan_position) + continue; // Skip the scan position + + u08_t *byte = &bytes[pos ^ 3]; + u64_t add = offset % 95; + offset /= 95; + + u32_t val = (*byte - 32 + add); + *byte = 32 + (val % 95); + offset += val / 95; + } - if(word_idx >= 3 && word_idx < 13) + // Try all possible printable ASCII values for the scan position (32-126) + for(u32_t val = 32; val < 127; val++) + { + // Set the test value at the scan position + if(scan_position >= 12 && scan_position < 54) { - u32_t mask = ~(0xFFu << shift); - coin[word_idx] = (coin[word_idx] & mask) | (val << shift); - - // Make sure we don't use newline in the middle - u08_t *bytes = (u08_t *)coin; - if(scan_position < 54 && bytes[scan_position ^ 3] == 0x0A) + bytes[scan_position ^ 3] = (u08_t)val; + + // Skip newline in the middle (it's only valid at position 54) + if(scan_position != 54 && val == '\n') continue; } @@ -164,20 +166,8 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t u32_t idx = atomicAdd(coins_storage_area, 14u); if(idx < 1024u - 14u) { - coins_storage_area[idx + 0] = coin[ 3]; - coins_storage_area[idx + 1] = coin[ 4]; - coins_storage_area[idx + 2] = coin[ 5]; - coins_storage_area[idx + 3] = coin[ 6]; - coins_storage_area[idx + 4] = coin[ 7]; - coins_storage_area[idx + 5] = coin[ 8]; - coins_storage_area[idx + 6] = coin[ 9]; - coins_storage_area[idx + 7] = coin[10]; - coins_storage_area[idx + 8] = coin[11]; - coins_storage_area[idx + 9] = coin[12]; - coins_storage_area[idx + 10] = coin[13]; - coins_storage_area[idx + 11] = hash[1]; - coins_storage_area[idx + 12] = hash[2]; - coins_storage_area[idx + 13] = hash[3]; + for(int i = 0; i < 14; i++) + coins_storage_area[idx + i] = coin[i]; } } }