make CUDA version mine like the others so comparisons can be made
Signed-off-by: RubenCGomes <rlcg@ua.pt>
This commit is contained in:
parent
4ca44a528f
commit
bd06cb1133
|
|
@ -71,14 +71,9 @@ static void histogram_print(histogram_t *h, const char *title, int n_bins)
|
|||
// Coin reconstruction from stored data
|
||||
static void reconstruct_coin(u32_t *stored_data, u32_t coin[14])
|
||||
{
|
||||
// Fixed parts (must match kernel byte order)
|
||||
coin[0] = 0x44455449u; // "DETI" with byte swap (idx ^ 3)
|
||||
coin[1] = 0x20636F69u; // " coi" with byte swap (idx ^ 3)
|
||||
coin[2] = 0x6E203220u; // "n 2 " with byte swap (idx ^ 3)
|
||||
|
||||
// Variable parts (restore from storage)
|
||||
for(int i = 0; i < 11; i++)
|
||||
coin[3 + i] = stored_data[i];
|
||||
// Simply copy the complete coin data from storage
|
||||
for(int i = 0; i < 14; i++)
|
||||
coin[i] = stored_data[i];
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -8,10 +8,11 @@
|
|||
|
||||
typedef unsigned int u32_t;
|
||||
typedef unsigned char u08_t;
|
||||
typedef unsigned long long u64_t;
|
||||
|
||||
//
|
||||
// Optimized CUDA kernel for DETI coin mining
|
||||
// Each thread generates its own message based on thread coordinates and external parameters
|
||||
// Each thread generates coins using the same approach as CPU/SIMD miners
|
||||
//
|
||||
|
||||
extern "C" __global__ __launch_bounds__(RECOMENDED_CUDA_BLOCK_SIZE,1)
|
||||
|
|
@ -19,12 +20,11 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
|
|||
{
|
||||
u32_t coin[14];
|
||||
u32_t hash[5];
|
||||
u32_t n, warp_id, lane_id;
|
||||
u32_t n;
|
||||
u08_t *bytes = (u08_t *)coin;
|
||||
|
||||
// Get thread coordinates
|
||||
// Get thread index (used as offset from base counter)
|
||||
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
|
||||
warp_id = n >> 5u;
|
||||
lane_id = n & 31u;
|
||||
|
||||
// Initialize coin template: "DETI coin 2 " + variable + "\n\x80"
|
||||
// Use byte-swapped format to match host expectations (idx ^ 3)
|
||||
|
|
@ -32,25 +32,32 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
|
|||
coin[1] = 0x20636F69u; // " coi" with byte swap
|
||||
coin[2] = 0x6E203220u; // "n 2 " with byte swap
|
||||
|
||||
// Variable part: encode thread ID and parameters
|
||||
// This ensures each thread works on a different message
|
||||
coin[3] = n; // Global thread ID
|
||||
coin[4] = param1; // External parameter 1
|
||||
coin[5] = param2; // External parameter 2
|
||||
coin[6] = blockIdx.x; // Block index
|
||||
coin[7] = threadIdx.x; // Thread index
|
||||
coin[8] = warp_id; // Warp ID
|
||||
coin[9] = lane_id; // Lane ID
|
||||
coin[10] = n ^ param1; // XOR combination
|
||||
coin[11] = n ^ param2; // XOR combination
|
||||
coin[12] = (n * 0x9E3779B9u); // Hash-like mixing
|
||||
// Initialize variable part (positions 12-53, 42 bytes)
|
||||
// Start with A-Z pattern like CPU/SIMD miners
|
||||
for(int i = 12; i < 54; i++)
|
||||
bytes[i ^ 3] = 'A' + ((i - 12) % 26);
|
||||
|
||||
// Last word: bytes 52-55
|
||||
// Memory layout: coin[13]=0xAABBCCDD -> mem[52]=DD, [53]=CC, [54]=BB, [55]=AA
|
||||
// With idx^3: bytes[52^3]=bytes[55]=AA, bytes[53^3]=bytes[54]=BB, bytes[54^3]=bytes[53]=CC, bytes[55^3]=bytes[52]=DD
|
||||
// We want: bytes[54^3]=0x0A (newline), bytes[55^3]=0x80 (padding)
|
||||
// So: bytes[53]=0x0A, bytes[52]=0x80 -> coin[13]=0x????0A80
|
||||
coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A
|
||||
// End with newline and padding
|
||||
bytes[0x36 ^ 3] = '\n'; // Position 54
|
||||
bytes[0x37 ^ 3] = 0x80; // Position 55
|
||||
|
||||
// Calculate offset based on thread index and parameters
|
||||
// This creates a unique starting point for each thread
|
||||
u64_t offset = ((u64_t)param1 << 32) | param2;
|
||||
offset += (u64_t)n;
|
||||
|
||||
// Apply offset to variable part (increment the coin counter)
|
||||
// Use the same carry logic as CPU/SIMD miners
|
||||
for(int pos = 53; pos >= 12 && offset > 0; pos--)
|
||||
{
|
||||
u08_t *byte = &bytes[pos ^ 3];
|
||||
u64_t add = offset % 95; // Range: 32-126 (95 values)
|
||||
offset /= 95;
|
||||
|
||||
u32_t val = (*byte - 32 + add);
|
||||
*byte = 32 + (val % 95);
|
||||
offset += val / 95; // Carry
|
||||
}
|
||||
|
||||
// Compute SHA1 hash
|
||||
# define T u32_t
|
||||
|
|
@ -74,22 +81,9 @@ void mine_deti_coins_kernel(u32_t *coins_storage_area, u32_t param1, u32_t param
|
|||
// Make sure we don't write outside buffer
|
||||
if(idx < 1024u - 14u)
|
||||
{
|
||||
// Store the coin data (only variable parts needed)
|
||||
coins_storage_area[idx + 0] = coin[ 3];
|
||||
coins_storage_area[idx + 1] = coin[ 4];
|
||||
coins_storage_area[idx + 2] = coin[ 5];
|
||||
coins_storage_area[idx + 3] = coin[ 6];
|
||||
coins_storage_area[idx + 4] = coin[ 7];
|
||||
coins_storage_area[idx + 5] = coin[ 8];
|
||||
coins_storage_area[idx + 6] = coin[ 9];
|
||||
coins_storage_area[idx + 7] = coin[10];
|
||||
coins_storage_area[idx + 8] = coin[11];
|
||||
coins_storage_area[idx + 9] = coin[12];
|
||||
coins_storage_area[idx + 10] = coin[13];
|
||||
// Store hash value for verification
|
||||
coins_storage_area[idx + 11] = hash[1];
|
||||
coins_storage_area[idx + 12] = hash[2];
|
||||
coins_storage_area[idx + 13] = hash[3];
|
||||
// Store the complete coin data
|
||||
for(int i = 0; i < 14; i++)
|
||||
coins_storage_area[idx + i] = coin[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -104,6 +98,7 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
|
|||
u32_t coin[14];
|
||||
u32_t hash[5];
|
||||
u32_t n;
|
||||
u08_t *bytes = (u08_t *)coin;
|
||||
|
||||
n = (u32_t)threadIdx.x + (u32_t)blockDim.x * (u32_t)blockIdx.x;
|
||||
|
||||
|
|
@ -112,36 +107,43 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
|
|||
coin[1] = 0x20636F69u; // " coi" with byte swap
|
||||
coin[2] = 0x6E203220u; // "n 2 " with byte swap
|
||||
|
||||
// Variable part
|
||||
coin[3] = param1;
|
||||
coin[4] = param2;
|
||||
coin[5] = n >> 8; // High bits of n
|
||||
coin[6] = blockIdx.x;
|
||||
coin[7] = threadIdx.x;
|
||||
coin[8] = param1 ^ param2;
|
||||
coin[9] = n & 0xFFu; // Low 8 bits of n
|
||||
coin[10] = param1 + n;
|
||||
coin[11] = param2 - n;
|
||||
coin[12] = (n * 0x9E3779B9u);
|
||||
coin[13] = ((n & 0xFFFFu) << 16) | 0x0A80u; // Top 2 bytes: variable, bottom: 0x80 0x0A
|
||||
// Initialize variable part with A-Z pattern
|
||||
for(int i = 12; i < 54; i++)
|
||||
bytes[i ^ 3] = 'A' + ((i - 12) % 26);
|
||||
|
||||
// Try all possible values for the scan position (0-255)
|
||||
// This allows exploring a full byte range in a single kernel launch
|
||||
for(u32_t val = 0; val < 256u; val++)
|
||||
// End with newline and padding
|
||||
bytes[0x36 ^ 3] = '\n'; // Position 54
|
||||
bytes[0x37 ^ 3] = 0x80; // Position 55
|
||||
|
||||
// Apply base offset from parameters (similar to main kernel)
|
||||
u64_t offset = ((u64_t)param1 << 32) | param2;
|
||||
offset += (u64_t)n;
|
||||
|
||||
// Apply offset to all positions except the scan position
|
||||
for(int pos = 53; pos >= 12 && offset > 0; pos--)
|
||||
{
|
||||
// Insert the test value at the scan position
|
||||
u32_t word_idx = scan_position / 4;
|
||||
u32_t byte_pos = scan_position % 4;
|
||||
u32_t shift = byte_pos * 8;
|
||||
if(pos == scan_position)
|
||||
continue; // Skip the scan position
|
||||
|
||||
if(word_idx >= 3 && word_idx < 13)
|
||||
u08_t *byte = &bytes[pos ^ 3];
|
||||
u64_t add = offset % 95;
|
||||
offset /= 95;
|
||||
|
||||
u32_t val = (*byte - 32 + add);
|
||||
*byte = 32 + (val % 95);
|
||||
offset += val / 95;
|
||||
}
|
||||
|
||||
// Try all possible printable ASCII values for the scan position (32-126)
|
||||
for(u32_t val = 32; val < 127; val++)
|
||||
{
|
||||
// Set the test value at the scan position
|
||||
if(scan_position >= 12 && scan_position < 54)
|
||||
{
|
||||
u32_t mask = ~(0xFFu << shift);
|
||||
coin[word_idx] = (coin[word_idx] & mask) | (val << shift);
|
||||
bytes[scan_position ^ 3] = (u08_t)val;
|
||||
|
||||
// Make sure we don't use newline in the middle
|
||||
u08_t *bytes = (u08_t *)coin;
|
||||
if(scan_position < 54 && bytes[scan_position ^ 3] == 0x0A)
|
||||
// Skip newline in the middle (it's only valid at position 54)
|
||||
if(scan_position != 54 && val == '\n')
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -164,20 +166,8 @@ void mine_deti_coins_scan_kernel(u32_t *coins_storage_area, u32_t param1, u32_t
|
|||
u32_t idx = atomicAdd(coins_storage_area, 14u);
|
||||
if(idx < 1024u - 14u)
|
||||
{
|
||||
coins_storage_area[idx + 0] = coin[ 3];
|
||||
coins_storage_area[idx + 1] = coin[ 4];
|
||||
coins_storage_area[idx + 2] = coin[ 5];
|
||||
coins_storage_area[idx + 3] = coin[ 6];
|
||||
coins_storage_area[idx + 4] = coin[ 7];
|
||||
coins_storage_area[idx + 5] = coin[ 8];
|
||||
coins_storage_area[idx + 6] = coin[ 9];
|
||||
coins_storage_area[idx + 7] = coin[10];
|
||||
coins_storage_area[idx + 8] = coin[11];
|
||||
coins_storage_area[idx + 9] = coin[12];
|
||||
coins_storage_area[idx + 10] = coin[13];
|
||||
coins_storage_area[idx + 11] = hash[1];
|
||||
coins_storage_area[idx + 12] = hash[2];
|
||||
coins_storage_area[idx + 13] = hash[3];
|
||||
for(int i = 0; i < 14; i++)
|
||||
coins_storage_area[idx + i] = coin[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue