aad-assignment-1/aad_coin_miner_cuda_kernel.cu

210 lines
6.6 KiB
Plaintext

//
// Arquiteturas de Alto Desempenho 2025/2026
//
// DETI Coin Miner - CUDA kernel (Optimized)
//
#include "aad_sha1.h"
#include "aad_data_types.h"
//
// Optimized CUDA kernel
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u64_t base_nonce, u32_t attempts_per_thread)
{
u32_t coin[16]; // SHA1 requires 16 words (64 bytes)
u32_t hash[5];
// 1. Initialize Fixed Prefix: "DETI coin 2 " (12 bytes)
// We construct this directly into the u32 array.
// Note: We assume the system is Little Endian, but SHA1 input via macro usually handles bytes.
// Ideally, we pack bytes: 'D','E','T','I' -> 0x44455449
// Word 0: "DETI"
coin[0] = (u32_t)'D' << 24 | (u32_t)'E' << 16 | (u32_t)'T' << 8 | (u32_t)'I';
// Word 1: " coi"
coin[1] = (u32_t)' ' << 24 | (u32_t)'c' << 16 | (u32_t)'o' << 8 | (u32_t)'i';
// Word 2: "n 2 "
coin[2] = (u32_t)'n' << 24 | (u32_t)' ' << 16 | (u32_t)'2' << 8 | (u32_t)' ';
// 2. Initialize Variable Part (Bytes 12 to 53)
// Fill with a safe printable char ' ' (0x20)
#pragma unroll
for(int i = 3; i <= 12; i++) {
coin[i] = 0x20202020;
}
// Word 13 is partial variable + suffix
// Bytes 52, 53 are variable. Byte 54 is '\n', Byte 55 is 0x80 (Padding)
coin[13] = 0x20200A80;
// 3. Initialize SHA1 Length Padding
// Message is 55 bytes. Length in bits = 55 * 8 = 440.
// SHA1 puts length at the very end (Word 15).
coin[14] = 0x00000000;
coin[15] = 440;
// 4. Thread Unique Initialization
// We use the thread ID to set the initial state of the variable bytes
// to ensure every thread starts at a different point.
u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
u64_t nonce_offset = base_nonce + thread_id * attempts_per_thread;
// "Seeding" the message with the nonce (Fast update of specific bytes)
// We modify the bytes in words 3 through 12.
// Accessing as byte pointer for easier manipulation
u08_t *byte_ptr = (u08_t*)coin;
// Apply the nonce offset to the message structure (Odometer setup)
// Start modifying from byte 12
u64_t temp_nonce = nonce_offset;
for (int k = 12; k < 54 && temp_nonce > 0; k++) {
u32_t val = byte_ptr[k ^ 3] + (temp_nonce % 95); // mod 95 to stay in printable ASCII
temp_nonce /= 95;
if (val > 0x7E) { // Wrap around printable range
val -= 95;
temp_nonce++; // Carry
}
byte_ptr[k ^ 3] = (u08_t)val;
}
// 5. Mining Loop
for(u32_t attempt = 0; attempt < attempts_per_thread; attempt++)
{
// --- SHA1 HASH CALCULATION ---
#define T u32_t
#define C(c) (c)
#define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
#define DATA(idx) coin[idx]
#define HASH(idx) hash[idx]
CUSTOM_SHA1_CODE();
#undef T
#undef C
#undef ROTATE
#undef DATA
#undef HASH
// --- CHECK RESULT ---
// Check for "aad20250" prefix (AAD20250 hex)
if(hash[0] == 0xAAD20250u)
{
// Found a candidate! Save it.
u32_t idx = atomicAdd(&coins_storage_area[0], 14u);
// Boundary check (first word is count, data starts at index 1)
// We normalize the index to be relative to storage start
if(idx < 1024u - 15u) // Ensure space
{
// Store valid coin (14 words = 56 bytes, covers the 55 byte content)
// Adjust idx because coins_storage_area[0] is the counter
for(int w=0; w<14; w++) {
coins_storage_area[idx + w] = coin[w];
}
}
}
// --- UPDATE MESSAGE (ODOMETER) ---
// Increment the message string for the next attempt
// We only touch the variable bytes.
// Start at byte 53 (just before the \n) and work backwards if carry needed.
// Note: byte_ptr access needs XOR 3 for Endianness correction on arrays treated as words
int pos = 53;
while (pos >= 12) {
u08_t *b = &byte_ptr[pos ^ 3];
(*b)++;
if (*b <= 0x7E) {
break; // No carry, done incrementing
}
// Overflow printable range, reset to start of range (0x20) and carry
*b = 0x20;
pos--;
}
}
}
//
// Kernel: Mines a coin where the first 48 bytes are FIXED (the visual pattern)
// and only the last ~7 bytes are mutated to find the hash.
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
void mine_visual_row_kernel(u32_t *coins_storage_area, u32_t *row_template, u64_t base_nonce)
{
u32_t coin[16]; // SHA1 working buffer
u32_t hash[5];
// 1. Load the template (Fixed Visual Part)
// The host has already prepared "DETI coin 2 " + "The DNA Pattern"
// We copy the first 12 words (48 bytes) exactly as they are.
#pragma unroll
for(int i = 0; i < 12; i++) {
coin[i] = row_template[i];
}
// 2. Setup the "Mining Area" (Bytes 48-53)
// We use word 12 and part of word 13 for the nonce.
// Word 13 also contains the \n and 0x80 padding.
// Template provided by host: [ ... visual ... ] [ mining_space ] \n 0x80
coin[12] = 0x41414141; // Initialize mining space with 'AAAA'
coin[13] = row_template[13]; // This contains the \n (byte 54) and 0x80 (byte 55)
// SHA1 Length padding (55 bytes = 440 bits)
coin[14] = 0;
coin[15] = 440;
// 3. Thread unique nonce calculation
u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
u64_t nonce = base_nonce + thread_id; // Simple linear nonce
// 4. Map nonce to the "Mining Area" (Bytes 48-53)
// We manipulate bytes 48, 49, 50, 51 (Word 12) and 52, 53 (Low half of Word 13)
u08_t *bytes = (u08_t*)coin;
// We use an Odometer approach on the specific bytes allowed for mining
// so we don't disturb the beautiful visual pattern on the left.
u64_t temp_nonce = nonce;
for(int k = 48; k <= 53; k++)
{
// Map to printable ASCII (0x21 to 0x7E) to avoid forbidden \n
u32_t val = (bytes[k^3] + (temp_nonce % 90));
temp_nonce /= 90;
if(val > 0x7E) {
val = 0x21 + (val - 0x7E); // Wrap
temp_nonce++; // Carry
}
bytes[k^3] = (u08_t)val;
}
// 5. SHA1 Computation
#define T u32_t
#define C(c) (c)
#define ROTATE(x,n) (((x) << (n)) | ((x) >> (32 - (n))))
#define DATA(idx) coin[idx]
#define HASH(idx) hash[idx]
CUSTOM_SHA1_CODE();
#undef T
#undef C
#undef ROTATE
#undef DATA
#undef HASH
// 6. Check Result
if(hash[0] == 0xAAD20250u)
{
u32_t idx = atomicAdd(&coins_storage_area[0], 14u);
if(idx < 1024u - 15u)
{
// Save the found coin
for(int w=0; w<14; w++) {
coins_storage_area[idx + w] = coin[w];
}
}
}
}