//
// Arquiteturas de Alto Desempenho 2025/2026
//
// DETI Coin Miner - CUDA kernel (Optimized)
//

#include "aad_sha1.h"
#include "aad_data_types.h"

//
// Optimized CUDA kernel
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
void mine_deti_coins_kernel(u32_t *coins_storage_area, u64_t base_nonce, u32_t attempts_per_thread)
{
  u32_t coin[16]; // SHA1 requires 16 words (64 bytes)
  u32_t hash[5];

  // 1. Initialize Fixed Prefix: "DETI coin 2 " (12 bytes)
  coin[0] = (u32_t)'D' << 24 | (u32_t)'E' << 16 | (u32_t)'T' << 8 | (u32_t)'I';
  coin[1] = (u32_t)' ' << 24 | (u32_t)'c' << 16 | (u32_t)'o' << 8 | (u32_t)'i';
  coin[2] = (u32_t)'n' << 24 | (u32_t)' ' << 16 | (u32_t)'2' << 8 | (u32_t)' ';

  // 2. Initialize Variable Part (Bytes 12 to 53)
  // Fill with a safe printable char ' ' (0x20)
  #pragma unroll
  for(int i = 3; i <= 12; i++) {
    coin[i] = 0x20202020;
  }
  // Word 13 is partial variable + suffix
  // Bytes 52, 53 are variable. Byte 54 is '\n', Byte 55 is 0x80 (Padding)
  coin[13] = 0x20200A80;

  // 3. Initialize SHA1 Length Padding
  // Message is 55 bytes. Length in bits = 55 * 8 = 440.
  // SHA1 puts length at the very end (Word 15).
  coin[14] = 0x00000000;
  coin[15] = 440;

  // 4. Thread Unique Initialization
  // Uses thread ID to set the initial state of the variable bytes
  // to ensure every thread starts at a different point.
  u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
  u64_t nonce_offset = base_nonce + thread_id * attempts_per_thread;

  // Seeding the message with the nonce (Fast update of specific bytes)
  u08_t *byte_ptr = (u08_t*)coin;

  // Apply the nonce offset to the message structure
  u64_t temp_nonce = nonce_offset;
  for (int k = 12; k < 54 && temp_nonce > 0; k++) {
      u32_t val = byte_ptr[k ^ 3] + (temp_nonce % 95); // mod 95 to stay in printable ASCII
      temp_nonce /= 95;

      if (val > 0x7E) { // Wrap around printable range
          val -= 95;
          temp_nonce++; // Carry
      }
      byte_ptr[k ^ 3] = (u08_t)val;
  }

  // 5. Mining Loop
  for(u32_t attempt = 0; attempt < attempts_per_thread; attempt++)
  {
    // --- SHA1 HASH CALCULATION ---
    #define T            u32_t
    #define C(c)         (c)
    #define ROTATE(x,n)  (((x) << (n)) | ((x) >> (32 - (n))))
    #define DATA(idx)    coin[idx]
    #define HASH(idx)    hash[idx]

    CUSTOM_SHA1_CODE();

    #undef T
    #undef C
    #undef ROTATE
    #undef DATA
    #undef HASH

    // --- CHECK RESULT ---
    // Check for "aad20250" prefix (AAD20250 hex)
    if(hash[0] == 0xAAD20250u)
    {
       // Found a candidate! Save it.
       u32_t idx = atomicAdd(&coins_storage_area[0], 14u);

       // Boundary check (first word is count, data starts at index 1)
       // We normalize the index to be relative to storage start
       if(idx < 1024u - 15u) // Ensure space
       {
          // Store valid coin (14 words = 56 bytes, covers the 55 byte content)
          // Adjust idx because coins_storage_area[0] is the counter
          for(int w=0; w<14; w++) {
              coins_storage_area[idx + w] = coin[w];
          }
       }
    }

    // --- UPDATE MESSAGE (ODOMETER) ---
    // Increment the message string for the next attempt
    // Start at byte 53 (just before the \n) and work backwards if carry needed.

    int pos = 53;
    while (pos >= 12) {
        u08_t *b = &byte_ptr[pos ^ 3];
        (*b)++;
        if (*b <= 0x7E) {
            break; // No carry, done incrementing
        }
        // Overflow printable range, reset to start of range (0x20) and carry
        *b = 0x20;
        pos--;
    }
  }
}

//
// Kernel: Mines a coin where the first 48 bytes are FIXED (the visual pattern)
// and only the last ~7 bytes are mutated to find the hash.
//
extern "C" __global__ __launch_bounds__(RECOMMENDED_CUDA_BLOCK_SIZE, 1)
void mine_visual_row_kernel(u32_t *coins_storage_area, u32_t *row_template, u64_t base_nonce)
{
  u32_t coin[16]; // SHA1 working buffer
  u32_t hash[5];

  // 1. Load the template
  #pragma unroll
  for(int i = 0; i < 12; i++) {
      coin[i] = row_template[i];
  }

  // 2. Setup the "Mining Area" (Bytes 48-53)
  // Template provided by host: [ ... visual ... ] [ mining_space ] \n 0x80

  coin[12] = 0x41414141; // Initialize mining space with 'AAAA'
  coin[13] = row_template[13]; // This contains the \n (byte 54) and 0x80 (byte 55)

  // SHA1 Length padding (55 bytes = 440 bits)
  coin[14] = 0;
  coin[15] = 440;

  // 3. Thread unique nonce calculation
  u64_t thread_id = (u64_t)blockIdx.x * blockDim.x + threadIdx.x;
  u64_t nonce = base_nonce + thread_id; // Simple linear nonce

  // 4. Map nonce to the "Mining Area" (Bytes 48-53)
  // Change bytes from 48 to 53
  u08_t *bytes = (u08_t*)coin;

  u64_t temp_nonce = nonce;
  for(int k = 48; k <= 53; k++)
  {
      // Map to printable ASCII (0x21 to 0x7E) to avoid forbidden \n
      u32_t val = (bytes[k^3] + (temp_nonce % 90));
      temp_nonce /= 90;

      if(val > 0x7E) {
         val = 0x21 + (val - 0x7E); // Wrap
         temp_nonce++; // Carry
      }
      bytes[k^3] = (u08_t)val;
  }

  // 5. SHA1 Computation
  #define T            u32_t
  #define C(c)         (c)
  #define ROTATE(x,n)  (((x) << (n)) | ((x) >> (32 - (n))))
  #define DATA(idx)    coin[idx]
  #define HASH(idx)    hash[idx]

  CUSTOM_SHA1_CODE();

  #undef T
  #undef C
  #undef ROTATE
  #undef DATA
  #undef HASH

  // 6. Check Result
  if(hash[0] == 0xAAD20250u)
  {
    u32_t idx = atomicAdd(&coins_storage_area[0], 14u);
    if(idx < 1024u - 15u)
    {
       // Save the found coin
       for(int w=0; w<14; w++) {
           coins_storage_area[idx + w] = coin[w];
       }
    }
  }
}