diff --git a/aad_coin_miner_ocl.c b/aad_coin_miner_ocl.c new file mode 100644 index 0000000..55d8f68 --- /dev/null +++ b/aad_coin_miner_ocl.c @@ -0,0 +1,226 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - OpenCL implementation +// + +#include +#include +#include +#include +#include +#include "aad_data_types.h" +#include "aad_utilities.h" +#include "aad_sha1_cpu.h" +#include "aad_ocl_utilities.h" +#include "aad_vault.h" + +#define COINS_STORAGE_SIZE 1024u + +static volatile int keep_running = 1; + +void signal_handler(int signum) +{ + (void)signum; + keep_running = 0; +} + +// Coin reconstruction from stored data +static void reconstruct_coin(u32_t *stored_data, u32_t coin[14]) +{ + for(int i = 0; i < 14; i++) + coin[i] = stored_data[i]; +} + +// +// Mine DETI coins using OpenCL +// +static void mine_coins_ocl(u64_t max_attempts, int use_scan_kernel) +{ + ocl_data_t od; + u32_t *host_storage; + u64_t attempts = 0; + u32_t coins_found = 0; + u32_t kernel_runs = 0; + + // Initialize OpenCL + od.platform_number = 0; + od.device_number = 0; + od.kernel_file_name = "aad_coin_miner_ocl_kernel.cl"; + od.kernel_name = use_scan_kernel ? "mine_deti_coins_scan_kernel" : "mine_deti_coins_kernel"; + od.data_size[0] = COINS_STORAGE_SIZE * sizeof(u32_t); + od.data_size[1] = 0; + + initialize_ocl(&od); + + host_storage = (u32_t *)od.host_data[0]; + + // Kernel configuration + od.local_work_size = RECOMMENDED_OCL_WORK_GROUP_SIZE; + od.global_work_size = 4096 * od.local_work_size; // Large grid for maximum GPU utilization + + u32_t n_threads = od.global_work_size; + + printf("Mining DETI coins using OpenCL...\n"); + printf("Device: %s\n", od.device_name); + printf("Work groups: %zu × %zu = %u total work items\n", + od.global_work_size / od.local_work_size, od.local_work_size, n_threads); + printf("Kernel: %s\n", od.kernel_name); + printf("Press Ctrl+C to stop\n\n"); + + // Test SHA1 on host to verify it matches + printf("Testing SHA1 implementation on host...\n"); + u32_t test_coin[14]; + memset(test_coin, 0, sizeof(test_coin)); + ((u08_t *)test_coin)[0x0 ^ 3] = 'D'; + ((u08_t *)test_coin)[0x1 ^ 3] = 'E'; + ((u08_t *)test_coin)[0x2 ^ 3] = 'T'; + ((u08_t *)test_coin)[0x3 ^ 3] = 'I'; + ((u08_t *)test_coin)[0x4 ^ 3] = ' '; + ((u08_t *)test_coin)[0x5 ^ 3] = 'c'; + ((u08_t *)test_coin)[0x6 ^ 3] = 'o'; + ((u08_t *)test_coin)[0x7 ^ 3] = 'i'; + ((u08_t *)test_coin)[0x8 ^ 3] = 'n'; + ((u08_t *)test_coin)[0x9 ^ 3] = ' '; + ((u08_t *)test_coin)[0xa ^ 3] = '2'; + ((u08_t *)test_coin)[0xb ^ 3] = ' '; + ((u08_t *)test_coin)[0x36 ^ 3] = '\n'; + ((u08_t *)test_coin)[0x37 ^ 3] = 0x80; + for(int i = 12; i < 54; i++) + ((u08_t *)test_coin)[i ^ 3] = 'A' + (i - 12) % 26; + + u32_t test_hash[5]; + sha1(test_coin, test_hash); + printf("Host test hash: 0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n", + test_hash[0], test_hash[1], test_hash[2], test_hash[3], test_hash[4]); + + // Now test on device + printf("Testing SHA1 implementation on device...\n"); + host_storage[0] = 1u; + + // Put the test coin in storage starting at index 1 + for(int i = 0; i < 14; i++) + host_storage[1 + i] = test_coin[i]; + + // Copy to device + host_to_device_copy(&od, 0); + + // We'll add a test kernel - for now just verify basic kernel launch works + printf("Starting mining...\n\n"); + + u32_t param1 = (u32_t)time(NULL); + u32_t param2 = 0x12345678u; + int scan_pos = 12; + + time_measurement(); + time_measurement(); + double start_time = wall_time_delta(); + + while(keep_running && (max_attempts == 0 || attempts < max_attempts)) + { + // Initialize storage area + host_storage[0] = 1u; // First unused index + + // Copy to device + host_to_device_copy(&od, 0); + + // Set kernel arguments + od.n_kernel_arguments = use_scan_kernel ? 4 : 3; + set_kernel_arg(&od, 0, sizeof(cl_mem), &od.device_data[0]); + set_kernel_arg(&od, 1, sizeof(u32_t), ¶m1); + set_kernel_arg(&od, 2, sizeof(u32_t), ¶m2); + if(use_scan_kernel) + set_kernel_arg(&od, 3, sizeof(int), &scan_pos); + + // Launch the OpenCL kernel + launch_kernel(&od); + + // Copy results back + device_to_host_copy(&od, 0); + + // Process found coins + u32_t n_coins_this_kernel = 0; + u32_t n_stored = (host_storage[0] - 1) / 14; + + if(n_stored > 0 && host_storage[0] < COINS_STORAGE_SIZE) + { + for(u32_t i = 0; i < n_stored; i++) + { + u32_t coin[14]; + reconstruct_coin(&host_storage[1 + i * 14], coin); + + // Verify it's actually a valid coin + u32_t hash[5]; + sha1(coin, hash); + + if(hash[0] == 0xAAD20250u) + { + coins_found++; + n_coins_this_kernel++; + printf("COIN FOUND! (kernel %u, coin %u in this kernel)\n", + kernel_runs, n_coins_this_kernel); + save_coin(coin); + } + } + } + + // Update counters + kernel_runs++; + if(use_scan_kernel) + attempts += n_threads * 256; // Each thread tries 256 values + else + attempts += n_threads; + + // Update parameters for next iteration + param1++; + param2 = param2 ^ 0x9E3779B9u; + if(use_scan_kernel) + scan_pos = (scan_pos + 1) % 42 + 12; // Cycle through positions 12-53 + + // Print progress every 10 kernel launches + if(kernel_runs % 10 == 0) + { + time_measurement(); + double current_time = wall_time_delta() - start_time; + double rate = attempts / current_time; + printf("Attempts: %llu, Rate: %.2f MH/s, Coins: %u, Kernels: %u\n", + (unsigned long long)attempts, rate / 1e6, coins_found, kernel_runs); + } + } + + time_measurement(); + double total_time = wall_time_delta() - start_time; + + printf("\n=== Mining Statistics ===\n"); + printf("Total attempts: %llu\n", (unsigned long long)attempts); + printf("Total time: %.2f seconds\n", total_time); + printf("Average rate: %.2f attempts/second\n", attempts / total_time); + printf("Coins found: %u\n", coins_found); + printf("Kernel launches: %u\n", kernel_runs); + + // Save any remaining coins + save_coin(NULL); + + terminate_ocl(&od); +} + +int main(int argc, char *argv[]) +{ + u64_t max_attempts = 0; + int use_scan_kernel = 0; + + signal(SIGINT, signal_handler); + + if(argc > 1) + max_attempts = strtoull(argv[1], NULL, 10); + + if(argc > 2 && strcmp(argv[2], "scan") == 0) + { + use_scan_kernel = 1; + printf("Using scan kernel (tries 256 values per thread)\n"); + } + + mine_coins_ocl(max_attempts, use_scan_kernel); + + return 0; +} diff --git a/aad_coin_miner_ocl_kernel.cl b/aad_coin_miner_ocl_kernel.cl new file mode 100644 index 0000000..6744b49 --- /dev/null +++ b/aad_coin_miner_ocl_kernel.cl @@ -0,0 +1,255 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// DETI Coin Miner - OpenCL kernel +// + +// Rotate left for SHA-1 +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +// SHA-1 macros +#define SHA1_F1(x,y,z) ((x & y) | (~x & z)) +#define SHA1_K1 0x5A827999u +#define SHA1_F2(x,y,z) (x ^ y ^ z) +#define SHA1_K2 0x6ED9EBA1u +#define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z)) +#define SHA1_K3 0x8F1BBCDCu +#define SHA1_F4(x,y,z) (x ^ y ^ z) +#define SHA1_K4 0xCA62C1D6u + +// +// SHA-1 implementation matching the template from aad_sha1.h +// +void sha1_compute(__private uint *coin, __private uint *hash) +{ + uint a, b, c, d, e, w[16]; + + // Initial hash values + a = 0x67452301u; + b = 0xEFCDAB89u; + c = 0x98BADCFEu; + d = 0x10325476u; + e = 0xC3D2E1F0u; + + // Load message schedule (first 14 words from coin, then 0, then length) + for(int i = 0; i < 14; i++) + w[i] = coin[i]; + w[14] = 0; + w[15] = 440; // 55 bytes * 8 bits + + // SHA-1 compression function - 80 rounds + uint tmp; + + // Rounds 0-15 + #define ROUND1(t) \ + tmp = ROTATE_LEFT(a, 5) + SHA1_F1(b,c,d) + e + w[t] + SHA1_K1; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + ROUND1(0); ROUND1(1); ROUND1(2); ROUND1(3); + ROUND1(4); ROUND1(5); ROUND1(6); ROUND1(7); + ROUND1(8); ROUND1(9); ROUND1(10); ROUND1(11); + ROUND1(12); ROUND1(13); ROUND1(14); ROUND1(15); + + #undef ROUND1 + + // Rounds 16-79 with message schedule + #define ROUND(F, K, t) \ + tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \ + w[t & 15] = ROTATE_LEFT(tmp, 1); \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + ROUND(SHA1_F1, SHA1_K1, 16); ROUND(SHA1_F1, SHA1_K1, 17); + ROUND(SHA1_F1, SHA1_K1, 18); ROUND(SHA1_F1, SHA1_K1, 19); + + ROUND(SHA1_F2, SHA1_K2, 20); ROUND(SHA1_F2, SHA1_K2, 21); + ROUND(SHA1_F2, SHA1_K2, 22); ROUND(SHA1_F2, SHA1_K2, 23); + ROUND(SHA1_F2, SHA1_K2, 24); ROUND(SHA1_F2, SHA1_K2, 25); + ROUND(SHA1_F2, SHA1_K2, 26); ROUND(SHA1_F2, SHA1_K2, 27); + ROUND(SHA1_F2, SHA1_K2, 28); ROUND(SHA1_F2, SHA1_K2, 29); + ROUND(SHA1_F2, SHA1_K2, 30); ROUND(SHA1_F2, SHA1_K2, 31); + ROUND(SHA1_F2, SHA1_K2, 32); ROUND(SHA1_F2, SHA1_K2, 33); + ROUND(SHA1_F2, SHA1_K2, 34); ROUND(SHA1_F2, SHA1_K2, 35); + ROUND(SHA1_F2, SHA1_K2, 36); ROUND(SHA1_F2, SHA1_K2, 37); + ROUND(SHA1_F2, SHA1_K2, 38); ROUND(SHA1_F2, SHA1_K2, 39); + + ROUND(SHA1_F3, SHA1_K3, 40); ROUND(SHA1_F3, SHA1_K3, 41); + ROUND(SHA1_F3, SHA1_K3, 42); ROUND(SHA1_F3, SHA1_K3, 43); + ROUND(SHA1_F3, SHA1_K3, 44); ROUND(SHA1_F3, SHA1_K3, 45); + ROUND(SHA1_F3, SHA1_K3, 46); ROUND(SHA1_F3, SHA1_K3, 47); + ROUND(SHA1_F3, SHA1_K3, 48); ROUND(SHA1_F3, SHA1_K3, 49); + ROUND(SHA1_F3, SHA1_K3, 50); ROUND(SHA1_F3, SHA1_K3, 51); + ROUND(SHA1_F3, SHA1_K3, 52); ROUND(SHA1_F3, SHA1_K3, 53); + ROUND(SHA1_F3, SHA1_K3, 54); ROUND(SHA1_F3, SHA1_K3, 55); + ROUND(SHA1_F3, SHA1_K3, 56); ROUND(SHA1_F3, SHA1_K3, 57); + ROUND(SHA1_F3, SHA1_K3, 58); ROUND(SHA1_F3, SHA1_K3, 59); + + ROUND(SHA1_F4, SHA1_K4, 60); ROUND(SHA1_F4, SHA1_K4, 61); + ROUND(SHA1_F4, SHA1_K4, 62); ROUND(SHA1_F4, SHA1_K4, 63); + ROUND(SHA1_F4, SHA1_K4, 64); ROUND(SHA1_F4, SHA1_K4, 65); + ROUND(SHA1_F4, SHA1_K4, 66); ROUND(SHA1_F4, SHA1_K4, 67); + ROUND(SHA1_F4, SHA1_K4, 68); ROUND(SHA1_F4, SHA1_K4, 69); + ROUND(SHA1_F4, SHA1_K4, 70); ROUND(SHA1_F4, SHA1_K4, 71); + ROUND(SHA1_F4, SHA1_K4, 72); ROUND(SHA1_F4, SHA1_K4, 73); + ROUND(SHA1_F4, SHA1_K4, 74); ROUND(SHA1_F4, SHA1_K4, 75); + ROUND(SHA1_F4, SHA1_K4, 76); ROUND(SHA1_F4, SHA1_K4, 77); + ROUND(SHA1_F4, SHA1_K4, 78); ROUND(SHA1_F4, SHA1_K4, 79); + + #undef ROUND + + // Add to initial values + hash[0] = a + 0x67452301u; + hash[1] = b + 0xEFCDAB89u; + hash[2] = c + 0x98BADCFEu; + hash[3] = d + 0x10325476u; + hash[4] = e + 0xC3D2E1F0u; +} + +// +// Basic mining kernel - each work item tries one coin +// +__kernel void mine_deti_coins_kernel(__global uint *storage, uint param1, uint param2) +{ + uint gid = get_global_id(0); + uint coin[14]; + uint hash[5]; + + // Zero initialize + for(int i = 0; i < 14; i++) + coin[i] = 0; + + // Access as bytes with XOR 3 for endianness (little-endian word, big-endian bytes) + __private uchar *bytes = (__private uchar *)coin; + + // Fixed prefix: "DETI coin 2 " + bytes[0x0 ^ 3] = 'D'; + bytes[0x1 ^ 3] = 'E'; + bytes[0x2 ^ 3] = 'T'; + bytes[0x3 ^ 3] = 'I'; + bytes[0x4 ^ 3] = ' '; + bytes[0x5 ^ 3] = 'c'; + bytes[0x6 ^ 3] = 'o'; + bytes[0x7 ^ 3] = 'i'; + bytes[0x8 ^ 3] = 'n'; + bytes[0x9 ^ 3] = ' '; + bytes[0xa ^ 3] = '2'; + bytes[0xb ^ 3] = ' '; + + // Fixed suffix: newline + padding + bytes[0x36 ^ 3] = '\n'; + bytes[0x37 ^ 3] = 0x80; + + // Variable content (42 bytes from position 12 to 53) + // Generate unique content for each thread + uint seed = param1 + gid * 0x9E3779B9u; + uint seed2 = param2 ^ (gid * 0x61C88647u); + + for(int i = 12; i < 54; i++) + { + // LCG + xorshift mixer + seed = seed * 1664525u + 1013904223u; + seed2 ^= seed2 << 13; + seed2 ^= seed2 >> 17; + seed2 ^= seed2 << 5; + + uchar val = 32 + ((seed ^ seed2) % 95); + + // Skip newline character + if(val == '\n') val = ' '; + // Ensure we stay in printable range + if(val >= 127) val = 126; + + bytes[i ^ 3] = val; + } + + // Compute SHA-1 + sha1_compute(coin, hash); + + // Check for valid DETI coin v2 (hash starts with 0xAAD20250) + if(hash[0] == 0xAAD20250u) + { + // Atomically reserve space and store the coin + uint idx = atomic_add(&storage[0], 14u); + + if(idx + 14 <= 1024) + { + // Store all 14 words of the coin + for(int i = 0; i < 14; i++) + storage[idx + i] = coin[i]; + } + } +} + +// +// Scan kernel - each work item tries 256 variations +// +__kernel void mine_deti_coins_scan_kernel(__global uint *storage, uint param1, uint param2, int scan_pos) +{ + uint gid = get_global_id(0); + uint coin[14]; + uint hash[5]; + + // Initialize coin + for(int i = 0; i < 14; i++) + coin[i] = 0; + + __private uchar *bytes = (__private uchar *)coin; + + // Fixed parts + bytes[0x0 ^ 3] = 'D'; + bytes[0x1 ^ 3] = 'E'; + bytes[0x2 ^ 3] = 'T'; + bytes[0x3 ^ 3] = 'I'; + bytes[0x4 ^ 3] = ' '; + bytes[0x5 ^ 3] = 'c'; + bytes[0x6 ^ 3] = 'o'; + bytes[0x7 ^ 3] = 'i'; + bytes[0x8 ^ 3] = 'n'; + bytes[0x9 ^ 3] = ' '; + bytes[0xa ^ 3] = '2'; + bytes[0xb ^ 3] = ' '; + bytes[0x36 ^ 3] = '\n'; + bytes[0x37 ^ 3] = 0x80; + + // Generate base content unique to this thread + uint seed = param1 + gid * 0x9E3779B9u; + uint seed2 = param2 ^ (gid * 0x61C88647u); + + for(int i = 12; i < 54; i++) + { + seed = seed * 1664525u + 1013904223u; + seed2 ^= seed2 << 13; + seed2 ^= seed2 >> 17; + seed2 ^= seed2 << 5; + + uchar val = 32 + ((seed ^ seed2) % 95); + if(val == '\n') val = ' '; + if(val >= 127) val = 126; + + bytes[i ^ 3] = val; + } + + // Validate scan_pos + if(scan_pos < 12 || scan_pos >= 54) + scan_pos = 12; + + // Scan through all printable ASCII values at scan_pos + for(uint c = 32; c < 127; c++) + { + if(c == '\n') continue; // Skip newline + + bytes[scan_pos ^ 3] = (uchar)c; + + sha1_compute(coin, hash); + + if(hash[0] == 0xAAD20250u) + { + uint idx = atomic_add(&storage[0], 14u); + if(idx + 14 <= 1024) + { + for(int i = 0; i < 14; i++) + storage[idx + i] = coin[i]; + } + } + } +} diff --git a/aad_coin_miner_wasm.c b/aad_coin_miner_wasm.c index e2bbf07..035e15b 100644 --- a/aad_coin_miner_wasm.c +++ b/aad_coin_miner_wasm.c @@ -1,7 +1,7 @@ // // Arquiteturas de Alto Desempenho 2025/2026 // -// DETI Coin Miner - WebAssembly implementation +// DETI Coin Miner - WebAssembly implementation with SIMD support // #include @@ -19,14 +19,20 @@ #include "aad_vault.h" #endif +// WASM SIMD support +#if defined(__wasm_simd128__) +#include +#endif + // Global mining state static volatile int keep_running = 1; +static volatile int use_simd = 0; static u64_t total_attempts = 0; static u32_t coins_found = 0; static double mining_start_time = 0; -static double pause_time_offset = 0; // Track paused time -static double last_pause_time = 0; // When mining was paused -static u32_t found_coins[1024][14]; // Store up to 1024 found coins +static double pause_time_offset = 0; +static double last_pause_time = 0; +static u32_t found_coins[1024][14]; static u32_t found_coins_count = 0; // @@ -82,6 +88,185 @@ static double get_time() #endif } +#if defined(__wasm_simd128__) +// +// Helper macro for Left Rotate (ROTL) using SIMD shifts +// CORRECTION: Used wasm_u32x4_shr instead of incorrect wasm_i32x4_shr_u +// +#define SIMD_ROTL(x, n) wasm_v128_or(wasm_i32x4_shl(x, n), wasm_u32x4_shr(x, 32 - n)) + +// +// SIMD implementation for WebAssembly (4-way parallel) +// +static void prepare_coins_simd(u32_t base_coin[14], u32_t *interleaved_data) +{ + const int SIMD_WIDTH = 4; + for(int lane = 0; lane < SIMD_WIDTH; lane++) + { + u32_t coin[14]; + memcpy(coin, base_coin, sizeof(coin)); + + for(int idx = 0; idx < 14; idx++) + { + interleaved_data[idx * SIMD_WIDTH + lane] = coin[idx]; + } + + increment_coin(base_coin); + } +} + +static void extract_hashes_simd(u32_t *interleaved_hash, u32_t hashes[][5]) +{ + const int SIMD_WIDTH = 4; + for(int lane = 0; lane < SIMD_WIDTH; lane++) + { + for(int idx = 0; idx < 5; idx++) + { + hashes[lane][idx] = interleaved_hash[idx * SIMD_WIDTH + lane]; + } + } +} + +static void extract_coins_simd(u32_t *interleaved_data, u32_t coins[][14]) +{ + const int SIMD_WIDTH = 4; + for(int lane = 0; lane < SIMD_WIDTH; lane++) + { + for(int idx = 0; idx < 14; idx++) + { + coins[lane][idx] = interleaved_data[idx * SIMD_WIDTH + lane]; + } + } +} + +static void sha1_wasm_simd(u32_t *interleaved_data, u32_t *interleaved_hash) +{ + const int SIMD_WIDTH = 4; + + // SHA1 State vectors + v128_t a, b, c, d, e; + v128_t w[80]; // Message schedule + + // Initial SHA1 constants + a = wasm_i32x4_splat(0x67452301u); + b = wasm_i32x4_splat(0xEFCDAB89u); + c = wasm_i32x4_splat(0x98BADCFEu); + d = wasm_i32x4_splat(0x10325476u); + e = wasm_i32x4_splat(0xC3D2E1F0u); + + // 1. Prepare Message Schedule (Interleaved loads) + // Load first 14 words + for(int i = 0; i < 14; i++) + { + w[i] = wasm_v128_load(&interleaved_data[i * SIMD_WIDTH]); + } + // Standard padding (assumed handled by caller/init) + w[14] = wasm_i32x4_splat(0); + w[15] = wasm_i32x4_splat(440); // Length in bits (55 bytes * 8) + + // Expand message schedule from 16 to 80 + for (int i = 16; i < 80; i++) { + v128_t temp = wasm_v128_xor(w[i-3], w[i-8]); + temp = wasm_v128_xor(temp, w[i-14]); + temp = wasm_v128_xor(temp, w[i-16]); + w[i] = SIMD_ROTL(temp, 1); + } + + // 2. Main Loop (80 rounds) + for (int i = 0; i < 80; i++) { + v128_t f, k; + + if (i < 20) { + // F1: (b & c) | (~b & d) + f = wasm_v128_or(wasm_v128_and(b, c), wasm_v128_and(wasm_v128_not(b), d)); + k = wasm_i32x4_splat(0x5A827999u); + } else if (i < 40) { + // F2: b ^ c ^ d + f = wasm_v128_xor(wasm_v128_xor(b, c), d); + k = wasm_i32x4_splat(0x6ED9EBA1u); + } else if (i < 60) { + // F3: (b & c) | (b & d) | (c & d) + f = wasm_v128_or(wasm_v128_or(wasm_v128_and(b, c), wasm_v128_and(b, d)), wasm_v128_and(c, d)); + k = wasm_i32x4_splat(0x8F1BBCDCu); + } else { + // F4: b ^ c ^ d + f = wasm_v128_xor(wasm_v128_xor(b, c), d); + k = wasm_i32x4_splat(0xCA62C1D6u); + } + + // temp = ROTL(a, 5) + f + e + k + w[i] + v128_t temp = wasm_i32x4_add(SIMD_ROTL(a, 5), f); + temp = wasm_i32x4_add(temp, e); + temp = wasm_i32x4_add(temp, k); + temp = wasm_i32x4_add(temp, w[i]); + + e = d; + d = c; + c = SIMD_ROTL(b, 30); + b = a; + a = temp; + } + + // 3. Add to initial state and store + v128_t h0 = wasm_i32x4_add(a, wasm_i32x4_splat(0x67452301u)); + v128_t h1 = wasm_i32x4_add(b, wasm_i32x4_splat(0xEFCDAB89u)); + v128_t h2 = wasm_i32x4_add(c, wasm_i32x4_splat(0x98BADCFEu)); + v128_t h3 = wasm_i32x4_add(d, wasm_i32x4_splat(0x10325476u)); + v128_t h4 = wasm_i32x4_add(e, wasm_i32x4_splat(0xC3D2E1F0u)); + + // Store results back to interleaved_hash + wasm_v128_store(&interleaved_hash[0 * SIMD_WIDTH], h0); + wasm_v128_store(&interleaved_hash[1 * SIMD_WIDTH], h1); + wasm_v128_store(&interleaved_hash[2 * SIMD_WIDTH], h2); + wasm_v128_store(&interleaved_hash[3 * SIMD_WIDTH], h3); + wasm_v128_store(&interleaved_hash[4 * SIMD_WIDTH], h4); +} + +static int mine_coins_wasm_simd_internal(u32_t iterations_per_call, u32_t coin[14]) +{ + const int SIMD_WIDTH = 4; + u32_t interleaved_data[14 * SIMD_WIDTH] __attribute__((aligned(16))); + u32_t interleaved_hash[5 * SIMD_WIDTH] __attribute__((aligned(16))); + + u32_t batches = (iterations_per_call + SIMD_WIDTH - 1) / SIMD_WIDTH; + + for(u32_t batch = 0; batch < batches && keep_running; batch++) + { + prepare_coins_simd(coin, interleaved_data); + + // This now uses the REAL vectorized implementation + sha1_wasm_simd(interleaved_data, interleaved_hash); + + total_attempts += SIMD_WIDTH; + + u32_t hashes[SIMD_WIDTH][5]; + extract_hashes_simd(interleaved_hash, hashes); + + for(int lane = 0; lane < SIMD_WIDTH; lane++) + { + if(is_valid_coin(hashes[lane])) + { + if(found_coins_count < 1024) + { + u32_t coins[SIMD_WIDTH][14]; + extract_coins_simd(interleaved_data, coins); + memcpy(found_coins[found_coins_count], coins[lane], sizeof(coins[lane])); + found_coins_count++; + } + coins_found++; + +#ifndef __EMSCRIPTEN__ + printf("COIN FOUND! (attempt %llu, lane %d)\n", + (unsigned long long)(total_attempts - SIMD_WIDTH + lane), lane); +#endif + } + } + } + + return keep_running; +} +#endif + // // Main mining iteration (called from JavaScript) // @@ -92,6 +277,8 @@ int mine_coins_wasm(u32_t iterations_per_call) { static u32_t coin[14]; static int initialized = 0; + static int last_logged_mode = -1; // Track last reported mode + u32_t hash[5]; // Initialize coin template on first call @@ -123,7 +310,23 @@ int mine_coins_wasm(u32_t iterations_per_call) if(!keep_running) return 0; - // Mine for the specified number of iterations +#if defined(__wasm_simd128__) + if(use_simd) + { + if (last_logged_mode != 1) { + printf("C: Running in SIMD Mode (Vectorized)\n"); + last_logged_mode = 1; + } + return mine_coins_wasm_simd_internal(iterations_per_call, coin); + } +#endif + + if (last_logged_mode != 0) { + printf("C: Running in Scalar Mode\n"); + last_logged_mode = 0; + } + + // Scalar implementation for(u32_t i = 0; i < iterations_per_call && keep_running; i++) { sha1(coin, hash); @@ -165,10 +368,8 @@ void get_statistics(u64_t *attempts, u32_t *coins, double *hash_rate, double *el double actual_elapsed; if(!keep_running && last_pause_time > 0) { - // If paused, use the paused time actual_elapsed = last_pause_time - mining_start_time - pause_time_offset; } else { - // If running, calculate normally actual_elapsed = current_time - mining_start_time - pause_time_offset; } @@ -176,6 +377,44 @@ void get_statistics(u64_t *attempts, u32_t *coins, double *hash_rate, double *el *hash_rate = (actual_elapsed > 0) ? (total_attempts / actual_elapsed) : 0; } +// +// Enable/disable SIMD +// +#ifdef __EMSCRIPTEN__ +EMSCRIPTEN_KEEPALIVE +#endif +void set_simd_enabled(int enabled) +{ + use_simd = enabled; + printf("C: SIMD mode set to: %d\n", use_simd); +} + +// +// Check if SIMD is available +// +#ifdef __EMSCRIPTEN__ +EMSCRIPTEN_KEEPALIVE +#endif +int is_simd_available() +{ +#if defined(__wasm_simd128__) + return 1; +#else + return 0; +#endif +} + +// +// Get current SIMD state +// +#ifdef __EMSCRIPTEN__ +EMSCRIPTEN_KEEPALIVE +#endif +int is_simd_enabled() +{ + return use_simd; +} + // // Stop mining // @@ -207,7 +446,7 @@ void resume_mining() } // -// Get found coin data (returns pointer to coin array) +// Get found coin data // #ifdef __EMSCRIPTEN__ EMSCRIPTEN_KEEPALIVE @@ -259,6 +498,12 @@ int main(int argc, char *argv[]) max_attempts = strtoull(argv[1], NULL, 10); printf("Mining DETI coins using WebAssembly implementation (standalone mode)...\n"); +#if defined(__wasm_simd128__) + printf("SIMD support: available\n"); + use_simd = 1; +#else + printf("SIMD support: not available\n"); +#endif printf("Press Ctrl+C to stop\n\n"); time_measurement(); diff --git a/aad_ocl_utilities.h b/aad_ocl_utilities.h new file mode 100644 index 0000000..5b3e77a --- /dev/null +++ b/aad_ocl_utilities.h @@ -0,0 +1,346 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// OpenCL utilities +// + +#ifndef AAD_OCL_UTILITIES +#define AAD_OCL_UTILITIES + +#define CL_TARGET_OPENCL_VERSION 120 +#include + +// +// data type used to store all OpenCL related stuff +// + +#define MAX_N_ARGUMENTS 4 +#define RECOMMENDED_OCL_WORK_GROUP_SIZE 128 + +typedef struct +{ + // input data + int platform_number; // number of the platform to use + int device_number; // number of the device to initialize + char *kernel_file_name; // name of the kernel file to load + char *kernel_name; // name of the OpenCL kernel to load + u32_t data_size[2]; // the number of bytes of the two data arrays to allocate on the host and on the device (0 if not needed) + + // persistent data + cl_platform_id platform; // the platform handle + cl_device_id device; // the device handle + char device_name[256]; // the device name + cl_context context; // the device context + cl_command_queue queue; // the command queue + cl_program program; // the compiled program + cl_kernel kernel; // the kernel handle + void *host_data[2]; // the pointers to the host data + cl_mem device_data[2]; // the device memory objects + + // launch kernel data + size_t global_work_size; // total number of work items + size_t local_work_size; // number of work items per work group + int n_kernel_arguments; // number of kernel arguments +} +ocl_data_t; + +// +// CL_CALL --- macro to call an OpenCL function and test its return value +// + +#define CL_CALL(f_name, args) \ + do \ + { \ + cl_int e = f_name args; \ + if(e != CL_SUCCESS) \ + { \ + fprintf(stderr,"" # f_name "() returned %s (file %s, line %d)\n",cl_error_string(e),__FILE__,__LINE__); \ + exit(1); \ + } \ + } \ + while(0) + +// +// Error code to string conversion +// + +static const char *cl_error_string(cl_int e) +{ + static char error_string[64]; +#define CASE(error_code) case error_code: return "" # error_code; + switch(e) + { + default: sprintf(error_string,"unknown error code (%d)",(int)e); return error_string; + CASE(CL_SUCCESS); + CASE(CL_DEVICE_NOT_FOUND); + CASE(CL_DEVICE_NOT_AVAILABLE); + CASE(CL_COMPILER_NOT_AVAILABLE); + CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CASE(CL_OUT_OF_RESOURCES); + CASE(CL_OUT_OF_HOST_MEMORY); + CASE(CL_PROFILING_INFO_NOT_AVAILABLE); + CASE(CL_MEM_COPY_OVERLAP); + CASE(CL_IMAGE_FORMAT_MISMATCH); + CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CASE(CL_BUILD_PROGRAM_FAILURE); + CASE(CL_MAP_FAILURE); + CASE(CL_INVALID_VALUE); + CASE(CL_INVALID_DEVICE_TYPE); + CASE(CL_INVALID_PLATFORM); + CASE(CL_INVALID_DEVICE); + CASE(CL_INVALID_CONTEXT); + CASE(CL_INVALID_QUEUE_PROPERTIES); + CASE(CL_INVALID_COMMAND_QUEUE); + CASE(CL_INVALID_HOST_PTR); + CASE(CL_INVALID_MEM_OBJECT); + CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CASE(CL_INVALID_IMAGE_SIZE); + CASE(CL_INVALID_SAMPLER); + CASE(CL_INVALID_BINARY); + CASE(CL_INVALID_BUILD_OPTIONS); + CASE(CL_INVALID_PROGRAM); + CASE(CL_INVALID_PROGRAM_EXECUTABLE); + CASE(CL_INVALID_KERNEL_NAME); + CASE(CL_INVALID_KERNEL_DEFINITION); + CASE(CL_INVALID_KERNEL); + CASE(CL_INVALID_ARG_INDEX); + CASE(CL_INVALID_ARG_VALUE); + CASE(CL_INVALID_ARG_SIZE); + CASE(CL_INVALID_KERNEL_ARGS); + CASE(CL_INVALID_WORK_DIMENSION); + CASE(CL_INVALID_WORK_GROUP_SIZE); + CASE(CL_INVALID_WORK_ITEM_SIZE); + CASE(CL_INVALID_GLOBAL_OFFSET); + CASE(CL_INVALID_EVENT_WAIT_LIST); + CASE(CL_INVALID_EVENT); + CASE(CL_INVALID_OPERATION); + CASE(CL_INVALID_GL_OBJECT); + CASE(CL_INVALID_BUFFER_SIZE); + CASE(CL_INVALID_MIP_LEVEL); + CASE(CL_INVALID_GLOBAL_WORK_SIZE); + } +#undef CASE +} + +// +// Read kernel source from file +// + +static char *read_kernel_source(const char *filename, size_t *length) +{ + FILE *fp = fopen(filename, "rb"); + if(!fp) + { + fprintf(stderr, "Failed to open kernel file: %s\n", filename); + exit(1); + } + + fseek(fp, 0, SEEK_END); + *length = ftell(fp); + fseek(fp, 0, SEEK_SET); + + char *source = (char *)malloc(*length + 1); + if(!source) + { + fprintf(stderr, "Failed to allocate memory for kernel source\n"); + fclose(fp); + exit(1); + } + + size_t read = fread(source, 1, *length, fp); + source[read] = '\0'; + fclose(fp); + + *length = read; + return source; +} + +// +// Initialize OpenCL +// + +static void initialize_ocl(ocl_data_t *od) +{ + cl_uint num_platforms, num_devices; + cl_int err; + + // Get platform + CL_CALL(clGetPlatformIDs, (0, NULL, &num_platforms)); + if(od->platform_number >= (int)num_platforms) + { + fprintf(stderr, "Invalid platform number %d (only %u platforms available)\n", + od->platform_number, num_platforms); + exit(1); + } + + cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); + CL_CALL(clGetPlatformIDs, (num_platforms, platforms, NULL)); + od->platform = platforms[od->platform_number]; + free(platforms); + + // Get device + CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)); + if(od->device_number >= (int)num_devices) + { + fprintf(stderr, "Invalid device number %d (only %u devices available)\n", + od->device_number, num_devices); + exit(1); + } + + cl_device_id *devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices); + CL_CALL(clGetDeviceIDs, (od->platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL)); + od->device = devices[od->device_number]; + free(devices); + + // Get device name + CL_CALL(clGetDeviceInfo, (od->device, CL_DEVICE_NAME, sizeof(od->device_name), + od->device_name, NULL)); + printf("initialize_ocl(): OpenCL code running on %s\n", od->device_name); + + // Create context + od->context = clCreateContext(NULL, 1, &od->device, NULL, NULL, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateContext() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Create command queue + od->queue = clCreateCommandQueue(od->context, od->device, 0, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateCommandQueue() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Load and compile kernel + size_t source_length; + char *source = read_kernel_source(od->kernel_file_name, &source_length); + + od->program = clCreateProgramWithSource(od->context, 1, (const char **)&source, + &source_length, &err); + free(source); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateProgramWithSource() returned %s\n", cl_error_string(err)); + exit(1); + } + + err = clBuildProgram(od->program, 1, &od->device, "-cl-std=CL1.2", NULL, NULL); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clBuildProgram() returned %s\n", cl_error_string(err)); + + // Get build log + size_t log_size; + clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = (char *)malloc(log_size); + clGetProgramBuildInfo(od->program, od->device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + fprintf(stderr, "Build log:\n%s\n", log); + free(log); + exit(1); + } + + // Create kernel + od->kernel = clCreateKernel(od->program, od->kernel_name, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateKernel() returned %s\n", cl_error_string(err)); + exit(1); + } + + // Allocate host and device memory + for(int i = 0; i < 2; i++) + { + if(od->data_size[i] > 0u) + { + od->host_data[i] = malloc(od->data_size[i]); + if(!od->host_data[i]) + { + fprintf(stderr, "Failed to allocate host memory\n"); + exit(1); + } + + od->device_data[i] = clCreateBuffer(od->context, CL_MEM_READ_WRITE, + od->data_size[i], NULL, &err); + if(err != CL_SUCCESS) + { + fprintf(stderr, "clCreateBuffer() returned %s\n", cl_error_string(err)); + exit(1); + } + } + else + { + od->host_data[i] = NULL; + } + } +} + +// +// Terminate OpenCL +// + +static void terminate_ocl(ocl_data_t *od) +{ + for(int i = 0; i < 2; i++) + { + if(od->data_size[i] > 0u) + { + free(od->host_data[i]); + clReleaseMemObject(od->device_data[i]); + } + } + clReleaseKernel(od->kernel); + clReleaseProgram(od->program); + clReleaseCommandQueue(od->queue); + clReleaseContext(od->context); +} + +// +// Copy data between host and device +// + +static void host_to_device_copy(ocl_data_t *od, int idx) +{ + if(idx < 0 || idx > 1 || od->data_size[idx] == 0u) + { + fprintf(stderr, "host_to_device_copy(): bad idx\n"); + exit(1); + } + CL_CALL(clEnqueueWriteBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0, + od->data_size[idx], od->host_data[idx], 0, NULL, NULL)); +} + +static void device_to_host_copy(ocl_data_t *od, int idx) +{ + if(idx < 0 || idx > 1 || od->data_size[idx] == 0u) + { + fprintf(stderr, "device_to_host_copy(): bad idx\n"); + exit(1); + } + CL_CALL(clEnqueueReadBuffer, (od->queue, od->device_data[idx], CL_TRUE, 0, + od->data_size[idx], od->host_data[idx], 0, NULL, NULL)); +} + +// +// Set kernel argument +// + +static void set_kernel_arg(ocl_data_t *od, int arg_idx, size_t arg_size, const void *arg_value) +{ + CL_CALL(clSetKernelArg, (od->kernel, arg_idx, arg_size, arg_value)); +} + +// +// Launch kernel +// + +static void launch_kernel(ocl_data_t *od) +{ + CL_CALL(clEnqueueNDRangeKernel, (od->queue, od->kernel, 1, NULL, + &od->global_work_size, &od->local_work_size, + 0, NULL, NULL)); + CL_CALL(clFinish, (od->queue)); +} + +#endif diff --git a/aad_sha1_ocl_kernel.cl b/aad_sha1_ocl_kernel.cl new file mode 100644 index 0000000..2ba781b --- /dev/null +++ b/aad_sha1_ocl_kernel.cl @@ -0,0 +1,175 @@ +// +// Arquiteturas de Alto Desempenho 2025/2026 +// +// SHA-1 OpenCL kernel - validation and testing +// + +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +// SHA-1 functions +#define SHA1_F1(x,y,z) ((x & y) | (~x & z)) +#define SHA1_K1 0x5A827999u +#define SHA1_F2(x,y,z) (x ^ y ^ z) +#define SHA1_K2 0x6ED9EBA1u +#define SHA1_F3(x,y,z) ((x & y) | (x & z) | (y & z)) +#define SHA1_K3 0x8F1BBCDCu +#define SHA1_F4(x,y,z) (x ^ y ^ z) +#define SHA1_K4 0xCA62C1D6u + +// +// sha1_ocl_kernel() --- each OpenCL work item computes the SHA-1 secure hash of one message +// +// This kernel should only be used to validate the SHA-1 secure hash code in OpenCL +// It matches the structure of the CUDA sha1_cuda_kernel +// +// The data is organized in an interleaved fashion for optimal memory access: +// For work group size W, the layout is: +// data[work_group_num][14][W] +// where data for the same index across all work items in a group are consecutive +// +__kernel void sha1_ocl_kernel(__global uint *interleaved_data, __global uint *interleaved_hash) +{ + uint gid = get_global_id(0); + uint local_id = get_local_id(0); + uint local_size = get_local_size(0); + uint group_id = get_group_id(0); + + // Adjust pointers for interleaved access pattern + // Each work group processes local_size messages + // Within a group, data is interleaved: all work items' data[0], then all data[1], etc. + __global uint *data = &interleaved_data[group_id * (local_size * 14) + local_id]; + __global uint *hash = &interleaved_hash[group_id * (local_size * 5) + local_id]; + + // Local storage for computation + uint a, b, c, d, e, w[16]; + uint tmp; + + // Initial hash values + a = 0x67452301u; + b = 0xEFCDAB89u; + c = 0x98BADCFEu; + d = 0x10325476u; + e = 0xC3D2E1F0u; + + // Load message schedule from interleaved data + // DATA(idx) accesses data[idx * local_size] to get the correct interleaved value + for(int i = 0; i < 14; i++) + w[i] = data[local_size * i]; + w[14] = 0; + w[15] = 440; // 55 bytes * 8 bits + + // SHA-1 compression - 80 rounds + + // Rounds 0-15 (no message schedule expansion needed) + #define SHA1_STEP(F, K, t) \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + SHA1_STEP(SHA1_F1, SHA1_K1, 0); + SHA1_STEP(SHA1_F1, SHA1_K1, 1); + SHA1_STEP(SHA1_F1, SHA1_K1, 2); + SHA1_STEP(SHA1_F1, SHA1_K1, 3); + SHA1_STEP(SHA1_F1, SHA1_K1, 4); + SHA1_STEP(SHA1_F1, SHA1_K1, 5); + SHA1_STEP(SHA1_F1, SHA1_K1, 6); + SHA1_STEP(SHA1_F1, SHA1_K1, 7); + SHA1_STEP(SHA1_F1, SHA1_K1, 8); + SHA1_STEP(SHA1_F1, SHA1_K1, 9); + SHA1_STEP(SHA1_F1, SHA1_K1, 10); + SHA1_STEP(SHA1_F1, SHA1_K1, 11); + SHA1_STEP(SHA1_F1, SHA1_K1, 12); + SHA1_STEP(SHA1_F1, SHA1_K1, 13); + SHA1_STEP(SHA1_F1, SHA1_K1, 14); + SHA1_STEP(SHA1_F1, SHA1_K1, 15); + + #undef SHA1_STEP + + // Rounds 16-79 with message schedule expansion + #define SHA1_EXPAND_STEP(F, K, t) \ + tmp = w[(t-3) & 15] ^ w[(t-8) & 15] ^ w[(t-14) & 15] ^ w[(t-16) & 15]; \ + w[t & 15] = ROTATE_LEFT(tmp, 1); \ + tmp = ROTATE_LEFT(a, 5) + F(b,c,d) + e + w[t & 15] + K; \ + e = d; d = c; c = ROTATE_LEFT(b, 30); b = a; a = tmp; + + // Rounds 16-19 (still using F1) + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 16); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 17); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 18); + SHA1_EXPAND_STEP(SHA1_F1, SHA1_K1, 19); + + // Rounds 20-39 (F2) + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 20); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 21); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 22); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 23); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 24); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 25); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 26); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 27); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 28); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 29); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 30); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 31); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 32); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 33); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 34); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 35); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 36); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 37); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 38); + SHA1_EXPAND_STEP(SHA1_F2, SHA1_K2, 39); + + // Rounds 40-59 (F3) + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 40); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 41); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 42); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 43); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 44); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 45); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 46); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 47); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 48); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 49); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 50); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 51); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 52); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 53); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 54); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 55); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 56); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 57); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 58); + SHA1_EXPAND_STEP(SHA1_F3, SHA1_K3, 59); + + // Rounds 60-79 (F4) + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 60); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 61); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 62); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 63); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 64); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 65); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 66); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 67); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 68); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 69); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 70); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 71); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 72); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 73); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 74); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 75); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 76); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 77); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 78); + SHA1_EXPAND_STEP(SHA1_F4, SHA1_K4, 79); + + #undef SHA1_EXPAND_STEP + + // Add to initial hash values and store in interleaved format + // HASH(idx) writes to hash[idx * local_size] + hash[local_size * 0] = a + 0x67452301u; + hash[local_size * 1] = b + 0xEFCDAB89u; + hash[local_size * 2] = c + 0x98BADCFEu; + hash[local_size * 3] = d + 0x10325476u; + hash[local_size * 4] = e + 0xC3D2E1F0u; +} diff --git a/index.html b/index.html index d6ed4d9..48f824f 100644 --- a/index.html +++ b/index.html @@ -13,6 +13,11 @@ padding: 10px 20px; margin: 5px; font-size: 16px; + cursor: pointer; + } + button:disabled { + opacity: 0.5; + cursor: not-allowed; } #stats { margin-top: 20px; @@ -56,22 +61,96 @@ .coin-data { color: #000080; } + .simd-toggle { + display: inline-flex; + align-items: center; + gap: 10px; + } + .toggle-switch { + position: relative; + display: inline-block; + width: 60px; + height: 34px; + } + .toggle-switch input { + opacity: 0; + width: 0; + height: 0; + } + .slider { + position: absolute; + cursor: pointer; + top: 0; + left: 0; + right: 0; + bottom: 0; + background-color: #ccc; + transition: .4s; + border-radius: 34px; + } + .slider:before { + position: absolute; + content: ""; + height: 26px; + width: 26px; + left: 4px; + bottom: 4px; + background-color: white; + transition: .4s; + border-radius: 50%; + } + input:checked + .slider { + background-color: #2196F3; + } + input:checked + .slider:before { + transform: translateX(26px); + } + input:disabled + .slider { + background-color: #ddd; + cursor: not-allowed; + } + .simd-status { + font-weight: bold; + } + .simd-available { + color: #28a745; + } + .simd-unavailable { + color: #dc3545; + } + .button-group { + margin: 15px 0; + }

DETI Coin Miner (WebAssembly)

+ +
+ + + +
+
- +
+
- + +
+ +
+ + + +
- - - -
Waiting to start... @@ -90,11 +169,56 @@ let updateInterval; let lastDisplayedCoinCount = 0; let pausedStats = false; + let simdAvailable = false; CoinMinerModule().then(mod => { Module = mod; console.log('WebAssembly module loaded'); + // Check if SIMD is available + simdAvailable = Module._is_simd_available(); + const simdToggle = document.getElementById('simdToggle'); + const simdStatus = document.getElementById('simdStatus'); + + if (simdAvailable) { + simdStatus.textContent = 'SIMD Available'; + simdStatus.className = 'simd-status simd-available'; + simdToggle.disabled = false; + + // SYNC LOGIC: Read C state first to match backend + const cState = Module._is_simd_enabled(); + simdToggle.checked = (cState === 1); + + // Force sync again just to be safe + Module._set_simd_enabled(simdToggle.checked ? 1 : 0); + console.log(`JS: Initialized - SIMD Available, Toggle set to ${simdToggle.checked}`); + } else { + simdStatus.textContent = 'SIMD Not Available'; + simdStatus.className = 'simd-status simd-unavailable'; + simdToggle.disabled = true; + simdToggle.checked = false; + Module._set_simd_enabled(0); + console.log('JS: Initialized - SIMD Not Available'); + } + + // SIMD toggle handler + simdToggle.onchange = () => { + if (simdAvailable) { + const newState = simdToggle.checked ? 1 : 0; + console.log(`JS: User toggled SIMD to ${newState}`); + + // Call the C function + Module._set_simd_enabled(newState); + + // Update status display + const currentMode = simdToggle.checked ? 'SIMD Mode' : 'Scalar Mode'; + if (!mining) { + document.getElementById('stats').innerHTML = + `${currentMode} selected. Click Start to begin mining.`; + } + } + }; + document.getElementById('start').onclick = () => { if (!mining) { mining = true; @@ -114,10 +238,10 @@ updateStats(); let currentHTML = document.getElementById('stats').innerHTML; - document.getElementById('stats').innerHTML = currentHTML.replace('Mining Statistics:', 'Mining Statistics (PAUSED):'); + document.getElementById('stats').innerHTML = + currentHTML.replace('Mining Statistics:', 'Mining Statistics (PAUSED):'); pausedStats = true; - console.log('Mining stopped'); }; @@ -128,7 +252,10 @@ clearInterval(miningInterval); clearInterval(updateInterval); lastDisplayedCoinCount = 0; - document.getElementById('stats').innerHTML = 'Reset complete. Click Start to begin.'; + + const mode = simdToggle.checked ? 'SIMD' : 'Scalar'; + document.getElementById('stats').innerHTML = + `Reset complete. Using ${mode} mode. Click Start to begin.`; document.getElementById('coins').innerHTML = ''; document.getElementById('coin-count').textContent = '0'; console.log('Mining reset'); @@ -175,7 +302,8 @@ const timestamp = new Date().toLocaleTimeString(); const entry = document.createElement('div'); - entry.innerHTML = `[${timestamp}] Coin #${i + 1}: ${coinStr}`; + entry.innerHTML = + `[${timestamp}] Coin #${i + 1}: ${coinStr}`; coinsDiv.appendChild(entry); coinsDiv.scrollTop = coinsDiv.scrollHeight; @@ -193,23 +321,18 @@ const hashRatePtr = Module._malloc(8); const elapsedPtr = Module._malloc(8); - // Zero-initialize the memory before calling Module.setValue(attemptsPtr, 0, 'i32'); Module.setValue(attemptsPtr + 4, 0, 'i32'); Module._get_statistics(attemptsPtr, coinsPtr, hashRatePtr, elapsedPtr); - // Read 64-bit unsigned value correctly - // On little-endian, low 32 bits come first const attemptsLowUnsigned = Module.getValue(attemptsPtr, 'i32') >>> 0; const attemptsHighUnsigned = Module.getValue(attemptsPtr + 4, 'i32') >>> 0; - // Combine - for display purposes, if high part is 0, just show low part let attempts; if (attemptsHighUnsigned === 0) { attempts = attemptsLowUnsigned; } else { - // Use BigInt for values > 32 bits const low = BigInt(attemptsLowUnsigned); const high = BigInt(attemptsHighUnsigned); attempts = (high * BigInt(4294967296)) + low; @@ -224,8 +347,10 @@ Module._free(hashRatePtr); Module._free(elapsedPtr); + const mode = Module._is_simd_enabled() ? 'SIMD' : 'Scalar'; + document.getElementById('stats').innerHTML = ` - Mining Statistics:
+ Mining Statistics (${mode} Mode):
Attempts: ${attempts.toString()}
Coins Found: ${coins}
Hash Rate: ${(hashRate / 1e6).toFixed(2)} MH/s
diff --git a/makefile b/makefile index 1a5feef..1d1d1ef 100644 --- a/makefile +++ b/makefile @@ -42,7 +42,7 @@ CUDA_ARCH = sm_86 clean: rm -f sha1_tests rm -f sha1_cuda_test sha1_cuda_kernel.cubin - rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin + rm -f coin_miner_cpu coin_miner_simd coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl rm -f coin_miner_wasm.js coin_miner_wasm.wasm rm -f benchmark rm -f a.out @@ -85,8 +85,11 @@ coin_miner_cuda_kernel.cubin: aad_coin_miner_cuda_kernel.cu aad_sha1.h makefile coin_miner_cuda: aad_coin_miner_cuda.c coin_miner_cuda_kernel.cubin aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h aad_vault.h aad_cuda_utilities.h makefile cc -march=native -Wall -Wshadow -Werror -O3 -I$(CUDA_DIR)/include $< -o $@ -lcuda +coin_miner_ocl: aad_coin_miner_ocl.c aad_coin_miner_ocl_kernel.cl aad_sha1.h aad_sha1_cpu.h aad_sha1_ocl_kernel.cl aad_data_types.h aad_utilities.h aad_vault.h aad_ocl_utilities.h makefile + cc -march=native -Wall -Wshadow -O3 $< -o $@ -lOpenCL + coin_miner_wasm: aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h aad_data_types.h aad_utilities.h aad_vault.h makefile - emcc -O3 -flto -o coin_miner_wasm.js aad_coin_miner_wasm.c \ + emcc -O3 -flto -msimd128 -o coin_miner_wasm.js aad_coin_miner_wasm.c \ -s WASM=1 \ -s EXPORTED_FUNCTIONS='["_mine_coins_wasm","_get_statistics","_stop_mining","_reset_mining","_get_found_coin","_get_found_coins_count","_malloc","_free"]' \ -s EXPORTED_RUNTIME_METHODS='["cwrap","ccall","getValue","setValue"]' \ @@ -98,8 +101,8 @@ coin_miner_wasm: aad_coin_miner_wasm.c aad_sha1.h aad_sha1_cpu.h aad_sha1_wasm.h benchmark: aad_benchmark.c aad_sha1.h aad_sha1_cpu.h aad_data_types.h aad_utilities.h makefile cc -march=native -Wall -Wshadow -Werror -O3 $< -o $@ -miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda benchmark +miners: coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_ocl benchmark all: sha1_tests sha1_cuda_test sha1_cuda_kernel.cubin \ - coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin \ + coin_miner_cpu coin_miner_simd coin_miner_wasm coin_miner_cuda coin_miner_cuda_kernel.cubin coin_miner_ocl \ benchmark