// // Tomás Oliveira e Silva, September 2025 // // Arquiteturas de Alto Desempenho 2025/2026 // #include #include #include #include #include "aad_data_types.h" #include "aad_utilities.h" #include "aad_sha1_cpu.h" #include "aad_cuda_utilities.h" static void test_sha1_cuda(int n_tests) { u32_t n,*interleaved32_data,*interleaved32_hash,data[14],hash[5],good_hash[5]; double host_to_device_time,kernel_time,device_to_host_time,hashes_per_second; cuda_data_t cd; if(n_tests <= 0 || n_tests > (1 << 24) || n_tests % RECOMMENDED_CUDA_BLOCK_SIZE != 0) { fprintf(stderr,"test_sha1_cuda(): bad number of tests\n"); exit(1); } // initialize cd.device_number = 0; // first device cd.cubin_file_name = "sha1_cuda_kernel.cubin"; cd.kernel_name = "sha1_cuda_kernel"; cd.data_size[0] = (u32_t)n_tests * (u32_t)14 * (u32_t)sizeof(u32_t); // size of the data array cd.data_size[1] = (u32_t)n_tests * (u32_t) 5 * (u32_t)sizeof(u32_t); // size of the hash array fprintf(stderr,"test_sha1_cuda(): %.3f MiB bytes for the interleaved32_data[] array\n",(double)cd.data_size[0] / (double)(1 << 20)); fprintf(stderr,"test_sha1_cuda(): %.3f MiB bytes for the interleaved32_hash[] array\n",(double)cd.data_size[1] / (double)(1 << 20)); initialize_cuda(&cd); interleaved32_data = (u32_t *)cd.host_data[0]; interleaved32_hash = (u32_t *)cd.host_data[1]; // random interleaved32_data n = cd.data_size[0]; while(n != 0u) ((u08_t *)interleaved32_data)[--n] = random_byte(); // run SHA1 in the CUDA device time_measurement(); host_to_device_copy(&cd,0); // idx=0 means that the interleaved32_data is copied to the CUDA device time_measurement(); host_to_device_time = wall_time_delta(); cd.grid_dim_x = (u32_t)n_tests / (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE; cd.block_dim_x = (u32_t)RECOMMENDED_CUDA_BLOCK_SIZE; cd.n_kernel_arguments = 2; cd.arg[0] = &cd.device_data[0]; // interleaved32_data cd.arg[1] = &cd.device_data[1]; // interleaved32_hash time_measurement(); launch_kernel(&cd); time_measurement(); kernel_time = wall_time_delta(); time_measurement(); device_to_host_copy(&cd,1); // idx=1 means that the interleaved32_hash is copied to the host time_measurement(); device_to_host_time = wall_time_delta(); // test for(n = 0;n < n_tests;n++) { // deinterleave the data and the hash // on the CUDA side, the data for each warp is clustered together; what follows must match what is in the CUDA kernel // each warp has 32 threads int warp_number = n / 32; int lane = n % 32; for(int idx = 0;idx < 14;idx++) data[idx] = interleaved32_data[32 * 14 * warp_number + 32 * idx + lane]; for(int idx = 0;idx < 5;idx++) hash[idx] = interleaved32_hash[32 * 5 * warp_number + 32 * idx + lane]; // compute the SHA1 secure hahs on the cpu sha1(&data[0],&good_hash[0]); // compare them for(int idx = 0;idx < 5;idx++) if(hash[idx] != good_hash[idx]) { fprintf(stderr,"test_sha1_cuda() failed for n=%d\n",n); for(idx = 0;idx < 14;idx++) fprintf(stderr,"%2d 0x%08X\n",idx,data[idx]); fprintf(stderr,"---\n"); for(idx = 0;idx < 5;idx++) fprintf(stderr,"%2d 0x%08X 0x%08X\n",idx,good_hash[idx],hash[idx]); exit(1); } } // cleanup terminate_cuda(&cd); hashes_per_second = (double)n_tests / kernel_time; printf("sha1_cuda_kernel() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second); printf(" host -> device --- %.6f seconds\n",host_to_device_time); printf(" kernel ----------- %.6f seconds\n",kernel_time); printf(" device -> host --- %.6f seconds\n",device_to_host_time); } // // main program // int main(void) { test_sha1_cuda(128 * 65536); return 0; }