417 lines
13 KiB
C
417 lines
13 KiB
C
//
|
|
// Tomás Oliveira e Silva, September 2025
|
|
//
|
|
// Arquiteturas de Alto Desempenho 2025/2026
|
|
//
|
|
|
|
#include <time.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "aad_data_types.h"
|
|
#include "aad_utilities.h"
|
|
#include "aad_sha1_cpu.h"
|
|
|
|
//
|
|
// test the reference implementation
|
|
//
|
|
|
|
static void test_sha1(int n_tests,int n_measurements)
|
|
{
|
|
static union { u08_t c[14 * 4]; u32_t i[14]; } data; // the data as bytes and as 32-bit integers
|
|
static union { u08_t c[ 5 * 4]; u32_t i[ 5]; } hash; // the hash as bytes and as 32-bit integers
|
|
char command[320]; // 320 is more than enough
|
|
char response[64]; // 64 is more than enough
|
|
char computed[64]; // 64 is more than enough
|
|
double hashes_per_second;
|
|
int n,i,idx;
|
|
u32_t sum;
|
|
FILE *fp;
|
|
|
|
// test
|
|
response[40] = '\0';
|
|
for(n = 0;n < n_tests;n++)
|
|
{
|
|
// create random data (55 bytes)
|
|
for(i = 0;i < 55;i++)
|
|
data.c[i ^ 3] = random_byte();
|
|
// append padding (a SHA1 thing...)
|
|
data.c[55 ^ 3] = 0x80;
|
|
// compute its SHA1 secure hash
|
|
sha1(&data.i[0],&hash.i[0]);
|
|
// convert the secure hash into a string
|
|
idx = 0;
|
|
for(i = 0;i < 20;i++)
|
|
idx += sprintf(&computed[idx],"%02x",(int)hash.c[i ^ 3] & 0xFF);
|
|
if(idx >= (int)sizeof(computed))
|
|
{
|
|
fprintf(stderr,"computed[] is too small\n");
|
|
exit(1);
|
|
}
|
|
// construct the command to test the SHA1 secure hash
|
|
idx = sprintf(&command[0],"/bin/echo -en '"); // do not rely on the bash echo builtin command
|
|
for(i = 0;i < 55;i++)
|
|
idx += sprintf(&command[idx],"\\x%02x",data.c[i ^ 3]);
|
|
idx += sprintf(&command[idx],"' | sha1sum");
|
|
if(idx >= (int)sizeof(command))
|
|
{
|
|
fprintf(stderr,"command[] is too small\n");
|
|
exit(1);
|
|
}
|
|
// run it and get its output
|
|
fp = popen(command,"r");
|
|
if(fp == NULL)
|
|
{
|
|
fprintf(stderr,"popen() failed\n");
|
|
exit(1);
|
|
}
|
|
if(fread((void *)&response[0],sizeof(char),(size_t)40,fp) != (size_t)40)
|
|
{
|
|
fprintf(stderr,"fread() failed\n");
|
|
exit(1);
|
|
}
|
|
pclose(fp);
|
|
// compare them
|
|
if(memcmp((void *)response,(void *)computed,(size_t)40) != 0)
|
|
{ // print everything
|
|
fprintf(stderr,"sha1() failure for n=%d:\n",n);
|
|
for(i = 0;i < 55;i++)
|
|
fprintf(stderr," message[%2d] = %02x\n",i,(int)data.c[i ^ 3] & 0xFF);
|
|
for(i = 0;i < 20;i++)
|
|
fprintf(stderr," hash[%2d] = %02x\n",i,(int)hash.c[i ^ 3] % 0xFF);
|
|
fprintf(stderr," sha1sum output: %s\n",response);
|
|
fprintf(stderr," sha1() output: %s\n",computed);
|
|
for(i = 0;i < 40 && response[i] == computed[i];i++)
|
|
;
|
|
fprintf(stderr," mismatch at %d\n",i);
|
|
exit(1);
|
|
}
|
|
}
|
|
// warmup (turbo boost...)
|
|
for(i = n = 0;i < 1000000;i++)
|
|
n += (int)random_byte();
|
|
if(n == 0)
|
|
fprintf(stderr,"sha1(): this should not be possible, n=0\n");
|
|
// measure
|
|
time_measurement();
|
|
sum = 0u;
|
|
for(n = 0;n < n_measurements;n++)
|
|
{
|
|
data.i[0]++;
|
|
sha1(&data.i[0],&hash.i[0]);
|
|
sum += hash.i[4];
|
|
}
|
|
time_measurement();
|
|
if(sum == 0u)
|
|
fprintf(stderr,"sha1(): what a coincidence, sum=0\n");
|
|
hashes_per_second = (double)n_measurements / cpu_time_delta();
|
|
// report
|
|
printf("sha1() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second);
|
|
}
|
|
|
|
|
|
//
|
|
// test the avx implementation
|
|
//
|
|
|
|
#if defined(__AVX__)
|
|
|
|
static void test_sha1_avx(int n_tests,int n_measurements)
|
|
{
|
|
#define N_LANES 4
|
|
static union { u08_t c[14 * 4]; u32_t i[14]; } data[N_LANES]; // the data as bytes and as 32-bit integers
|
|
static union { u08_t c[ 5 * 4]; u32_t i[ 5]; } hash[N_LANES]; // the hash as bytes and as 32-bit integers
|
|
static u32_t interleaved_data[14][N_LANES] __attribute__((aligned(16)));
|
|
static u32_t interleaved_hash[5][N_LANES] __attribute__((aligned(16)));
|
|
double hashes_per_second;
|
|
int n,i,lane;
|
|
u32_t sum;
|
|
|
|
// test
|
|
for(n = 0;n < n_tests;n++)
|
|
{
|
|
// the data and the secure hash for the reference implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
{
|
|
// create random data (55 bytes)
|
|
for(i = 0;i < 55;i++)
|
|
data[lane].c[i ^ 3] = random_byte();
|
|
// append padding (a SHA1 thing...)
|
|
data[lane].c[55 ^ 3] = 0x80;
|
|
// compute its SHA1 secure hash
|
|
sha1(&data[lane].i[0],&hash[lane].i[0]);
|
|
}
|
|
// interleave (transpose) the data for the avx implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 14;i++)
|
|
interleaved_data[i][lane] = data[lane].i[i];
|
|
// compute the four secure hashes in one go
|
|
sha1_avx((v4si *)&interleaved_data[0],(v4si *)&interleaved_hash[0]);
|
|
// test
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 5;i++)
|
|
if(interleaved_hash[i][lane] != hash[lane].i[i])
|
|
{
|
|
fprintf(stderr,"sha1_avx() failure for n=%d (bad/good):\n",n);
|
|
for(i = 0;i < 5;i++)
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
fprintf(stderr,"%s%08X/%08X%s",(lane == 0) ? " " : " ",interleaved_hash[i][lane] ,hash[lane].i[i],(lane == N_LANES - 1) ? "\n" : "");
|
|
exit(1);
|
|
}
|
|
}
|
|
// measure
|
|
time_measurement();
|
|
sum = 0u;
|
|
for(n = 0;n < n_measurements;n++)
|
|
{
|
|
interleaved_data[0][0]++;
|
|
sha1_avx((v4si *)&interleaved_data[0],(v4si *)&interleaved_hash[0]);
|
|
sum += interleaved_hash[4][0];
|
|
}
|
|
time_measurement();
|
|
if(sum == 0u)
|
|
fprintf(stderr,"sha1_avx(): what a coincidence, sum=0\n");
|
|
hashes_per_second = (double)n_measurements * (double)N_LANES / cpu_time_delta();
|
|
// report
|
|
printf("sha1_avx() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second);
|
|
# undef N_LANES
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
//
|
|
// test the avx2 implementation
|
|
//
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
static void test_sha1_avx2(int n_tests,int n_measurements)
|
|
{
|
|
#define N_LANES 8
|
|
static union { u08_t c[14 * 4]; u32_t i[14]; } data[N_LANES]; // the data as bytes and as 32-bit integers
|
|
static union { u08_t c[ 5 * 4]; u32_t i[ 5]; } hash[N_LANES]; // the hash as bytes and as 32-bit integers
|
|
static u32_t interleaved_data[14][N_LANES] __attribute__((aligned(32)));
|
|
static u32_t interleaved_hash[5][N_LANES] __attribute__((aligned(32)));
|
|
double hashes_per_second;
|
|
int n,i,lane;
|
|
u32_t sum;
|
|
|
|
// test
|
|
for(n = 0;n < n_tests;n++)
|
|
{
|
|
// the data and the secure hash for the reference implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
{
|
|
// create random data (55 bytes)
|
|
for(i = 0;i < 55;i++)
|
|
data[lane].c[i ^ 3] = random_byte();
|
|
// append padding (a SHA1 thing...)
|
|
data[lane].c[55 ^ 3] = 0x80;
|
|
// compute its SHA1 secure hash
|
|
sha1(&data[lane].i[0],&hash[lane].i[0]);
|
|
}
|
|
// interleave (transpose) the data for the avx2 implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 14;i++)
|
|
interleaved_data[i][lane] = data[lane].i[i];
|
|
// compute the eight secure hashes in one go
|
|
sha1_avx2((v8si *)&interleaved_data[0],(v8si *)&interleaved_hash[0]);
|
|
// test
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 5;i++)
|
|
if(interleaved_hash[i][lane] != hash[lane].i[i])
|
|
{
|
|
fprintf(stderr,"sha1_avx2() failure for n=%d (bad/good):\n",n);
|
|
for(i = 0;i < 5;i++)
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
fprintf(stderr,"%s%08X/%08X%s",(lane == 0) ? " " : " ",interleaved_hash[i][lane] ,hash[lane].i[i],(lane == N_LANES - 1) ? "\n" : "");
|
|
exit(1);
|
|
}
|
|
}
|
|
// measure
|
|
time_measurement();
|
|
sum = 0u;
|
|
for(n = 0;n < n_measurements;n++)
|
|
{
|
|
interleaved_data[0][0]++;
|
|
sha1_avx2((v8si *)&interleaved_data[0],(v8si *)&interleaved_hash[0]);
|
|
sum += interleaved_hash[4][0];
|
|
}
|
|
time_measurement();
|
|
if(sum == 0u)
|
|
fprintf(stderr,"sha1_avx2(): what a coincidence, sum=0\n");
|
|
hashes_per_second = (double)n_measurements * (double)N_LANES / cpu_time_delta();
|
|
// report
|
|
printf("sha1_avx2() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second);
|
|
# undef N_LANES
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
//
|
|
// test the avx512f implementation
|
|
//
|
|
|
|
#if defined(__AVX512F__)
|
|
|
|
static void test_sha1_avx512f(int n_tests,int n_measurements)
|
|
{
|
|
#define N_LANES 16
|
|
static union { u08_t c[14 * 4]; u32_t i[14]; } data[N_LANES]; // the data as bytes and as 32-bit integers
|
|
static union { u08_t c[ 5 * 4]; u32_t i[ 5]; } hash[N_LANES]; // the hash as bytes and as 32-bit integers
|
|
static u32_t interleaved_data[14][N_LANES] __attribute__((aligned(64)));
|
|
static u32_t interleaved_hash[5][N_LANES] __attribute__((aligned(64)));
|
|
double hashes_per_second;
|
|
int n,i,lane;
|
|
u32_t sum;
|
|
|
|
// test
|
|
for(n = 0;n < n_tests;n++)
|
|
{
|
|
// the data and the secure hash for the reference implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
{
|
|
// create random data (55 bytes)
|
|
for(i = 0;i < 55;i++)
|
|
data[lane].c[i ^ 3] = random_byte();
|
|
// append padding (a SHA1 thing...)
|
|
data[lane].c[55 ^ 3] = 0x80;
|
|
// compute its SHA1 secure hash
|
|
sha1(&data[lane].i[0],&hash[lane].i[0]);
|
|
}
|
|
// interleave (transpose) the data for the avx512f implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 14;i++)
|
|
interleaved_data[i][lane] = data[lane].i[i];
|
|
// compute the sixteen secure hashes in one go
|
|
sha1_avx512f((v16si *)&interleaved_data[0],(v16si *)&interleaved_hash[0]);
|
|
// test
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 5;i++)
|
|
if(interleaved_hash[i][lane] != hash[lane].i[i])
|
|
{
|
|
fprintf(stderr,"sha1_avx512f() failure for n=%d (bad/good):\n",n);
|
|
for(i = 0;i < 5;i++)
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
fprintf(stderr,"%s%08X/%08X%s",(lane == 0) ? " " : " ",interleaved_hash[i][lane] ,hash[lane].i[i],(lane == N_LANES - 1) ? "\n" : "");
|
|
exit(1);
|
|
}
|
|
}
|
|
// measure
|
|
time_measurement();
|
|
sum = 0u;
|
|
for(n = 0;n < n_measurements;n++)
|
|
{
|
|
interleaved_data[0][0]++;
|
|
sha1_avx512f((v16si *)&interleaved_data[0],(v16si *)&interleaved_hash[0]);
|
|
sum += interleaved_hash[4][0];
|
|
}
|
|
time_measurement();
|
|
if(sum == 0u)
|
|
fprintf(stderr,"sha1_avx512f(): what a coincidence, sum=0\n");
|
|
hashes_per_second = (double)n_measurements * (double)N_LANES / cpu_time_delta();
|
|
// report
|
|
printf("sha1_avx512f() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second);
|
|
# undef N_LANES
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
//
|
|
// test the neon implementation
|
|
//
|
|
|
|
#if defined(__ARM_NEON)
|
|
|
|
static void test_sha1_neon(int n_tests,int n_measurements)
|
|
{
|
|
#define N_LANES 4
|
|
static union { u08_t c[14 * 4]; u32_t i[14]; } data[N_LANES]; // the data as bytes and as 32-bit integers
|
|
static union { u08_t c[ 5 * 4]; u32_t i[ 5]; } hash[N_LANES]; // the hash as bytes and as 32-bit integers
|
|
static u32_t interleaved_data[14][N_LANES] __attribute__((aligned(16)));
|
|
static u32_t interleaved_hash[5][N_LANES] __attribute__((aligned(16)));
|
|
double hashes_per_second;
|
|
int n,i,lane;
|
|
u32_t sum;
|
|
|
|
// test
|
|
for(n = 0;n < n_tests;n++)
|
|
{
|
|
// the data and the secure hash for the reference implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
{
|
|
// create random data (55 bytes)
|
|
for(i = 0;i < 55;i++)
|
|
data[lane].c[i ^ 3] = random_byte();
|
|
// append padding (a SHA1 thing...)
|
|
data[lane].c[55 ^ 3] = 0x80;
|
|
// compute its SHA1 secure hash
|
|
sha1(&data[lane].i[0],&hash[lane].i[0]);
|
|
}
|
|
// interleave (transpose) the data for the neon implementation
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 14;i++)
|
|
interleaved_data[i][lane] = data[lane].i[i];
|
|
// compute the four secure hashes in one go
|
|
sha1_neon((uint32x4_t *)&interleaved_data[0],(uint32x4_t *)&interleaved_hash[0]);
|
|
// test
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
for(i = 0;i < 5;i++)
|
|
if(interleaved_hash[i][lane] != hash[lane].i[i])
|
|
{
|
|
fprintf(stderr,"sha1_neon() failure for n=%d (bad/good):\n",n);
|
|
for(i = 0;i < 5;i++)
|
|
for(lane = 0;lane < N_LANES;lane++)
|
|
fprintf(stderr,"%s%08X/%08X%s",(lane == 0) ? " " : " ",interleaved_hash[i][lane] ,hash[lane].i[i],(lane == N_LANES - 1) ? "\n" : "");
|
|
exit(1);
|
|
}
|
|
}
|
|
// measure
|
|
time_measurement();
|
|
sum = 0u;
|
|
for(n = 0;n < n_measurements;n++)
|
|
{
|
|
interleaved_data[0][0]++;
|
|
sha1_neon((uint32x4_t *)&interleaved_data[0],(uint32x4_t *)&interleaved_hash[0]);
|
|
sum += interleaved_hash[4][0];
|
|
}
|
|
time_measurement();
|
|
if(sum == 0u)
|
|
fprintf(stderr,"sha1_neon(): what a coincidence, sum=0\n");
|
|
hashes_per_second = (double)n_measurements * (double)N_LANES / cpu_time_delta();
|
|
// report
|
|
printf("sha1_neon() passed (%d test%s, %.0f secure hashes per second)\n",n_tests,(n_tests == 1) ? "" : "s",hashes_per_second);
|
|
# undef N_LANES
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
//
|
|
// main program
|
|
//
|
|
|
|
int main(void)
|
|
{
|
|
int n_tests = 1000;
|
|
int n_measurements = 10000000;
|
|
|
|
test_sha1(n_tests,n_measurements);
|
|
#if defined(__AVX__)
|
|
test_sha1_avx(n_tests,n_measurements);
|
|
#endif
|
|
#if defined(__AVX2__)
|
|
test_sha1_avx2(n_tests,n_measurements);
|
|
#endif
|
|
#if defined(__AVX512F__)
|
|
test_sha1_avx512f(n_tests,n_measurements);
|
|
#endif
|
|
#if defined(__ARM_NEON)
|
|
test_sha1_neon(n_tests,n_measurements);
|
|
#endif
|
|
return 0;
|
|
}
|