// // Tomás Oliveira e Silva, September 2025 // // Arquiteturas de Alto Desempenho 2025/2026 // // CUDA driver API stuff // #ifndef AAD_CUDA_UTILITIES #define AAD_CUDA_UTILITIES #include // // data type used to store all CUDA related stuff // #define MAX_N_ARGUMENTS 4 typedef struct { // input data int device_number; // number of the device to initialize char *cubin_file_name; // name of the cubin file to load (NULL if not needed) char *kernel_name; // name of the CUDA kernel to load (NULL if not needed) u32_t data_size[2]; // the number of bytes of the two data arrays to allocate on the host and on the device (0 if not needed) // persistent data CUdevice cu_device; // the device yhandle char device_name[256]; // the device name CUcontext cu_context; // the device context CUmodule cu_module; // the loaded cubin file contents CUfunction cu_kernel; // the pointer to the CUDA kernel CUstream cu_stream; // the command stream void *host_data[2]; // the pointers to the host data CUdeviceptr device_data[2]; // the pointers to the device data // launch kernel data unsigned int grid_dim_x; // the number of grid blocks (in the X dimension, the only one we will use here) unsigned int block_dim_x; // the number of threads in a block (in the X dimension, the only one we will use here, should be equal to RECOMENDED_CUDA_BLOCK_SIZE) int n_kernel_arguments; // number of kernel arguments void *arg[MAX_N_ARGUMENTS]; // pointers to the kernel argument data } cuda_data_t; // // CU_CALL --- macro that should be used to call a CUDA driver API function and to test its return value // // it should be used to test the return value of calls such as // cuInit(device_number); // cuDeviceGet(&cu_device,device_number); // // in these cases, f_name is, respectively, cuInit and cuDeviceGet, and args is, respectively, // (device_number) and (&cu_device,device_number) // #define CU_CALL(f_name,args) \ do \ { \ CUresult e = f_name args; \ if(e != CUDA_SUCCESS) \ { /* the call failed, terminate the program */ \ fprintf(stderr,"" # f_name "() returned %s (file %s, line %d)\n",cu_error_string(e),__FILE__,__LINE__); \ exit(1); \ } \ } \ while(0) // // terse description of the CUDA error codes (replacement of the error code number by its name) // static const char *cu_error_string(CUresult e) { static char error_string[64]; # define CASE(error_code) case error_code: return "" # error_code; switch((int)e) { // list of error codes extracted from cuda.h (TODO: /usr/local/cuda-10.2/targets/x86_64-linux/include/CL) default: sprintf(error_string,"unknown error code (%d)",(int)e); return(error_string); CASE(CUDA_SUCCESS ); CASE(CUDA_ERROR_INVALID_VALUE ); CASE(CUDA_ERROR_OUT_OF_MEMORY ); CASE(CUDA_ERROR_NOT_INITIALIZED ); CASE(CUDA_ERROR_DEINITIALIZED ); CASE(CUDA_ERROR_PROFILER_DISABLED ); CASE(CUDA_ERROR_PROFILER_NOT_INITIALIZED ); CASE(CUDA_ERROR_PROFILER_ALREADY_STARTED ); CASE(CUDA_ERROR_PROFILER_ALREADY_STOPPED ); CASE(CUDA_ERROR_NO_DEVICE ); CASE(CUDA_ERROR_INVALID_DEVICE ); CASE(CUDA_ERROR_INVALID_IMAGE ); CASE(CUDA_ERROR_INVALID_CONTEXT ); CASE(CUDA_ERROR_CONTEXT_ALREADY_CURRENT ); CASE(CUDA_ERROR_MAP_FAILED ); CASE(CUDA_ERROR_UNMAP_FAILED ); CASE(CUDA_ERROR_ARRAY_IS_MAPPED ); CASE(CUDA_ERROR_ALREADY_MAPPED ); CASE(CUDA_ERROR_NO_BINARY_FOR_GPU ); CASE(CUDA_ERROR_ALREADY_ACQUIRED ); CASE(CUDA_ERROR_NOT_MAPPED ); CASE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY ); CASE(CUDA_ERROR_NOT_MAPPED_AS_POINTER ); CASE(CUDA_ERROR_ECC_UNCORRECTABLE ); CASE(CUDA_ERROR_UNSUPPORTED_LIMIT ); CASE(CUDA_ERROR_CONTEXT_ALREADY_IN_USE ); CASE(CUDA_ERROR_PEER_ACCESS_UNSUPPORTED ); CASE(CUDA_ERROR_INVALID_PTX ); CASE(CUDA_ERROR_INVALID_GRAPHICS_CONTEXT ); CASE(CUDA_ERROR_NVLINK_UNCORRECTABLE ); CASE(CUDA_ERROR_INVALID_SOURCE ); CASE(CUDA_ERROR_FILE_NOT_FOUND ); CASE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND); CASE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ); CASE(CUDA_ERROR_OPERATING_SYSTEM ); CASE(CUDA_ERROR_INVALID_HANDLE ); CASE(CUDA_ERROR_NOT_FOUND ); CASE(CUDA_ERROR_NOT_READY ); CASE(CUDA_ERROR_ILLEGAL_ADDRESS ); CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ); CASE(CUDA_ERROR_LAUNCH_TIMEOUT ); CASE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ); CASE(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ); CASE(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ); CASE(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ); CASE(CUDA_ERROR_CONTEXT_IS_DESTROYED ); CASE(CUDA_ERROR_ASSERT ); CASE(CUDA_ERROR_TOO_MANY_PEERS ); CASE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED); CASE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ); CASE(CUDA_ERROR_HARDWARE_STACK_ERROR ); CASE(CUDA_ERROR_ILLEGAL_INSTRUCTION ); CASE(CUDA_ERROR_MISALIGNED_ADDRESS ); CASE(CUDA_ERROR_INVALID_ADDRESS_SPACE ); CASE(CUDA_ERROR_INVALID_PC ); CASE(CUDA_ERROR_LAUNCH_FAILED ); CASE(CUDA_ERROR_NOT_PERMITTED ); CASE(CUDA_ERROR_NOT_SUPPORTED ); CASE(CUDA_ERROR_UNKNOWN ); }; # undef CASE } // // synchonize the stream command buffer // static void synchronize_cuda(cuda_data_t *cd) { CU_CALL( cuStreamSynchronize , (cd->cu_stream) ); } // // initialize the CUDA driver API interface // // load a single cubin file, with a single CUDA kernel // allocate up to two storage areas both on the host and on the device // static void initialize_cuda(cuda_data_t *cd) { // // initialize the driver API interface // CU_CALL( cuInit , (0) ); // // open the CUDA device // CU_CALL( cuDeviceGet , (&cd->cu_device,cd->device_number) ); // // get information about the CUDA device // CU_CALL( cuDeviceGetName , (cd->device_name,(int)sizeof(cd->device_name) - 1,cd->cu_device) ); printf("initialize_cuda(): CUDA code running on a %s (device %d, CUDA %u.%u.%u)\n",cd->device_name,cd->device_number,CUDA_VERSION / 1000,(CUDA_VERSION / 10) % 100,CUDA_VERSION % 10); // // create a context // CU_CALL( cuDevicePrimaryCtxRetain , (&cd->cu_context,cd->cu_device) ); CU_CALL( cuCtxSetCurrent , (cd->cu_context) ); CU_CALL( cuCtxSetCacheConfig , (CU_FUNC_CACHE_PREFER_L1) ); // // load precompiled modules // CU_CALL( cuModuleLoad , (&cd->cu_module,cd->cubin_file_name) ); // // get the kernel function pointers // CU_CALL( cuModuleGetFunction, (&cd->cu_kernel,cd->cu_module,cd->kernel_name) ); // // create a command stream (we could have used the default stream) // CU_CALL( cuStreamCreate, (&cd->cu_stream,CU_STREAM_NON_BLOCKING) ); // // allocate host and device memory // for(int i = 0;i < 2;i++) if(cd->data_size[i] > 0u) { CU_CALL( cuMemAllocHost , ((void **)&cd->host_data[i] ,(size_t)cd->data_size[i]) ); CU_CALL( cuMemAlloc , (&cd->device_data[i],(size_t)cd->data_size[i]) ); } else cd->host_data[i] = NULL; // // catch any lingering errors // synchronize_cuda(cd); } // // terminate the CUDA driver API interface // static void terminate_cuda(cuda_data_t *cd) { CU_CALL( cuStreamDestroy, (cd->cu_stream) ); for(int i = 0;i < 2;i++) if(cd->data_size[i] > 0u) { CU_CALL( cuMemFreeHost , (cd->host_data[i]) ); CU_CALL( cuMemFree , (cd->device_data[i]) ); } CU_CALL( cuModuleUnload , (cd->cu_module) ); CU_CALL( cuDevicePrimaryCtxRelease, (cd->cu_device) ); } // // copy data from the host to the device and from the device to the host // static void host_to_device_copy(cuda_data_t *cd,int idx) { if(idx < 0 || idx > 1 || cd->data_size[idx] == 0u) { fprintf(stderr,"host_to_device_copy(): bad idx\n"); exit(1); } CU_CALL( cuMemcpyHtoD , (cd->device_data[idx],(void *)cd->host_data[idx],(size_t)cd->data_size[idx]) ); synchronize_cuda(cd); } static void device_to_host_copy(cuda_data_t *cd,int idx) { if(idx < 0 || idx > 1 || cd->data_size[idx] == 0u) { fprintf(stderr,"device_to_host_copy(): bad idx\n"); exit(1); } CU_CALL( cuMemcpyDtoH , ((void *)cd->host_data[idx],cd->device_data[idx],(size_t)cd->data_size[idx]) ); synchronize_cuda(cd); } // // launch a CUDA kernel (with 0 bytes of shared memory and no extra options) // static void lauch_kernel(cuda_data_t *cd) { if(cd->block_dim_x != (unsigned int)RECOMENDED_CUDA_BLOCK_SIZE) fprintf(stderr,"lauch_kernel(): block_dim_x should be equal to %d\n",RECOMENDED_CUDA_BLOCK_SIZE); CU_CALL( cuLaunchKernel , (cd->cu_kernel,cd->grid_dim_x,1u,1u,cd->block_dim_x,1u,1u,0u,cd->cu_stream,&cd->arg[0],NULL) ); synchronize_cuda(cd); } #endif