#include #include #include #include #include #include #include #include void __check(cudnnStatus_t status, const char* const file, int const line) { if (status != CUDNN_STATUS_SUCCESS) { fprintf(stderr, "CUDNN error at %s:%d, code=%d(%s)\n", file, line, static_cast(status), cudnnGetErrorString(status)); exit(1); } } #define checkCudnnError(val) __check(val, __FILE__, __LINE__) void init_data(double *x, int size) { for (int i = 0; i < size; i++) { x[i] = drand48(); } } void print_array(double* output, int size1, int size2) { for(int i = 0; i < size1; i++) { for (int j = 0; j < size2; j++) fprintf(stdout," %lf", output[i*size2+j]); fprintf(stdout, "\n"); } } int constexpr IMAGE_BATCH = 4; int constexpr IMAGE_HEIGHT = 28; int constexpr IMAGE_WIDTH = 28; int constexpr FILTER_HEIGHT = 5; int constexpr FILTER_WIDTH = 5; int constexpr FILTER_BATCH = 2; int constexpr INPUT_CHANNEL = 3; int constexpr PAD_HEIGHT = 2; int constexpr PAD_WIDTH = 2; int constexpr NDIM = 4; int constexpr OUTPUT_HEIGHT = 1 + IMAGE_HEIGHT + 2*PAD_HEIGHT - FILTER_HEIGHT; int constexpr OUTPUT_WIDTH = 1 + IMAGE_WIDTH + 2*PAD_WIDTH - FILTER_WIDTH; int constexpr OUTPUT_BATCH = IMAGE_BATCH; int constexpr OUTPUT_CHANNEL = FILTER_BATCH; void reference(double *X_, double *H_, double *G_) { double(*X)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH] = (double(*)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH])(X_); double(*H)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH] = (double(*)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH])(H_); double(*G)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH] = (double(*)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH])(G_); int constexpr j = 0; for (int i = 0; i < OUTPUT_BATCH; ++i) { for (int j = 0; j < OUTPUT_CHANNEL; ++j) { for (int k = 0; k < OUTPUT_HEIGHT; ++k) { for (int l = 0; l < OUTPUT_WIDTH; ++l) { double Gi = 0.0; for (int m = 0; m < FILTER_HEIGHT; ++m) { for (int n = 0; n < FILTER_WIDTH; ++n) { for (int c = 0; c < INPUT_CHANNEL; ++c) { double Xi = 0.0; if (k + m >= PAD_HEIGHT && k + m < IMAGE_HEIGHT + PAD_HEIGHT && l + n >= PAD_WIDTH && l + n < IMAGE_WIDTH + PAD_WIDTH) { Xi = X[i][c][k - PAD_HEIGHT + m][l - PAD_WIDTH + n]; } Gi += Xi * H[j][c][m][n]; } } } G[i][j][k][l] = Gi; } } } } } int main(int argc, char *argv[]) { checkCudaErrors(cuInit(0)); // init device int numDevices; checkCudaErrors(cuDeviceGetCount(&numDevices)); if (numDevices == 0) { fprintf(stderr, "No CUDA devices found\n"); exit(1); } int deviceNumber = 0; if (const char* deviceNumEnv = std::getenv("NVJIT_GPU_DEVICE")) { deviceNumber = atoi(deviceNumEnv); } CUdevice device; checkCudaErrors(cuDeviceGet(&device, deviceNumber)); // create context CUcontext context; checkCudaErrors(cuCtxCreate(&context, 0, device)); // Init cudnnHandle // ----------------- cudnnHandle_t handle; checkCudnnError(cudnnCreate(&handle)); // Init cudnn tensor // ------------------ cudnnTensorDescriptor_t inputTensor; checkCudnnError(cudnnCreateTensorDescriptor(&inputTensor)); // Set tensor properties // ---------------------- int dimA[NDIM] = {IMAGE_BATCH, INPUT_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH}; int strideA[NDIM] = {IMAGE_HEIGHT * IMAGE_WIDTH * INPUT_CHANNEL, IMAGE_HEIGHT * IMAGE_WIDTH, IMAGE_WIDTH, 1}; checkCudnnError(cudnnSetTensorNdDescriptor(inputTensor, CUDNN_DATA_DOUBLE, NDIM, dimA, strideA)); // Init filter // ------------ cudnnFilterDescriptor_t filter; checkCudnnError(cudnnCreateFilterDescriptor(&filter)); // Set filter properties // ---------------------- int filterDim[NDIM] = {FILTER_BATCH, INPUT_CHANNEL, FILTER_HEIGHT, FILTER_WIDTH}; checkCudnnError(cudnnSetFilterNdDescriptor( filter, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, NDIM, filterDim)); // Init convolution // ----------------- cudnnConvolutionDescriptor_t conv; checkCudnnError(cudnnCreateConvolutionDescriptor(&conv)); // Set convolution properties // --------------------------- int padA[2] = {PAD_HEIGHT, PAD_WIDTH}; int filterStride[2] = {1, 1}; int upscale[2] = {1, 1}; checkCudnnError(cudnnSetConvolutionNdDescriptor( conv, 2, padA, filterStride, upscale, CUDNN_CROSS_CORRELATION, CUDNN_DATA_DOUBLE)); // Get the output size // ------------------- int outputDim[NDIM]; checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(conv, inputTensor, filter, 4, outputDim)); fprintf(stdout, "Output Dimensions: [%d, %d, %d, %d]\n", outputDim[0], outputDim[1], outputDim[2], outputDim[3]); assert(outputDim[0] == OUTPUT_BATCH); assert(outputDim[1] == OUTPUT_CHANNEL); assert(outputDim[2] == OUTPUT_HEIGHT); assert(outputDim[3] == OUTPUT_WIDTH); // Init Output Tensor // ------------------ cudnnTensorDescriptor_t outputTensor; checkCudnnError(cudnnCreateTensorDescriptor(&outputTensor)); // Set output tensor properties // ----------------------------- int outputStride[NDIM]; std::memcpy(outputStride, outputDim+1, sizeof(int)*NDIM); outputStride[NDIM-1] = 1; for (int k = NDIM-2; k >= 0; --k) outputStride[k] = outputStride[k+1]*outputDim[k+1]; checkCudnnError(cudnnSetTensorNdDescriptor(outputTensor, CUDNN_DATA_DOUBLE, NDIM, outputDim, outputStride)); // Get prefered algorithm // ------------------------ cudnnConvolutionFwdAlgo_t algo; checkCudnnError(cudnnGetConvolutionForwardAlgorithm( handle, inputTensor, filter, conv, outputTensor, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); fprintf(stdout, "Algorithm : %d\n", algo); // Get the workspace size // ----------------------- size_t workspace_size = 0; cudnnGetConvolutionForwardWorkspaceSize(handle, inputTensor, filter, conv, outputTensor, algo, &workspace_size); fprintf(stdout, "Workspace Size : %zu\n", workspace_size); ////////////// allocate device data // convolution data // ----------------- double *data_X, *data_H, *data_G, *data_workspace{nullptr}; size_t image_size = IMAGE_WIDTH * IMAGE_HEIGHT * IMAGE_BATCH * INPUT_CHANNEL; size_t filter_size = FILTER_WIDTH * FILTER_HEIGHT * FILTER_BATCH * INPUT_CHANNEL; size_t output_size = OUTPUT_WIDTH * OUTPUT_HEIGHT * OUTPUT_BATCH * OUTPUT_CHANNEL; checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_X, image_size * sizeof(double), CU_MEM_ATTACH_GLOBAL)); checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_H, filter_size * sizeof(double), CU_MEM_ATTACH_GLOBAL)); checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_G, output_size * sizeof(double), CU_MEM_ATTACH_GLOBAL)); checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_workspace, (workspace_size + 1) * sizeof(double), CU_MEM_ATTACH_GLOBAL)); // init data // ---------- init_data(data_X, image_size); init_data(data_H, filter_size); // apply the convolution // ----------------------- double alpha = 1.0; double beta = 0.0; checkCudnnError(cudnnConvolutionForward( handle, &alpha, inputTensor, data_X, filter, data_H, conv, algo, data_workspace, workspace_size, &beta, outputTensor, data_G)); checkCudaErrors(cuCtxSynchronize()); std::vector ref_G(output_size); reference(data_X, data_H, ref_G.data()); for (int i = 0; i < outputDim[0]; i++) { for (int j = 0; j < outputDim[1]; j++) { for (int k = 0; k < outputDim[2]; k++) { for (int l = 0; l < outputDim[3]; l++) { size_t idx = ((i * outputDim[1] + j) * outputDim[2] + k) * outputDim[3] + l; double err2 = std::abs(data_G[idx] - ref_G[idx]); if (err2 > 1e-6) { fprintf(stderr, "Error at Location [%d, %d %d, %d], Expected : %lf" ",Got : %lf, Error : %e\n", i, j, k, l, ref_G[idx], data_G[idx], err2); exit(1); } } } } } // free memory // ------------- checkCudaErrors(cuMemFree((CUdeviceptr)data_G)); checkCudaErrors(cuMemFree((CUdeviceptr)data_X)); checkCudaErrors(cuMemFree((CUdeviceptr)data_H)); checkCudaErrors(cuMemFree((CUdeviceptr)data_workspace)); // destroy cudnnHandle // ------------------- checkCudnnError(cudnnDestroy(handle)); return 0; }