Created
May 16, 2017 01:05
-
-
Save 3gx/1b90f7bbcf702c60d3076fc7dae62348 to your computer and use it in GitHub Desktop.
convolution example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cuda.h> | |
| #include <cudnn.h> | |
| #include <helper_cuda_drvapi.h> | |
| #include <cstdlib> | |
| #include <cstring> | |
| #include <cmath> | |
| #include <cassert> | |
| #include <vector> | |
| void __check(cudnnStatus_t status, const char* const file, int const line) | |
| { | |
| if (status != CUDNN_STATUS_SUCCESS) { | |
| fprintf(stderr, "CUDNN error at %s:%d, code=%d(%s)\n", | |
| file, line, static_cast<unsigned int>(status), | |
| cudnnGetErrorString(status)); | |
| exit(1); | |
| } | |
| } | |
| #define checkCudnnError(val) __check(val, __FILE__, __LINE__) | |
| void init_data(double *x, int size) { | |
| for (int i = 0; i < size; i++) { | |
| x[i] = drand48(); | |
| } | |
| } | |
| void print_array(double* output, int size1, int size2) | |
| { | |
| for(int i = 0; i < size1; i++) { | |
| for (int j = 0; j < size2; j++) | |
| fprintf(stdout," %lf", output[i*size2+j]); | |
| fprintf(stdout, "\n"); | |
| } | |
| } | |
| int constexpr IMAGE_BATCH = 4; | |
| int constexpr IMAGE_HEIGHT = 28; | |
| int constexpr IMAGE_WIDTH = 28; | |
| int constexpr FILTER_HEIGHT = 5; | |
| int constexpr FILTER_WIDTH = 5; | |
| int constexpr FILTER_BATCH = 2; | |
| int constexpr INPUT_CHANNEL = 3; | |
| int constexpr PAD_HEIGHT = 2; | |
| int constexpr PAD_WIDTH = 2; | |
| int constexpr NDIM = 4; | |
| int constexpr OUTPUT_HEIGHT = 1 + IMAGE_HEIGHT + 2*PAD_HEIGHT - FILTER_HEIGHT; | |
| int constexpr OUTPUT_WIDTH = 1 + IMAGE_WIDTH + 2*PAD_WIDTH - FILTER_WIDTH; | |
| int constexpr OUTPUT_BATCH = IMAGE_BATCH; | |
| int constexpr OUTPUT_CHANNEL = FILTER_BATCH; | |
| void reference(double *X_, double *H_, double *G_) { | |
| double(*X)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH] = | |
| (double(*)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH])(X_); | |
| double(*H)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH] = | |
| (double(*)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH])(H_); | |
| double(*G)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH] = | |
| (double(*)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH])(G_); | |
| int constexpr j = 0; | |
| for (int i = 0; i < OUTPUT_BATCH; ++i) { | |
| for (int j = 0; j < OUTPUT_CHANNEL; ++j) { | |
| for (int k = 0; k < OUTPUT_HEIGHT; ++k) { | |
| for (int l = 0; l < OUTPUT_WIDTH; ++l) { | |
| double Gi = 0.0; | |
| for (int m = 0; m < FILTER_HEIGHT; ++m) { | |
| for (int n = 0; n < FILTER_WIDTH; ++n) { | |
| for (int c = 0; c < INPUT_CHANNEL; ++c) { | |
| double Xi = 0.0; | |
| if (k + m >= PAD_HEIGHT && k + m < IMAGE_HEIGHT + PAD_HEIGHT && | |
| l + n >= PAD_WIDTH && l + n < IMAGE_WIDTH + PAD_WIDTH) { | |
| Xi = X[i][c][k - PAD_HEIGHT + m][l - PAD_WIDTH + n]; | |
| } | |
| Gi += Xi * H[j][c][m][n]; | |
| } | |
| } | |
| } | |
| G[i][j][k][l] = Gi; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| int main(int argc, char *argv[]) { | |
| checkCudaErrors(cuInit(0)); | |
| // init device | |
| int numDevices; | |
| checkCudaErrors(cuDeviceGetCount(&numDevices)); | |
| if (numDevices == 0) { | |
| fprintf(stderr, "No CUDA devices found\n"); | |
| exit(1); | |
| } | |
| int deviceNumber = 0; | |
| if (const char* deviceNumEnv = std::getenv("NVJIT_GPU_DEVICE")) { | |
| deviceNumber = atoi(deviceNumEnv); | |
| } | |
| CUdevice device; | |
| checkCudaErrors(cuDeviceGet(&device, deviceNumber)); | |
| // create context | |
| CUcontext context; | |
| checkCudaErrors(cuCtxCreate(&context, 0, device)); | |
| // Init cudnnHandle | |
| // ----------------- | |
| cudnnHandle_t handle; | |
| checkCudnnError(cudnnCreate(&handle)); | |
| // Init cudnn tensor | |
| // ------------------ | |
| cudnnTensorDescriptor_t inputTensor; | |
| checkCudnnError(cudnnCreateTensorDescriptor(&inputTensor)); | |
| // Set tensor properties | |
| // ---------------------- | |
| int dimA[NDIM] = {IMAGE_BATCH, INPUT_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH}; | |
| int strideA[NDIM] = {IMAGE_HEIGHT * IMAGE_WIDTH * INPUT_CHANNEL, | |
| IMAGE_HEIGHT * IMAGE_WIDTH, IMAGE_WIDTH, 1}; | |
| checkCudnnError(cudnnSetTensorNdDescriptor(inputTensor, CUDNN_DATA_DOUBLE, | |
| NDIM, dimA, strideA)); | |
| // Init filter | |
| // ------------ | |
| cudnnFilterDescriptor_t filter; | |
| checkCudnnError(cudnnCreateFilterDescriptor(&filter)); | |
| // Set filter properties | |
| // ---------------------- | |
| int filterDim[NDIM] = {FILTER_BATCH, INPUT_CHANNEL, FILTER_HEIGHT, FILTER_WIDTH}; | |
| checkCudnnError(cudnnSetFilterNdDescriptor( | |
| filter, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, NDIM, filterDim)); | |
| // Init convolution | |
| // ----------------- | |
| cudnnConvolutionDescriptor_t conv; | |
| checkCudnnError(cudnnCreateConvolutionDescriptor(&conv)); | |
| // Set convolution properties | |
| // --------------------------- | |
| int padA[2] = {PAD_HEIGHT, PAD_WIDTH}; | |
| int filterStride[2] = {1, 1}; | |
| int upscale[2] = {1, 1}; | |
| checkCudnnError(cudnnSetConvolutionNdDescriptor( | |
| conv, 2, padA, filterStride, upscale, CUDNN_CROSS_CORRELATION, | |
| CUDNN_DATA_DOUBLE)); | |
| // Get the output size | |
| // ------------------- | |
| int outputDim[NDIM]; | |
| checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(conv, inputTensor, | |
| filter, 4, outputDim)); | |
| fprintf(stdout, "Output Dimensions: [%d, %d, %d, %d]\n", outputDim[0], | |
| outputDim[1], outputDim[2], outputDim[3]); | |
| assert(outputDim[0] == OUTPUT_BATCH); | |
| assert(outputDim[1] == OUTPUT_CHANNEL); | |
| assert(outputDim[2] == OUTPUT_HEIGHT); | |
| assert(outputDim[3] == OUTPUT_WIDTH); | |
| // Init Output Tensor | |
| // ------------------ | |
| cudnnTensorDescriptor_t outputTensor; | |
| checkCudnnError(cudnnCreateTensorDescriptor(&outputTensor)); | |
| // Set output tensor properties | |
| // ----------------------------- | |
| int outputStride[NDIM]; | |
| std::memcpy(outputStride, outputDim+1, sizeof(int)*NDIM); | |
| outputStride[NDIM-1] = 1; | |
| for (int k = NDIM-2; k >= 0; --k) | |
| outputStride[k] = outputStride[k+1]*outputDim[k+1]; | |
| checkCudnnError(cudnnSetTensorNdDescriptor(outputTensor, CUDNN_DATA_DOUBLE, | |
| NDIM, outputDim, outputStride)); | |
| // Get prefered algorithm | |
| // ------------------------ | |
| cudnnConvolutionFwdAlgo_t algo; | |
| checkCudnnError(cudnnGetConvolutionForwardAlgorithm( | |
| handle, inputTensor, filter, conv, outputTensor, | |
| CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo)); | |
| fprintf(stdout, "Algorithm : %d\n", algo); | |
| // Get the workspace size | |
| // ----------------------- | |
| size_t workspace_size = 0; | |
| cudnnGetConvolutionForwardWorkspaceSize(handle, inputTensor, filter, conv, | |
| outputTensor, algo, &workspace_size); | |
| fprintf(stdout, "Workspace Size : %zu\n", workspace_size); | |
| ////////////// allocate device data | |
| // convolution data | |
| // ----------------- | |
| double *data_X, *data_H, *data_G, *data_workspace{nullptr}; | |
| size_t image_size = IMAGE_WIDTH * IMAGE_HEIGHT * IMAGE_BATCH * INPUT_CHANNEL; | |
| size_t filter_size = FILTER_WIDTH * FILTER_HEIGHT * FILTER_BATCH * INPUT_CHANNEL; | |
| size_t output_size = OUTPUT_WIDTH * OUTPUT_HEIGHT * OUTPUT_BATCH * OUTPUT_CHANNEL; | |
| checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_X, | |
| image_size * sizeof(double), | |
| CU_MEM_ATTACH_GLOBAL)); | |
| checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_H, | |
| filter_size * sizeof(double), | |
| CU_MEM_ATTACH_GLOBAL)); | |
| checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_G, | |
| output_size * sizeof(double), | |
| CU_MEM_ATTACH_GLOBAL)); | |
| checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_workspace, | |
| (workspace_size + 1) * sizeof(double), | |
| CU_MEM_ATTACH_GLOBAL)); | |
| // init data | |
| // ---------- | |
| init_data(data_X, image_size); | |
| init_data(data_H, filter_size); | |
| // apply the convolution | |
| // ----------------------- | |
| double alpha = 1.0; | |
| double beta = 0.0; | |
| checkCudnnError(cudnnConvolutionForward( | |
| handle, &alpha, inputTensor, data_X, filter, data_H, conv, algo, | |
| data_workspace, workspace_size, &beta, outputTensor, data_G)); | |
| checkCudaErrors(cuCtxSynchronize()); | |
| std::vector<double> ref_G(output_size); | |
| reference(data_X, data_H, ref_G.data()); | |
| for (int i = 0; i < outputDim[0]; i++) { | |
| for (int j = 0; j < outputDim[1]; j++) { | |
| for (int k = 0; k < outputDim[2]; k++) { | |
| for (int l = 0; l < outputDim[3]; l++) { | |
| size_t idx = | |
| ((i * outputDim[1] + j) * outputDim[2] + k) * outputDim[3] + l; | |
| double err2 = std::abs(data_G[idx] - ref_G[idx]); | |
| if (err2 > 1e-6) { | |
| fprintf(stderr, | |
| "Error at Location [%d, %d %d, %d], Expected : %lf" | |
| ",Got : %lf, Error : %e\n", | |
| i, j, k, l, ref_G[idx], data_G[idx], err2); | |
| exit(1); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // free memory | |
| // ------------- | |
| checkCudaErrors(cuMemFree((CUdeviceptr)data_G)); | |
| checkCudaErrors(cuMemFree((CUdeviceptr)data_X)); | |
| checkCudaErrors(cuMemFree((CUdeviceptr)data_H)); | |
| checkCudaErrors(cuMemFree((CUdeviceptr)data_workspace)); | |
| // destroy cudnnHandle | |
| // ------------------- | |
| checkCudnnError(cudnnDestroy(handle)); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment