Skip to content

Instantly share code, notes, and snippets.

@3gx
Created May 16, 2017 01:05
Show Gist options
  • Select an option

  • Save 3gx/1b90f7bbcf702c60d3076fc7dae62348 to your computer and use it in GitHub Desktop.

Select an option

Save 3gx/1b90f7bbcf702c60d3076fc7dae62348 to your computer and use it in GitHub Desktop.

Revisions

  1. Evghenii Gaburov created this gist May 16, 2017.
    269 changes: 269 additions & 0 deletions conv.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,269 @@
    #include <cuda.h>
    #include <cudnn.h>
    #include <helper_cuda_drvapi.h>
    #include <cstdlib>
    #include <cstring>
    #include <cmath>
    #include <cassert>
    #include <vector>

    void __check(cudnnStatus_t status, const char* const file, int const line)
    {
    if (status != CUDNN_STATUS_SUCCESS) {
    fprintf(stderr, "CUDNN error at %s:%d, code=%d(%s)\n",
    file, line, static_cast<unsigned int>(status),
    cudnnGetErrorString(status));
    exit(1);
    }
    }

    #define checkCudnnError(val) __check(val, __FILE__, __LINE__)

    void init_data(double *x, int size) {
    for (int i = 0; i < size; i++) {
    x[i] = drand48();
    }
    }

    void print_array(double* output, int size1, int size2)
    {
    for(int i = 0; i < size1; i++) {
    for (int j = 0; j < size2; j++)
    fprintf(stdout," %lf", output[i*size2+j]);
    fprintf(stdout, "\n");
    }
    }

    int constexpr IMAGE_BATCH = 4;
    int constexpr IMAGE_HEIGHT = 28;
    int constexpr IMAGE_WIDTH = 28;

    int constexpr FILTER_HEIGHT = 5;
    int constexpr FILTER_WIDTH = 5;
    int constexpr FILTER_BATCH = 2;
    int constexpr INPUT_CHANNEL = 3;

    int constexpr PAD_HEIGHT = 2;
    int constexpr PAD_WIDTH = 2;

    int constexpr NDIM = 4;

    int constexpr OUTPUT_HEIGHT = 1 + IMAGE_HEIGHT + 2*PAD_HEIGHT - FILTER_HEIGHT;
    int constexpr OUTPUT_WIDTH = 1 + IMAGE_WIDTH + 2*PAD_WIDTH - FILTER_WIDTH;
    int constexpr OUTPUT_BATCH = IMAGE_BATCH;
    int constexpr OUTPUT_CHANNEL = FILTER_BATCH;

    void reference(double *X_, double *H_, double *G_) {
    double(*X)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH] =
    (double(*)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH])(X_);

    double(*H)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH] =
    (double(*)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH])(H_);

    double(*G)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH] =
    (double(*)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH])(G_);

    int constexpr j = 0;
    for (int i = 0; i < OUTPUT_BATCH; ++i) {
    for (int j = 0; j < OUTPUT_CHANNEL; ++j) {
    for (int k = 0; k < OUTPUT_HEIGHT; ++k) {
    for (int l = 0; l < OUTPUT_WIDTH; ++l) {
    double Gi = 0.0;
    for (int m = 0; m < FILTER_HEIGHT; ++m) {
    for (int n = 0; n < FILTER_WIDTH; ++n) {
    for (int c = 0; c < INPUT_CHANNEL; ++c) {
    double Xi = 0.0;
    if (k + m >= PAD_HEIGHT && k + m < IMAGE_HEIGHT + PAD_HEIGHT &&
    l + n >= PAD_WIDTH && l + n < IMAGE_WIDTH + PAD_WIDTH) {
    Xi = X[i][c][k - PAD_HEIGHT + m][l - PAD_WIDTH + n];
    }
    Gi += Xi * H[j][c][m][n];
    }
    }
    }
    G[i][j][k][l] = Gi;
    }
    }
    }
    }
    }

    int main(int argc, char *argv[]) {
    checkCudaErrors(cuInit(0));


    // init device
    int numDevices;
    checkCudaErrors(cuDeviceGetCount(&numDevices));
    if (numDevices == 0) {
    fprintf(stderr, "No CUDA devices found\n");
    exit(1);
    }
    int deviceNumber = 0;
    if (const char* deviceNumEnv = std::getenv("NVJIT_GPU_DEVICE")) {
    deviceNumber = atoi(deviceNumEnv);
    }
    CUdevice device;
    checkCudaErrors(cuDeviceGet(&device, deviceNumber));

    // create context
    CUcontext context;
    checkCudaErrors(cuCtxCreate(&context, 0, device));

    // Init cudnnHandle
    // -----------------
    cudnnHandle_t handle;
    checkCudnnError(cudnnCreate(&handle));

    // Init cudnn tensor
    // ------------------
    cudnnTensorDescriptor_t inputTensor;
    checkCudnnError(cudnnCreateTensorDescriptor(&inputTensor));

    // Set tensor properties
    // ----------------------
    int dimA[NDIM] = {IMAGE_BATCH, INPUT_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH};
    int strideA[NDIM] = {IMAGE_HEIGHT * IMAGE_WIDTH * INPUT_CHANNEL,
    IMAGE_HEIGHT * IMAGE_WIDTH, IMAGE_WIDTH, 1};
    checkCudnnError(cudnnSetTensorNdDescriptor(inputTensor, CUDNN_DATA_DOUBLE,
    NDIM, dimA, strideA));

    // Init filter
    // ------------
    cudnnFilterDescriptor_t filter;
    checkCudnnError(cudnnCreateFilterDescriptor(&filter));

    // Set filter properties
    // ----------------------
    int filterDim[NDIM] = {FILTER_BATCH, INPUT_CHANNEL, FILTER_HEIGHT, FILTER_WIDTH};
    checkCudnnError(cudnnSetFilterNdDescriptor(
    filter, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, NDIM, filterDim));

    // Init convolution
    // -----------------
    cudnnConvolutionDescriptor_t conv;
    checkCudnnError(cudnnCreateConvolutionDescriptor(&conv));

    // Set convolution properties
    // ---------------------------
    int padA[2] = {PAD_HEIGHT, PAD_WIDTH};
    int filterStride[2] = {1, 1};
    int upscale[2] = {1, 1};
    checkCudnnError(cudnnSetConvolutionNdDescriptor(
    conv, 2, padA, filterStride, upscale, CUDNN_CROSS_CORRELATION,
    CUDNN_DATA_DOUBLE));

    // Get the output size
    // -------------------
    int outputDim[NDIM];
    checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(conv, inputTensor,
    filter, 4, outputDim));
    fprintf(stdout, "Output Dimensions: [%d, %d, %d, %d]\n", outputDim[0],
    outputDim[1], outputDim[2], outputDim[3]);
    assert(outputDim[0] == OUTPUT_BATCH);
    assert(outputDim[1] == OUTPUT_CHANNEL);
    assert(outputDim[2] == OUTPUT_HEIGHT);
    assert(outputDim[3] == OUTPUT_WIDTH);

    // Init Output Tensor
    // ------------------
    cudnnTensorDescriptor_t outputTensor;
    checkCudnnError(cudnnCreateTensorDescriptor(&outputTensor));

    // Set output tensor properties
    // -----------------------------
    int outputStride[NDIM];
    std::memcpy(outputStride, outputDim+1, sizeof(int)*NDIM);
    outputStride[NDIM-1] = 1;
    for (int k = NDIM-2; k >= 0; --k)
    outputStride[k] = outputStride[k+1]*outputDim[k+1];
    checkCudnnError(cudnnSetTensorNdDescriptor(outputTensor, CUDNN_DATA_DOUBLE,
    NDIM, outputDim, outputStride));

    // Get prefered algorithm
    // ------------------------
    cudnnConvolutionFwdAlgo_t algo;
    checkCudnnError(cudnnGetConvolutionForwardAlgorithm(
    handle, inputTensor, filter, conv, outputTensor,
    CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
    fprintf(stdout, "Algorithm : %d\n", algo);

    // Get the workspace size
    // -----------------------
    size_t workspace_size = 0;
    cudnnGetConvolutionForwardWorkspaceSize(handle, inputTensor, filter, conv,
    outputTensor, algo, &workspace_size);
    fprintf(stdout, "Workspace Size : %zu\n", workspace_size);

    ////////////// allocate device data

    // convolution data
    // -----------------
    double *data_X, *data_H, *data_G, *data_workspace{nullptr};
    size_t image_size = IMAGE_WIDTH * IMAGE_HEIGHT * IMAGE_BATCH * INPUT_CHANNEL;
    size_t filter_size = FILTER_WIDTH * FILTER_HEIGHT * FILTER_BATCH * INPUT_CHANNEL;
    size_t output_size = OUTPUT_WIDTH * OUTPUT_HEIGHT * OUTPUT_BATCH * OUTPUT_CHANNEL;

    checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_X,
    image_size * sizeof(double),
    CU_MEM_ATTACH_GLOBAL));
    checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_H,
    filter_size * sizeof(double),
    CU_MEM_ATTACH_GLOBAL));
    checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_G,
    output_size * sizeof(double),
    CU_MEM_ATTACH_GLOBAL));
    checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_workspace,
    (workspace_size + 1) * sizeof(double),
    CU_MEM_ATTACH_GLOBAL));

    // init data
    // ----------
    init_data(data_X, image_size);
    init_data(data_H, filter_size);

    // apply the convolution
    // -----------------------
    double alpha = 1.0;
    double beta = 0.0;
    checkCudnnError(cudnnConvolutionForward(
    handle, &alpha, inputTensor, data_X, filter, data_H, conv, algo,
    data_workspace, workspace_size, &beta, outputTensor, data_G));

    checkCudaErrors(cuCtxSynchronize());

    std::vector<double> ref_G(output_size);
    reference(data_X, data_H, ref_G.data());

    for (int i = 0; i < outputDim[0]; i++) {
    for (int j = 0; j < outputDim[1]; j++) {
    for (int k = 0; k < outputDim[2]; k++) {
    for (int l = 0; l < outputDim[3]; l++) {
    size_t idx =
    ((i * outputDim[1] + j) * outputDim[2] + k) * outputDim[3] + l;
    double err2 = std::abs(data_G[idx] - ref_G[idx]);
    if (err2 > 1e-6) {
    fprintf(stderr,
    "Error at Location [%d, %d %d, %d], Expected : %lf"
    ",Got : %lf, Error : %e\n",
    i, j, k, l, ref_G[idx], data_G[idx], err2);
    exit(1);
    }
    }
    }
    }
    }

    // free memory
    // -------------
    checkCudaErrors(cuMemFree((CUdeviceptr)data_G));
    checkCudaErrors(cuMemFree((CUdeviceptr)data_X));
    checkCudaErrors(cuMemFree((CUdeviceptr)data_H));
    checkCudaErrors(cuMemFree((CUdeviceptr)data_workspace));

    // destroy cudnnHandle
    // -------------------
    checkCudnnError(cudnnDestroy(handle));

    return 0;
    }