3gx · May 16, 2017 01:05 · May 16, 2017
diff --git a/conv.cpp b/conv.cpp
@@ -0,0 +1,269 @@
+#include <cuda.h>
+#include <cudnn.h>
+#include <helper_cuda_drvapi.h>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <cassert>
+#include <vector>
+
+void __check(cudnnStatus_t status, const char* const file, int const line)
+{
+  if (status != CUDNN_STATUS_SUCCESS) {
+    fprintf(stderr, "CUDNN error at %s:%d, code=%d(%s)\n",
+            file, line, static_cast<unsigned int>(status),
+            cudnnGetErrorString(status));
+    exit(1);
+  }
+}
+
+#define checkCudnnError(val) __check(val, __FILE__, __LINE__)
+
+void init_data(double *x, int size) {
+  for (int i = 0; i < size; i++) {
+    x[i] = drand48();
+  }
+}
+
+void print_array(double* output, int size1, int size2)
+{
+  for(int i = 0; i < size1; i++) {
+    for (int j = 0; j < size2; j++)
+      fprintf(stdout," %lf", output[i*size2+j]);
+    fprintf(stdout, "\n");
+  }
+}
+
+int constexpr IMAGE_BATCH = 4;
+int constexpr IMAGE_HEIGHT = 28;
+int constexpr IMAGE_WIDTH = 28;
+
+int constexpr FILTER_HEIGHT = 5;
+int constexpr FILTER_WIDTH = 5;
+int constexpr FILTER_BATCH = 2;
+int constexpr INPUT_CHANNEL = 3;
+
+int constexpr PAD_HEIGHT = 2;
+int constexpr PAD_WIDTH = 2;
+
+int constexpr NDIM = 4;
+
+int constexpr OUTPUT_HEIGHT  = 1 + IMAGE_HEIGHT + 2*PAD_HEIGHT - FILTER_HEIGHT;
+int constexpr OUTPUT_WIDTH   = 1 + IMAGE_WIDTH  + 2*PAD_WIDTH  - FILTER_WIDTH;
+int constexpr OUTPUT_BATCH   = IMAGE_BATCH;
+int constexpr OUTPUT_CHANNEL = FILTER_BATCH;
+
+void reference(double *X_, double *H_, double *G_) {
+  double(*X)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH] =
+      (double(*)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH])(X_);
+
+  double(*H)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH] =
+      (double(*)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH])(H_);
+
+  double(*G)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH] =
+      (double(*)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH])(G_);
+
+  int constexpr j = 0;
+  for (int i = 0; i < OUTPUT_BATCH; ++i) {
+    for (int j = 0; j < OUTPUT_CHANNEL; ++j) {
+      for (int k = 0; k < OUTPUT_HEIGHT; ++k) {
+        for (int l = 0; l < OUTPUT_WIDTH; ++l) {
+          double Gi = 0.0;
+          for (int m = 0; m < FILTER_HEIGHT; ++m) {
+            for (int n = 0; n < FILTER_WIDTH; ++n) {
+              for (int c = 0; c < INPUT_CHANNEL; ++c) {
+                double Xi = 0.0;
+                if (k + m >= PAD_HEIGHT && k + m < IMAGE_HEIGHT + PAD_HEIGHT &&
+                    l + n >= PAD_WIDTH && l + n < IMAGE_WIDTH + PAD_WIDTH) {
+                  Xi = X[i][c][k - PAD_HEIGHT + m][l - PAD_WIDTH + n];
+                }
+                Gi += Xi * H[j][c][m][n];
+              }
+            }
+          }
+          G[i][j][k][l] = Gi;
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char *argv[]) {
+  checkCudaErrors(cuInit(0));
+
+
+  // init device
+  int numDevices;
+  checkCudaErrors(cuDeviceGetCount(&numDevices));
+  if (numDevices == 0) {
+    fprintf(stderr, "No CUDA devices found\n");
+    exit(1);
+  }
+  int deviceNumber = 0;
+  if (const char* deviceNumEnv = std::getenv("NVJIT_GPU_DEVICE")) {
+    deviceNumber = atoi(deviceNumEnv);
+  }
+  CUdevice device;
+  checkCudaErrors(cuDeviceGet(&device, deviceNumber));
+
+  // create context
+  CUcontext context;
+  checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+  // Init cudnnHandle
+  // -----------------
+  cudnnHandle_t handle;
+  checkCudnnError(cudnnCreate(&handle));
+
+  // Init cudnn tensor
+  // ------------------
+  cudnnTensorDescriptor_t inputTensor;
+  checkCudnnError(cudnnCreateTensorDescriptor(&inputTensor));
+
+  // Set tensor properties
+  // ----------------------
+  int dimA[NDIM] = {IMAGE_BATCH, INPUT_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH};
+  int strideA[NDIM] = {IMAGE_HEIGHT * IMAGE_WIDTH * INPUT_CHANNEL,
+                       IMAGE_HEIGHT * IMAGE_WIDTH, IMAGE_WIDTH, 1};
+  checkCudnnError(cudnnSetTensorNdDescriptor(inputTensor, CUDNN_DATA_DOUBLE,
+                                             NDIM, dimA, strideA));
+
+  // Init filter
+  // ------------
+  cudnnFilterDescriptor_t filter;
+  checkCudnnError(cudnnCreateFilterDescriptor(&filter));
+
+  // Set filter properties
+  // ----------------------
+  int filterDim[NDIM] = {FILTER_BATCH, INPUT_CHANNEL, FILTER_HEIGHT, FILTER_WIDTH};
+  checkCudnnError(cudnnSetFilterNdDescriptor(
+      filter, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, NDIM, filterDim));
+
+  // Init convolution
+  // -----------------
+  cudnnConvolutionDescriptor_t conv;
+  checkCudnnError(cudnnCreateConvolutionDescriptor(&conv));
+
+  // Set convolution properties
+  // ---------------------------
+  int padA[2] = {PAD_HEIGHT, PAD_WIDTH};
+  int filterStride[2] = {1, 1};
+  int upscale[2] = {1, 1};
+  checkCudnnError(cudnnSetConvolutionNdDescriptor(
+      conv, 2, padA, filterStride, upscale, CUDNN_CROSS_CORRELATION,
+      CUDNN_DATA_DOUBLE));
+
+  // Get the output size
+  // -------------------
+  int outputDim[NDIM];
+  checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(conv, inputTensor,
+                                                        filter, 4, outputDim));
+  fprintf(stdout, "Output Dimensions: [%d, %d, %d, %d]\n", outputDim[0],
+          outputDim[1], outputDim[2], outputDim[3]);
+  assert(outputDim[0] == OUTPUT_BATCH);
+  assert(outputDim[1] == OUTPUT_CHANNEL);
+  assert(outputDim[2] == OUTPUT_HEIGHT);
+  assert(outputDim[3] == OUTPUT_WIDTH);
+
+  // Init Output Tensor
+  // ------------------
+  cudnnTensorDescriptor_t outputTensor;
+  checkCudnnError(cudnnCreateTensorDescriptor(&outputTensor));
+
+  // Set output tensor properties
+  // -----------------------------
+  int outputStride[NDIM];
+  std::memcpy(outputStride, outputDim+1, sizeof(int)*NDIM);
+  outputStride[NDIM-1] = 1;
+  for (int k = NDIM-2; k >= 0; --k)
+    outputStride[k] = outputStride[k+1]*outputDim[k+1];
+  checkCudnnError(cudnnSetTensorNdDescriptor(outputTensor, CUDNN_DATA_DOUBLE,
+                                             NDIM, outputDim, outputStride));
+
+  // Get prefered algorithm
+  // ------------------------
+  cudnnConvolutionFwdAlgo_t algo;
+  checkCudnnError(cudnnGetConvolutionForwardAlgorithm(
+      handle, inputTensor, filter, conv, outputTensor,
+      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
+  fprintf(stdout, "Algorithm : %d\n", algo);
+
+  // Get the workspace size
+  // -----------------------
+  size_t workspace_size = 0;
+  cudnnGetConvolutionForwardWorkspaceSize(handle, inputTensor, filter, conv,
+                                          outputTensor, algo, &workspace_size);
+  fprintf(stdout, "Workspace Size : %zu\n", workspace_size);
+
+  ////////////// allocate device data
+
+  // convolution data
+  // -----------------
+  double *data_X, *data_H, *data_G, *data_workspace{nullptr};
+  size_t image_size = IMAGE_WIDTH * IMAGE_HEIGHT * IMAGE_BATCH * INPUT_CHANNEL;
+  size_t filter_size = FILTER_WIDTH * FILTER_HEIGHT * FILTER_BATCH * INPUT_CHANNEL;
+  size_t output_size = OUTPUT_WIDTH * OUTPUT_HEIGHT * OUTPUT_BATCH * OUTPUT_CHANNEL;
+
+  checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_X,
+                                    image_size * sizeof(double),
+                                    CU_MEM_ATTACH_GLOBAL));
+  checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_H,
+                                    filter_size * sizeof(double),
+                                    CU_MEM_ATTACH_GLOBAL));
+  checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_G,
+                                    output_size * sizeof(double),
+                                    CU_MEM_ATTACH_GLOBAL));
+  checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_workspace,
+                                    (workspace_size + 1) * sizeof(double),
+                                    CU_MEM_ATTACH_GLOBAL));
+
+  // init data
+  // ----------
+  init_data(data_X, image_size);
+  init_data(data_H, filter_size);
+
+  // apply the convolution
+  // -----------------------
+  double alpha = 1.0;
+  double beta = 0.0;
+  checkCudnnError(cudnnConvolutionForward(
+      handle, &alpha, inputTensor, data_X, filter, data_H, conv, algo,
+      data_workspace, workspace_size, &beta, outputTensor, data_G));
+
+  checkCudaErrors(cuCtxSynchronize());
+
+  std::vector<double> ref_G(output_size);
+  reference(data_X, data_H, ref_G.data());
+
+  for (int i = 0; i < outputDim[0]; i++) {
+    for (int j = 0; j < outputDim[1]; j++) {
+      for (int k = 0; k < outputDim[2]; k++) {
+        for (int l = 0; l < outputDim[3]; l++) {
+          size_t idx =
+              ((i * outputDim[1] + j) * outputDim[2] + k) * outputDim[3] + l;
+          double err2 = std::abs(data_G[idx] - ref_G[idx]);
+          if (err2 > 1e-6) {
+            fprintf(stderr,
+                    "Error at Location [%d, %d %d, %d], Expected : %lf"
+                    ",Got : %lf, Error : %e\n",
+                    i, j, k, l, ref_G[idx], data_G[idx], err2);
+            exit(1);
+          }
+        }
+      }
+    }
+  }
+
+  // free memory
+  // -------------
+  checkCudaErrors(cuMemFree((CUdeviceptr)data_G));
+  checkCudaErrors(cuMemFree((CUdeviceptr)data_X));
+  checkCudaErrors(cuMemFree((CUdeviceptr)data_H));
+  checkCudaErrors(cuMemFree((CUdeviceptr)data_workspace));
+
+  // destroy cudnnHandle  
+  // -------------------
+  checkCudnnError(cudnnDestroy(handle));
+
+  return 0;
+}
No results found