Skip to content

Instantly share code, notes, and snippets.

@3gx
Created May 16, 2017 01:05
Show Gist options
  • Select an option

  • Save 3gx/1b90f7bbcf702c60d3076fc7dae62348 to your computer and use it in GitHub Desktop.

Select an option

Save 3gx/1b90f7bbcf702c60d3076fc7dae62348 to your computer and use it in GitHub Desktop.
convolution example
#include <cuda.h>
#include <cudnn.h>
#include <helper_cuda_drvapi.h>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <cassert>
#include <vector>
void __check(cudnnStatus_t status, const char* const file, int const line)
{
if (status != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "CUDNN error at %s:%d, code=%d(%s)\n",
file, line, static_cast<unsigned int>(status),
cudnnGetErrorString(status));
exit(1);
}
}
#define checkCudnnError(val) __check(val, __FILE__, __LINE__)
void init_data(double *x, int size) {
for (int i = 0; i < size; i++) {
x[i] = drand48();
}
}
void print_array(double* output, int size1, int size2)
{
for(int i = 0; i < size1; i++) {
for (int j = 0; j < size2; j++)
fprintf(stdout," %lf", output[i*size2+j]);
fprintf(stdout, "\n");
}
}
int constexpr IMAGE_BATCH = 4;
int constexpr IMAGE_HEIGHT = 28;
int constexpr IMAGE_WIDTH = 28;
int constexpr FILTER_HEIGHT = 5;
int constexpr FILTER_WIDTH = 5;
int constexpr FILTER_BATCH = 2;
int constexpr INPUT_CHANNEL = 3;
int constexpr PAD_HEIGHT = 2;
int constexpr PAD_WIDTH = 2;
int constexpr NDIM = 4;
int constexpr OUTPUT_HEIGHT = 1 + IMAGE_HEIGHT + 2*PAD_HEIGHT - FILTER_HEIGHT;
int constexpr OUTPUT_WIDTH = 1 + IMAGE_WIDTH + 2*PAD_WIDTH - FILTER_WIDTH;
int constexpr OUTPUT_BATCH = IMAGE_BATCH;
int constexpr OUTPUT_CHANNEL = FILTER_BATCH;
void reference(double *X_, double *H_, double *G_) {
double(*X)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH] =
(double(*)[INPUT_CHANNEL][IMAGE_HEIGHT][IMAGE_WIDTH])(X_);
double(*H)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH] =
(double(*)[INPUT_CHANNEL][FILTER_HEIGHT][FILTER_WIDTH])(H_);
double(*G)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH] =
(double(*)[OUTPUT_CHANNEL][OUTPUT_HEIGHT][OUTPUT_WIDTH])(G_);
int constexpr j = 0;
for (int i = 0; i < OUTPUT_BATCH; ++i) {
for (int j = 0; j < OUTPUT_CHANNEL; ++j) {
for (int k = 0; k < OUTPUT_HEIGHT; ++k) {
for (int l = 0; l < OUTPUT_WIDTH; ++l) {
double Gi = 0.0;
for (int m = 0; m < FILTER_HEIGHT; ++m) {
for (int n = 0; n < FILTER_WIDTH; ++n) {
for (int c = 0; c < INPUT_CHANNEL; ++c) {
double Xi = 0.0;
if (k + m >= PAD_HEIGHT && k + m < IMAGE_HEIGHT + PAD_HEIGHT &&
l + n >= PAD_WIDTH && l + n < IMAGE_WIDTH + PAD_WIDTH) {
Xi = X[i][c][k - PAD_HEIGHT + m][l - PAD_WIDTH + n];
}
Gi += Xi * H[j][c][m][n];
}
}
}
G[i][j][k][l] = Gi;
}
}
}
}
}
int main(int argc, char *argv[]) {
checkCudaErrors(cuInit(0));
// init device
int numDevices;
checkCudaErrors(cuDeviceGetCount(&numDevices));
if (numDevices == 0) {
fprintf(stderr, "No CUDA devices found\n");
exit(1);
}
int deviceNumber = 0;
if (const char* deviceNumEnv = std::getenv("NVJIT_GPU_DEVICE")) {
deviceNumber = atoi(deviceNumEnv);
}
CUdevice device;
checkCudaErrors(cuDeviceGet(&device, deviceNumber));
// create context
CUcontext context;
checkCudaErrors(cuCtxCreate(&context, 0, device));
// Init cudnnHandle
// -----------------
cudnnHandle_t handle;
checkCudnnError(cudnnCreate(&handle));
// Init cudnn tensor
// ------------------
cudnnTensorDescriptor_t inputTensor;
checkCudnnError(cudnnCreateTensorDescriptor(&inputTensor));
// Set tensor properties
// ----------------------
int dimA[NDIM] = {IMAGE_BATCH, INPUT_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH};
int strideA[NDIM] = {IMAGE_HEIGHT * IMAGE_WIDTH * INPUT_CHANNEL,
IMAGE_HEIGHT * IMAGE_WIDTH, IMAGE_WIDTH, 1};
checkCudnnError(cudnnSetTensorNdDescriptor(inputTensor, CUDNN_DATA_DOUBLE,
NDIM, dimA, strideA));
// Init filter
// ------------
cudnnFilterDescriptor_t filter;
checkCudnnError(cudnnCreateFilterDescriptor(&filter));
// Set filter properties
// ----------------------
int filterDim[NDIM] = {FILTER_BATCH, INPUT_CHANNEL, FILTER_HEIGHT, FILTER_WIDTH};
checkCudnnError(cudnnSetFilterNdDescriptor(
filter, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, NDIM, filterDim));
// Init convolution
// -----------------
cudnnConvolutionDescriptor_t conv;
checkCudnnError(cudnnCreateConvolutionDescriptor(&conv));
// Set convolution properties
// ---------------------------
int padA[2] = {PAD_HEIGHT, PAD_WIDTH};
int filterStride[2] = {1, 1};
int upscale[2] = {1, 1};
checkCudnnError(cudnnSetConvolutionNdDescriptor(
conv, 2, padA, filterStride, upscale, CUDNN_CROSS_CORRELATION,
CUDNN_DATA_DOUBLE));
// Get the output size
// -------------------
int outputDim[NDIM];
checkCudnnError(cudnnGetConvolutionNdForwardOutputDim(conv, inputTensor,
filter, 4, outputDim));
fprintf(stdout, "Output Dimensions: [%d, %d, %d, %d]\n", outputDim[0],
outputDim[1], outputDim[2], outputDim[3]);
assert(outputDim[0] == OUTPUT_BATCH);
assert(outputDim[1] == OUTPUT_CHANNEL);
assert(outputDim[2] == OUTPUT_HEIGHT);
assert(outputDim[3] == OUTPUT_WIDTH);
// Init Output Tensor
// ------------------
cudnnTensorDescriptor_t outputTensor;
checkCudnnError(cudnnCreateTensorDescriptor(&outputTensor));
// Set output tensor properties
// -----------------------------
int outputStride[NDIM];
std::memcpy(outputStride, outputDim+1, sizeof(int)*NDIM);
outputStride[NDIM-1] = 1;
for (int k = NDIM-2; k >= 0; --k)
outputStride[k] = outputStride[k+1]*outputDim[k+1];
checkCudnnError(cudnnSetTensorNdDescriptor(outputTensor, CUDNN_DATA_DOUBLE,
NDIM, outputDim, outputStride));
// Get prefered algorithm
// ------------------------
cudnnConvolutionFwdAlgo_t algo;
checkCudnnError(cudnnGetConvolutionForwardAlgorithm(
handle, inputTensor, filter, conv, outputTensor,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo));
fprintf(stdout, "Algorithm : %d\n", algo);
// Get the workspace size
// -----------------------
size_t workspace_size = 0;
cudnnGetConvolutionForwardWorkspaceSize(handle, inputTensor, filter, conv,
outputTensor, algo, &workspace_size);
fprintf(stdout, "Workspace Size : %zu\n", workspace_size);
////////////// allocate device data
// convolution data
// -----------------
double *data_X, *data_H, *data_G, *data_workspace{nullptr};
size_t image_size = IMAGE_WIDTH * IMAGE_HEIGHT * IMAGE_BATCH * INPUT_CHANNEL;
size_t filter_size = FILTER_WIDTH * FILTER_HEIGHT * FILTER_BATCH * INPUT_CHANNEL;
size_t output_size = OUTPUT_WIDTH * OUTPUT_HEIGHT * OUTPUT_BATCH * OUTPUT_CHANNEL;
checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_X,
image_size * sizeof(double),
CU_MEM_ATTACH_GLOBAL));
checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_H,
filter_size * sizeof(double),
CU_MEM_ATTACH_GLOBAL));
checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_G,
output_size * sizeof(double),
CU_MEM_ATTACH_GLOBAL));
checkCudaErrors(cuMemAllocManaged((CUdeviceptr *)&data_workspace,
(workspace_size + 1) * sizeof(double),
CU_MEM_ATTACH_GLOBAL));
// init data
// ----------
init_data(data_X, image_size);
init_data(data_H, filter_size);
// apply the convolution
// -----------------------
double alpha = 1.0;
double beta = 0.0;
checkCudnnError(cudnnConvolutionForward(
handle, &alpha, inputTensor, data_X, filter, data_H, conv, algo,
data_workspace, workspace_size, &beta, outputTensor, data_G));
checkCudaErrors(cuCtxSynchronize());
std::vector<double> ref_G(output_size);
reference(data_X, data_H, ref_G.data());
for (int i = 0; i < outputDim[0]; i++) {
for (int j = 0; j < outputDim[1]; j++) {
for (int k = 0; k < outputDim[2]; k++) {
for (int l = 0; l < outputDim[3]; l++) {
size_t idx =
((i * outputDim[1] + j) * outputDim[2] + k) * outputDim[3] + l;
double err2 = std::abs(data_G[idx] - ref_G[idx]);
if (err2 > 1e-6) {
fprintf(stderr,
"Error at Location [%d, %d %d, %d], Expected : %lf"
",Got : %lf, Error : %e\n",
i, j, k, l, ref_G[idx], data_G[idx], err2);
exit(1);
}
}
}
}
}
// free memory
// -------------
checkCudaErrors(cuMemFree((CUdeviceptr)data_G));
checkCudaErrors(cuMemFree((CUdeviceptr)data_X));
checkCudaErrors(cuMemFree((CUdeviceptr)data_H));
checkCudaErrors(cuMemFree((CUdeviceptr)data_workspace));
// destroy cudnnHandle
// -------------------
checkCudnnError(cudnnDestroy(handle));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment