tea · April 25, 2016 11:14 · Apr 25, 2016
diff --git a/NvEncoderPerf.cpp b/NvEncoderPerf.cpp
@@ -0,0 +1,744 @@
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+#include "../common/inc/nvEncodeAPI.h"
+#include "../common/inc/nvUtils.h"
+#include "NvEncoderPerf.h"
+#include <process.h>
+
+//#define VERBOSE
+
+#define BITSTREAM_BUFFER_SIZE 2 * 1024 * 1024
+#define MAX_FRAMES_TO_PRELOAD 60
+
+void CNvEncoderPerf::ConvertYUVpitchToNV12(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index)
+{
+    uint32_t lockedPitch;
+    unsigned char *pInputSurface;
+
+    m_pNvHWEncoder->NvEncLockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface, (void**)&pInputSurface, &lockedPitch);
+
+    unsigned char *pInputSurfaceCh = pInputSurface + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
+    int y;
+    int x;
+    if (width == 0)
+        width = width;
+    if (lockedPitch == 0)
+        lockedPitch = width;
+
+    for (y = 0; y < height; y++)
+    {
+        memcpy(pInputSurface + (lockedPitch*y), yuv_luma + (width*y), width);
+    }
+
+    for (y = 0; y < height / 2; y++)
+    {
+        for (x = 0; x < width; x = x + 2)
+        {
+            pInputSurfaceCh[(y*lockedPitch) + x] = yuv_cb[((width / 2)*y) + (x >> 1)];
+            pInputSurfaceCh[(y*lockedPitch) + (x + 1)] = yuv_cr[((width / 2)*y) + (x >> 1)];
+        }
+    }
+    m_pNvHWEncoder->NvEncUnlockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface);
+}
+
+void CNvEncoderPerf::ConvertYUVpitchToYUV444(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index)
+{
+    uint32_t lockedPitch;
+    unsigned char *pInputSurface;
+
+    m_pNvHWEncoder->NvEncLockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface, (void**)&pInputSurface, &lockedPitch);
+    if (lockedPitch == 0)
+        lockedPitch = width;
+
+    unsigned char *pInputSurfaceCb = pInputSurface + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
+    unsigned char *pInputSurfaceCr = pInputSurfaceCb + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
+    for (int h = 0; h < height; h++)
+    {
+        memcpy(pInputSurface + lockedPitch * h, yuv_luma + width * h, width);
+        memcpy(pInputSurfaceCb + lockedPitch * h, yuv_cb + width * h, width);
+        memcpy(pInputSurfaceCr + lockedPitch * h, yuv_cr + width * h, width);
+    }
+
+    m_pNvHWEncoder->NvEncUnlockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface);
+}
+
+CNvEncoderPerf::CNvEncoderPerf()
+{
+    m_pNvHWEncoder = new CNvHWEncoder;
+    m_pDevice = NULL;
+#if defined (NV_WINDOWS)
+    m_pD3D = NULL;
+#endif
+    m_cuContext = NULL;
+
+    m_uEncodeBufferCount = 0;
+    memset(&m_stEncoderInput, 0, sizeof(m_stEncoderInput));
+    memset(&m_stEOSOutputBfr, 0, sizeof(m_stEOSOutputBfr));
+
+    memset(&m_stEncodeBuffer, 0, sizeof(m_stEncodeBuffer));
+}
+
+CNvEncoderPerf::~CNvEncoderPerf()
+{
+    if (m_pNvHWEncoder)
+    {
+        delete m_pNvHWEncoder;
+        m_pNvHWEncoder = NULL;
+    }
+}
+
+NVENCSTATUS CNvEncoderPerf::InitCuda(uint32_t deviceID)
+{
+    CUresult cuResult;
+    CUdevice device;
+    CUcontext cuContextCurr;
+    int  deviceCount = 0;
+    int  SMminor = 0, SMmajor = 0;
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    typedef HMODULE CUDADRIVER;
+#else
+    typedef void *CUDADRIVER;
+#endif
+    CUDADRIVER hHandleDriver = 0;
+
+    cuResult = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuInit error:0x%x\n", cuResult);
+        assert(0);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    cuResult = cuDeviceGetCount(&deviceCount);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuDeviceGetCount error:0x%x\n", cuResult);
+        assert(0);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    // If dev is negative value, we clamp to 0
+    if ((int)deviceID < 0)
+        deviceID = 0;
+
+    if (deviceID >(unsigned int)deviceCount - 1)
+    {
+        PRINTERR("Invalid Device Id = %d\n", deviceID);
+        return NV_ENC_ERR_INVALID_ENCODERDEVICE;
+    }
+
+    cuResult = cuDeviceGet(&device, deviceID);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuDeviceGet error:0x%x\n", cuResult);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    cuResult = cuDeviceComputeCapability(&SMmajor, &SMminor, deviceID);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuDeviceComputeCapability error:0x%x\n", cuResult);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    if (((SMmajor << 4) + SMminor) < 0x30)
+    {
+        PRINTERR("GPU %d does not have NVENC capabilities exiting\n", deviceID);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    cuResult = cuCtxCreate((CUcontext*)(&m_pDevice), 0, device);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuCtxCreate error:0x%x\n", cuResult);
+        assert(0);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    cuResult = cuCtxPopCurrent(&cuContextCurr);
+    if (cuResult != CUDA_SUCCESS)
+    {
+        PRINTERR("cuCtxPopCurrent error:0x%x\n", cuResult);
+        assert(0);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+    return NV_ENC_SUCCESS;
+}
+
+#if defined(NV_WINDOWS)
+NVENCSTATUS CNvEncoderPerf::InitD3D9(uint32_t deviceID)
+{
+    D3DPRESENT_PARAMETERS d3dpp;
+    D3DADAPTER_IDENTIFIER9 adapterId;
+    unsigned int iAdapter = NULL;
+    HRESULT hr = S_OK;
+
+    m_pD3D = Direct3DCreate9(D3D_SDK_VERSION);
+    if (m_pD3D == NULL)
+    {
+        assert(m_pD3D);
+        return NV_ENC_ERR_OUT_OF_MEMORY;;
+    }
+
+    if (deviceID >= m_pD3D->GetAdapterCount())
+    {
+        PRINTERR("Invalid Device Id = %d. Please use DX10/DX11 to detect headless video devices.\n", deviceID);
+        return NV_ENC_ERR_INVALID_ENCODERDEVICE;
+    }
+
+    hr = m_pD3D->GetAdapterIdentifier(deviceID, 0, &adapterId);
+    if (hr != S_OK)
+    {
+        PRINTERR("Invalid Device Id = %d\n", deviceID);
+        return NV_ENC_ERR_INVALID_ENCODERDEVICE;
+    }
+
+    ZeroMemory(&d3dpp, sizeof(d3dpp));
+    d3dpp.Windowed = TRUE;
+    d3dpp.BackBufferFormat = D3DFMT_X8R8G8B8;
+    d3dpp.BackBufferWidth = 640;
+    d3dpp.BackBufferHeight = 480;
+    d3dpp.BackBufferCount = 1;
+    d3dpp.SwapEffect = D3DSWAPEFFECT_COPY;
+    d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+    d3dpp.Flags = D3DPRESENTFLAG_VIDEO;//D3DPRESENTFLAG_LOCKABLE_BACKBUFFER;
+    DWORD dwBehaviorFlags = D3DCREATE_FPU_PRESERVE | D3DCREATE_MULTITHREADED | D3DCREATE_HARDWARE_VERTEXPROCESSING;
+
+    hr = m_pD3D->CreateDevice(deviceID,
+        D3DDEVTYPE_HAL,
+        GetDesktopWindow(),
+        dwBehaviorFlags,
+        &d3dpp,
+        (IDirect3DDevice9**)(&m_pDevice));
+
+    if (FAILED(hr))
+        return NV_ENC_ERR_OUT_OF_MEMORY;
+
+    return  NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvEncoderPerf::InitD3D10(uint32_t deviceID)
+{
+    HRESULT hr;
+    IDXGIFactory * pFactory = NULL;
+    IDXGIAdapter * pAdapter;
+
+    if (CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory) != S_OK)
+    {
+        return NV_ENC_ERR_GENERIC;
+    }
+
+    if (pFactory->EnumAdapters(deviceID, &pAdapter) != DXGI_ERROR_NOT_FOUND)
+    {
+        hr = D3D10CreateDevice(pAdapter, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0,
+            D3D10_SDK_VERSION, (ID3D10Device**)(&m_pDevice));
+        if (FAILED(hr))
+        {
+            PRINTERR("Invalid Device Id = %d\n", deviceID);
+            return NV_ENC_ERR_OUT_OF_MEMORY;
+        }
+    }
+    else
+    {
+        PRINTERR("Invalid Device Id = %d\n", deviceID);
+        return NV_ENC_ERR_INVALID_ENCODERDEVICE;
+    }
+
+    return  NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvEncoderPerf::InitD3D11(uint32_t deviceID)
+{
+    HRESULT hr;
+    IDXGIFactory * pFactory = NULL;
+    IDXGIAdapter * pAdapter;
+
+    if (CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory) != S_OK)
+    {
+        return NV_ENC_ERR_GENERIC;
+    }
+
+    if (pFactory->EnumAdapters(deviceID, &pAdapter) != DXGI_ERROR_NOT_FOUND)
+    {
+        hr = D3D11CreateDevice(pAdapter, D3D_DRIVER_TYPE_UNKNOWN, NULL, 0,
+            NULL, 0, D3D11_SDK_VERSION, (ID3D11Device**)(&m_pDevice), NULL, NULL);
+        if (FAILED(hr))
+        {
+            PRINTERR("Invalid Device Id = %d\n", deviceID);
+            return NV_ENC_ERR_OUT_OF_MEMORY;
+        }
+    }
+    else
+    {
+        PRINTERR("Invalid Device Id = %d\n", deviceID);
+        return NV_ENC_ERR_NO_ENCODE_DEVICE;
+    }
+
+    return  NV_ENC_SUCCESS;
+}
+#endif
+
+NVENCSTATUS CNvEncoderPerf::AllocateIOBuffers(uint32_t uInputWidth, uint32_t uInputHeight, int isYuv444)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    m_EncodeBufferQueue.Initialize(m_stEncodeBuffer, m_uEncodeBufferCount);
+    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++)
+    {
+        nvStatus = m_pNvHWEncoder->NvEncCreateInputBuffer(uInputWidth, uInputHeight, &m_stEncodeBuffer[i].stInputBfr.hInputSurface, isYuv444);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            PRINTERR("Failed to allocate Input Buffer, Please reduce MAX_FRAMES_TO_PRELOAD\n");
+            return nvStatus;
+        }
+        if (isYuv444 == 0)
+            m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
+        else
+            m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+        m_stEncodeBuffer[i].stInputBfr.dwWidth = uInputWidth;
+        m_stEncodeBuffer[i].stInputBfr.dwHeight = uInputHeight;
+        nvStatus = m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            PRINTERR("Failed to allocate Output Buffer, Please reduce MAX_FRAMES_TO_PRELOAD\n");
+            return nvStatus;
+        }
+        m_stEncodeBuffer[i].stOutputBfr.dwBitstreamBufferSize = BITSTREAM_BUFFER_SIZE;
+    }
+
+    m_stEOSOutputBfr.bEOSFlag = TRUE;
+
+	return NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvEncoderPerf::ReleaseIOBuffers()
+{
+    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++)
+    {
+        m_pNvHWEncoder->NvEncDestroyInputBuffer(m_stEncodeBuffer[i].stInputBfr.hInputSurface);
+		char bu[128];
+		sprintf(bu, "%p\n", (void*)m_stEncodeBuffer[i].stInputBfr.hInputSurface);
+		OutputDebugStringA(bu);
+        m_stEncodeBuffer[i].stInputBfr.hInputSurface = NULL;
+
+        m_pNvHWEncoder->NvEncDestroyBitstreamBuffer(m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
+        m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer = NULL;
+    }
+
+	return NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvEncoderPerf::FlushEncoder()
+{
+    NVENCSTATUS nvStatus = m_pNvHWEncoder->NvEncFlushEncoderQueue(nullptr);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+        return nvStatus;
+    }
+
+    EncodeBuffer *pEncodeBufer = m_EncodeBufferQueue.GetPending();
+    while (pEncodeBufer)
+    {
+        m_pNvHWEncoder->ProcessOutput(pEncodeBufer);
+        pEncodeBufer = m_EncodeBufferQueue.GetPending();
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvEncoderPerf::Deinitialize(uint32_t devicetype)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    ReleaseIOBuffers();
+
+    nvStatus = m_pNvHWEncoder->NvEncDestroyEncoder();
+
+    if (m_pDevice)
+    {
+        switch (devicetype)
+        {
+#if defined(NV_WINDOWS)
+        case NV_ENC_DX9:
+            ((IDirect3DDevice9*)(m_pDevice))->Release();
+            break;
+
+        case NV_ENC_DX10:
+            ((ID3D10Device*)(m_pDevice))->Release();
+            break;
+
+        case NV_ENC_DX11:
+            ((ID3D11Device*)(m_pDevice))->Release();
+            break;
+#endif
+
+        case NV_ENC_CUDA:
+            CUresult cuResult = CUDA_SUCCESS;
+            cuResult = cuCtxDestroy((CUcontext)m_pDevice);
+            if (cuResult != CUDA_SUCCESS)
+                PRINTERR("cuCtxDestroy error:0x%x\n", cuResult);
+        }
+
+        m_pDevice = NULL;
+    }
+
+#if defined (NV_WINDOWS)
+    if (m_pD3D)
+    {
+        m_pD3D->Release();
+        m_pD3D = NULL;
+    }
+#endif
+
+    return nvStatus;
+}
+
+void PrintHelp()
+{
+    printf("Usage : NvEncoderPerf \n"
+        "-i <string>                  Specify input yuv420 file\n"
+        "-o <string>                  Specify output bitstream file\n"
+        "-size <int int>              Specify input resolution <width height>\n"
+        "\n### Optional parameters ###\n"
+        "-codec <integer>             Specify the codec \n"
+        "                                 0: H264\n"
+        "                                 1: HEVC\n"
+        "-preset <string>             Specify the preset for encoder settings\n"
+        "                                 hq : nvenc HQ \n"
+        "                                 hp : nvenc HP \n"
+        "                                 lowLatencyHP : nvenc low latency HP \n"
+        "                                 lowLatencyHQ : nvenc low latency HQ \n"
+        "-startf <integer>            Specify start index for encoding. Default is 0\n"
+        "-endf <integer>              Specify end index for encoding. Default is end of file\n"
+        "-fps <integer>               Specify encoding frame rate\n"
+        "-goplength <integer>         Specify gop length\n"
+        "-numB <integer>              Specify number of B frames\n"
+        "-bitrate <integer>           Specify the encoding average bitrate\n"
+        "-vbvMaxBitrate <integer>     Specify the vbv max bitrate\n"
+        "-vbvSize <integer>           Specify the encoding vbv/hrd buffer size\n"
+        "-rcmode <integer>            Specify the rate control mode\n"
+        "                                 0:  Constant QP\n"
+        "                                 1:  Single pass VBR\n"
+        "                                 2:  Single pass CBR\n"
+        "                                 4:  Single pass VBR minQP\n"
+        "                                 8:  Two pass frame quality\n"
+        "                                 16: Two pass frame size cap\n"
+        "                                 32: Two pass VBR\n"
+        "-qp <integer>                Specify qp for Constant QP mode\n"
+        "-i_qfactor <float>           Specify qscale difference between I-frames and P-frames\n"
+        "-b_qfactor <float>           Specify qscale difference between P-frames and B-frames\n" 
+        "-i_qoffset <float>           Specify qscale offset between I-frames and P-frames\n"
+        "-b_qoffset <float>           Specify qscale offset between P-frames and B-frames\n" 
+        "-devicetype <integer>        Specify devicetype used for encoding\n"
+        "                                 0:  DX9\n"
+        "                                 1:  DX11\n"
+        "                                 2:  Cuda\n"
+        "                                 3:  DX10\n"
+        "-deviceID <integer>          Specify the GPU device on which encoding will take place\n"
+        "-yuv444 <integer>            Specify the input YUV format\n"
+        "                                 0: YUV 420\n"
+        "                                 1: YUV 444\n"
+        "-help                        Prints Help Information\n\n"
+        );
+}
+
+int CNvEncoderPerf::EncodeMain(std::atomic<int>& generation)
+{
+    uint8_t *yuv[3] = { 0 };
+    unsigned long long lStart, lEnd, lFreq;
+    int numFramesEncoded = 0;
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    bool bError = false;
+    double elapsedTime = 0.0f;
+    bool eof = false;
+    EncodeConfig encodeConfig;
+    uint32_t chromaFormatIDC = 0;
+    int32_t lumaPlaneSize = 0, chromaPlaneSize = 0;
+
+    memset(&encodeConfig, 0, sizeof(EncodeConfig));
+
+	encodeConfig.width = 736;
+	encodeConfig.height = 576;
+    encodeConfig.endFrameIdx = INT_MAX;
+    encodeConfig.bitrate = 5000000;
+    encodeConfig.rcMode = NV_ENC_PARAMS_RC_CONSTQP;
+    encodeConfig.gopLength = NVENC_INFINITE_GOPLENGTH;
+    encodeConfig.deviceType = NV_ENC_CUDA;
+    encodeConfig.codec = NV_ENC_H264;
+    encodeConfig.fps = 30;
+    encodeConfig.qp = 28;
+    encodeConfig.i_quant_factor = DEFAULT_I_QFACTOR;
+    encodeConfig.b_quant_factor = DEFAULT_B_QFACTOR;  
+    encodeConfig.i_quant_offset = DEFAULT_I_QOFFSET;
+    encodeConfig.b_quant_offset = DEFAULT_B_QOFFSET; 
+    encodeConfig.presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
+    encodeConfig.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+
+    nvStatus = m_pNvHWEncoder->ParseArguments(&encodeConfig, 0, nullptr);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        PrintHelp();
+        return 1;
+    }
+
+    if (encodeConfig.width == 0 || encodeConfig.height == 0)
+    {
+        PrintHelp();
+        return 1;
+    }
+
+    switch (encodeConfig.deviceType)
+    {
+#if defined(NV_WINDOWS)
+    case NV_ENC_DX9:
+        InitD3D9(encodeConfig.deviceID);
+        break;
+
+    case NV_ENC_DX10:
+        InitD3D10(encodeConfig.deviceID);
+        break;
+
+    case NV_ENC_DX11:
+        InitD3D11(encodeConfig.deviceID);
+        break;
+#endif
+
+    case NV_ENC_CUDA:
+        InitCuda(encodeConfig.deviceID);
+        break;
+    }
+
+    if (encodeConfig.deviceType != NV_ENC_CUDA)
+        nvStatus = m_pNvHWEncoder->Initialize(m_pDevice, NV_ENC_DEVICE_TYPE_DIRECTX);
+    else
+        nvStatus = m_pNvHWEncoder->Initialize(m_pDevice, NV_ENC_DEVICE_TYPE_CUDA);
+
+    if (nvStatus != NV_ENC_SUCCESS)
+        return 1;
+
+    encodeConfig.presetGUID = m_pNvHWEncoder->GetPresetGUID(encodeConfig.encoderPreset, encodeConfig.codec);
+#ifdef VERBOSE
+    printf("Encoding input           : \"%s\"\n", encodeConfig.inputFileName);
+    printf("         output          : \"%s\"\n", encodeConfig.outputFileName);
+    printf("         codec           : \"%s\"\n", encodeConfig.codec == NV_ENC_HEVC ? "HEVC" : "H264");
+    printf("         size            : %dx%d\n", encodeConfig.width, encodeConfig.height);
+    printf("         bitrate         : %d bits/sec\n", encodeConfig.bitrate);
+    printf("         vbvMaxBitrate   : %d bits/sec\n", encodeConfig.vbvMaxBitrate);
+    printf("         vbvSize         : %d bits\n", encodeConfig.vbvSize);
+    printf("         fps             : %d frames/sec\n", encodeConfig.fps);
+    printf("         rcMode          : %s\n", encodeConfig.rcMode == NV_ENC_PARAMS_RC_CONSTQP ? "CONSTQP" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR ? "VBR" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR ? "CBR" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ? "VBR MINQP" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_QUALITY ? "TWO_PASS_QUALITY" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP ? "TWO_PASS_FRAMESIZE_CAP" :
+                                              encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_VBR ? "TWO_PASS_VBR" : "UNKNOWN");
+    if (encodeConfig.gopLength == NVENC_INFINITE_GOPLENGTH)
+        printf("         goplength       : INFINITE GOP \n");
+    else
+        printf("         goplength       : %d \n", encodeConfig.gopLength);
+    printf("         B frames        : %d \n", encodeConfig.numB);
+    printf("         QP              : %d \n", encodeConfig.qp);
+    printf("         preset          : %s\n", (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HQ_GUID) ? "LOW_LATENCY_HQ" :
+                                        (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HP_GUID) ? "LOW_LATENCY_HP" :
+                                        (encodeConfig.presetGUID == NV_ENC_PRESET_HQ_GUID) ? "HQ_PRESET" :
+                                        (encodeConfig.presetGUID == NV_ENC_PRESET_HP_GUID) ? "HP_PRESET" :
+                                        (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID) ? "LOW_LATENCY_DEFAULT" : "DEFAULT");
+    printf("         devicetype      : %s\n", encodeConfig.deviceType == NV_ENC_DX9 ? "DX9" :
+                                        encodeConfig.deviceType == NV_ENC_DX10 ? "DX10" :
+                                        encodeConfig.deviceType == NV_ENC_DX11 ? "DX11" :
+                                        encodeConfig.deviceType == NV_ENC_CUDA ? "CUDA" : "INVALID");
+
+    printf("\n");
+#endif
+    nvStatus = m_pNvHWEncoder->CreateEncoder(&encodeConfig);
+    if (nvStatus != NV_ENC_SUCCESS)
+        return 1;
+
+    m_uEncodeBufferCount = MAX_FRAMES_TO_PRELOAD;
+
+    nvStatus = AllocateIOBuffers(encodeConfig.width, encodeConfig.height, encodeConfig.isYuv444);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        bError = true;
+        goto exit;
+    }
+    chromaFormatIDC = (encodeConfig.isYuv444 ? 3 : 1);
+    lumaPlaneSize = encodeConfig.width * encodeConfig.height;
+    chromaPlaneSize = (chromaFormatIDC == 3) ? lumaPlaneSize : (lumaPlaneSize >> 2);
+
+    yuv[0] = new uint8_t[lumaPlaneSize];
+    yuv[1] = new uint8_t[chromaPlaneSize];
+    yuv[2] = new uint8_t[chromaPlaneSize];
+
+    NvQueryPerformanceCounter(&lStart);
+
+	int gen = generation;
+    for (int frm = encodeConfig.startFrameIdx; frm <= encodeConfig.endFrameIdx; frm += MAX_FRAMES_TO_PRELOAD)
+    {
+        int numFramesLoaded = 0;
+        for (int frmCnt = frm; frmCnt <= MIN(frm + MAX_FRAMES_TO_PRELOAD - 1, encodeConfig.endFrameIdx); frmCnt++)
+        {
+			if (gen != generation)
+            {
+                eof = true;
+                break;
+            }
+
+			for(int y = 0; y < encodeConfig.height; ++y)
+				for (int x = 0; x < encodeConfig.width; ++x)
+				{
+					yuv[0][encodeConfig.width*y + x] = (x + y) % 256;
+					yuv[1][encodeConfig.width / 2 * (y / 2) + x / 2] = (3 * x + y) % 256;
+					yuv[2][encodeConfig.width / 2 * (y / 2) + x / 2] = (x + 3 * y) % 256;
+				}
+			ConvertYUVpitchToNV12(yuv[0], yuv[1], yuv[2], encodeConfig.width, encodeConfig.height, (frmCnt - frm));
+			numFramesLoaded++;
+        }
+
+        if (numFramesLoaded)
+        {
+            NvQueryPerformanceCounter(&lStart);
+            for (int frmCnt = 0; frmCnt < numFramesLoaded; frmCnt++)
+            {
+                EncodeFrame(false, encodeConfig.width, encodeConfig.height);
+                numFramesEncoded++;
+            }
+            nvStatus = EncodeFrame(true, encodeConfig.width, encodeConfig.height);
+            if (nvStatus != NV_ENC_SUCCESS)
+            {
+                bError = true;
+                goto exit;
+            }
+            NvQueryPerformanceCounter(&lEnd);
+            elapsedTime += (double)(lEnd - lStart);
+        }
+        if (eof == true)
+        {
+            break;
+        }
+    }
+#ifdef VERBOSE
+    if (numFramesEncoded > 0)
+    {
+        NvQueryPerformanceFrequency(&lFreq);
+        printf("Encoded %d frames in %6.2fms\n", numFramesEncoded, (elapsedTime*1000.0) / lFreq);
+        printf("Average Encode Time : %6.2fms\n", ((elapsedTime*1000.0) / numFramesEncoded) / lFreq);
+        printf("Frames per second: %dfps\n", (int)((float)numFramesEncoded * 1000.0 /(float)((elapsedTime*1000.0) / lFreq)));
+    }
+#endif
+exit:
+    Deinitialize(encodeConfig.deviceType);
+
+    for (int i = 0; i < 3; i ++)
+    {
+        if (yuv[i])
+        {
+            delete [] yuv[i];
+        }
+    }
+
+    return bError ? 1 : 0;
+}
+
+NVENCSTATUS CNvEncoderPerf::EncodeFrame(bool bFlush, uint32_t width, uint32_t height)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    EncodeBuffer *pEncodeBuffer = NULL;
+    NV_ENC_PIC_PARAMS encPicParams;
+
+    memset(&encPicParams, 0, sizeof(encPicParams));
+    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);
+
+    if (bFlush)
+    {
+        FlushEncoder();
+        return NV_ENC_SUCCESS;
+    }
+
+    pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
+    if(!pEncodeBuffer)
+    {
+        m_pNvHWEncoder->ProcessOutput(m_EncodeBufferQueue.GetPending());
+        pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
+    }
+
+    nvStatus = m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, width, height);
+    return nvStatus;
+}
+
+class Encoder
+{
+public:
+	HANDLE hThread = INVALID_HANDLE_VALUE;
+	HANDLE hEvent = INVALID_HANDLE_VALUE;
+	std::atomic<int> generation{ 0 };
+	CNvEncoderPerf encoder;
+
+	Encoder()
+	{
+		hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
+	}
+
+	~Encoder()
+	{
+		CloseHandle(hEvent);
+	}
+
+	static unsigned  __stdcall threadFunc(void* pArguments)
+	{
+		((Encoder*)pArguments)->inThread();
+		return 0;
+	}
+
+	void inThread()
+	{
+		for (;;)
+		{
+			printf("<");
+			encoder.EncodeMain(generation);
+			printf(">");
+			SetEvent(hEvent);
+		}
+	}
+
+	void runInThread()
+	{
+		hThread = (HANDLE)_beginthreadex(NULL, 0, &Encoder::threadFunc, this, 0, nullptr);
+	}
+
+	void finalize()
+	{
+		++generation;
+		WaitForSingleObject(hEvent, INFINITE);
+		ResetEvent(hEvent);
+	}
+};
+
+int main(int argc, char **argv)
+{
+	Encoder encoder;
+	Encoder encoder_a;
+
+	encoder_a.runInThread();
+	encoder.runInThread();
+	for (;;)
+	{
+		Sleep(1000);
+		encoder.finalize();
+	}
+
+	return 0;
+}
diff --git a/NvEncoderPerf.h b/NvEncoderPerf.h
@@ -0,0 +1,141 @@
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+#if defined(NV_WINDOWS)
+    #include <d3d9.h>
+    #include <d3d10_1.h>
+    #include <d3d11.h>
+#pragma warning(disable : 4996)
+#endif
+#include <atomic>
+#include "../common/inc/NvHWEncoder.h"
+
+#define MAX_ENCODE_QUEUE 100
+
+#define SET_VER(configStruct, type) {configStruct.version = type##_VER;}
+
+template<class T>
+class CNvQueue {
+    T** m_pBuffer;
+    unsigned int m_uSize;
+    unsigned int m_uPendingCount;
+    unsigned int m_uAvailableIdx;
+    unsigned int m_uPendingndex;
+public:
+    CNvQueue(): m_pBuffer(NULL), m_uSize(0), m_uPendingCount(0), m_uAvailableIdx(0),
+                m_uPendingndex(0)
+    {
+    }
+
+    ~CNvQueue()
+    {
+        delete[] m_pBuffer;
+    }
+
+    bool Initialize(T *pItems, unsigned int uSize)
+    {
+        m_uSize = uSize;
+        m_uPendingCount = 0;
+        m_uAvailableIdx = 0;
+        m_uPendingndex = 0;
+        m_pBuffer = new T *[m_uSize];
+        for (unsigned int i = 0; i < m_uSize; i++)
+        {
+            m_pBuffer[i] = &pItems[i];
+        }
+        return true;
+    }
+
+
+    T * GetAvailable()
+    {
+        T *pItem = NULL;
+        if (m_uPendingCount == m_uSize)
+        {
+            return NULL;
+        }
+        pItem = m_pBuffer[m_uAvailableIdx];
+        m_uAvailableIdx = (m_uAvailableIdx+1)%m_uSize;
+        m_uPendingCount += 1;
+        return pItem;
+    }
+
+    T* GetPending()
+    {
+        if (m_uPendingCount == 0) 
+        {
+            return NULL;
+        }
+
+        T *pItem = m_pBuffer[m_uPendingndex];
+        m_uPendingndex = (m_uPendingndex+1)%m_uSize;
+        m_uPendingCount -= 1;
+        return pItem;
+    }
+};
+
+typedef struct _EncodeFrameConfig
+{
+    uint8_t  *yuv[3];
+    uint32_t stride[3];
+    uint32_t width;
+    uint32_t height;
+}EncodeFrameConfig;
+
+typedef enum
+{
+    NV_ENC_DX9 = 0,
+    NV_ENC_DX11 = 1,
+    NV_ENC_CUDA = 2,
+    NV_ENC_DX10 = 3,
+} NvEncodeDeviceType;
+
+class CNvEncoderPerf
+{
+public:
+    CNvEncoderPerf();
+    virtual ~CNvEncoderPerf();
+
+	int EncodeMain(std::atomic<int>& generation);
+
+protected:
+    CNvHWEncoder                                        *m_pNvHWEncoder;
+    uint32_t                                             m_uEncodeBufferCount;
+    void*                                                m_pDevice;
+#if defined(NV_WINDOWS)
+    IDirect3D9                                          *m_pD3D;
+#endif
+
+    CUcontext                                            m_cuContext;
+    EncodeConfig                                         m_stEncoderInput;
+    EncodeBuffer                                         m_stEncodeBuffer[MAX_ENCODE_QUEUE];
+    CNvQueue<EncodeBuffer>                               m_EncodeBufferQueue;
+    EncodeOutputBuffer                                   m_stEOSOutputBfr; 
+
+protected:
+    NVENCSTATUS                                          Deinitialize(uint32_t devicetype);
+    NVENCSTATUS                                          EncodeFrame(bool bFlush=false, uint32_t width=0, uint32_t height=0);
+    NVENCSTATUS                                          InitD3D9(uint32_t deviceID = 0);
+    NVENCSTATUS                                          InitD3D11(uint32_t deviceID = 0);
+    NVENCSTATUS                                          InitD3D10(uint32_t deviceID = 0);
+    NVENCSTATUS                                          InitCuda(uint32_t deviceID = 0);
+    NVENCSTATUS                                          AllocateIOBuffers(uint32_t uInputWidth, uint32_t uInputHeight,int isYuv444);
+    NVENCSTATUS                                          ReleaseIOBuffers();
+    unsigned char*                                       LockInputBuffer(void * hInputSurface, uint32_t *pLockedPitch);
+    NVENCSTATUS                                          FlushEncoder();
+    void                                                 ConvertYUVpitchToNV12(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index);
+    void                                                 ConvertYUVpitchToYUV444(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index);
+
+};
+
+// NVEncodeAPI entry point
+typedef NVENCSTATUS (NVENCAPI *MYPROC)(NV_ENCODE_API_FUNCTION_LIST*); 
diff --git a/NvHWEncoder.cpp b/NvHWEncoder.cpp
@@ -0,0 +1,1284 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#include "../inc/NvHWEncoder.h"
+
+NVENCSTATUS CNvHWEncoder::NvEncOpenEncodeSession(void* device, uint32_t deviceType)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncOpenEncodeSession(device, deviceType, &m_hEncoder);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeGUIDCount(uint32_t* encodeGUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDCount(m_hEncoder, encodeGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeProfileGUIDCount(GUID encodeGUID, uint32_t* encodeProfileGUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeProfileGUIDCount(m_hEncoder, encodeGUID, encodeProfileGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeProfileGUIDs(GUID encodeGUID, GUID* profileGUIDs, uint32_t guidArraySize, uint32_t* GUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeProfileGUIDs(m_hEncoder, encodeGUID, profileGUIDs, guidArraySize, GUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeGUIDs(GUID* GUIDs, uint32_t guidArraySize, uint32_t* GUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDs(m_hEncoder, GUIDs, guidArraySize, GUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetInputFormatCount(GUID encodeGUID, uint32_t* inputFmtCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetInputFormatCount(m_hEncoder, encodeGUID, inputFmtCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetInputFormats(GUID encodeGUID, NV_ENC_BUFFER_FORMAT* inputFmts, uint32_t inputFmtArraySize, uint32_t* inputFmtCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetInputFormats(m_hEncoder, encodeGUID, inputFmts, inputFmtArraySize, inputFmtCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeCaps(GUID encodeGUID, NV_ENC_CAPS_PARAM* capsParam, int* capsVal)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeCaps(m_hEncoder, encodeGUID, capsParam, capsVal);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetCount(GUID encodeGUID, uint32_t* encodePresetGUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetCount(m_hEncoder, encodeGUID, encodePresetGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetGUIDs(GUID encodeGUID, GUID* presetGUIDs, uint32_t guidArraySize, uint32_t* encodePresetGUIDCount)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetGUIDs(m_hEncoder, encodeGUID, presetGUIDs, guidArraySize, encodePresetGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetConfig(GUID encodeGUID, GUID  presetGUID, NV_ENC_PRESET_CONFIG* presetConfig)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetConfig(m_hEncoder, encodeGUID, presetGUID, presetConfig);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncCreateInputBuffer(uint32_t width, uint32_t height, void** inputBuffer, uint32_t isYuv444)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams;
+
+    memset(&createInputBufferParams, 0, sizeof(createInputBufferParams));
+    SET_VER(createInputBufferParams, NV_ENC_CREATE_INPUT_BUFFER);
+
+    createInputBufferParams.width = width;
+    createInputBufferParams.height = height;
+    createInputBufferParams.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+    createInputBufferParams.bufferFmt = isYuv444 ? NV_ENC_BUFFER_FORMAT_YUV444_PL : NV_ENC_BUFFER_FORMAT_NV12_PL;
+
+    nvStatus = m_pEncodeAPI->nvEncCreateInputBuffer(m_hEncoder, &createInputBufferParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    *inputBuffer = createInputBufferParams.inputBuffer;
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncDestroyInputBuffer(NV_ENC_INPUT_PTR inputBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (inputBuffer)
+    {
+        nvStatus = m_pEncodeAPI->nvEncDestroyInputBuffer(m_hEncoder, inputBuffer);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            assert(0);
+        }
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncCreateMVBuffer(uint32_t size, void** bitstreamBuffer)
+{
+    NVENCSTATUS status;
+    NV_ENC_CREATE_MV_BUFFER stAllocMVBuffer;
+    memset(&stAllocMVBuffer, 0, sizeof(stAllocMVBuffer));
+    SET_VER(stAllocMVBuffer, NV_ENC_CREATE_MV_BUFFER);
+    status = m_pEncodeAPI->nvEncCreateMVBuffer(m_hEncoder, &stAllocMVBuffer);
+    if (status != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+    *bitstreamBuffer = stAllocMVBuffer.MVBuffer;
+    return status;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncDestroyMVBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer)
+{
+    NVENCSTATUS status;
+    NV_ENC_CREATE_MV_BUFFER stAllocMVBuffer;
+    memset(&stAllocMVBuffer, 0, sizeof(stAllocMVBuffer));
+    SET_VER(stAllocMVBuffer, NV_ENC_CREATE_MV_BUFFER);
+    status = m_pEncodeAPI->nvEncDestroyMVBuffer(m_hEncoder, bitstreamBuffer);
+    if (status != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+    bitstreamBuffer = NULL;
+    return status;
+}
+
+NVENCSTATUS CNvHWEncoder::NvRunMotionEstimationOnly(EncodeBuffer *pEncodeBuffer[2], MEOnlyConfig *pMEOnly)
+{
+    NVENCSTATUS nvStatus;
+    NV_ENC_MEONLY_PARAMS stMEOnlyParams;
+    SET_VER(stMEOnlyParams,NV_ENC_MEONLY_PARAMS);
+    stMEOnlyParams.referenceFrame = pEncodeBuffer[0]->stInputBfr.hInputSurface;
+    stMEOnlyParams.inputBuffer = pEncodeBuffer[1]->stInputBfr.hInputSurface;
+    stMEOnlyParams.bufferFmt = pEncodeBuffer[1]->stInputBfr.bufferFmt;
+    stMEOnlyParams.inputWidth = pEncodeBuffer[1]->stInputBfr.dwWidth;
+    stMEOnlyParams.inputHeight = pEncodeBuffer[1]->stInputBfr.dwHeight;
+    stMEOnlyParams.outputMV = pEncodeBuffer[0]->stOutputBfr.hBitstreamBuffer;
+    nvStatus = m_pEncodeAPI->nvEncRunMotionEstimationOnly(m_hEncoder, &stMEOnlyParams);
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncCreateBitstreamBuffer(uint32_t size, void** bitstreamBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams;
+
+    memset(&createBitstreamBufferParams, 0, sizeof(createBitstreamBufferParams));
+    SET_VER(createBitstreamBufferParams, NV_ENC_CREATE_BITSTREAM_BUFFER);
+
+    createBitstreamBufferParams.size = size;
+    createBitstreamBufferParams.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+
+    nvStatus = m_pEncodeAPI->nvEncCreateBitstreamBuffer(m_hEncoder, &createBitstreamBufferParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    *bitstreamBuffer = createBitstreamBufferParams.bitstreamBuffer;
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncDestroyBitstreamBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (bitstreamBuffer)
+    {
+        nvStatus = m_pEncodeAPI->nvEncDestroyBitstreamBuffer(m_hEncoder, bitstreamBuffer);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            assert(0);
+        }
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncLockBitstream(NV_ENC_LOCK_BITSTREAM* lockBitstreamBufferParams)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncLockBitstream(m_hEncoder, lockBitstreamBufferParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncUnlockBitstream(NV_ENC_OUTPUT_PTR bitstreamBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncUnlockBitstream(m_hEncoder, bitstreamBuffer);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncLockInputBuffer(void* inputBuffer, void** bufferDataPtr, uint32_t* pitch)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_LOCK_INPUT_BUFFER lockInputBufferParams;
+
+    memset(&lockInputBufferParams, 0, sizeof(lockInputBufferParams));
+    SET_VER(lockInputBufferParams, NV_ENC_LOCK_INPUT_BUFFER);
+
+    lockInputBufferParams.inputBuffer = inputBuffer;
+    nvStatus = m_pEncodeAPI->nvEncLockInputBuffer(m_hEncoder, &lockInputBufferParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    *bufferDataPtr = lockInputBufferParams.bufferDataPtr;
+    *pitch = lockInputBufferParams.pitch;
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncUnlockInputBuffer(NV_ENC_INPUT_PTR inputBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncUnlockInputBuffer(m_hEncoder, inputBuffer);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetEncodeStats(NV_ENC_STAT* encodeStats)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeStats(m_hEncoder, encodeStats);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncGetSequenceParams(NV_ENC_SEQUENCE_PARAM_PAYLOAD* sequenceParamPayload)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    nvStatus = m_pEncodeAPI->nvEncGetSequenceParams(m_hEncoder, sequenceParamPayload);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncDestroyEncoder()
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (m_bEncoderInitialized)
+    {
+        nvStatus = m_pEncodeAPI->nvEncDestroyEncoder(m_hEncoder);
+
+        m_bEncoderInitialized = false;
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncInvalidateRefFrames(const NvEncPictureCommand *pEncPicCommand)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    for (uint32_t i = 0; i < pEncPicCommand->numRefFramesToInvalidate; i++)
+    {
+        nvStatus = m_pEncodeAPI->nvEncInvalidateRefFrames(m_hEncoder, pEncPicCommand->refFrameNumbers[i]);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncOpenEncodeSessionEx(void* device, NV_ENC_DEVICE_TYPE deviceType)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS openSessionExParams;
+
+    memset(&openSessionExParams, 0, sizeof(openSessionExParams));
+    SET_VER(openSessionExParams, NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS);
+
+    openSessionExParams.device = device;
+    openSessionExParams.deviceType = deviceType;
+    openSessionExParams.reserved = NULL;
+    openSessionExParams.apiVersion = NVENCAPI_VERSION;
+
+    nvStatus = m_pEncodeAPI->nvEncOpenEncodeSessionEx(&openSessionExParams, &m_hEncoder);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncReconfigureEncoder(const NvEncPictureCommand *pEncPicCommand)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (pEncPicCommand->bBitrateChangePending || pEncPicCommand->bResolutionChangePending)
+    {
+        if (pEncPicCommand->bResolutionChangePending)
+        {
+            m_uCurWidth = pEncPicCommand->newWidth;
+            m_uCurHeight = pEncPicCommand->newHeight;
+            if ((m_uCurWidth > m_uMaxWidth) || (m_uCurHeight > m_uMaxHeight))
+            {
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            m_stCreateEncodeParams.encodeWidth = m_uCurWidth;
+            m_stCreateEncodeParams.encodeHeight = m_uCurHeight;
+            m_stCreateEncodeParams.darWidth = m_uCurWidth;
+            m_stCreateEncodeParams.darHeight = m_uCurHeight;
+        }
+
+        if (pEncPicCommand->bBitrateChangePending)
+        {
+            m_stEncodeConfig.rcParams.averageBitRate = pEncPicCommand->newBitrate;
+            m_stEncodeConfig.rcParams.maxBitRate = pEncPicCommand->newBitrate;
+            m_stEncodeConfig.rcParams.vbvBufferSize = pEncPicCommand->newVBVSize != 0 ? pEncPicCommand->newVBVSize : (pEncPicCommand->newBitrate * m_stCreateEncodeParams.frameRateDen) / m_stCreateEncodeParams.frameRateNum;
+            m_stEncodeConfig.rcParams.vbvInitialDelay = m_stEncodeConfig.rcParams.vbvBufferSize;
+        }
+
+        NV_ENC_RECONFIGURE_PARAMS stReconfigParams;
+        memset(&stReconfigParams, 0, sizeof(stReconfigParams));
+        memcpy(&stReconfigParams.reInitEncodeParams, &m_stCreateEncodeParams, sizeof(m_stCreateEncodeParams));
+        stReconfigParams.version = NV_ENC_RECONFIGURE_PARAMS_VER;
+        stReconfigParams.forceIDR = pEncPicCommand->bResolutionChangePending ? 1 : 0;
+
+        nvStatus = m_pEncodeAPI->nvEncReconfigureEncoder(m_hEncoder, &stReconfigParams);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            assert(0);
+        }
+    }
+
+    return nvStatus;
+}
+
+CNvHWEncoder::CNvHWEncoder()
+{
+    m_hEncoder = NULL;
+    m_bEncoderInitialized = false;
+    m_pEncodeAPI = NULL;
+    m_hinstLib = NULL;
+    m_EncodeIdx = 0;
+    m_uCurWidth = 0;
+    m_uCurHeight = 0;
+    m_uMaxWidth = 0;
+    m_uMaxHeight = 0;
+
+    memset(&m_stCreateEncodeParams, 0, sizeof(m_stCreateEncodeParams));
+    SET_VER(m_stCreateEncodeParams, NV_ENC_INITIALIZE_PARAMS);
+
+    memset(&m_stEncodeConfig, 0, sizeof(m_stEncodeConfig));
+    SET_VER(m_stEncodeConfig, NV_ENC_CONFIG);
+}
+
+CNvHWEncoder::~CNvHWEncoder()
+{
+    // clean up encode API resources here
+    if (m_pEncodeAPI)
+    {
+        delete m_pEncodeAPI;
+        m_pEncodeAPI = NULL;
+    }
+
+    if (m_hinstLib)
+    {
+#if defined (NV_WINDOWS)
+        FreeLibrary(m_hinstLib);
+#else
+        dlclose(m_hinstLib);
+#endif
+
+        m_hinstLib = NULL;
+    }
+}
+
+NVENCSTATUS CNvHWEncoder::ValidateEncodeGUID (GUID inputCodecGuid)
+{
+    unsigned int i, codecFound, encodeGUIDCount, encodeGUIDArraySize;
+    NVENCSTATUS nvStatus;
+    GUID *encodeGUIDArray;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDCount(m_hEncoder, &encodeGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+        return nvStatus;
+    }
+
+    encodeGUIDArray = new GUID[encodeGUIDCount];
+    memset(encodeGUIDArray, 0, sizeof(GUID)* encodeGUIDCount);
+
+    encodeGUIDArraySize = 0;
+    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDs(m_hEncoder, encodeGUIDArray, encodeGUIDCount, &encodeGUIDArraySize);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        delete[] encodeGUIDArray;
+        assert(0);
+        return nvStatus;
+    }
+
+    assert(encodeGUIDArraySize <= encodeGUIDCount);
+
+    codecFound = 0;
+    for (i = 0; i < encodeGUIDArraySize; i++)
+    {
+        if (inputCodecGuid == encodeGUIDArray[i])
+        {
+            codecFound = 1;
+            break;
+        }
+    }
+
+    delete[] encodeGUIDArray;
+
+    if (codecFound)
+        return NV_ENC_SUCCESS;
+    else
+        return NV_ENC_ERR_INVALID_PARAM;
+}
+
+NVENCSTATUS CNvHWEncoder::ValidatePresetGUID(GUID inputPresetGuid, GUID inputCodecGuid)
+{
+    uint32_t i, presetFound, presetGUIDCount, presetGUIDArraySize;
+    NVENCSTATUS nvStatus;
+    GUID *presetGUIDArray;
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetCount(m_hEncoder, inputCodecGuid, &presetGUIDCount);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+        return nvStatus;
+    }
+
+    presetGUIDArray = new GUID[presetGUIDCount];
+    memset(presetGUIDArray, 0, sizeof(GUID)* presetGUIDCount);
+
+    presetGUIDArraySize = 0;
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetGUIDs(m_hEncoder, inputCodecGuid, presetGUIDArray, presetGUIDCount, &presetGUIDArraySize);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+        delete[] presetGUIDArray;
+        return nvStatus;
+    }
+
+    assert(presetGUIDArraySize <= presetGUIDCount);
+
+    presetFound = 0;
+    for (i = 0; i < presetGUIDArraySize; i++)
+    {
+        if (inputPresetGuid == presetGUIDArray[i])
+        {
+            presetFound = 1;
+            break;
+        }
+    }
+
+    delete[] presetGUIDArray;
+
+    if (presetFound)
+        return NV_ENC_SUCCESS;
+    else
+        return NV_ENC_ERR_INVALID_PARAM;
+}
+
+NVENCSTATUS CNvHWEncoder::CreateEncoder(const EncodeConfig *pEncCfg)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (pEncCfg == NULL)
+    {
+        return NV_ENC_ERR_INVALID_PARAM;
+    }
+
+    m_uCurWidth = pEncCfg->width;
+    m_uCurHeight = pEncCfg->height;
+
+    m_uMaxWidth = (pEncCfg->maxWidth > 0 ? pEncCfg->maxWidth : pEncCfg->width);
+    m_uMaxHeight = (pEncCfg->maxHeight > 0 ? pEncCfg->maxHeight : pEncCfg->height);
+
+    if ((m_uCurWidth > m_uMaxWidth) || (m_uCurHeight > m_uMaxHeight)) {
+        return NV_ENC_ERR_INVALID_PARAM;
+    }
+
+    if (!pEncCfg->width || !pEncCfg->height)
+    {
+        return NV_ENC_ERR_INVALID_PARAM;
+    }
+
+    if (pEncCfg->isYuv444 && (pEncCfg->codec == NV_ENC_HEVC))
+    {
+        PRINTERR("444 is not supported with HEVC \n");
+        return NV_ENC_ERR_INVALID_PARAM;
+    }
+
+    GUID inputCodecGUID = pEncCfg->codec == NV_ENC_H264 ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID;
+    nvStatus = ValidateEncodeGUID(inputCodecGUID);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        PRINTERR("codec not supported \n");
+        return nvStatus;
+    }
+
+    codecGUID = inputCodecGUID;
+
+    m_stCreateEncodeParams.encodeGUID = inputCodecGUID;
+    m_stCreateEncodeParams.presetGUID = pEncCfg->presetGUID;
+    m_stCreateEncodeParams.encodeWidth = pEncCfg->width;
+    m_stCreateEncodeParams.encodeHeight = pEncCfg->height;
+
+    m_stCreateEncodeParams.darWidth = pEncCfg->width;
+    m_stCreateEncodeParams.darHeight = pEncCfg->height;
+    m_stCreateEncodeParams.frameRateNum = pEncCfg->fps;
+    m_stCreateEncodeParams.frameRateDen = 1;
+    m_stCreateEncodeParams.enableEncodeAsync = 0;
+    m_stCreateEncodeParams.enablePTD = 1;
+    m_stCreateEncodeParams.reportSliceOffsets = 0;
+    m_stCreateEncodeParams.enableSubFrameWrite = 0;
+    m_stCreateEncodeParams.encodeConfig = &m_stEncodeConfig;
+    m_stCreateEncodeParams.maxEncodeWidth = m_uMaxWidth;
+    m_stCreateEncodeParams.maxEncodeHeight = m_uMaxHeight;
+
+    // apply preset
+    NV_ENC_PRESET_CONFIG stPresetCfg;
+    memset(&stPresetCfg, 0, sizeof(NV_ENC_PRESET_CONFIG));
+    SET_VER(stPresetCfg, NV_ENC_PRESET_CONFIG);
+    SET_VER(stPresetCfg.presetCfg, NV_ENC_CONFIG);
+
+    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetConfig(m_hEncoder, m_stCreateEncodeParams.encodeGUID, m_stCreateEncodeParams.presetGUID, &stPresetCfg);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        PRINTERR("nvEncGetEncodePresetConfig returned failure");
+        return nvStatus;
+    }
+    memcpy(&m_stEncodeConfig, &stPresetCfg.presetCfg, sizeof(NV_ENC_CONFIG));
+
+    m_stEncodeConfig.gopLength = pEncCfg->gopLength;
+    m_stEncodeConfig.frameIntervalP = pEncCfg->numB + 1;
+    if (pEncCfg->pictureStruct == NV_ENC_PIC_STRUCT_FRAME)
+    {
+        m_stEncodeConfig.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
+    }
+    else
+    {
+        m_stEncodeConfig.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
+    }
+
+    m_stEncodeConfig.mvPrecision = NV_ENC_MV_PRECISION_QUARTER_PEL;
+
+    if (pEncCfg->bitrate || pEncCfg->vbvMaxBitrate)
+    {
+        m_stEncodeConfig.rcParams.rateControlMode = (NV_ENC_PARAMS_RC_MODE)pEncCfg->rcMode;
+        m_stEncodeConfig.rcParams.averageBitRate = pEncCfg->bitrate;
+        m_stEncodeConfig.rcParams.maxBitRate = pEncCfg->vbvMaxBitrate;
+        m_stEncodeConfig.rcParams.vbvBufferSize = pEncCfg->vbvSize;
+        m_stEncodeConfig.rcParams.vbvInitialDelay = pEncCfg->vbvSize * 9 / 10;
+    }
+    else
+    {
+        m_stEncodeConfig.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
+    }
+
+    if (pEncCfg->rcMode == 0)
+    {
+        m_stEncodeConfig.rcParams.constQP.qpInterP = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
+        m_stEncodeConfig.rcParams.constQP.qpInterB = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
+        m_stEncodeConfig.rcParams.constQP.qpIntra = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
+    }
+
+    // set up initial QP value
+    if (pEncCfg->rcMode == NV_ENC_PARAMS_RC_VBR || pEncCfg->rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ||
+        pEncCfg->rcMode == NV_ENC_PARAMS_RC_2_PASS_VBR) {
+        m_stEncodeConfig.rcParams.enableInitialRCQP = 1;
+        m_stEncodeConfig.rcParams.initialRCQP.qpInterP  = pEncCfg->qp;
+        if(pEncCfg->i_quant_factor != 0.0 && pEncCfg->b_quant_factor != 0.0) {               
+            m_stEncodeConfig.rcParams.initialRCQP.qpIntra = (int)(pEncCfg->qp * FABS(pEncCfg->i_quant_factor) + pEncCfg->i_quant_offset);
+            m_stEncodeConfig.rcParams.initialRCQP.qpInterB = (int)(pEncCfg->qp * FABS(pEncCfg->b_quant_factor) + pEncCfg->b_quant_offset);
+        } else {
+            m_stEncodeConfig.rcParams.initialRCQP.qpIntra = pEncCfg->qp;
+            m_stEncodeConfig.rcParams.initialRCQP.qpInterB = pEncCfg->qp;
+        }
+
+    }
+
+    if (pEncCfg->isYuv444)
+    {
+        m_stEncodeConfig.encodeCodecConfig.h264Config.chromaFormatIDC = 3;
+    }
+    else
+    {
+        m_stEncodeConfig.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
+    }
+
+    if (pEncCfg->intraRefreshEnableFlag)
+    {
+        if (pEncCfg->codec == NV_ENC_HEVC)
+        {
+            m_stEncodeConfig.encodeCodecConfig.hevcConfig.enableIntraRefresh = 1;
+            m_stEncodeConfig.encodeCodecConfig.hevcConfig.intraRefreshPeriod = pEncCfg->intraRefreshPeriod;
+            m_stEncodeConfig.encodeCodecConfig.hevcConfig.intraRefreshCnt = pEncCfg->intraRefreshDuration;
+        }
+        else
+        {
+            m_stEncodeConfig.encodeCodecConfig.h264Config.enableIntraRefresh = 1;
+            m_stEncodeConfig.encodeCodecConfig.h264Config.intraRefreshPeriod = pEncCfg->intraRefreshPeriod;
+            m_stEncodeConfig.encodeCodecConfig.h264Config.intraRefreshCnt = pEncCfg->intraRefreshDuration;
+        }
+    }
+
+    if (pEncCfg->invalidateRefFramesEnableFlag)
+    {
+        if (pEncCfg->codec == NV_ENC_HEVC)
+        {
+            m_stEncodeConfig.encodeCodecConfig.hevcConfig.maxNumRefFramesInDPB = 16;
+        }
+        else
+        {
+            m_stEncodeConfig.encodeCodecConfig.h264Config.maxNumRefFrames = 16;
+        }
+    }
+
+    if (pEncCfg->qpDeltaMapFile)
+    {
+        m_stEncodeConfig.rcParams.enableExtQPDeltaMap = 1;
+    }
+    if (pEncCfg->codec == NV_ENC_H264)
+    {
+        m_stEncodeConfig.encodeCodecConfig.h264Config.idrPeriod = pEncCfg->gopLength;
+    }
+    else if (pEncCfg->codec == NV_ENC_HEVC)
+    {
+        m_stEncodeConfig.encodeCodecConfig.hevcConfig.idrPeriod = pEncCfg->gopLength;
+    }
+
+    if (pEncCfg->enableMEOnly == 1 || pEncCfg->enableMEOnly == 2)
+    {
+        NV_ENC_CAPS_PARAM stCapsParam;
+        memset(&stCapsParam, 0, sizeof(NV_ENC_CAPS_PARAM));
+        SET_VER(stCapsParam, NV_ENC_CAPS_PARAM);
+        stCapsParam.capsToQuery = NV_ENC_CAPS_SUPPORT_MEONLY_MODE;
+        m_stCreateEncodeParams.enableMEOnlyMode =  true;
+        int meonlyMode = 0;
+        nvStatus = m_pEncodeAPI->nvEncGetEncodeCaps(m_hEncoder, m_stCreateEncodeParams.encodeGUID, &stCapsParam, &meonlyMode);
+        if (nvStatus != NV_ENC_SUCCESS)
+        {
+            PRINTERR("Encode Session Initialization failed");
+            return nvStatus;
+        }
+        else
+        {
+            if (meonlyMode == 1)
+            {
+                printf("NV_ENC_CAPS_SUPPORT_MEONLY_MODE  supported\n");
+            }
+            else
+            {
+                PRINTERR("NV_ENC_CAPS_SUPPORT_MEONLY_MODE not supported\n");
+                return NV_ENC_ERR_UNSUPPORTED_DEVICE;
+            }
+        } 
+    }
+
+    nvStatus = m_pEncodeAPI->nvEncInitializeEncoder(m_hEncoder, &m_stCreateEncodeParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        PRINTERR("Encode Session Initialization failed");
+        return nvStatus;
+    }
+    m_bEncoderInitialized = true;
+
+    return nvStatus;
+}
+
+GUID CNvHWEncoder::GetPresetGUID(char* encoderPreset, int codec)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    GUID presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
+
+    if (encoderPreset && (stricmp(encoderPreset, "hq") == 0))
+    {
+        presetGUID = NV_ENC_PRESET_HQ_GUID;
+    }
+    else if (encoderPreset && (stricmp(encoderPreset, "lowLatencyHP") == 0))
+    {
+        presetGUID = NV_ENC_PRESET_LOW_LATENCY_HP_GUID;
+    }
+    else if (encoderPreset && (stricmp(encoderPreset, "hp") == 0))
+    {
+        presetGUID = NV_ENC_PRESET_HP_GUID;
+    }
+    else if (encoderPreset && (stricmp(encoderPreset, "lowLatencyHQ") == 0))
+    {
+        presetGUID = NV_ENC_PRESET_LOW_LATENCY_HQ_GUID;
+    }
+    else if (encoderPreset && (stricmp(encoderPreset, "lossless") == 0))
+    {
+        presetGUID = NV_ENC_PRESET_LOSSLESS_HP_GUID;
+    }
+    else
+    {
+        if (encoderPreset)
+            PRINTERR("Unsupported preset guid %s\n", encoderPreset);
+        presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
+    }
+
+    GUID inputCodecGUID = codec == NV_ENC_H264 ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID;
+    nvStatus = ValidatePresetGUID(presetGUID, inputCodecGUID);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
+        PRINTERR("Unsupported preset guid %s\n", encoderPreset);
+    }
+
+    return presetGUID;
+}
+
+NVENCSTATUS CNvHWEncoder::ProcessOutput(const EncodeBuffer *pEncodeBuffer)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+
+    if (pEncodeBuffer->stOutputBfr.hBitstreamBuffer == NULL && pEncodeBuffer->stOutputBfr.bEOSFlag == FALSE)
+    {
+        return NV_ENC_ERR_INVALID_PARAM;
+    }
+
+    if (pEncodeBuffer->stOutputBfr.bEOSFlag)
+        return NV_ENC_SUCCESS;
+
+    nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_LOCK_BITSTREAM lockBitstreamData;
+    memset(&lockBitstreamData, 0, sizeof(lockBitstreamData));
+    SET_VER(lockBitstreamData, NV_ENC_LOCK_BITSTREAM);
+    lockBitstreamData.outputBitstream = pEncodeBuffer->stOutputBfr.hBitstreamBuffer;
+    lockBitstreamData.doNotWait = false;
+
+    nvStatus = m_pEncodeAPI->nvEncLockBitstream(m_hEncoder, &lockBitstreamData);
+    if (nvStatus == NV_ENC_SUCCESS)
+    {
+        nvStatus = m_pEncodeAPI->nvEncUnlockBitstream(m_hEncoder, pEncodeBuffer->stOutputBfr.hBitstreamBuffer);
+    }
+    else
+    {
+        PRINTERR("lock bitstream function failed \n");
+    }
+
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::Initialize(void* device, NV_ENC_DEVICE_TYPE deviceType)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    MYPROC nvEncodeAPICreateInstance; // function pointer to create instance in nvEncodeAPI
+
+#if defined(NV_WINDOWS)
+#if defined (_WIN64)
+    m_hinstLib = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
+#else
+    m_hinstLib = LoadLibrary(TEXT("nvEncodeAPI.dll"));
+#endif
+#else
+    m_hinstLib = dlopen("libnvidia-encode.so.1", RTLD_LAZY);
+#endif
+    if (m_hinstLib == NULL)
+        return NV_ENC_ERR_OUT_OF_MEMORY;
+
+#if defined(NV_WINDOWS)
+    nvEncodeAPICreateInstance = (MYPROC)GetProcAddress(m_hinstLib, "NvEncodeAPICreateInstance");
+#else
+    nvEncodeAPICreateInstance = (MYPROC)dlsym(m_hinstLib, "NvEncodeAPICreateInstance");
+#endif
+
+    if (nvEncodeAPICreateInstance == NULL)
+        return NV_ENC_ERR_OUT_OF_MEMORY;
+
+    m_pEncodeAPI = new NV_ENCODE_API_FUNCTION_LIST;
+    if (m_pEncodeAPI == NULL)
+        return NV_ENC_ERR_OUT_OF_MEMORY;
+
+    memset(m_pEncodeAPI, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST));
+    m_pEncodeAPI->version = NV_ENCODE_API_FUNCTION_LIST_VER;
+    nvStatus = nvEncodeAPICreateInstance(m_pEncodeAPI);
+    if (nvStatus != NV_ENC_SUCCESS)
+        return nvStatus;
+
+    nvStatus = NvEncOpenEncodeSessionEx(device, deviceType);
+    if (nvStatus != NV_ENC_SUCCESS)
+        return nvStatus;
+
+    return NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncEncodeFrame(EncodeBuffer *pEncodeBuffer, NvEncPictureCommand *encPicCommand,
+                                           uint32_t width, uint32_t height, NV_ENC_PIC_STRUCT ePicStruct,
+                                           int8_t *qpDeltaMapArray, uint32_t qpDeltaMapArraySize)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_PIC_PARAMS encPicParams;
+
+    memset(&encPicParams, 0, sizeof(encPicParams));
+    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);
+
+    encPicParams.inputBuffer = pEncodeBuffer->stInputBfr.hInputSurface;
+    encPicParams.bufferFmt = pEncodeBuffer->stInputBfr.bufferFmt;
+    encPicParams.inputWidth = width;
+    encPicParams.inputHeight = height;
+    encPicParams.outputBitstream = pEncodeBuffer->stOutputBfr.hBitstreamBuffer;
+    encPicParams.completionEvent = nullptr;
+    encPicParams.inputTimeStamp = m_EncodeIdx;
+    encPicParams.pictureStruct = ePicStruct;
+    encPicParams.qpDeltaMap = qpDeltaMapArray;
+    encPicParams.qpDeltaMapSize = qpDeltaMapArraySize;
+
+    if (encPicCommand)
+    {
+        if (encPicCommand->bForceIDR)
+        {
+            encPicParams.encodePicFlags |= NV_ENC_PIC_FLAG_FORCEIDR;
+        }
+
+        if (encPicCommand->bForceIntraRefresh)
+        {
+            if (codecGUID == NV_ENC_CODEC_HEVC_GUID)
+            {
+                encPicParams.codecPicParams.hevcPicParams.forceIntraRefreshWithFrameCnt = encPicCommand->intraRefreshDuration;
+            }
+            else
+            {
+                encPicParams.codecPicParams.h264PicParams.forceIntraRefreshWithFrameCnt = encPicCommand->intraRefreshDuration;
+            }
+        }
+    }
+
+    nvStatus = m_pEncodeAPI->nvEncEncodePicture(m_hEncoder, &encPicParams);
+    if (nvStatus != NV_ENC_SUCCESS && nvStatus != NV_ENC_ERR_NEED_MORE_INPUT)
+    {
+        assert(0);
+        return nvStatus;
+    }
+
+    m_EncodeIdx++;
+
+    return NV_ENC_SUCCESS;
+}
+
+NVENCSTATUS CNvHWEncoder::NvEncFlushEncoderQueue(void *hEOSEvent)
+{
+    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
+    NV_ENC_PIC_PARAMS encPicParams;
+    memset(&encPicParams, 0, sizeof(encPicParams));
+    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);
+    encPicParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+    encPicParams.completionEvent = hEOSEvent;
+    nvStatus = m_pEncodeAPI->nvEncEncodePicture(m_hEncoder, &encPicParams);
+    if (nvStatus != NV_ENC_SUCCESS)
+    {
+        assert(0);
+    }
+    return nvStatus;
+}
+
+NVENCSTATUS CNvHWEncoder::ParseArguments(EncodeConfig *encodeConfig, int argc, char *argv[])
+{
+    for (int i = 1; i < argc; i++)
+    {
+        if (stricmp(argv[i], "-bmpfilePath") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->inputFilePath = argv[i];
+        }
+        else if (stricmp(argv[i], "-i") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->inputFileName = argv[i];
+        }
+        else if (stricmp(argv[i], "-o") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->outputFileName = argv[i];
+        }
+        else if (stricmp(argv[i], "-size") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->width) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->height) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 2]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-maxSize") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->maxWidth) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->maxHeight) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 2]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-bitrate") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->bitrate) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-vbvMaxBitrate") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->vbvMaxBitrate) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-vbvSize") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->vbvSize) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-fps") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->fps) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-startf") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->startFrameIdx) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-endf") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->endFrameIdx) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-rcmode") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->rcMode) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-goplength") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->gopLength) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-numB") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->numB) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-qp") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->qp) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-i_qfactor") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->i_quant_factor) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-b_qfactor") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->b_quant_factor) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-i_qoffset") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->i_quant_offset) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-b_qoffset") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->b_quant_offset) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-preset") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->encoderPreset = argv[i];
+        }
+        else if (stricmp(argv[i], "-devicetype") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->deviceType) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-codec") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->codec) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-encCmdFile") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->encCmdFileName = argv[i];
+        }
+        else if (stricmp(argv[i], "-intraRefresh") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshEnableFlag) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-intraRefreshPeriod") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshPeriod) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-intraRefreshDuration") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshDuration) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-picStruct") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->pictureStruct) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-deviceID") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->deviceID) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-yuv444") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->isYuv444) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-qpDeltaMapFile") == 0)
+        {
+            if (++i >= argc)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            encodeConfig->qpDeltaMapFile = argv[i];
+        }
+        else if (stricmp(argv[i], "-meonly") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->enableMEOnly) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            if (encodeConfig->enableMEOnly != 1 && encodeConfig->enableMEOnly != 2)
+            {
+                PRINTERR("invalid enableMEOnly value = %d (permissive value 1 and 2)\n", encodeConfig->enableMEOnly);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-preloadedFrameCount") == 0)
+        {
+            if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->preloadedFrameCount) != 1)
+            {
+                PRINTERR("invalid parameter for %s\n", argv[i - 1]);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+            if (encodeConfig->preloadedFrameCount <= 1)
+            {
+                PRINTERR("invalid preloadedFrameQueueSize value = %d (permissive value 2 and above)\n", encodeConfig->preloadedFrameCount);
+                return NV_ENC_ERR_INVALID_PARAM;
+            }
+        }
+        else if (stricmp(argv[i], "-help") == 0)
+        {
+            return NV_ENC_ERR_INVALID_PARAM;
+        }
+        else
+        {
+            PRINTERR("invalid parameter  %s\n", argv[i++]);
+            return NV_ENC_ERR_INVALID_PARAM;
+        }
+    }
+
+    return NV_ENC_SUCCESS;
+}
diff --git a/NvHWEncoder.h b/NvHWEncoder.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "dynlink_cuda_cuda.h" // <cuda.h>
+
+#include "nvEncodeAPI.h"
+#include "nvUtils.h"
+
+#define SET_VER(configStruct, type) {configStruct.version = type##_VER;}
+
+#if defined (NV_WINDOWS)
+    #include "d3d9.h"
+    #define NVENCAPI __stdcall
+    #pragma warning(disable : 4996)
+#elif defined (NV_UNIX)
+    #include <dlfcn.h>
+    #include <string.h>
+    #define NVENCAPI
+#endif
+
+#define DEFAULT_I_QFACTOR -0.8f
+#define DEFAULT_B_QFACTOR 1.25f
+#define DEFAULT_I_QOFFSET 0.f
+#define DEFAULT_B_QOFFSET 1.25f
+
+typedef struct _EncodeConfig
+{
+    int              width;
+    int              height;
+    int              maxWidth;
+    int              maxHeight;
+    int              fps;
+    int              bitrate;
+    int              vbvMaxBitrate;
+    int              vbvSize;
+    int              rcMode;
+    int              qp;
+    float            i_quant_factor;
+    float            b_quant_factor;
+    float            i_quant_offset;
+    float            b_quant_offset;
+    GUID             presetGUID;
+    int              codec;
+    int              invalidateRefFramesEnableFlag;
+    int              intraRefreshEnableFlag;
+    int              intraRefreshPeriod;
+    int              intraRefreshDuration;
+    int              deviceType;
+    int              startFrameIdx;
+    int              endFrameIdx;
+    int              gopLength;
+    int              numB;
+    int              pictureStruct;
+    int              deviceID;
+    int              isYuv444;
+    char            *qpDeltaMapFile;
+    char* inputFileName;
+    char* outputFileName;
+    char* encoderPreset;
+    char* inputFilePath;
+    char *encCmdFileName;
+    int  enableMEOnly;
+    int  preloadedFrameCount;
+}EncodeConfig;
+
+typedef struct _EncodeInputBuffer
+{
+    unsigned int      dwWidth;
+    unsigned int      dwHeight;
+    CUdeviceptr       pNV12devPtr;
+    uint32_t          uNV12Stride;
+    CUdeviceptr       pNV12TempdevPtr;
+    uint32_t          uNV12TempStride;
+    NV_ENC_INPUT_PTR  hInputSurface;
+    NV_ENC_BUFFER_FORMAT bufferFmt;
+}EncodeInputBuffer;
+
+typedef struct _EncodeOutputBuffer
+{
+    unsigned int          dwBitstreamBufferSize;
+    NV_ENC_OUTPUT_PTR     hBitstreamBuffer;
+    bool                  bEOSFlag;
+}EncodeOutputBuffer;
+
+typedef struct _EncodeBuffer
+{
+    EncodeOutputBuffer      stOutputBfr;
+    EncodeInputBuffer       stInputBfr;
+}EncodeBuffer;
+
+typedef struct _NvEncPictureCommand
+{
+    bool bResolutionChangePending;
+    bool bBitrateChangePending;
+    bool bForceIDR;
+    bool bForceIntraRefresh;
+    bool bInvalidateRefFrames;
+
+    uint32_t newWidth;
+    uint32_t newHeight;
+
+    uint32_t newBitrate;
+    uint32_t newVBVSize;
+
+    uint32_t  intraRefreshDuration;
+
+    uint32_t  numRefFramesToInvalidate;
+    uint32_t  refFrameNumbers[16];
+}NvEncPictureCommand;
+
+enum
+{
+    NV_ENC_H264 = 0,
+    NV_ENC_HEVC = 1,
+};
+
+struct MEOnlyConfig
+{
+    unsigned char *yuv[2][3];
+    unsigned int stride[3];
+    unsigned int width;
+    unsigned int height;
+    unsigned int inputFrameIndex;
+    unsigned int referenceFrameIndex;
+};
+
+class CNvHWEncoder
+{
+public:
+    uint32_t                                             m_EncodeIdx;
+    uint32_t                                             m_uMaxWidth;
+    uint32_t                                             m_uMaxHeight;
+    uint32_t                                             m_uCurWidth;
+    uint32_t                                             m_uCurHeight;
+
+protected:
+    bool                                                 m_bEncoderInitialized;
+    GUID                                                 codecGUID;
+
+    NV_ENCODE_API_FUNCTION_LIST*                         m_pEncodeAPI;
+    HINSTANCE                                            m_hinstLib;
+    void                                                *m_hEncoder;
+    NV_ENC_INITIALIZE_PARAMS                             m_stCreateEncodeParams;
+    NV_ENC_CONFIG                                        m_stEncodeConfig;
+
+public:
+    NVENCSTATUS NvEncOpenEncodeSession(void* device, uint32_t deviceType);
+    NVENCSTATUS NvEncGetEncodeGUIDCount(uint32_t* encodeGUIDCount);
+    NVENCSTATUS NvEncGetEncodeProfileGUIDCount(GUID encodeGUID, uint32_t* encodeProfileGUIDCount);
+    NVENCSTATUS NvEncGetEncodeProfileGUIDs(GUID encodeGUID, GUID* profileGUIDs, uint32_t guidArraySize, uint32_t* GUIDCount);
+    NVENCSTATUS NvEncGetEncodeGUIDs(GUID* GUIDs, uint32_t guidArraySize, uint32_t* GUIDCount);
+    NVENCSTATUS NvEncGetInputFormatCount(GUID encodeGUID, uint32_t* inputFmtCount);
+    NVENCSTATUS NvEncGetInputFormats(GUID encodeGUID, NV_ENC_BUFFER_FORMAT* inputFmts, uint32_t inputFmtArraySize, uint32_t* inputFmtCount);
+    NVENCSTATUS NvEncGetEncodeCaps(GUID encodeGUID, NV_ENC_CAPS_PARAM* capsParam, int* capsVal);
+    NVENCSTATUS NvEncGetEncodePresetCount(GUID encodeGUID, uint32_t* encodePresetGUIDCount);
+    NVENCSTATUS NvEncGetEncodePresetGUIDs(GUID encodeGUID, GUID* presetGUIDs, uint32_t guidArraySize, uint32_t* encodePresetGUIDCount);
+    NVENCSTATUS NvEncGetEncodePresetConfig(GUID encodeGUID, GUID  presetGUID, NV_ENC_PRESET_CONFIG* presetConfig);
+    NVENCSTATUS NvEncCreateInputBuffer(uint32_t width, uint32_t height, void** inputBuffer, uint32_t isYuv444);
+    NVENCSTATUS NvEncDestroyInputBuffer(NV_ENC_INPUT_PTR inputBuffer);
+    NVENCSTATUS NvEncCreateBitstreamBuffer(uint32_t size, void** bitstreamBuffer);
+    NVENCSTATUS NvEncDestroyBitstreamBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer);
+    NVENCSTATUS NvEncCreateMVBuffer(uint32_t size, void** bitstreamBuffer);
+    NVENCSTATUS NvEncDestroyMVBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer);
+    NVENCSTATUS NvRunMotionEstimationOnly(EncodeBuffer *pEncodeBuffer[2], MEOnlyConfig *pMEOnly);
+    NVENCSTATUS NvEncLockBitstream(NV_ENC_LOCK_BITSTREAM* lockBitstreamBufferParams);
+    NVENCSTATUS NvEncUnlockBitstream(NV_ENC_OUTPUT_PTR bitstreamBuffer);
+    NVENCSTATUS NvEncLockInputBuffer(void* inputBuffer, void** bufferDataPtr, uint32_t* pitch);
+    NVENCSTATUS NvEncUnlockInputBuffer(NV_ENC_INPUT_PTR inputBuffer);
+    NVENCSTATUS NvEncGetEncodeStats(NV_ENC_STAT* encodeStats);
+    NVENCSTATUS NvEncGetSequenceParams(NV_ENC_SEQUENCE_PARAM_PAYLOAD* sequenceParamPayload);
+    NVENCSTATUS NvEncDestroyEncoder();
+    NVENCSTATUS NvEncInvalidateRefFrames(const NvEncPictureCommand *pEncPicCommand);
+    NVENCSTATUS NvEncOpenEncodeSessionEx(void* device, NV_ENC_DEVICE_TYPE deviceType);
+    NVENCSTATUS NvEncReconfigureEncoder(const NvEncPictureCommand *pEncPicCommand);
+    NVENCSTATUS NvEncFlushEncoderQueue(void *hEOSEvent);
+
+    CNvHWEncoder();
+    virtual ~CNvHWEncoder();
+    NVENCSTATUS                                          Initialize(void* device, NV_ENC_DEVICE_TYPE deviceType);
+    NVENCSTATUS                                          Deinitialize();
+    NVENCSTATUS                                          NvEncEncodeFrame(EncodeBuffer *pEncodeBuffer, NvEncPictureCommand *encPicCommand,
+                                                                          uint32_t width, uint32_t height,
+                                                                          NV_ENC_PIC_STRUCT ePicStruct = NV_ENC_PIC_STRUCT_FRAME,
+                                                                          int8_t *qpDeltaMapArray = NULL, uint32_t qpDeltaMapArraySize = 0);
+    NVENCSTATUS                                          CreateEncoder(const EncodeConfig *pEncCfg);
+    GUID                                                 GetPresetGUID(char* encoderPreset, int codec);
+    NVENCSTATUS                                          ProcessOutput(const EncodeBuffer *pEncodeBuffer);
+    NVENCSTATUS                                          FlushEncoder();
+    NVENCSTATUS                                          ValidateEncodeGUID(GUID inputCodecGuid);
+    NVENCSTATUS                                          ValidatePresetGUID(GUID presetCodecGuid, GUID inputCodecGuid);
+    static NVENCSTATUS                                   ParseArguments(EncodeConfig *encodeConfig, int argc, char *argv[]);
+};
+
+typedef NVENCSTATUS (NVENCAPI *MYPROC)(NV_ENCODE_API_FUNCTION_LIST*); 
diff --git a/dynlink_cuda.cpp b/dynlink_cuda.cpp
@@ -0,0 +1,654 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+// With these flags defined, this source file will dynamically
+// load the corresponding functions.  Disabled by default.
+#define __CUDA_API_VERSION 4000
+
+#include <stdio.h>
+#include <string.h>
+#include "../inc/dynlink_cuda_cuda.h"
+#if INIT_CUDA_GL
+#include "../inc/dynlink_cudaGL.h"
+#endif
+#if INIT_CUDA_D3D9
+#include "../inc/dynlink_cudaD3D9.h"
+#endif
+
+tcuInit                               *_cuInit;
+tcuDriverGetVersion                   *cuDriverGetVersion;
+tcuDeviceGet                          *cuDeviceGet;
+tcuDeviceGetCount                     *cuDeviceGetCount;
+tcuDeviceGetName                      *cuDeviceGetName;
+tcuDeviceComputeCapability            *cuDeviceComputeCapability;
+tcuDeviceTotalMem                     *cuDeviceTotalMem;
+tcuDeviceGetProperties                *cuDeviceGetProperties;
+tcuDeviceGetAttribute                 *cuDeviceGetAttribute;
+tcuCtxCreate                          *cuCtxCreate;
+tcuCtxDestroy                         *cuCtxDestroy;
+tcuCtxAttach                          *cuCtxAttach;
+tcuCtxDetach                          *cuCtxDetach;
+tcuCtxPushCurrent                     *cuCtxPushCurrent;
+tcuCtxPopCurrent                      *cuCtxPopCurrent;
+tcuCtxGetCurrent                      *cuCtxGetCurrent;
+tcuCtxSetCurrent                      *cuCtxSetCurrent;
+tcuCtxGetDevice                       *cuCtxGetDevice;
+tcuCtxSynchronize                     *cuCtxSynchronize;
+tcuModuleLoad                         *cuModuleLoad;
+tcuModuleLoadData                     *cuModuleLoadData;
+tcuModuleLoadDataEx                   *cuModuleLoadDataEx;
+tcuModuleLoadFatBinary                *cuModuleLoadFatBinary;
+tcuModuleUnload                       *cuModuleUnload;
+tcuModuleGetFunction                  *cuModuleGetFunction;
+tcuModuleGetGlobal                    *cuModuleGetGlobal;
+tcuModuleGetTexRef                    *cuModuleGetTexRef;
+tcuModuleGetSurfRef                   *cuModuleGetSurfRef;
+tcuMemGetInfo                         *cuMemGetInfo;
+tcuMemAlloc                           *cuMemAlloc;
+tcuMemAllocPitch                      *cuMemAllocPitch;
+tcuMemFree                            *cuMemFree;
+tcuMemGetAddressRange                 *cuMemGetAddressRange;
+tcuMemAllocHost                       *cuMemAllocHost;
+tcuMemFreeHost                        *cuMemFreeHost;
+tcuMemHostAlloc                       *cuMemHostAlloc;
+tcuMemHostGetDevicePointer            *cuMemHostGetDevicePointer;
+tcuMemHostRegister                    *cuMemHostRegister;
+tcuMemHostUnregister                  *cuMemHostUnregister;
+tcuMemcpyHtoD                         *cuMemcpyHtoD;
+tcuMemcpyDtoH                         *cuMemcpyDtoH;
+tcuMemcpyDtoD                         *cuMemcpyDtoD;
+tcuMemcpyDtoA                         *cuMemcpyDtoA;
+tcuMemcpyAtoD                         *cuMemcpyAtoD;
+tcuMemcpyHtoA                         *cuMemcpyHtoA;
+tcuMemcpyAtoH                         *cuMemcpyAtoH;
+tcuMemcpyAtoA                         *cuMemcpyAtoA;
+tcuMemcpy2D                           *cuMemcpy2D;
+tcuMemcpy2DUnaligned                  *cuMemcpy2DUnaligned;
+tcuMemcpy3D                           *cuMemcpy3D;
+tcuMemcpyHtoDAsync                    *cuMemcpyHtoDAsync;
+tcuMemcpyDtoHAsync                    *cuMemcpyDtoHAsync;
+tcuMemcpyDtoDAsync                    *cuMemcpyDtoDAsync;
+tcuMemcpyHtoAAsync                    *cuMemcpyHtoAAsync;
+tcuMemcpyAtoHAsync                    *cuMemcpyAtoHAsync;
+tcuMemcpy2DAsync                      *cuMemcpy2DAsync;
+tcuMemcpy3DAsync                      *cuMemcpy3DAsync;
+tcuMemcpy                             *cuMemcpy;
+tcuMemcpyPeer                         *cuMemcpyPeer;
+tcuMemsetD8                           *cuMemsetD8;
+tcuMemsetD16                          *cuMemsetD16;
+tcuMemsetD32                          *cuMemsetD32;
+tcuMemsetD2D8                         *cuMemsetD2D8;
+tcuMemsetD2D16                        *cuMemsetD2D16;
+tcuMemsetD2D32                        *cuMemsetD2D32;
+tcuFuncSetBlockShape                  *cuFuncSetBlockShape;
+tcuFuncSetSharedSize                  *cuFuncSetSharedSize;
+tcuFuncGetAttribute                   *cuFuncGetAttribute;
+tcuFuncSetCacheConfig                 *cuFuncSetCacheConfig;
+tcuLaunchKernel                       *cuLaunchKernel;
+tcuArrayCreate                        *cuArrayCreate;
+tcuArrayGetDescriptor                 *cuArrayGetDescriptor;
+tcuArrayDestroy                       *cuArrayDestroy;
+tcuArray3DCreate                      *cuArray3DCreate;
+tcuArray3DGetDescriptor               *cuArray3DGetDescriptor;
+tcuTexRefCreate                       *cuTexRefCreate;
+tcuTexRefDestroy                      *cuTexRefDestroy;
+tcuTexRefSetArray                     *cuTexRefSetArray;
+tcuTexRefSetAddress                   *cuTexRefSetAddress;
+tcuTexRefSetAddress2D                 *cuTexRefSetAddress2D;
+tcuTexRefSetFormat                    *cuTexRefSetFormat;
+tcuTexRefSetAddressMode               *cuTexRefSetAddressMode;
+tcuTexRefSetFilterMode                *cuTexRefSetFilterMode;
+tcuTexRefSetFlags                     *cuTexRefSetFlags;
+tcuTexRefGetAddress                   *cuTexRefGetAddress;
+tcuTexRefGetArray                     *cuTexRefGetArray;
+tcuTexRefGetAddressMode               *cuTexRefGetAddressMode;
+tcuTexRefGetFilterMode                *cuTexRefGetFilterMode;
+tcuTexRefGetFormat                    *cuTexRefGetFormat;
+tcuTexRefGetFlags                     *cuTexRefGetFlags;
+tcuSurfRefSetArray                    *cuSurfRefSetArray;
+tcuSurfRefGetArray                    *cuSurfRefGetArray;
+tcuParamSetSize                       *cuParamSetSize;
+tcuParamSeti                          *cuParamSeti;
+tcuParamSetf                          *cuParamSetf;
+tcuParamSetv                          *cuParamSetv;
+tcuParamSetTexRef                     *cuParamSetTexRef;
+tcuLaunch                             *cuLaunch;
+tcuLaunchGrid                         *cuLaunchGrid;
+tcuLaunchGridAsync                    *cuLaunchGridAsync;
+tcuEventCreate                        *cuEventCreate;
+tcuEventRecord                        *cuEventRecord;
+tcuEventQuery                         *cuEventQuery;
+tcuEventSynchronize                   *cuEventSynchronize;
+tcuEventDestroy                       *cuEventDestroy;
+tcuEventElapsedTime                   *cuEventElapsedTime;
+tcuStreamCreate                       *cuStreamCreate;
+tcuStreamQuery                        *cuStreamQuery;
+tcuStreamSynchronize                  *cuStreamSynchronize;
+tcuStreamDestroy                      *cuStreamDestroy;
+tcuGraphicsUnregisterResource         *cuGraphicsUnregisterResource;
+tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
+tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
+tcuGraphicsResourceSetMapFlags        *cuGraphicsResourceSetMapFlags;
+tcuGraphicsMapResources               *cuGraphicsMapResources;
+tcuGraphicsUnmapResources             *cuGraphicsUnmapResources;
+tcuGetExportTable                     *cuGetExportTable;
+tcuCtxSetLimit                        *cuCtxSetLimit;
+tcuCtxGetLimit                        *cuCtxGetLimit;
+tcuMemHostGetFlags                    *cuMemHostGetFlags;
+
+#if INIT_CUDA_GL
+    // GL/CUDA interop
+    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    tcuWGLGetDevice                       *cuWGLGetDevice;
+    #endif
+
+    //#if __CUDA_API_VERSION >= 3020
+    tcuGLCtxCreate                        *cuGLCtxCreate;
+    tcuGLCtxCreate                        *cuGLCtxCreate_v2;
+    tcuGLMapBufferObject                  *cuGLMapBufferObject;
+    tcuGLMapBufferObject                  *cuGLMapBufferObject_v2;
+    tcuGLMapBufferObjectAsync             *cuGLMapBufferObjectAsync;
+    //#endif
+
+#if __CUDA_API_VERSION >= 6050
+    tcuGLGetDevices                       *cuGLGetDevices;
+#endif
+
+    tcuGLInit                             *cuGLInit; // deprecated in CUDA 3.0
+    tcuGraphicsGLRegisterBuffer           *cuGraphicsGLRegisterBuffer;
+    tcuGraphicsGLRegisterImage            *cuGraphicsGLRegisterImage;
+    tcuGLSetBufferObjectMapFlags          *cuGLSetBufferObjectMapFlags;
+    tcuGLRegisterBufferObject             *cuGLRegisterBufferObject;
+
+    tcuGLUnmapBufferObject                *cuGLUnmapBufferObject;
+    tcuGLUnmapBufferObjectAsync           *cuGLUnmapBufferObjectAsync;
+
+    tcuGLUnregisterBufferObject           *cuGLUnregisterBufferObject;
+    tcuGLGetDevices                       *cuGLGetDevices; // CUDA 6.5 only
+#endif
+
+#if INIT_CUDA_D3D9
+    // D3D9/CUDA interop (CUDA 1.x compatible API). These functions
+    // are deprecated; please use the ones below
+    tcuD3D9Begin                          *cuD3D9Begin;
+    tcuD3D9End                            *cuD3D9End;
+
+    // D3D9/CUDA interop (CUDA 2.x compatible)
+    tcuD3D9GetDirect3DDevice              *cuD3D9GetDirect3DDevice;
+    tcuD3D9RegisterResource               *cuD3D9RegisterResource;
+    tcuD3D9UnregisterResource             *cuD3D9UnregisterResource;
+    tcuD3D9MapResources                   *cuD3D9MapResources;
+    tcuD3D9UnmapResources                 *cuD3D9UnmapResources;
+    tcuD3D9ResourceSetMapFlags            *cuD3D9ResourceSetMapFlags;
+    tcuD3D9ResourceGetSurfaceDimensions   *cuD3D9ResourceGetSurfaceDimensions;
+    tcuD3D9ResourceGetMappedArray         *cuD3D9ResourceGetMappedArray;
+    tcuD3D9ResourceGetMappedPointer       *cuD3D9ResourceGetMappedPointer;
+    tcuD3D9ResourceGetMappedSize          *cuD3D9ResourceGetMappedSize;
+    tcuD3D9ResourceGetMappedPitch         *cuD3D9ResourceGetMappedPitch;
+
+    // D3D9/CUDA interop (CUDA 2.0+)
+    tcuD3D9GetDevice                      *cuD3D9GetDevice;
+    tcuD3D9GetDevice                      *cuD3D9GetDevices;
+    tcuD3D9GetDevice                      *cuD3D9GetDevice_v2;
+    tcuD3D9CtxCreate                      *cuD3D9CtxCreate;
+    tcuD3D9CtxCreate                      *cuD3D9CtxCreate_v2;
+    tcuGraphicsD3D9RegisterResource       *cuGraphicsD3D9RegisterResource;
+    tcuGraphicsD3D9RegisterResource       *cuGraphicsD3D9RegisterResource_v2;
+#endif
+
+#define STRINGIFY(X) #X
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#include <Windows.h>
+
+#ifdef UNICODE
+static LPCWSTR __CudaLibName = L"nvcuda.dll";
+#else
+static LPCSTR __CudaLibName = "nvcuda.dll";
+#endif
+
+typedef HMODULE CUDADRIVER;
+
+static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
+{
+    *pInstance = LoadLibrary(__CudaLibName);
+
+    if (*pInstance == NULL)
+    {
+        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+
+    return CUDA_SUCCESS;
+}
+
+#define GET_PROC_EX(name, alias, required)                     \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);               \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __CudaLibName);                                  \
+        }
+
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __CudaLibName);                       \
+        }
+
+#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
+
+#include <dlfcn.h>
+
+#if defined(__APPLE__) || defined(__MACOSX)
+static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
+#else
+static char __CudaLibName[] = "libcuda.so";
+#endif
+
+typedef void *CUDADRIVER;
+
+static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
+{
+    *pInstance = dlopen(__CudaLibName, RTLD_NOW);
+
+    if (*pInstance == NULL)
+    {
+        printf("dlopen \"%s\" failed!\n", __CudaLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+
+    return CUDA_SUCCESS;
+}
+
+#define GET_PROC_EX(name, alias, required)                              \
+    alias = (t##name *)dlsym(CudaDrvLib, #name);                        \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __CudaLibName);                                  \
+        }
+
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));         \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __CudaLibName);                    \
+        }
+
+#else
+#error unsupported platform
+#endif
+
+#define CHECKED_CALL(call)              \
+    do {                                \
+        CUresult result = (call);       \
+        if (CUDA_SUCCESS != result) {   \
+            return result;              \
+        }                               \
+    } while(0)
+
+#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
+#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
+#define GET_PROC(name)          GET_PROC_REQUIRED(name)
+#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
+
+#if INIT_CUDA_GL
+inline CUresult CUDAAPI cuInitGL(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
+{
+    if (cudaVersion >= 2010)
+    {
+        GET_PROC(cuGLCtxCreate);
+        GET_PROC(cuGraphicsGLRegisterBuffer);
+        GET_PROC(cuGraphicsGLRegisterImage);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        GET_PROC(cuWGLGetDevice);
+#endif
+    }
+    if (cudaVersion >= 2030)
+    {
+        GET_PROC(cuGraphicsGLRegisterBuffer);
+        GET_PROC(cuGraphicsGLRegisterImage);
+    }
+    if (cudaVersion >= 3000)
+    {
+        GET_PROC(cuGLGetDevices);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        GET_PROC(cuWGLGetDevice);
+#endif
+        GET_PROC_V2(cuGLCtxCreate);
+
+        GET_PROC_V2(cuGLMapBufferObject);
+        GET_PROC(cuGLUnmapBufferObject);
+        GET_PROC(cuGLMapBufferObjectAsync);
+        GET_PROC(cuGLUnmapBufferObjectAsync);
+        GET_PROC(cuGLRegisterBufferObject);
+        GET_PROC(cuGLUnregisterBufferObject);
+        GET_PROC(cuGLSetBufferObjectMapFlags);
+    }
+
+    return CUDA_SUCCESS;
+}
+#endif
+
+#ifdef INIT_CUDA_D3D9
+inline CUresult CUDAAPI cuInitD3D9(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
+{
+    // D3D9/CUDA (CUDA 1.x compatible API)
+    GET_PROC(cuD3D9Begin);
+    GET_PROC(cuD3D9End);
+
+    // D3D9/CUDA (CUDA 2.x compatible API)
+    GET_PROC(cuD3D9GetDirect3DDevice);
+    GET_PROC(cuD3D9RegisterResource);
+    GET_PROC(cuD3D9UnregisterResource);
+    GET_PROC(cuD3D9MapResources);
+    GET_PROC(cuD3D9UnmapResources);
+    GET_PROC(cuD3D9ResourceSetMapFlags);
+
+    // D3D9/CUDA (CUDA 2.0+ compatible API)
+    GET_PROC(cuD3D9GetDevice);
+    GET_PROC(cuGraphicsD3D9RegisterResource);
+
+    GET_PROC_V2(cuD3D9CtxCreate);
+    GET_PROC_V2(cuD3D9ResourceGetSurfaceDimensions);
+    GET_PROC_V2(cuD3D9ResourceGetMappedPointer);
+    GET_PROC_V2(cuD3D9ResourceGetMappedSize);
+    GET_PROC_V2(cuD3D9ResourceGetMappedPitch);
+//    GET_PROC_V2(cuD3D9ResourceGetMappedArray);
+
+    return CUDA_SUCCESS;
+}
+#endif
+
+#ifdef INIT_CUDA_D3D10
+inline CUresult CUDAAPI cuInitD3D10(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
+{
+    if (cudaVersion >= 2030)
+    {
+        GET_PROC(cuD3D10GetDevice);
+        GET_PROC(cuD3D10CtxCreate);
+        GET_PROC(cuGraphicsD3D10RegisterResource);
+    }
+    return CUDA_SUCCESS;
+}
+#endif
+
+#ifdef INIT_CUDA_D3D11
+inline CUresult CUDAAPI cuInitD3D11(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
+{
+    if (cudaVersion >= 3000)
+    {
+        GET_PROC(cuD3D11GetDevice);
+        GET_PROC(cuD3D11CtxCreate);
+        GET_PROC(cuGraphicsD3D11RegisterResource);
+    }
+
+    return CUDA_SUCCESS;
+}
+#endif
+
+
+CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion, void *pHandleDriver)
+{
+    CUDADRIVER CudaDrvLib;
+    int driverVer = 1000;
+
+    CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
+    if (pHandleDriver != NULL)
+    {
+        memcpy(pHandleDriver, &CudaDrvLib, sizeof(CUDADRIVER));
+    }
+
+    // cuInit is required; alias it to _cuInit
+    GET_PROC_EX(cuInit, _cuInit, 1);
+    CHECKED_CALL(_cuInit(Flags));
+
+    // available since 2.2. if not present, version 1.0 is assumed
+    GET_PROC_OPTIONAL(cuDriverGetVersion);
+
+    if (cuDriverGetVersion)
+    {
+        CHECKED_CALL(cuDriverGetVersion(&driverVer));
+    }
+
+    // fetch all function pointers
+    GET_PROC(cuDeviceGet);
+    GET_PROC(cuDeviceGetCount);
+    GET_PROC(cuDeviceGetName);
+    GET_PROC(cuDeviceComputeCapability);
+    GET_PROC(cuDeviceGetProperties);
+    GET_PROC(cuDeviceGetAttribute);
+    GET_PROC(cuCtxDestroy);
+    GET_PROC(cuCtxAttach);
+    GET_PROC(cuCtxDetach);
+    GET_PROC(cuCtxPushCurrent);
+    GET_PROC(cuCtxPopCurrent);
+    GET_PROC(cuCtxGetDevice);
+    GET_PROC(cuCtxSynchronize);
+    GET_PROC(cuModuleLoad);
+    GET_PROC(cuModuleLoadData);
+    GET_PROC(cuModuleUnload);
+    GET_PROC(cuModuleGetFunction);
+    GET_PROC(cuModuleGetTexRef);
+    GET_PROC(cuMemFreeHost);
+    GET_PROC(cuMemHostAlloc);
+    GET_PROC(cuFuncSetBlockShape);
+    GET_PROC(cuFuncSetSharedSize);
+    GET_PROC(cuFuncGetAttribute);
+    GET_PROC(cuArrayDestroy);
+    GET_PROC(cuTexRefCreate);
+    GET_PROC(cuTexRefDestroy);
+    GET_PROC(cuTexRefSetArray);
+    GET_PROC(cuTexRefSetFormat);
+    GET_PROC(cuTexRefSetAddressMode);
+    GET_PROC(cuTexRefSetFilterMode);
+    GET_PROC(cuTexRefSetFlags);
+    GET_PROC(cuTexRefGetArray);
+    GET_PROC(cuTexRefGetAddressMode);
+    GET_PROC(cuTexRefGetFilterMode);
+    GET_PROC(cuTexRefGetFormat);
+    GET_PROC(cuTexRefGetFlags);
+    GET_PROC(cuParamSetSize);
+    GET_PROC(cuParamSeti);
+    GET_PROC(cuParamSetf);
+    GET_PROC(cuParamSetv);
+    GET_PROC(cuParamSetTexRef);
+    GET_PROC(cuLaunch);
+    GET_PROC(cuLaunchGrid);
+    GET_PROC(cuLaunchGridAsync);
+    GET_PROC(cuEventCreate);
+    GET_PROC(cuEventRecord);
+    GET_PROC(cuEventQuery);
+    GET_PROC(cuEventSynchronize);
+    GET_PROC(cuEventDestroy);
+    GET_PROC(cuEventElapsedTime);
+    GET_PROC(cuStreamCreate);
+    GET_PROC(cuStreamQuery);
+    GET_PROC(cuStreamSynchronize);
+    GET_PROC(cuStreamDestroy);
+
+    // These could be _v2 interfaces
+    if (cudaVersion >= 4000)
+    {
+        GET_PROC_V2(cuCtxDestroy);
+        GET_PROC_V2(cuCtxPopCurrent);
+        GET_PROC_V2(cuCtxPushCurrent);
+        GET_PROC_V2(cuStreamDestroy);
+        GET_PROC_V2(cuEventDestroy);
+    }
+
+    if (cudaVersion >= 3020)
+    {
+        GET_PROC_V2(cuDeviceTotalMem);
+        GET_PROC_V2(cuCtxCreate);
+        GET_PROC_V2(cuModuleGetGlobal);
+        GET_PROC_V2(cuMemGetInfo);
+        GET_PROC_V2(cuMemAlloc);
+        GET_PROC_V2(cuMemAllocPitch);
+        GET_PROC_V2(cuMemFree);
+        GET_PROC_V2(cuMemGetAddressRange);
+        GET_PROC_V2(cuMemAllocHost);
+        GET_PROC_V2(cuMemHostGetDevicePointer);
+        GET_PROC_V2(cuMemcpyHtoD);
+        GET_PROC_V2(cuMemcpyDtoH);
+        GET_PROC_V2(cuMemcpyDtoD);
+        GET_PROC_V2(cuMemcpyDtoA);
+        GET_PROC_V2(cuMemcpyAtoD);
+        GET_PROC_V2(cuMemcpyHtoA);
+        GET_PROC_V2(cuMemcpyAtoH);
+        GET_PROC_V2(cuMemcpyAtoA);
+        GET_PROC_V2(cuMemcpy2D);
+        GET_PROC_V2(cuMemcpy2DUnaligned);
+        GET_PROC_V2(cuMemcpy3D);
+        GET_PROC_V2(cuMemcpyHtoDAsync);
+        GET_PROC_V2(cuMemcpyDtoHAsync);
+        GET_PROC_V2(cuMemcpyHtoAAsync);
+        GET_PROC_V2(cuMemcpyAtoHAsync);
+        GET_PROC_V2(cuMemcpy2DAsync);
+        GET_PROC_V2(cuMemcpy3DAsync);
+        GET_PROC_V2(cuMemsetD8);
+        GET_PROC_V2(cuMemsetD16);
+        GET_PROC_V2(cuMemsetD32);
+        GET_PROC_V2(cuMemsetD2D8);
+        GET_PROC_V2(cuMemsetD2D16);
+        GET_PROC_V2(cuMemsetD2D32);
+        GET_PROC_V2(cuArrayCreate);
+        GET_PROC_V2(cuArrayGetDescriptor);
+        GET_PROC_V2(cuArray3DCreate);
+        GET_PROC_V2(cuArray3DGetDescriptor);
+        GET_PROC_V2(cuTexRefSetAddress);
+        GET_PROC_V2(cuTexRefSetAddress2D);
+        GET_PROC_V2(cuTexRefGetAddress);
+    }
+    else
+    {
+        GET_PROC(cuDeviceTotalMem);
+        GET_PROC(cuCtxCreate);
+        GET_PROC(cuModuleGetGlobal);
+        GET_PROC(cuMemGetInfo);
+        GET_PROC(cuMemAlloc);
+        GET_PROC(cuMemAllocPitch);
+        GET_PROC(cuMemFree);
+        GET_PROC(cuMemGetAddressRange);
+        GET_PROC(cuMemAllocHost);
+        GET_PROC(cuMemHostGetDevicePointer);
+        GET_PROC(cuMemcpyHtoD);
+        GET_PROC(cuMemcpyDtoH);
+        GET_PROC(cuMemcpyDtoD);
+        GET_PROC(cuMemcpyDtoA);
+        GET_PROC(cuMemcpyAtoD);
+        GET_PROC(cuMemcpyHtoA);
+        GET_PROC(cuMemcpyAtoH);
+        GET_PROC(cuMemcpyAtoA);
+        GET_PROC(cuMemcpy2D);
+        GET_PROC(cuMemcpy2DUnaligned);
+        GET_PROC(cuMemcpy3D);
+        GET_PROC(cuMemcpyHtoDAsync);
+        GET_PROC(cuMemcpyDtoHAsync);
+        GET_PROC(cuMemcpyHtoAAsync);
+        GET_PROC(cuMemcpyAtoHAsync);
+        GET_PROC(cuMemcpy2DAsync);
+        GET_PROC(cuMemcpy3DAsync);
+        GET_PROC(cuMemsetD8);
+        GET_PROC(cuMemsetD16);
+        GET_PROC(cuMemsetD32);
+        GET_PROC(cuMemsetD2D8);
+        GET_PROC(cuMemsetD2D16);
+        GET_PROC(cuMemsetD2D32);
+        GET_PROC(cuArrayCreate);
+        GET_PROC(cuArrayGetDescriptor);
+        GET_PROC(cuArray3DCreate);
+        GET_PROC(cuArray3DGetDescriptor);
+        GET_PROC(cuTexRefSetAddress);
+        GET_PROC(cuTexRefSetAddress2D);
+        GET_PROC(cuTexRefGetAddress);
+    }
+
+    // The following functions are specific to CUDA versions
+    if (driverVer >= 2010)
+    {
+        GET_PROC(cuModuleLoadDataEx);
+        GET_PROC(cuModuleLoadFatBinary);
+    }
+
+    if (driverVer >= 2030)
+    {
+        GET_PROC(cuMemHostGetFlags);
+    }
+
+    if (driverVer >= 3000)
+    {
+        GET_PROC(cuMemcpyDtoDAsync);
+        GET_PROC(cuFuncSetCacheConfig);
+
+        GET_PROC(cuGraphicsUnregisterResource);
+        GET_PROC(cuGraphicsSubResourceGetMappedArray);
+
+#if (__CUDA_API_VERSION >= 3020)
+        if (cudaVersion >= 3020)
+        {
+            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
+        }
+        else
+        {
+            GET_PROC(cuGraphicsResourceGetMappedPointer);
+        }
+#endif
+        GET_PROC(cuGraphicsResourceSetMapFlags);
+        GET_PROC(cuGraphicsMapResources);
+        GET_PROC(cuGraphicsUnmapResources);
+        GET_PROC(cuGetExportTable);
+    }
+
+    if (driverVer >= 3010)
+    {
+        GET_PROC(cuModuleGetSurfRef);
+        GET_PROC(cuSurfRefSetArray);
+        GET_PROC(cuSurfRefGetArray);
+        GET_PROC(cuCtxSetLimit);
+        GET_PROC(cuCtxGetLimit);
+    }
+
+    if (driverVer >= 4000)
+    {
+        GET_PROC(cuCtxSetCurrent);
+        GET_PROC(cuCtxGetCurrent);
+        GET_PROC(cuMemHostRegister);
+        GET_PROC(cuMemHostUnregister);
+        GET_PROC(cuMemcpy);
+        GET_PROC(cuMemcpyPeer);
+        GET_PROC(cuLaunchKernel);
+    }
+
+#if INIT_CUDA_GL
+    if (cuInitGL(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
+        return CUDA_ERROR_INVALID_DEVICE;
+#endif
+
+#if INIT_CUDA_D3D9
+    if (cuInitD3D9(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
+        return CUDA_ERROR_INVALID_DEVICE;
+#endif
+
+#if INIT_CUDA_D3D10
+    if (cuInitD3D10(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
+        return CUDA_ERROR_INVALID_DEVICE;
+#endif
+
+#if INIT_CUDA_D3D11
+    if (cuInitD3D11(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
+        return CUDA_ERROR_INVALID_DEVICE;
+#endif
+
+    return CUDA_SUCCESS;
+}
+
diff --git a/dynlink_cuda_cuda.h b/dynlink_cuda_cuda.h
@@ -0,0 +1,1685 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+#include <stdlib.h>
+
+#ifndef __CUDA_API_VERSION
+#define __CUDA_API_VERSION 4000
+#endif
+
+/**
+ * \defgroup CUDA_DRIVER CUDA Driver API
+ *
+ * This section describes the low-level CUDA driver application programming
+ * interface.
+ *
+ * @{
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 4000 /* 4.0 */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ */
+#if __CUDA_API_VERSION >= 3020
+
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
+    typedef unsigned long long CUdeviceptr;
+#else
+    typedef unsigned int CUdeviceptr;
+#endif
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+typedef int CUdevice;                                     /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
+typedef struct CUarray_st *CUarray;                       /**< CUDA array */
+typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                       /**< CUDA event */
+typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
+
+typedef struct CUuuid_st                                  /**< CUDA definition of UUID */
+{
+    char bytes[16];
+} CUuuid;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum
+{
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
+    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+#if __CUDA_API_VERSION < 4000
+    CU_CTX_SCHED_MASK          = 0x03,
+    CU_CTX_FLAGS_MASK          = 0x1f
+#else
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_PRIMARY             = 0x20, /**< Initialize and return the primary context */
+    CU_CTX_FLAGS_MASK          = 0x3f
+#endif
+} CUctx_flags;
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum
+{
+    CU_EVENT_DEFAULT        = 0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
+} CUevent_flags;
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum
+{
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum
+{
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum
+{
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum
+{
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Peak clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Maximum texture array width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Maximum texture array height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35                         /**< Device is using TCC driver model */
+#if __CUDA_API_VERSION >= 4000
+  , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device uses shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43   /**< Maximum layers in a 1D layered texture */
+#endif
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st
+{
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum
+{
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum
+{
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02  /**< prefer larger L1 cache and smaller shared memory */
+} CUfunc_cache;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum
+{
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03     /**< Array memory */
+#if __CUDA_API_VERSION >= 4000
+  , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+#endif
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum
+{
+    CU_COMPUTEMODE_DEFAULT           = 0,  /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_EXCLUSIVE         = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
+    CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+#if __CUDA_API_VERSION >= 4000
+  , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+#endif
+} CUcomputemode;
+
+/**
+ * Online compiler options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Returns a float value in the option of the wall clock time, in
+     * milliseconds, spent creating the cubin\n
+     * Option type: float
+     */
+    CU_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messsages from PTXAS
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
+     * Option type: char*
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages from PTXAS that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char*
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target_enum
+     */
+    CU_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
+     */
+    CU_JIT_FALLBACK_STRATEGY
+
+} CUjit_option;
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_10 = 0,   /**< Compute device class 1.0 */
+    CU_TARGET_COMPUTE_11,       /**< Compute device class 1.1 */
+    CU_TARGET_COMPUTE_12,       /**< Compute device class 1.2 */
+    CU_TARGET_COMPUTE_13,       /**< Compute device class 1.3 */
+    CU_TARGET_COMPUTE_20,       /**< Compute device class 2.0 */
+    CU_TARGET_COMPUTE_21        /**< Compute device class 2.1 */
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code */
+
+} CUjit_fallback;
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum
+{
+    CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST  = 0x04
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum
+{
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum
+{
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum
+{
+    CU_LIMIT_STACK_SIZE        = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE  = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE  = 0x02  /**< GPU malloc heap size */
+} CUlimit;
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum
+{
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * can also mean that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiling APIs are called while application is running
+     * in visual profiler mode.
+    */
+    CUDA_ERROR_PROFILER_DISABLED           = 5,
+    /**
+     * This indicates profiling has not been initialized for this context.
+     * Call cuProfilerInitialize() to resolve this.
+    */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+    /**
+     * This indicates profiler has already been started and probably
+     * cuProfilerStart() is incorrectly called.
+    */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+    /**
+     * This indicates profiler has already been stopped and probably
+     * cuProfilerStop() is incorrectly called.
+    */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. The context cannot be used, so it must
+     * be destroyed (and a new one should be created). All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
+     * context cannot be used (and must be destroyed similar to
+     * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
+     * this context are invalid and must be reconstructed if the program is to
+     * continue using CUDA.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
+
+    /**
+     * This error indicates that a call to ::cuMemPeerRegister is trying to
+     * register memory from a context which has not had peer access
+     * enabled yet via ::cuCtxEnablePeerAccess(), or that
+     * ::cuCtxDisablePeerAccess() is trying to disable peer access
+     * which has not been enabled yet.
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED    = 705,
+
+    /**
+     * This error indicates that a call to ::cuMemPeerRegister is trying to
+     * register already-registered memory.
+     */
+    CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
+
+    /**
+     * This error indicates that a call to ::cuMemPeerUnregister is trying to
+     * unregister memory that has not been registered.
+     */
+    CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED     = 707,
+
+    /**
+     * This error indicates that ::cuCtxCreate was called with the flag
+     * ::CU_CTX_PRIMARY on a device which already has initialized its
+     * primary context.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, peer memory is mapped into CUDA address space and
+ * ::cuMemPeerGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemPeerRegister()
+ */
+#define CU_MEMPEERREGISTER_DEVICEMAP    0x02
+#endif
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)0x00)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * CUDA API made obselete at API version 3020
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #define CUdeviceptr                  CUdeviceptr_v1
+    #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
+    #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
+    #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
+    #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
+    #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
+    #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
+#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
+typedef unsigned int CUdeviceptr;
+
+typedef struct CUDA_MEMCPY2D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+    unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+    unsigned int Height;        /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+typedef struct CUDA_MEMCPY3D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    unsigned int srcZ;          /**< Source Z */
+    unsigned int srcLOD;        /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+    unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    unsigned int dstZ;          /**< Destination Z */
+    unsigned int dstLOD;        /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+    unsigned int Height;        /**< Height of 3D memory copy */
+    unsigned int Depth;         /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of array */
+    unsigned int Height;        /**< Height of array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of 3D array */
+    unsigned int Height;        /**< Height of 3D array */
+    unsigned int Depth;         /**< Depth of 3D array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+    unsigned int Flags;         /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+#endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */
+
+/*
+ * If set, the CUDA array contains an array of 2D slices
+ * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
+ * the number of slices, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/** @} */ /* END CUDA_TYPES */
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    #define CUDAAPI __stdcall
+#else
+    #define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/*********************************
+ ** Initialization
+ *********************************/
+typedef CUresult  CUDAAPI tcuInit(unsigned int Flags);
+
+/*********************************
+ ** Driver Version Query
+ *********************************/
+typedef CUresult  CUDAAPI tcuDriverGetVersion(int *driverVersion);
+
+/************************************
+ **
+ **    Device management
+ **
+ ***********************************/
+
+typedef CUresult  CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
+typedef CUresult  CUDAAPI tcuDeviceGetCount(int *count);
+typedef CUresult  CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
+typedef CUresult  CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
+#else
+    typedef CUresult  CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+#endif
+
+typedef CUresult  CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+typedef CUresult  CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/************************************
+ **
+ **    Context management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult  CUDAAPI tcuCtxDestroy(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
+typedef CUresult  CUDAAPI tcuCtxDetach(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
+
+typedef CUresult  CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
+
+typedef CUresult  CUDAAPI tcuCtxGetDevice(CUdevice *device);
+typedef CUresult  CUDAAPI tcuCtxSynchronize(void);
+
+
+/************************************
+ **
+ **    Module management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
+typedef CUresult  CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
+typedef CUresult  CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult  CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+typedef CUresult  CUDAAPI tcuModuleUnload(CUmodule hmod);
+typedef CUresult  CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+#else
+    typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+#endif
+
+typedef CUresult  CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult  CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+/************************************
+ **
+ **    Memory management
+ **
+ ***********************************/
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
+    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
+                                              size_t *pPitch,
+                                              size_t WidthInBytes,
+                                              size_t Height,
+                                              // size of biggest r/w to be performed by kernels on this memory
+                                              // 4, 8 or 16 bytes
+                                              unsigned int ElementSizeBytes
+                                             );
+#else
+    typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
+    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
+    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
+    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
+                                              unsigned int *pPitch,
+                                              unsigned int WidthInBytes,
+                                              unsigned int Height,
+                                              // size of biggest r/w to be performed by kernels on this memory
+                                              // 4, 8 or 16 bytes
+                                              unsigned int ElementSizeBytes
+                                             );
+#endif
+
+typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
+
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
+#else
+    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
+#endif
+
+typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
+typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
+typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+/************************************
+ **
+ **    Synchronous Memcpy
+ **
+ ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
+ ** but if host memory is involved, they wait until the copy is done before returning.
+ **
+ ***********************************/
+// 1D functions
+#if __CUDA_API_VERSION >= 3020
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+    // device <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+    // array <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+#else
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
+
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
+
+    // device <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+
+    // array <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+#endif
+
+// 2D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult  CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+// 3D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+
+/************************************
+ **
+ **    Asynchronous Memcpy
+ **
+ ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
+ ** memcpy's done with these functions execute in parallel with the CPU and, if
+ ** the hardware is available, may execute in parallel with the GPU.
+ ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
+ **
+ ***********************************/
+
+// 1D functions
+#if __CUDA_API_VERSION >= 3020
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
+                                                 const void *srcHost, size_t ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
+                                                 CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
+                                                 CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                                 const void *srcHost, size_t ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
+                                                 size_t ByteCount, CUstream hStream);
+#else
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
+                                                 const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
+                                                 CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
+                                                 CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
+                                                 const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
+                                                 unsigned int ByteCount, CUstream hStream);
+#endif
+
+// 2D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+// 3D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+
+/************************************
+ **
+ **    Memset
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
+typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
+typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
+
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
+#else
+    typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+#endif
+
+/************************************
+ **
+ **    Function management
+ **
+ ***********************************/
+
+
+typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
+                                         unsigned int gridDimX,  unsigned int gridDimY,  unsigned int gridDimZ,
+                                         unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                                         unsigned int sharedMemBytes,
+                                         CUstream hStream, void **kernelParams, void **extra);
+
+/************************************
+ **
+ **    Array management
+ **
+ ***********************************/
+
+typedef CUresult  CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+typedef CUresult  CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+typedef CUresult  CUDAAPI tcuArrayDestroy(CUarray hArray);
+
+typedef CUresult  CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+typedef CUresult  CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+
+/************************************
+ **
+ **    Texture reference management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
+typedef CUresult  CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
+
+typedef CUresult  CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#else
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
+#endif
+
+typedef CUresult  CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult  CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult  CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult  CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+typedef CUresult  CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult  CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/************************************
+ **
+ **    Surface reference management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult  CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/************************************
+ **
+ **    Parameter management
+ **
+ ***********************************/
+
+typedef CUresult  CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult  CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult  CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
+typedef CUresult  CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult  CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+
+
+/************************************
+ **
+ **    Launch functions
+ **
+ ***********************************/
+
+typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
+typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+/************************************
+ **
+ **    Events
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
+typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
+typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/************************************
+ **
+ **    Streams
+ **
+ ***********************************/
+typedef CUresult CUDAAPI  tcuStreamCreate(CUstream *phStream, unsigned int Flags);
+typedef CUresult CUDAAPI  tcuStreamQuery(CUstream hStream);
+typedef CUresult CUDAAPI  tcuStreamSynchronize(CUstream hStream);
+typedef CUresult CUDAAPI  tcuStreamDestroy(CUstream hStream);
+
+/************************************
+ **
+ **    Graphics interop
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+#else
+    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+#endif
+
+typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/************************************
+ **
+ **    Export tables
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+/************************************
+ **
+ **    Limits
+ **
+ ***********************************/
+
+typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
+typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+
+extern tcuDriverGetVersion             *cuDriverGetVersion;
+extern tcuDeviceGet                    *cuDeviceGet;
+extern tcuDeviceGetCount               *cuDeviceGetCount;
+extern tcuDeviceGetName                *cuDeviceGetName;
+extern tcuDeviceComputeCapability      *cuDeviceComputeCapability;
+extern tcuDeviceGetProperties          *cuDeviceGetProperties;
+extern tcuDeviceGetAttribute           *cuDeviceGetAttribute;
+extern tcuCtxDestroy                   *cuCtxDestroy;
+extern tcuCtxAttach                    *cuCtxAttach;
+extern tcuCtxDetach                    *cuCtxDetach;
+extern tcuCtxPushCurrent               *cuCtxPushCurrent;
+extern tcuCtxPopCurrent                *cuCtxPopCurrent;
+
+extern tcuCtxSetCurrent                *cuCtxSetCurrent;
+extern tcuCtxGetCurrent                *cuCtxGetCurrent;
+
+extern tcuCtxGetDevice                 *cuCtxGetDevice;
+extern tcuCtxSynchronize               *cuCtxSynchronize;
+extern tcuModuleLoad                   *cuModuleLoad;
+extern tcuModuleLoadData               *cuModuleLoadData;
+extern tcuModuleLoadDataEx             *cuModuleLoadDataEx;
+extern tcuModuleLoadFatBinary          *cuModuleLoadFatBinary;
+extern tcuModuleUnload                 *cuModuleUnload;
+extern tcuModuleGetFunction            *cuModuleGetFunction;
+extern tcuModuleGetTexRef              *cuModuleGetTexRef;
+extern tcuModuleGetSurfRef             *cuModuleGetSurfRef;
+extern tcuMemFreeHost                  *cuMemFreeHost;
+extern tcuMemHostAlloc                 *cuMemHostAlloc;
+extern tcuMemHostGetFlags              *cuMemHostGetFlags;
+
+extern tcuMemHostRegister              *cuMemHostRegister;
+extern tcuMemHostUnregister            *cuMemHostUnregister;
+extern tcuMemcpy                       *cuMemcpy;
+extern tcuMemcpyPeer                   *cuMemcpyPeer;
+
+extern tcuDeviceTotalMem               *cuDeviceTotalMem;
+extern tcuCtxCreate                    *cuCtxCreate;
+extern tcuModuleGetGlobal              *cuModuleGetGlobal;
+extern tcuMemGetInfo                   *cuMemGetInfo;
+extern tcuMemAlloc                     *cuMemAlloc;
+extern tcuMemAllocPitch                *cuMemAllocPitch;
+extern tcuMemFree                      *cuMemFree;
+extern tcuMemGetAddressRange           *cuMemGetAddressRange;
+extern tcuMemAllocHost                 *cuMemAllocHost;
+extern tcuMemHostGetDevicePointer      *cuMemHostGetDevicePointer;
+extern tcuFuncSetBlockShape            *cuFuncSetBlockShape;
+extern tcuFuncSetSharedSize            *cuFuncSetSharedSize;
+extern tcuFuncGetAttribute             *cuFuncGetAttribute;
+extern tcuFuncSetCacheConfig           *cuFuncSetCacheConfig;
+extern tcuLaunchKernel                 *cuLaunchKernel;
+extern tcuArrayDestroy                 *cuArrayDestroy;
+extern tcuTexRefCreate                 *cuTexRefCreate;
+extern tcuTexRefDestroy                *cuTexRefDestroy;
+extern tcuTexRefSetArray               *cuTexRefSetArray;
+extern tcuTexRefSetFormat              *cuTexRefSetFormat;
+extern tcuTexRefSetAddressMode         *cuTexRefSetAddressMode;
+extern tcuTexRefSetFilterMode          *cuTexRefSetFilterMode;
+extern tcuTexRefSetFlags               *cuTexRefSetFlags;
+extern tcuTexRefGetArray               *cuTexRefGetArray;
+extern tcuTexRefGetAddressMode         *cuTexRefGetAddressMode;
+extern tcuTexRefGetFilterMode          *cuTexRefGetFilterMode;
+extern tcuTexRefGetFormat              *cuTexRefGetFormat;
+extern tcuTexRefGetFlags               *cuTexRefGetFlags;
+extern tcuSurfRefSetArray              *cuSurfRefSetArray;
+extern tcuSurfRefGetArray              *cuSurfRefGetArray;
+extern tcuParamSetSize                 *cuParamSetSize;
+extern tcuParamSeti                    *cuParamSeti;
+extern tcuParamSetf                    *cuParamSetf;
+extern tcuParamSetv                    *cuParamSetv;
+extern tcuParamSetTexRef               *cuParamSetTexRef;
+extern tcuLaunch                       *cuLaunch;
+extern tcuLaunchGrid                   *cuLaunchGrid;
+extern tcuLaunchGridAsync              *cuLaunchGridAsync;
+extern tcuEventCreate                  *cuEventCreate;
+extern tcuEventRecord                  *cuEventRecord;
+extern tcuEventQuery                   *cuEventQuery;
+extern tcuEventSynchronize             *cuEventSynchronize;
+extern tcuEventDestroy                 *cuEventDestroy;
+extern tcuEventElapsedTime             *cuEventElapsedTime;
+extern tcuStreamCreate                 *cuStreamCreate;
+extern tcuStreamQuery                  *cuStreamQuery;
+extern tcuStreamSynchronize            *cuStreamSynchronize;
+extern tcuStreamDestroy                *cuStreamDestroy;
+extern tcuGraphicsUnregisterResource   *cuGraphicsUnregisterResource;
+extern tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
+extern tcuGraphicsResourceSetMapFlags  *cuGraphicsResourceSetMapFlags;
+extern tcuGraphicsMapResources         *cuGraphicsMapResources;
+extern tcuGraphicsUnmapResources       *cuGraphicsUnmapResources;
+extern tcuGetExportTable               *cuGetExportTable;
+extern tcuCtxSetLimit                  *cuCtxSetLimit;
+extern tcuCtxGetLimit                  *cuCtxGetLimit;
+
+// These functions could be using the CUDA 3.2 interface (_v2)
+extern tcuMemcpyHtoD                   *cuMemcpyHtoD;
+extern tcuMemcpyDtoH                   *cuMemcpyDtoH;
+extern tcuMemcpyDtoD                   *cuMemcpyDtoD;
+extern tcuMemcpyDtoA                   *cuMemcpyDtoA;
+extern tcuMemcpyAtoD                   *cuMemcpyAtoD;
+extern tcuMemcpyHtoA                   *cuMemcpyHtoA;
+extern tcuMemcpyAtoH                   *cuMemcpyAtoH;
+extern tcuMemcpyAtoA                   *cuMemcpyAtoA;
+extern tcuMemcpy2D                     *cuMemcpy2D;
+extern tcuMemcpy2DUnaligned            *cuMemcpy2DUnaligned;
+extern tcuMemcpy3D                     *cuMemcpy3D;
+extern tcuMemcpyHtoDAsync              *cuMemcpyHtoDAsync;
+extern tcuMemcpyDtoHAsync              *cuMemcpyDtoHAsync;
+extern tcuMemcpyDtoDAsync              *cuMemcpyDtoDAsync;
+extern tcuMemcpyHtoAAsync              *cuMemcpyHtoAAsync;
+extern tcuMemcpyAtoHAsync              *cuMemcpyAtoHAsync;
+extern tcuMemcpy2DAsync                *cuMemcpy2DAsync;
+extern tcuMemcpy3DAsync                *cuMemcpy3DAsync;
+extern tcuMemsetD8                     *cuMemsetD8;
+extern tcuMemsetD16                    *cuMemsetD16;
+extern tcuMemsetD32                    *cuMemsetD32;
+extern tcuMemsetD2D8                   *cuMemsetD2D8;
+extern tcuMemsetD2D16                  *cuMemsetD2D16;
+extern tcuMemsetD2D32                  *cuMemsetD2D32;
+extern tcuArrayCreate                  *cuArrayCreate;
+extern tcuArrayGetDescriptor           *cuArrayGetDescriptor;
+extern tcuArray3DCreate                *cuArray3DCreate;
+extern tcuArray3DGetDescriptor         *cuArray3DGetDescriptor;
+extern tcuTexRefSetAddress             *cuTexRefSetAddress;
+extern tcuTexRefSetAddress2D           *cuTexRefSetAddress2D;
+extern tcuTexRefGetAddress             *cuTexRefGetAddress;
+extern tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
+
+/************************************/
+CUresult CUDAAPI cuInit   (unsigned int, int cudaVersion, void *hHandleDriver);
+/************************************/
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__cuda_cuda_h__
diff --git a/nvCPUOPSys.h b/nvCPUOPSys.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef NVCPUOPSYS_H
+#define NVCPUOPSYS_H
+
+
+#if defined(_WIN32) || defined(_WIN16)
+#   define NV_WINDOWS
+#endif
+
+#if (defined(__unix__) || defined(__unix) ) && !defined(nvmacosx) && !defined(vxworks) && !defined(__DJGPP__) && !defined(NV_UNIX) && !defined(__QNX__) && !defined(__QNXNTO__)/* XXX until removed from Makefiles */
+#   define NV_UNIX
+#endif /* defined(__unix__) */
+
+#if defined(__linux__) && !defined(NV_LINUX) && !defined(NV_VMWARE)
+#   define NV_LINUX
+#endif  /* defined(__linux__) */
+
+#endif
diff --git a/nvEncodeAPI.h b/nvEncodeAPI.h
diff --git a/nvUtils.h b/nvUtils.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef NVUTILS_H
+#define NVUTILS_H
+
+#include "nvCPUOPSys.h"
+
+#if defined (NV_WINDOWS)
+#include <windows.h>
+
+#elif defined NV_UNIX
+#include <sys/time.h>
+#include <limits.h>
+
+#define FALSE 0
+#define TRUE  1
+#define S_OK  0
+#define INFINITE UINT_MAX
+#define stricmp strcasecmp
+#define FILE_BEGIN               SEEK_SET
+#define INVALID_SET_FILE_POINTER (-1)
+#define INVALID_HANDLE_VALUE     ((void *)(-1))
+
+typedef void* HANDLE;
+typedef void* HINSTANCE;
+typedef unsigned long DWORD, *LPWORD;
+typedef DWORD FILE_SIZE;
+typedef DWORD HRESULT;
+
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define FABS(a) ((a) >= 0 ? (a) : -(a))
+
+inline bool NvSleep(unsigned int mSec)
+{
+#if defined (NV_WINDOWS)
+    Sleep(mSec);
+#elif defined NV_UNIX
+    usleep(mSec * 1000);
+#else
+#error NvSleep function unknown for this platform.
+#endif
+    return true;
+}
+
+inline bool NvQueryPerformanceFrequency(unsigned long long *freq)
+{
+    *freq = 0;
+#if defined (NV_WINDOWS)
+    LARGE_INTEGER lfreq;
+    if (!QueryPerformanceFrequency(&lfreq)) {
+        return false;
+    }
+    *freq = lfreq.QuadPart;
+#elif defined NV_UNIX
+    // We use system's  gettimeofday() to return timer ticks in uSec
+    *freq = 1000000000;
+#else
+#error NvQueryPerformanceFrequency function not defined for this platform.
+#endif
+
+    return true;
+}
+
+#define SEC_TO_NANO_ULL(sec)    ((unsigned long long)sec * 1000000000)
+#define MICRO_TO_NANO_ULL(sec)  ((unsigned long long)sec * 1000)
+
+inline bool NvQueryPerformanceCounter(unsigned long long *counter)
+{
+    *counter = 0;
+#if defined (NV_WINDOWS)
+    LARGE_INTEGER lcounter;
+    if (!QueryPerformanceCounter(&lcounter)) {
+        return false;
+    }
+    *counter = lcounter.QuadPart;
+#elif defined NV_UNIX
+    struct timeval tv;
+    int ret;
+
+    ret = gettimeofday(&tv, NULL);
+    if (ret != 0) {
+        return false;
+    }
+
+    *counter = SEC_TO_NANO_ULL(tv.tv_sec) + MICRO_TO_NANO_ULL(tv.tv_usec);
+#else
+#error NvQueryPerformanceCounter function not defined for this platform.
+#endif
+    return true;
+}
+
+#if defined NV_UNIX
+__inline bool operator==(const GUID &guid1, const GUID &guid2)
+{
+     if (guid1.Data1    == guid2.Data1 &&
+         guid1.Data2    == guid2.Data2 &&
+         guid1.Data3    == guid2.Data3 &&
+         guid1.Data4[0] == guid2.Data4[0] &&
+         guid1.Data4[1] == guid2.Data4[1] &&
+         guid1.Data4[2] == guid2.Data4[2] &&
+         guid1.Data4[3] == guid2.Data4[3] &&
+         guid1.Data4[4] == guid2.Data4[4] &&
+         guid1.Data4[5] == guid2.Data4[5] &&
+         guid1.Data4[6] == guid2.Data4[6] &&
+         guid1.Data4[7] == guid2.Data4[7])
+    {
+        return true;
+    }
+
+    return false;
+}
+__inline bool operator!=(const GUID &guid1, const GUID &guid2)
+{
+    return !(guid1 == guid2);
+}
+#endif
+#endif
+
+#define PRINTERR(message, ...) \
+    fprintf(stderr, "%s line %d: " message, __FILE__, __LINE__, ##__VA_ARGS__)