Skip to content

Instantly share code, notes, and snippets.

@tea
Created April 25, 2016 11:14
Show Gist options
  • Save tea/02d9df6e58a72f78096e7404e44967cc to your computer and use it in GitHub Desktop.
Save tea/02d9df6e58a72f78096e7404e44967cc to your computer and use it in GitHub Desktop.

Revisions

  1. tea created this gist Apr 25, 2016.
    744 changes: 744 additions & 0 deletions NvEncoderPerf.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,744 @@
    ////////////////////////////////////////////////////////////////////////////
    //
    // Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
    //
    // Please refer to the NVIDIA end user license agreement (EULA) associated
    // with this source code for terms and conditions that govern your use of
    // this software. Any use, reproduction, disclosure, or distribution of
    // this software and related documentation outside the terms of the EULA
    // is strictly prohibited.
    //
    ////////////////////////////////////////////////////////////////////////////

    #include "../common/inc/nvEncodeAPI.h"
    #include "../common/inc/nvUtils.h"
    #include "NvEncoderPerf.h"
    #include <process.h>

    //#define VERBOSE

    #define BITSTREAM_BUFFER_SIZE 2 * 1024 * 1024
    #define MAX_FRAMES_TO_PRELOAD 60

    void CNvEncoderPerf::ConvertYUVpitchToNV12(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index)
    {
    uint32_t lockedPitch;
    unsigned char *pInputSurface;

    m_pNvHWEncoder->NvEncLockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface, (void**)&pInputSurface, &lockedPitch);

    unsigned char *pInputSurfaceCh = pInputSurface + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
    int y;
    int x;
    if (width == 0)
    width = width;
    if (lockedPitch == 0)
    lockedPitch = width;

    for (y = 0; y < height; y++)
    {
    memcpy(pInputSurface + (lockedPitch*y), yuv_luma + (width*y), width);
    }

    for (y = 0; y < height / 2; y++)
    {
    for (x = 0; x < width; x = x + 2)
    {
    pInputSurfaceCh[(y*lockedPitch) + x] = yuv_cb[((width / 2)*y) + (x >> 1)];
    pInputSurfaceCh[(y*lockedPitch) + (x + 1)] = yuv_cr[((width / 2)*y) + (x >> 1)];
    }
    }
    m_pNvHWEncoder->NvEncUnlockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface);
    }

    void CNvEncoderPerf::ConvertYUVpitchToYUV444(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index)
    {
    uint32_t lockedPitch;
    unsigned char *pInputSurface;

    m_pNvHWEncoder->NvEncLockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface, (void**)&pInputSurface, &lockedPitch);
    if (lockedPitch == 0)
    lockedPitch = width;

    unsigned char *pInputSurfaceCb = pInputSurface + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
    unsigned char *pInputSurfaceCr = pInputSurfaceCb + (m_stEncodeBuffer[index].stInputBfr.dwHeight*lockedPitch);
    for (int h = 0; h < height; h++)
    {
    memcpy(pInputSurface + lockedPitch * h, yuv_luma + width * h, width);
    memcpy(pInputSurfaceCb + lockedPitch * h, yuv_cb + width * h, width);
    memcpy(pInputSurfaceCr + lockedPitch * h, yuv_cr + width * h, width);
    }

    m_pNvHWEncoder->NvEncUnlockInputBuffer(m_stEncodeBuffer[index].stInputBfr.hInputSurface);
    }

    CNvEncoderPerf::CNvEncoderPerf()
    {
    m_pNvHWEncoder = new CNvHWEncoder;
    m_pDevice = NULL;
    #if defined (NV_WINDOWS)
    m_pD3D = NULL;
    #endif
    m_cuContext = NULL;

    m_uEncodeBufferCount = 0;
    memset(&m_stEncoderInput, 0, sizeof(m_stEncoderInput));
    memset(&m_stEOSOutputBfr, 0, sizeof(m_stEOSOutputBfr));

    memset(&m_stEncodeBuffer, 0, sizeof(m_stEncodeBuffer));
    }

    CNvEncoderPerf::~CNvEncoderPerf()
    {
    if (m_pNvHWEncoder)
    {
    delete m_pNvHWEncoder;
    m_pNvHWEncoder = NULL;
    }
    }

    NVENCSTATUS CNvEncoderPerf::InitCuda(uint32_t deviceID)
    {
    CUresult cuResult;
    CUdevice device;
    CUcontext cuContextCurr;
    int deviceCount = 0;
    int SMminor = 0, SMmajor = 0;

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    typedef HMODULE CUDADRIVER;
    #else
    typedef void *CUDADRIVER;
    #endif
    CUDADRIVER hHandleDriver = 0;

    cuResult = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuInit error:0x%x\n", cuResult);
    assert(0);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    cuResult = cuDeviceGetCount(&deviceCount);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuDeviceGetCount error:0x%x\n", cuResult);
    assert(0);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    // If dev is negative value, we clamp to 0
    if ((int)deviceID < 0)
    deviceID = 0;

    if (deviceID >(unsigned int)deviceCount - 1)
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_INVALID_ENCODERDEVICE;
    }

    cuResult = cuDeviceGet(&device, deviceID);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuDeviceGet error:0x%x\n", cuResult);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    cuResult = cuDeviceComputeCapability(&SMmajor, &SMminor, deviceID);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuDeviceComputeCapability error:0x%x\n", cuResult);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    if (((SMmajor << 4) + SMminor) < 0x30)
    {
    PRINTERR("GPU %d does not have NVENC capabilities exiting\n", deviceID);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    cuResult = cuCtxCreate((CUcontext*)(&m_pDevice), 0, device);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuCtxCreate error:0x%x\n", cuResult);
    assert(0);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    cuResult = cuCtxPopCurrent(&cuContextCurr);
    if (cuResult != CUDA_SUCCESS)
    {
    PRINTERR("cuCtxPopCurrent error:0x%x\n", cuResult);
    assert(0);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }
    return NV_ENC_SUCCESS;
    }

    #if defined(NV_WINDOWS)
    NVENCSTATUS CNvEncoderPerf::InitD3D9(uint32_t deviceID)
    {
    D3DPRESENT_PARAMETERS d3dpp;
    D3DADAPTER_IDENTIFIER9 adapterId;
    unsigned int iAdapter = NULL;
    HRESULT hr = S_OK;

    m_pD3D = Direct3DCreate9(D3D_SDK_VERSION);
    if (m_pD3D == NULL)
    {
    assert(m_pD3D);
    return NV_ENC_ERR_OUT_OF_MEMORY;;
    }

    if (deviceID >= m_pD3D->GetAdapterCount())
    {
    PRINTERR("Invalid Device Id = %d. Please use DX10/DX11 to detect headless video devices.\n", deviceID);
    return NV_ENC_ERR_INVALID_ENCODERDEVICE;
    }

    hr = m_pD3D->GetAdapterIdentifier(deviceID, 0, &adapterId);
    if (hr != S_OK)
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_INVALID_ENCODERDEVICE;
    }

    ZeroMemory(&d3dpp, sizeof(d3dpp));
    d3dpp.Windowed = TRUE;
    d3dpp.BackBufferFormat = D3DFMT_X8R8G8B8;
    d3dpp.BackBufferWidth = 640;
    d3dpp.BackBufferHeight = 480;
    d3dpp.BackBufferCount = 1;
    d3dpp.SwapEffect = D3DSWAPEFFECT_COPY;
    d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
    d3dpp.Flags = D3DPRESENTFLAG_VIDEO;//D3DPRESENTFLAG_LOCKABLE_BACKBUFFER;
    DWORD dwBehaviorFlags = D3DCREATE_FPU_PRESERVE | D3DCREATE_MULTITHREADED | D3DCREATE_HARDWARE_VERTEXPROCESSING;

    hr = m_pD3D->CreateDevice(deviceID,
    D3DDEVTYPE_HAL,
    GetDesktopWindow(),
    dwBehaviorFlags,
    &d3dpp,
    (IDirect3DDevice9**)(&m_pDevice));

    if (FAILED(hr))
    return NV_ENC_ERR_OUT_OF_MEMORY;

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvEncoderPerf::InitD3D10(uint32_t deviceID)
    {
    HRESULT hr;
    IDXGIFactory * pFactory = NULL;
    IDXGIAdapter * pAdapter;

    if (CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory) != S_OK)
    {
    return NV_ENC_ERR_GENERIC;
    }

    if (pFactory->EnumAdapters(deviceID, &pAdapter) != DXGI_ERROR_NOT_FOUND)
    {
    hr = D3D10CreateDevice(pAdapter, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0,
    D3D10_SDK_VERSION, (ID3D10Device**)(&m_pDevice));
    if (FAILED(hr))
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_OUT_OF_MEMORY;
    }
    }
    else
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_INVALID_ENCODERDEVICE;
    }

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvEncoderPerf::InitD3D11(uint32_t deviceID)
    {
    HRESULT hr;
    IDXGIFactory * pFactory = NULL;
    IDXGIAdapter * pAdapter;

    if (CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory) != S_OK)
    {
    return NV_ENC_ERR_GENERIC;
    }

    if (pFactory->EnumAdapters(deviceID, &pAdapter) != DXGI_ERROR_NOT_FOUND)
    {
    hr = D3D11CreateDevice(pAdapter, D3D_DRIVER_TYPE_UNKNOWN, NULL, 0,
    NULL, 0, D3D11_SDK_VERSION, (ID3D11Device**)(&m_pDevice), NULL, NULL);
    if (FAILED(hr))
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_OUT_OF_MEMORY;
    }
    }
    else
    {
    PRINTERR("Invalid Device Id = %d\n", deviceID);
    return NV_ENC_ERR_NO_ENCODE_DEVICE;
    }

    return NV_ENC_SUCCESS;
    }
    #endif

    NVENCSTATUS CNvEncoderPerf::AllocateIOBuffers(uint32_t uInputWidth, uint32_t uInputHeight, int isYuv444)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    m_EncodeBufferQueue.Initialize(m_stEncodeBuffer, m_uEncodeBufferCount);
    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++)
    {
    nvStatus = m_pNvHWEncoder->NvEncCreateInputBuffer(uInputWidth, uInputHeight, &m_stEncodeBuffer[i].stInputBfr.hInputSurface, isYuv444);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("Failed to allocate Input Buffer, Please reduce MAX_FRAMES_TO_PRELOAD\n");
    return nvStatus;
    }
    if (isYuv444 == 0)
    m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
    else
    m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
    m_stEncodeBuffer[i].stInputBfr.dwWidth = uInputWidth;
    m_stEncodeBuffer[i].stInputBfr.dwHeight = uInputHeight;
    nvStatus = m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("Failed to allocate Output Buffer, Please reduce MAX_FRAMES_TO_PRELOAD\n");
    return nvStatus;
    }
    m_stEncodeBuffer[i].stOutputBfr.dwBitstreamBufferSize = BITSTREAM_BUFFER_SIZE;
    }

    m_stEOSOutputBfr.bEOSFlag = TRUE;

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvEncoderPerf::ReleaseIOBuffers()
    {
    for (uint32_t i = 0; i < m_uEncodeBufferCount; i++)
    {
    m_pNvHWEncoder->NvEncDestroyInputBuffer(m_stEncodeBuffer[i].stInputBfr.hInputSurface);
    char bu[128];
    sprintf(bu, "%p\n", (void*)m_stEncodeBuffer[i].stInputBfr.hInputSurface);
    OutputDebugStringA(bu);
    m_stEncodeBuffer[i].stInputBfr.hInputSurface = NULL;

    m_pNvHWEncoder->NvEncDestroyBitstreamBuffer(m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer);
    m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer = NULL;
    }

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvEncoderPerf::FlushEncoder()
    {
    NVENCSTATUS nvStatus = m_pNvHWEncoder->NvEncFlushEncoderQueue(nullptr);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    return nvStatus;
    }

    EncodeBuffer *pEncodeBufer = m_EncodeBufferQueue.GetPending();
    while (pEncodeBufer)
    {
    m_pNvHWEncoder->ProcessOutput(pEncodeBufer);
    pEncodeBufer = m_EncodeBufferQueue.GetPending();
    }

    return nvStatus;
    }

    NVENCSTATUS CNvEncoderPerf::Deinitialize(uint32_t devicetype)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    ReleaseIOBuffers();

    nvStatus = m_pNvHWEncoder->NvEncDestroyEncoder();

    if (m_pDevice)
    {
    switch (devicetype)
    {
    #if defined(NV_WINDOWS)
    case NV_ENC_DX9:
    ((IDirect3DDevice9*)(m_pDevice))->Release();
    break;

    case NV_ENC_DX10:
    ((ID3D10Device*)(m_pDevice))->Release();
    break;

    case NV_ENC_DX11:
    ((ID3D11Device*)(m_pDevice))->Release();
    break;
    #endif

    case NV_ENC_CUDA:
    CUresult cuResult = CUDA_SUCCESS;
    cuResult = cuCtxDestroy((CUcontext)m_pDevice);
    if (cuResult != CUDA_SUCCESS)
    PRINTERR("cuCtxDestroy error:0x%x\n", cuResult);
    }

    m_pDevice = NULL;
    }

    #if defined (NV_WINDOWS)
    if (m_pD3D)
    {
    m_pD3D->Release();
    m_pD3D = NULL;
    }
    #endif

    return nvStatus;
    }

    void PrintHelp()
    {
    printf("Usage : NvEncoderPerf \n"
    "-i <string> Specify input yuv420 file\n"
    "-o <string> Specify output bitstream file\n"
    "-size <int int> Specify input resolution <width height>\n"
    "\n### Optional parameters ###\n"
    "-codec <integer> Specify the codec \n"
    " 0: H264\n"
    " 1: HEVC\n"
    "-preset <string> Specify the preset for encoder settings\n"
    " hq : nvenc HQ \n"
    " hp : nvenc HP \n"
    " lowLatencyHP : nvenc low latency HP \n"
    " lowLatencyHQ : nvenc low latency HQ \n"
    "-startf <integer> Specify start index for encoding. Default is 0\n"
    "-endf <integer> Specify end index for encoding. Default is end of file\n"
    "-fps <integer> Specify encoding frame rate\n"
    "-goplength <integer> Specify gop length\n"
    "-numB <integer> Specify number of B frames\n"
    "-bitrate <integer> Specify the encoding average bitrate\n"
    "-vbvMaxBitrate <integer> Specify the vbv max bitrate\n"
    "-vbvSize <integer> Specify the encoding vbv/hrd buffer size\n"
    "-rcmode <integer> Specify the rate control mode\n"
    " 0: Constant QP\n"
    " 1: Single pass VBR\n"
    " 2: Single pass CBR\n"
    " 4: Single pass VBR minQP\n"
    " 8: Two pass frame quality\n"
    " 16: Two pass frame size cap\n"
    " 32: Two pass VBR\n"
    "-qp <integer> Specify qp for Constant QP mode\n"
    "-i_qfactor <float> Specify qscale difference between I-frames and P-frames\n"
    "-b_qfactor <float> Specify qscale difference between P-frames and B-frames\n"
    "-i_qoffset <float> Specify qscale offset between I-frames and P-frames\n"
    "-b_qoffset <float> Specify qscale offset between P-frames and B-frames\n"
    "-devicetype <integer> Specify devicetype used for encoding\n"
    " 0: DX9\n"
    " 1: DX11\n"
    " 2: Cuda\n"
    " 3: DX10\n"
    "-deviceID <integer> Specify the GPU device on which encoding will take place\n"
    "-yuv444 <integer> Specify the input YUV format\n"
    " 0: YUV 420\n"
    " 1: YUV 444\n"
    "-help Prints Help Information\n\n"
    );
    }

    int CNvEncoderPerf::EncodeMain(std::atomic<int>& generation)
    {
    uint8_t *yuv[3] = { 0 };
    unsigned long long lStart, lEnd, lFreq;
    int numFramesEncoded = 0;
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    bool bError = false;
    double elapsedTime = 0.0f;
    bool eof = false;
    EncodeConfig encodeConfig;
    uint32_t chromaFormatIDC = 0;
    int32_t lumaPlaneSize = 0, chromaPlaneSize = 0;

    memset(&encodeConfig, 0, sizeof(EncodeConfig));

    encodeConfig.width = 736;
    encodeConfig.height = 576;
    encodeConfig.endFrameIdx = INT_MAX;
    encodeConfig.bitrate = 5000000;
    encodeConfig.rcMode = NV_ENC_PARAMS_RC_CONSTQP;
    encodeConfig.gopLength = NVENC_INFINITE_GOPLENGTH;
    encodeConfig.deviceType = NV_ENC_CUDA;
    encodeConfig.codec = NV_ENC_H264;
    encodeConfig.fps = 30;
    encodeConfig.qp = 28;
    encodeConfig.i_quant_factor = DEFAULT_I_QFACTOR;
    encodeConfig.b_quant_factor = DEFAULT_B_QFACTOR;
    encodeConfig.i_quant_offset = DEFAULT_I_QOFFSET;
    encodeConfig.b_quant_offset = DEFAULT_B_QOFFSET;
    encodeConfig.presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
    encodeConfig.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;

    nvStatus = m_pNvHWEncoder->ParseArguments(&encodeConfig, 0, nullptr);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PrintHelp();
    return 1;
    }

    if (encodeConfig.width == 0 || encodeConfig.height == 0)
    {
    PrintHelp();
    return 1;
    }

    switch (encodeConfig.deviceType)
    {
    #if defined(NV_WINDOWS)
    case NV_ENC_DX9:
    InitD3D9(encodeConfig.deviceID);
    break;

    case NV_ENC_DX10:
    InitD3D10(encodeConfig.deviceID);
    break;

    case NV_ENC_DX11:
    InitD3D11(encodeConfig.deviceID);
    break;
    #endif

    case NV_ENC_CUDA:
    InitCuda(encodeConfig.deviceID);
    break;
    }

    if (encodeConfig.deviceType != NV_ENC_CUDA)
    nvStatus = m_pNvHWEncoder->Initialize(m_pDevice, NV_ENC_DEVICE_TYPE_DIRECTX);
    else
    nvStatus = m_pNvHWEncoder->Initialize(m_pDevice, NV_ENC_DEVICE_TYPE_CUDA);

    if (nvStatus != NV_ENC_SUCCESS)
    return 1;

    encodeConfig.presetGUID = m_pNvHWEncoder->GetPresetGUID(encodeConfig.encoderPreset, encodeConfig.codec);
    #ifdef VERBOSE
    printf("Encoding input : \"%s\"\n", encodeConfig.inputFileName);
    printf(" output : \"%s\"\n", encodeConfig.outputFileName);
    printf(" codec : \"%s\"\n", encodeConfig.codec == NV_ENC_HEVC ? "HEVC" : "H264");
    printf(" size : %dx%d\n", encodeConfig.width, encodeConfig.height);
    printf(" bitrate : %d bits/sec\n", encodeConfig.bitrate);
    printf(" vbvMaxBitrate : %d bits/sec\n", encodeConfig.vbvMaxBitrate);
    printf(" vbvSize : %d bits\n", encodeConfig.vbvSize);
    printf(" fps : %d frames/sec\n", encodeConfig.fps);
    printf(" rcMode : %s\n", encodeConfig.rcMode == NV_ENC_PARAMS_RC_CONSTQP ? "CONSTQP" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR ? "VBR" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_CBR ? "CBR" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ? "VBR MINQP" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_QUALITY ? "TWO_PASS_QUALITY" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP ? "TWO_PASS_FRAMESIZE_CAP" :
    encodeConfig.rcMode == NV_ENC_PARAMS_RC_2_PASS_VBR ? "TWO_PASS_VBR" : "UNKNOWN");
    if (encodeConfig.gopLength == NVENC_INFINITE_GOPLENGTH)
    printf(" goplength : INFINITE GOP \n");
    else
    printf(" goplength : %d \n", encodeConfig.gopLength);
    printf(" B frames : %d \n", encodeConfig.numB);
    printf(" QP : %d \n", encodeConfig.qp);
    printf(" preset : %s\n", (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HQ_GUID) ? "LOW_LATENCY_HQ" :
    (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_HP_GUID) ? "LOW_LATENCY_HP" :
    (encodeConfig.presetGUID == NV_ENC_PRESET_HQ_GUID) ? "HQ_PRESET" :
    (encodeConfig.presetGUID == NV_ENC_PRESET_HP_GUID) ? "HP_PRESET" :
    (encodeConfig.presetGUID == NV_ENC_PRESET_LOW_LATENCY_DEFAULT_GUID) ? "LOW_LATENCY_DEFAULT" : "DEFAULT");
    printf(" devicetype : %s\n", encodeConfig.deviceType == NV_ENC_DX9 ? "DX9" :
    encodeConfig.deviceType == NV_ENC_DX10 ? "DX10" :
    encodeConfig.deviceType == NV_ENC_DX11 ? "DX11" :
    encodeConfig.deviceType == NV_ENC_CUDA ? "CUDA" : "INVALID");

    printf("\n");
    #endif
    nvStatus = m_pNvHWEncoder->CreateEncoder(&encodeConfig);
    if (nvStatus != NV_ENC_SUCCESS)
    return 1;

    m_uEncodeBufferCount = MAX_FRAMES_TO_PRELOAD;

    nvStatus = AllocateIOBuffers(encodeConfig.width, encodeConfig.height, encodeConfig.isYuv444);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    bError = true;
    goto exit;
    }
    chromaFormatIDC = (encodeConfig.isYuv444 ? 3 : 1);
    lumaPlaneSize = encodeConfig.width * encodeConfig.height;
    chromaPlaneSize = (chromaFormatIDC == 3) ? lumaPlaneSize : (lumaPlaneSize >> 2);

    yuv[0] = new uint8_t[lumaPlaneSize];
    yuv[1] = new uint8_t[chromaPlaneSize];
    yuv[2] = new uint8_t[chromaPlaneSize];

    NvQueryPerformanceCounter(&lStart);

    int gen = generation;
    for (int frm = encodeConfig.startFrameIdx; frm <= encodeConfig.endFrameIdx; frm += MAX_FRAMES_TO_PRELOAD)
    {
    int numFramesLoaded = 0;
    for (int frmCnt = frm; frmCnt <= MIN(frm + MAX_FRAMES_TO_PRELOAD - 1, encodeConfig.endFrameIdx); frmCnt++)
    {
    if (gen != generation)
    {
    eof = true;
    break;
    }

    for(int y = 0; y < encodeConfig.height; ++y)
    for (int x = 0; x < encodeConfig.width; ++x)
    {
    yuv[0][encodeConfig.width*y + x] = (x + y) % 256;
    yuv[1][encodeConfig.width / 2 * (y / 2) + x / 2] = (3 * x + y) % 256;
    yuv[2][encodeConfig.width / 2 * (y / 2) + x / 2] = (x + 3 * y) % 256;
    }
    ConvertYUVpitchToNV12(yuv[0], yuv[1], yuv[2], encodeConfig.width, encodeConfig.height, (frmCnt - frm));
    numFramesLoaded++;
    }

    if (numFramesLoaded)
    {
    NvQueryPerformanceCounter(&lStart);
    for (int frmCnt = 0; frmCnt < numFramesLoaded; frmCnt++)
    {
    EncodeFrame(false, encodeConfig.width, encodeConfig.height);
    numFramesEncoded++;
    }
    nvStatus = EncodeFrame(true, encodeConfig.width, encodeConfig.height);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    bError = true;
    goto exit;
    }
    NvQueryPerformanceCounter(&lEnd);
    elapsedTime += (double)(lEnd - lStart);
    }
    if (eof == true)
    {
    break;
    }
    }
    #ifdef VERBOSE
    if (numFramesEncoded > 0)
    {
    NvQueryPerformanceFrequency(&lFreq);
    printf("Encoded %d frames in %6.2fms\n", numFramesEncoded, (elapsedTime*1000.0) / lFreq);
    printf("Average Encode Time : %6.2fms\n", ((elapsedTime*1000.0) / numFramesEncoded) / lFreq);
    printf("Frames per second: %dfps\n", (int)((float)numFramesEncoded * 1000.0 /(float)((elapsedTime*1000.0) / lFreq)));
    }
    #endif
    exit:
    Deinitialize(encodeConfig.deviceType);

    for (int i = 0; i < 3; i ++)
    {
    if (yuv[i])
    {
    delete [] yuv[i];
    }
    }

    return bError ? 1 : 0;
    }

    NVENCSTATUS CNvEncoderPerf::EncodeFrame(bool bFlush, uint32_t width, uint32_t height)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    EncodeBuffer *pEncodeBuffer = NULL;
    NV_ENC_PIC_PARAMS encPicParams;

    memset(&encPicParams, 0, sizeof(encPicParams));
    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);

    if (bFlush)
    {
    FlushEncoder();
    return NV_ENC_SUCCESS;
    }

    pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    if(!pEncodeBuffer)
    {
    m_pNvHWEncoder->ProcessOutput(m_EncodeBufferQueue.GetPending());
    pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    }

    nvStatus = m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, width, height);
    return nvStatus;
    }

    class Encoder
    {
    public:
    HANDLE hThread = INVALID_HANDLE_VALUE;
    HANDLE hEvent = INVALID_HANDLE_VALUE;
    std::atomic<int> generation{ 0 };
    CNvEncoderPerf encoder;

    Encoder()
    {
    hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
    }

    ~Encoder()
    {
    CloseHandle(hEvent);
    }

    static unsigned __stdcall threadFunc(void* pArguments)
    {
    ((Encoder*)pArguments)->inThread();
    return 0;
    }

    void inThread()
    {
    for (;;)
    {
    printf("<");
    encoder.EncodeMain(generation);
    printf(">");
    SetEvent(hEvent);
    }
    }

    void runInThread()
    {
    hThread = (HANDLE)_beginthreadex(NULL, 0, &Encoder::threadFunc, this, 0, nullptr);
    }

    void finalize()
    {
    ++generation;
    WaitForSingleObject(hEvent, INFINITE);
    ResetEvent(hEvent);
    }
    };

    int main(int argc, char **argv)
    {
    Encoder encoder;
    Encoder encoder_a;

    encoder_a.runInThread();
    encoder.runInThread();
    for (;;)
    {
    Sleep(1000);
    encoder.finalize();
    }

    return 0;
    }
    141 changes: 141 additions & 0 deletions NvEncoderPerf.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,141 @@
    ////////////////////////////////////////////////////////////////////////////
    //
    // Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
    //
    // Please refer to the NVIDIA end user license agreement (EULA) associated
    // with this source code for terms and conditions that govern your use of
    // this software. Any use, reproduction, disclosure, or distribution of
    // this software and related documentation outside the terms of the EULA
    // is strictly prohibited.
    //
    ////////////////////////////////////////////////////////////////////////////

    #if defined(NV_WINDOWS)
    #include <d3d9.h>
    #include <d3d10_1.h>
    #include <d3d11.h>
    #pragma warning(disable : 4996)
    #endif
    #include <atomic>
    #include "../common/inc/NvHWEncoder.h"

    #define MAX_ENCODE_QUEUE 100

    #define SET_VER(configStruct, type) {configStruct.version = type##_VER;}

    template<class T>
    class CNvQueue {
    T** m_pBuffer;
    unsigned int m_uSize;
    unsigned int m_uPendingCount;
    unsigned int m_uAvailableIdx;
    unsigned int m_uPendingndex;
    public:
    CNvQueue(): m_pBuffer(NULL), m_uSize(0), m_uPendingCount(0), m_uAvailableIdx(0),
    m_uPendingndex(0)
    {
    }

    ~CNvQueue()
    {
    delete[] m_pBuffer;
    }

    bool Initialize(T *pItems, unsigned int uSize)
    {
    m_uSize = uSize;
    m_uPendingCount = 0;
    m_uAvailableIdx = 0;
    m_uPendingndex = 0;
    m_pBuffer = new T *[m_uSize];
    for (unsigned int i = 0; i < m_uSize; i++)
    {
    m_pBuffer[i] = &pItems[i];
    }
    return true;
    }


    T * GetAvailable()
    {
    T *pItem = NULL;
    if (m_uPendingCount == m_uSize)
    {
    return NULL;
    }
    pItem = m_pBuffer[m_uAvailableIdx];
    m_uAvailableIdx = (m_uAvailableIdx+1)%m_uSize;
    m_uPendingCount += 1;
    return pItem;
    }

    T* GetPending()
    {
    if (m_uPendingCount == 0)
    {
    return NULL;
    }

    T *pItem = m_pBuffer[m_uPendingndex];
    m_uPendingndex = (m_uPendingndex+1)%m_uSize;
    m_uPendingCount -= 1;
    return pItem;
    }
    };

    typedef struct _EncodeFrameConfig
    {
    uint8_t *yuv[3];
    uint32_t stride[3];
    uint32_t width;
    uint32_t height;
    }EncodeFrameConfig;

    typedef enum
    {
    NV_ENC_DX9 = 0,
    NV_ENC_DX11 = 1,
    NV_ENC_CUDA = 2,
    NV_ENC_DX10 = 3,
    } NvEncodeDeviceType;

    class CNvEncoderPerf
    {
    public:
    CNvEncoderPerf();
    virtual ~CNvEncoderPerf();

    int EncodeMain(std::atomic<int>& generation);

    protected:
    CNvHWEncoder *m_pNvHWEncoder;
    uint32_t m_uEncodeBufferCount;
    void* m_pDevice;
    #if defined(NV_WINDOWS)
    IDirect3D9 *m_pD3D;
    #endif

    CUcontext m_cuContext;
    EncodeConfig m_stEncoderInput;
    EncodeBuffer m_stEncodeBuffer[MAX_ENCODE_QUEUE];
    CNvQueue<EncodeBuffer> m_EncodeBufferQueue;
    EncodeOutputBuffer m_stEOSOutputBfr;

    protected:
    NVENCSTATUS Deinitialize(uint32_t devicetype);
    NVENCSTATUS EncodeFrame(bool bFlush=false, uint32_t width=0, uint32_t height=0);
    NVENCSTATUS InitD3D9(uint32_t deviceID = 0);
    NVENCSTATUS InitD3D11(uint32_t deviceID = 0);
    NVENCSTATUS InitD3D10(uint32_t deviceID = 0);
    NVENCSTATUS InitCuda(uint32_t deviceID = 0);
    NVENCSTATUS AllocateIOBuffers(uint32_t uInputWidth, uint32_t uInputHeight,int isYuv444);
    NVENCSTATUS ReleaseIOBuffers();
    unsigned char* LockInputBuffer(void * hInputSurface, uint32_t *pLockedPitch);
    NVENCSTATUS FlushEncoder();
    void ConvertYUVpitchToNV12(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index);
    void ConvertYUVpitchToYUV444(unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height, int index);

    };

    // NVEncodeAPI entry point
    typedef NVENCSTATUS (NVENCAPI *MYPROC)(NV_ENCODE_API_FUNCTION_LIST*);
    1,284 changes: 1,284 additions & 0 deletions NvHWEncoder.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,1284 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */

    #include "../inc/NvHWEncoder.h"

    NVENCSTATUS CNvHWEncoder::NvEncOpenEncodeSession(void* device, uint32_t deviceType)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncOpenEncodeSession(device, deviceType, &m_hEncoder);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeGUIDCount(uint32_t* encodeGUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDCount(m_hEncoder, encodeGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeProfileGUIDCount(GUID encodeGUID, uint32_t* encodeProfileGUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeProfileGUIDCount(m_hEncoder, encodeGUID, encodeProfileGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeProfileGUIDs(GUID encodeGUID, GUID* profileGUIDs, uint32_t guidArraySize, uint32_t* GUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeProfileGUIDs(m_hEncoder, encodeGUID, profileGUIDs, guidArraySize, GUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeGUIDs(GUID* GUIDs, uint32_t guidArraySize, uint32_t* GUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDs(m_hEncoder, GUIDs, guidArraySize, GUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetInputFormatCount(GUID encodeGUID, uint32_t* inputFmtCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetInputFormatCount(m_hEncoder, encodeGUID, inputFmtCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetInputFormats(GUID encodeGUID, NV_ENC_BUFFER_FORMAT* inputFmts, uint32_t inputFmtArraySize, uint32_t* inputFmtCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetInputFormats(m_hEncoder, encodeGUID, inputFmts, inputFmtArraySize, inputFmtCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeCaps(GUID encodeGUID, NV_ENC_CAPS_PARAM* capsParam, int* capsVal)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeCaps(m_hEncoder, encodeGUID, capsParam, capsVal);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetCount(GUID encodeGUID, uint32_t* encodePresetGUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetCount(m_hEncoder, encodeGUID, encodePresetGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetGUIDs(GUID encodeGUID, GUID* presetGUIDs, uint32_t guidArraySize, uint32_t* encodePresetGUIDCount)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetGUIDs(m_hEncoder, encodeGUID, presetGUIDs, guidArraySize, encodePresetGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodePresetConfig(GUID encodeGUID, GUID presetGUID, NV_ENC_PRESET_CONFIG* presetConfig)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetConfig(m_hEncoder, encodeGUID, presetGUID, presetConfig);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncCreateInputBuffer(uint32_t width, uint32_t height, void** inputBuffer, uint32_t isYuv444)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_CREATE_INPUT_BUFFER createInputBufferParams;

    memset(&createInputBufferParams, 0, sizeof(createInputBufferParams));
    SET_VER(createInputBufferParams, NV_ENC_CREATE_INPUT_BUFFER);

    createInputBufferParams.width = width;
    createInputBufferParams.height = height;
    createInputBufferParams.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
    createInputBufferParams.bufferFmt = isYuv444 ? NV_ENC_BUFFER_FORMAT_YUV444_PL : NV_ENC_BUFFER_FORMAT_NV12_PL;

    nvStatus = m_pEncodeAPI->nvEncCreateInputBuffer(m_hEncoder, &createInputBufferParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    *inputBuffer = createInputBufferParams.inputBuffer;

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncDestroyInputBuffer(NV_ENC_INPUT_PTR inputBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (inputBuffer)
    {
    nvStatus = m_pEncodeAPI->nvEncDestroyInputBuffer(m_hEncoder, inputBuffer);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncCreateMVBuffer(uint32_t size, void** bitstreamBuffer)
    {
    NVENCSTATUS status;
    NV_ENC_CREATE_MV_BUFFER stAllocMVBuffer;
    memset(&stAllocMVBuffer, 0, sizeof(stAllocMVBuffer));
    SET_VER(stAllocMVBuffer, NV_ENC_CREATE_MV_BUFFER);
    status = m_pEncodeAPI->nvEncCreateMVBuffer(m_hEncoder, &stAllocMVBuffer);
    if (status != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    *bitstreamBuffer = stAllocMVBuffer.MVBuffer;
    return status;
    }

    NVENCSTATUS CNvHWEncoder::NvEncDestroyMVBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer)
    {
    NVENCSTATUS status;
    NV_ENC_CREATE_MV_BUFFER stAllocMVBuffer;
    memset(&stAllocMVBuffer, 0, sizeof(stAllocMVBuffer));
    SET_VER(stAllocMVBuffer, NV_ENC_CREATE_MV_BUFFER);
    status = m_pEncodeAPI->nvEncDestroyMVBuffer(m_hEncoder, bitstreamBuffer);
    if (status != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    bitstreamBuffer = NULL;
    return status;
    }

    NVENCSTATUS CNvHWEncoder::NvRunMotionEstimationOnly(EncodeBuffer *pEncodeBuffer[2], MEOnlyConfig *pMEOnly)
    {
    NVENCSTATUS nvStatus;
    NV_ENC_MEONLY_PARAMS stMEOnlyParams;
    SET_VER(stMEOnlyParams,NV_ENC_MEONLY_PARAMS);
    stMEOnlyParams.referenceFrame = pEncodeBuffer[0]->stInputBfr.hInputSurface;
    stMEOnlyParams.inputBuffer = pEncodeBuffer[1]->stInputBfr.hInputSurface;
    stMEOnlyParams.bufferFmt = pEncodeBuffer[1]->stInputBfr.bufferFmt;
    stMEOnlyParams.inputWidth = pEncodeBuffer[1]->stInputBfr.dwWidth;
    stMEOnlyParams.inputHeight = pEncodeBuffer[1]->stInputBfr.dwHeight;
    stMEOnlyParams.outputMV = pEncodeBuffer[0]->stOutputBfr.hBitstreamBuffer;
    nvStatus = m_pEncodeAPI->nvEncRunMotionEstimationOnly(m_hEncoder, &stMEOnlyParams);

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncCreateBitstreamBuffer(uint32_t size, void** bitstreamBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_CREATE_BITSTREAM_BUFFER createBitstreamBufferParams;

    memset(&createBitstreamBufferParams, 0, sizeof(createBitstreamBufferParams));
    SET_VER(createBitstreamBufferParams, NV_ENC_CREATE_BITSTREAM_BUFFER);

    createBitstreamBufferParams.size = size;
    createBitstreamBufferParams.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;

    nvStatus = m_pEncodeAPI->nvEncCreateBitstreamBuffer(m_hEncoder, &createBitstreamBufferParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    *bitstreamBuffer = createBitstreamBufferParams.bitstreamBuffer;

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncDestroyBitstreamBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (bitstreamBuffer)
    {
    nvStatus = m_pEncodeAPI->nvEncDestroyBitstreamBuffer(m_hEncoder, bitstreamBuffer);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncLockBitstream(NV_ENC_LOCK_BITSTREAM* lockBitstreamBufferParams)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncLockBitstream(m_hEncoder, lockBitstreamBufferParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncUnlockBitstream(NV_ENC_OUTPUT_PTR bitstreamBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncUnlockBitstream(m_hEncoder, bitstreamBuffer);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncLockInputBuffer(void* inputBuffer, void** bufferDataPtr, uint32_t* pitch)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_LOCK_INPUT_BUFFER lockInputBufferParams;

    memset(&lockInputBufferParams, 0, sizeof(lockInputBufferParams));
    SET_VER(lockInputBufferParams, NV_ENC_LOCK_INPUT_BUFFER);

    lockInputBufferParams.inputBuffer = inputBuffer;
    nvStatus = m_pEncodeAPI->nvEncLockInputBuffer(m_hEncoder, &lockInputBufferParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    *bufferDataPtr = lockInputBufferParams.bufferDataPtr;
    *pitch = lockInputBufferParams.pitch;

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncUnlockInputBuffer(NV_ENC_INPUT_PTR inputBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncUnlockInputBuffer(m_hEncoder, inputBuffer);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetEncodeStats(NV_ENC_STAT* encodeStats)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeStats(m_hEncoder, encodeStats);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncGetSequenceParams(NV_ENC_SEQUENCE_PARAM_PAYLOAD* sequenceParamPayload)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    nvStatus = m_pEncodeAPI->nvEncGetSequenceParams(m_hEncoder, sequenceParamPayload);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncDestroyEncoder()
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (m_bEncoderInitialized)
    {
    nvStatus = m_pEncodeAPI->nvEncDestroyEncoder(m_hEncoder);

    m_bEncoderInitialized = false;
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncInvalidateRefFrames(const NvEncPictureCommand *pEncPicCommand)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    for (uint32_t i = 0; i < pEncPicCommand->numRefFramesToInvalidate; i++)
    {
    nvStatus = m_pEncodeAPI->nvEncInvalidateRefFrames(m_hEncoder, pEncPicCommand->refFrameNumbers[i]);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncOpenEncodeSessionEx(void* device, NV_ENC_DEVICE_TYPE deviceType)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS openSessionExParams;

    memset(&openSessionExParams, 0, sizeof(openSessionExParams));
    SET_VER(openSessionExParams, NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS);

    openSessionExParams.device = device;
    openSessionExParams.deviceType = deviceType;
    openSessionExParams.reserved = NULL;
    openSessionExParams.apiVersion = NVENCAPI_VERSION;

    nvStatus = m_pEncodeAPI->nvEncOpenEncodeSessionEx(&openSessionExParams, &m_hEncoder);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::NvEncReconfigureEncoder(const NvEncPictureCommand *pEncPicCommand)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (pEncPicCommand->bBitrateChangePending || pEncPicCommand->bResolutionChangePending)
    {
    if (pEncPicCommand->bResolutionChangePending)
    {
    m_uCurWidth = pEncPicCommand->newWidth;
    m_uCurHeight = pEncPicCommand->newHeight;
    if ((m_uCurWidth > m_uMaxWidth) || (m_uCurHeight > m_uMaxHeight))
    {
    return NV_ENC_ERR_INVALID_PARAM;
    }
    m_stCreateEncodeParams.encodeWidth = m_uCurWidth;
    m_stCreateEncodeParams.encodeHeight = m_uCurHeight;
    m_stCreateEncodeParams.darWidth = m_uCurWidth;
    m_stCreateEncodeParams.darHeight = m_uCurHeight;
    }

    if (pEncPicCommand->bBitrateChangePending)
    {
    m_stEncodeConfig.rcParams.averageBitRate = pEncPicCommand->newBitrate;
    m_stEncodeConfig.rcParams.maxBitRate = pEncPicCommand->newBitrate;
    m_stEncodeConfig.rcParams.vbvBufferSize = pEncPicCommand->newVBVSize != 0 ? pEncPicCommand->newVBVSize : (pEncPicCommand->newBitrate * m_stCreateEncodeParams.frameRateDen) / m_stCreateEncodeParams.frameRateNum;
    m_stEncodeConfig.rcParams.vbvInitialDelay = m_stEncodeConfig.rcParams.vbvBufferSize;
    }

    NV_ENC_RECONFIGURE_PARAMS stReconfigParams;
    memset(&stReconfigParams, 0, sizeof(stReconfigParams));
    memcpy(&stReconfigParams.reInitEncodeParams, &m_stCreateEncodeParams, sizeof(m_stCreateEncodeParams));
    stReconfigParams.version = NV_ENC_RECONFIGURE_PARAMS_VER;
    stReconfigParams.forceIDR = pEncPicCommand->bResolutionChangePending ? 1 : 0;

    nvStatus = m_pEncodeAPI->nvEncReconfigureEncoder(m_hEncoder, &stReconfigParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    }

    return nvStatus;
    }

    CNvHWEncoder::CNvHWEncoder()
    {
    m_hEncoder = NULL;
    m_bEncoderInitialized = false;
    m_pEncodeAPI = NULL;
    m_hinstLib = NULL;
    m_EncodeIdx = 0;
    m_uCurWidth = 0;
    m_uCurHeight = 0;
    m_uMaxWidth = 0;
    m_uMaxHeight = 0;

    memset(&m_stCreateEncodeParams, 0, sizeof(m_stCreateEncodeParams));
    SET_VER(m_stCreateEncodeParams, NV_ENC_INITIALIZE_PARAMS);

    memset(&m_stEncodeConfig, 0, sizeof(m_stEncodeConfig));
    SET_VER(m_stEncodeConfig, NV_ENC_CONFIG);
    }

    CNvHWEncoder::~CNvHWEncoder()
    {
    // clean up encode API resources here
    if (m_pEncodeAPI)
    {
    delete m_pEncodeAPI;
    m_pEncodeAPI = NULL;
    }

    if (m_hinstLib)
    {
    #if defined (NV_WINDOWS)
    FreeLibrary(m_hinstLib);
    #else
    dlclose(m_hinstLib);
    #endif

    m_hinstLib = NULL;
    }
    }

    NVENCSTATUS CNvHWEncoder::ValidateEncodeGUID (GUID inputCodecGuid)
    {
    unsigned int i, codecFound, encodeGUIDCount, encodeGUIDArraySize;
    NVENCSTATUS nvStatus;
    GUID *encodeGUIDArray;

    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDCount(m_hEncoder, &encodeGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    return nvStatus;
    }

    encodeGUIDArray = new GUID[encodeGUIDCount];
    memset(encodeGUIDArray, 0, sizeof(GUID)* encodeGUIDCount);

    encodeGUIDArraySize = 0;
    nvStatus = m_pEncodeAPI->nvEncGetEncodeGUIDs(m_hEncoder, encodeGUIDArray, encodeGUIDCount, &encodeGUIDArraySize);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    delete[] encodeGUIDArray;
    assert(0);
    return nvStatus;
    }

    assert(encodeGUIDArraySize <= encodeGUIDCount);

    codecFound = 0;
    for (i = 0; i < encodeGUIDArraySize; i++)
    {
    if (inputCodecGuid == encodeGUIDArray[i])
    {
    codecFound = 1;
    break;
    }
    }

    delete[] encodeGUIDArray;

    if (codecFound)
    return NV_ENC_SUCCESS;
    else
    return NV_ENC_ERR_INVALID_PARAM;
    }

    NVENCSTATUS CNvHWEncoder::ValidatePresetGUID(GUID inputPresetGuid, GUID inputCodecGuid)
    {
    uint32_t i, presetFound, presetGUIDCount, presetGUIDArraySize;
    NVENCSTATUS nvStatus;
    GUID *presetGUIDArray;

    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetCount(m_hEncoder, inputCodecGuid, &presetGUIDCount);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    return nvStatus;
    }

    presetGUIDArray = new GUID[presetGUIDCount];
    memset(presetGUIDArray, 0, sizeof(GUID)* presetGUIDCount);

    presetGUIDArraySize = 0;
    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetGUIDs(m_hEncoder, inputCodecGuid, presetGUIDArray, presetGUIDCount, &presetGUIDArraySize);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    delete[] presetGUIDArray;
    return nvStatus;
    }

    assert(presetGUIDArraySize <= presetGUIDCount);

    presetFound = 0;
    for (i = 0; i < presetGUIDArraySize; i++)
    {
    if (inputPresetGuid == presetGUIDArray[i])
    {
    presetFound = 1;
    break;
    }
    }

    delete[] presetGUIDArray;

    if (presetFound)
    return NV_ENC_SUCCESS;
    else
    return NV_ENC_ERR_INVALID_PARAM;
    }

    NVENCSTATUS CNvHWEncoder::CreateEncoder(const EncodeConfig *pEncCfg)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (pEncCfg == NULL)
    {
    return NV_ENC_ERR_INVALID_PARAM;
    }

    m_uCurWidth = pEncCfg->width;
    m_uCurHeight = pEncCfg->height;

    m_uMaxWidth = (pEncCfg->maxWidth > 0 ? pEncCfg->maxWidth : pEncCfg->width);
    m_uMaxHeight = (pEncCfg->maxHeight > 0 ? pEncCfg->maxHeight : pEncCfg->height);

    if ((m_uCurWidth > m_uMaxWidth) || (m_uCurHeight > m_uMaxHeight)) {
    return NV_ENC_ERR_INVALID_PARAM;
    }

    if (!pEncCfg->width || !pEncCfg->height)
    {
    return NV_ENC_ERR_INVALID_PARAM;
    }

    if (pEncCfg->isYuv444 && (pEncCfg->codec == NV_ENC_HEVC))
    {
    PRINTERR("444 is not supported with HEVC \n");
    return NV_ENC_ERR_INVALID_PARAM;
    }

    GUID inputCodecGUID = pEncCfg->codec == NV_ENC_H264 ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID;
    nvStatus = ValidateEncodeGUID(inputCodecGUID);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("codec not supported \n");
    return nvStatus;
    }

    codecGUID = inputCodecGUID;

    m_stCreateEncodeParams.encodeGUID = inputCodecGUID;
    m_stCreateEncodeParams.presetGUID = pEncCfg->presetGUID;
    m_stCreateEncodeParams.encodeWidth = pEncCfg->width;
    m_stCreateEncodeParams.encodeHeight = pEncCfg->height;

    m_stCreateEncodeParams.darWidth = pEncCfg->width;
    m_stCreateEncodeParams.darHeight = pEncCfg->height;
    m_stCreateEncodeParams.frameRateNum = pEncCfg->fps;
    m_stCreateEncodeParams.frameRateDen = 1;
    m_stCreateEncodeParams.enableEncodeAsync = 0;
    m_stCreateEncodeParams.enablePTD = 1;
    m_stCreateEncodeParams.reportSliceOffsets = 0;
    m_stCreateEncodeParams.enableSubFrameWrite = 0;
    m_stCreateEncodeParams.encodeConfig = &m_stEncodeConfig;
    m_stCreateEncodeParams.maxEncodeWidth = m_uMaxWidth;
    m_stCreateEncodeParams.maxEncodeHeight = m_uMaxHeight;

    // apply preset
    NV_ENC_PRESET_CONFIG stPresetCfg;
    memset(&stPresetCfg, 0, sizeof(NV_ENC_PRESET_CONFIG));
    SET_VER(stPresetCfg, NV_ENC_PRESET_CONFIG);
    SET_VER(stPresetCfg.presetCfg, NV_ENC_CONFIG);

    nvStatus = m_pEncodeAPI->nvEncGetEncodePresetConfig(m_hEncoder, m_stCreateEncodeParams.encodeGUID, m_stCreateEncodeParams.presetGUID, &stPresetCfg);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("nvEncGetEncodePresetConfig returned failure");
    return nvStatus;
    }
    memcpy(&m_stEncodeConfig, &stPresetCfg.presetCfg, sizeof(NV_ENC_CONFIG));

    m_stEncodeConfig.gopLength = pEncCfg->gopLength;
    m_stEncodeConfig.frameIntervalP = pEncCfg->numB + 1;
    if (pEncCfg->pictureStruct == NV_ENC_PIC_STRUCT_FRAME)
    {
    m_stEncodeConfig.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
    }
    else
    {
    m_stEncodeConfig.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
    }

    m_stEncodeConfig.mvPrecision = NV_ENC_MV_PRECISION_QUARTER_PEL;

    if (pEncCfg->bitrate || pEncCfg->vbvMaxBitrate)
    {
    m_stEncodeConfig.rcParams.rateControlMode = (NV_ENC_PARAMS_RC_MODE)pEncCfg->rcMode;
    m_stEncodeConfig.rcParams.averageBitRate = pEncCfg->bitrate;
    m_stEncodeConfig.rcParams.maxBitRate = pEncCfg->vbvMaxBitrate;
    m_stEncodeConfig.rcParams.vbvBufferSize = pEncCfg->vbvSize;
    m_stEncodeConfig.rcParams.vbvInitialDelay = pEncCfg->vbvSize * 9 / 10;
    }
    else
    {
    m_stEncodeConfig.rcParams.rateControlMode = NV_ENC_PARAMS_RC_CONSTQP;
    }

    if (pEncCfg->rcMode == 0)
    {
    m_stEncodeConfig.rcParams.constQP.qpInterP = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
    m_stEncodeConfig.rcParams.constQP.qpInterB = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
    m_stEncodeConfig.rcParams.constQP.qpIntra = pEncCfg->presetGUID == NV_ENC_PRESET_LOSSLESS_HP_GUID? 0 : pEncCfg->qp;
    }

    // set up initial QP value
    if (pEncCfg->rcMode == NV_ENC_PARAMS_RC_VBR || pEncCfg->rcMode == NV_ENC_PARAMS_RC_VBR_MINQP ||
    pEncCfg->rcMode == NV_ENC_PARAMS_RC_2_PASS_VBR) {
    m_stEncodeConfig.rcParams.enableInitialRCQP = 1;
    m_stEncodeConfig.rcParams.initialRCQP.qpInterP = pEncCfg->qp;
    if(pEncCfg->i_quant_factor != 0.0 && pEncCfg->b_quant_factor != 0.0) {
    m_stEncodeConfig.rcParams.initialRCQP.qpIntra = (int)(pEncCfg->qp * FABS(pEncCfg->i_quant_factor) + pEncCfg->i_quant_offset);
    m_stEncodeConfig.rcParams.initialRCQP.qpInterB = (int)(pEncCfg->qp * FABS(pEncCfg->b_quant_factor) + pEncCfg->b_quant_offset);
    } else {
    m_stEncodeConfig.rcParams.initialRCQP.qpIntra = pEncCfg->qp;
    m_stEncodeConfig.rcParams.initialRCQP.qpInterB = pEncCfg->qp;
    }

    }

    if (pEncCfg->isYuv444)
    {
    m_stEncodeConfig.encodeCodecConfig.h264Config.chromaFormatIDC = 3;
    }
    else
    {
    m_stEncodeConfig.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
    }

    if (pEncCfg->intraRefreshEnableFlag)
    {
    if (pEncCfg->codec == NV_ENC_HEVC)
    {
    m_stEncodeConfig.encodeCodecConfig.hevcConfig.enableIntraRefresh = 1;
    m_stEncodeConfig.encodeCodecConfig.hevcConfig.intraRefreshPeriod = pEncCfg->intraRefreshPeriod;
    m_stEncodeConfig.encodeCodecConfig.hevcConfig.intraRefreshCnt = pEncCfg->intraRefreshDuration;
    }
    else
    {
    m_stEncodeConfig.encodeCodecConfig.h264Config.enableIntraRefresh = 1;
    m_stEncodeConfig.encodeCodecConfig.h264Config.intraRefreshPeriod = pEncCfg->intraRefreshPeriod;
    m_stEncodeConfig.encodeCodecConfig.h264Config.intraRefreshCnt = pEncCfg->intraRefreshDuration;
    }
    }

    if (pEncCfg->invalidateRefFramesEnableFlag)
    {
    if (pEncCfg->codec == NV_ENC_HEVC)
    {
    m_stEncodeConfig.encodeCodecConfig.hevcConfig.maxNumRefFramesInDPB = 16;
    }
    else
    {
    m_stEncodeConfig.encodeCodecConfig.h264Config.maxNumRefFrames = 16;
    }
    }

    if (pEncCfg->qpDeltaMapFile)
    {
    m_stEncodeConfig.rcParams.enableExtQPDeltaMap = 1;
    }
    if (pEncCfg->codec == NV_ENC_H264)
    {
    m_stEncodeConfig.encodeCodecConfig.h264Config.idrPeriod = pEncCfg->gopLength;
    }
    else if (pEncCfg->codec == NV_ENC_HEVC)
    {
    m_stEncodeConfig.encodeCodecConfig.hevcConfig.idrPeriod = pEncCfg->gopLength;
    }

    if (pEncCfg->enableMEOnly == 1 || pEncCfg->enableMEOnly == 2)
    {
    NV_ENC_CAPS_PARAM stCapsParam;
    memset(&stCapsParam, 0, sizeof(NV_ENC_CAPS_PARAM));
    SET_VER(stCapsParam, NV_ENC_CAPS_PARAM);
    stCapsParam.capsToQuery = NV_ENC_CAPS_SUPPORT_MEONLY_MODE;
    m_stCreateEncodeParams.enableMEOnlyMode = true;
    int meonlyMode = 0;
    nvStatus = m_pEncodeAPI->nvEncGetEncodeCaps(m_hEncoder, m_stCreateEncodeParams.encodeGUID, &stCapsParam, &meonlyMode);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("Encode Session Initialization failed");
    return nvStatus;
    }
    else
    {
    if (meonlyMode == 1)
    {
    printf("NV_ENC_CAPS_SUPPORT_MEONLY_MODE supported\n");
    }
    else
    {
    PRINTERR("NV_ENC_CAPS_SUPPORT_MEONLY_MODE not supported\n");
    return NV_ENC_ERR_UNSUPPORTED_DEVICE;
    }
    }
    }

    nvStatus = m_pEncodeAPI->nvEncInitializeEncoder(m_hEncoder, &m_stCreateEncodeParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    PRINTERR("Encode Session Initialization failed");
    return nvStatus;
    }
    m_bEncoderInitialized = true;

    return nvStatus;
    }

    GUID CNvHWEncoder::GetPresetGUID(char* encoderPreset, int codec)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    GUID presetGUID = NV_ENC_PRESET_DEFAULT_GUID;

    if (encoderPreset && (stricmp(encoderPreset, "hq") == 0))
    {
    presetGUID = NV_ENC_PRESET_HQ_GUID;
    }
    else if (encoderPreset && (stricmp(encoderPreset, "lowLatencyHP") == 0))
    {
    presetGUID = NV_ENC_PRESET_LOW_LATENCY_HP_GUID;
    }
    else if (encoderPreset && (stricmp(encoderPreset, "hp") == 0))
    {
    presetGUID = NV_ENC_PRESET_HP_GUID;
    }
    else if (encoderPreset && (stricmp(encoderPreset, "lowLatencyHQ") == 0))
    {
    presetGUID = NV_ENC_PRESET_LOW_LATENCY_HQ_GUID;
    }
    else if (encoderPreset && (stricmp(encoderPreset, "lossless") == 0))
    {
    presetGUID = NV_ENC_PRESET_LOSSLESS_HP_GUID;
    }
    else
    {
    if (encoderPreset)
    PRINTERR("Unsupported preset guid %s\n", encoderPreset);
    presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
    }

    GUID inputCodecGUID = codec == NV_ENC_H264 ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID;
    nvStatus = ValidatePresetGUID(presetGUID, inputCodecGUID);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    presetGUID = NV_ENC_PRESET_DEFAULT_GUID;
    PRINTERR("Unsupported preset guid %s\n", encoderPreset);
    }

    return presetGUID;
    }

    NVENCSTATUS CNvHWEncoder::ProcessOutput(const EncodeBuffer *pEncodeBuffer)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (pEncodeBuffer->stOutputBfr.hBitstreamBuffer == NULL && pEncodeBuffer->stOutputBfr.bEOSFlag == FALSE)
    {
    return NV_ENC_ERR_INVALID_PARAM;
    }

    if (pEncodeBuffer->stOutputBfr.bEOSFlag)
    return NV_ENC_SUCCESS;

    nvStatus = NV_ENC_SUCCESS;
    NV_ENC_LOCK_BITSTREAM lockBitstreamData;
    memset(&lockBitstreamData, 0, sizeof(lockBitstreamData));
    SET_VER(lockBitstreamData, NV_ENC_LOCK_BITSTREAM);
    lockBitstreamData.outputBitstream = pEncodeBuffer->stOutputBfr.hBitstreamBuffer;
    lockBitstreamData.doNotWait = false;

    nvStatus = m_pEncodeAPI->nvEncLockBitstream(m_hEncoder, &lockBitstreamData);
    if (nvStatus == NV_ENC_SUCCESS)
    {
    nvStatus = m_pEncodeAPI->nvEncUnlockBitstream(m_hEncoder, pEncodeBuffer->stOutputBfr.hBitstreamBuffer);
    }
    else
    {
    PRINTERR("lock bitstream function failed \n");
    }

    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::Initialize(void* device, NV_ENC_DEVICE_TYPE deviceType)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    MYPROC nvEncodeAPICreateInstance; // function pointer to create instance in nvEncodeAPI

    #if defined(NV_WINDOWS)
    #if defined (_WIN64)
    m_hinstLib = LoadLibrary(TEXT("nvEncodeAPI64.dll"));
    #else
    m_hinstLib = LoadLibrary(TEXT("nvEncodeAPI.dll"));
    #endif
    #else
    m_hinstLib = dlopen("libnvidia-encode.so.1", RTLD_LAZY);
    #endif
    if (m_hinstLib == NULL)
    return NV_ENC_ERR_OUT_OF_MEMORY;

    #if defined(NV_WINDOWS)
    nvEncodeAPICreateInstance = (MYPROC)GetProcAddress(m_hinstLib, "NvEncodeAPICreateInstance");
    #else
    nvEncodeAPICreateInstance = (MYPROC)dlsym(m_hinstLib, "NvEncodeAPICreateInstance");
    #endif

    if (nvEncodeAPICreateInstance == NULL)
    return NV_ENC_ERR_OUT_OF_MEMORY;

    m_pEncodeAPI = new NV_ENCODE_API_FUNCTION_LIST;
    if (m_pEncodeAPI == NULL)
    return NV_ENC_ERR_OUT_OF_MEMORY;

    memset(m_pEncodeAPI, 0, sizeof(NV_ENCODE_API_FUNCTION_LIST));
    m_pEncodeAPI->version = NV_ENCODE_API_FUNCTION_LIST_VER;
    nvStatus = nvEncodeAPICreateInstance(m_pEncodeAPI);
    if (nvStatus != NV_ENC_SUCCESS)
    return nvStatus;

    nvStatus = NvEncOpenEncodeSessionEx(device, deviceType);
    if (nvStatus != NV_ENC_SUCCESS)
    return nvStatus;

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvHWEncoder::NvEncEncodeFrame(EncodeBuffer *pEncodeBuffer, NvEncPictureCommand *encPicCommand,
    uint32_t width, uint32_t height, NV_ENC_PIC_STRUCT ePicStruct,
    int8_t *qpDeltaMapArray, uint32_t qpDeltaMapArraySize)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_PIC_PARAMS encPicParams;

    memset(&encPicParams, 0, sizeof(encPicParams));
    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);

    encPicParams.inputBuffer = pEncodeBuffer->stInputBfr.hInputSurface;
    encPicParams.bufferFmt = pEncodeBuffer->stInputBfr.bufferFmt;
    encPicParams.inputWidth = width;
    encPicParams.inputHeight = height;
    encPicParams.outputBitstream = pEncodeBuffer->stOutputBfr.hBitstreamBuffer;
    encPicParams.completionEvent = nullptr;
    encPicParams.inputTimeStamp = m_EncodeIdx;
    encPicParams.pictureStruct = ePicStruct;
    encPicParams.qpDeltaMap = qpDeltaMapArray;
    encPicParams.qpDeltaMapSize = qpDeltaMapArraySize;

    if (encPicCommand)
    {
    if (encPicCommand->bForceIDR)
    {
    encPicParams.encodePicFlags |= NV_ENC_PIC_FLAG_FORCEIDR;
    }

    if (encPicCommand->bForceIntraRefresh)
    {
    if (codecGUID == NV_ENC_CODEC_HEVC_GUID)
    {
    encPicParams.codecPicParams.hevcPicParams.forceIntraRefreshWithFrameCnt = encPicCommand->intraRefreshDuration;
    }
    else
    {
    encPicParams.codecPicParams.h264PicParams.forceIntraRefreshWithFrameCnt = encPicCommand->intraRefreshDuration;
    }
    }
    }

    nvStatus = m_pEncodeAPI->nvEncEncodePicture(m_hEncoder, &encPicParams);
    if (nvStatus != NV_ENC_SUCCESS && nvStatus != NV_ENC_ERR_NEED_MORE_INPUT)
    {
    assert(0);
    return nvStatus;
    }

    m_EncodeIdx++;

    return NV_ENC_SUCCESS;
    }

    NVENCSTATUS CNvHWEncoder::NvEncFlushEncoderQueue(void *hEOSEvent)
    {
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;
    NV_ENC_PIC_PARAMS encPicParams;
    memset(&encPicParams, 0, sizeof(encPicParams));
    SET_VER(encPicParams, NV_ENC_PIC_PARAMS);
    encPicParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
    encPicParams.completionEvent = hEOSEvent;
    nvStatus = m_pEncodeAPI->nvEncEncodePicture(m_hEncoder, &encPicParams);
    if (nvStatus != NV_ENC_SUCCESS)
    {
    assert(0);
    }
    return nvStatus;
    }

    NVENCSTATUS CNvHWEncoder::ParseArguments(EncodeConfig *encodeConfig, int argc, char *argv[])
    {
    for (int i = 1; i < argc; i++)
    {
    if (stricmp(argv[i], "-bmpfilePath") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->inputFilePath = argv[i];
    }
    else if (stricmp(argv[i], "-i") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->inputFileName = argv[i];
    }
    else if (stricmp(argv[i], "-o") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->outputFileName = argv[i];
    }
    else if (stricmp(argv[i], "-size") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->width) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }

    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->height) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 2]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-maxSize") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->maxWidth) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }

    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->maxHeight) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 2]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-bitrate") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->bitrate) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-vbvMaxBitrate") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->vbvMaxBitrate) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-vbvSize") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->vbvSize) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-fps") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->fps) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-startf") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->startFrameIdx) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-endf") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->endFrameIdx) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-rcmode") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->rcMode) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-goplength") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->gopLength) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-numB") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->numB) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-qp") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->qp) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-i_qfactor") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->i_quant_factor) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-b_qfactor") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->b_quant_factor) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-i_qoffset") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->i_quant_offset) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-b_qoffset") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%f", &encodeConfig->b_quant_offset) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-preset") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->encoderPreset = argv[i];
    }
    else if (stricmp(argv[i], "-devicetype") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->deviceType) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-codec") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->codec) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-encCmdFile") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->encCmdFileName = argv[i];
    }
    else if (stricmp(argv[i], "-intraRefresh") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshEnableFlag) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-intraRefreshPeriod") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshPeriod) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-intraRefreshDuration") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->intraRefreshDuration) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-picStruct") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->pictureStruct) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-deviceID") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->deviceID) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-yuv444") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->isYuv444) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-qpDeltaMapFile") == 0)
    {
    if (++i >= argc)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    encodeConfig->qpDeltaMapFile = argv[i];
    }
    else if (stricmp(argv[i], "-meonly") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->enableMEOnly) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    if (encodeConfig->enableMEOnly != 1 && encodeConfig->enableMEOnly != 2)
    {
    PRINTERR("invalid enableMEOnly value = %d (permissive value 1 and 2)\n", encodeConfig->enableMEOnly);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-preloadedFrameCount") == 0)
    {
    if (++i >= argc || sscanf(argv[i], "%d", &encodeConfig->preloadedFrameCount) != 1)
    {
    PRINTERR("invalid parameter for %s\n", argv[i - 1]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    if (encodeConfig->preloadedFrameCount <= 1)
    {
    PRINTERR("invalid preloadedFrameQueueSize value = %d (permissive value 2 and above)\n", encodeConfig->preloadedFrameCount);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }
    else if (stricmp(argv[i], "-help") == 0)
    {
    return NV_ENC_ERR_INVALID_PARAM;
    }
    else
    {
    PRINTERR("invalid parameter %s\n", argv[i++]);
    return NV_ENC_ERR_INVALID_PARAM;
    }
    }

    return NV_ENC_SUCCESS;
    }
    206 changes: 206 additions & 0 deletions NvHWEncoder.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,206 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */

    #include <stdlib.h>
    #include <stdio.h>
    #include <assert.h>

    #include "dynlink_cuda_cuda.h" // <cuda.h>

    #include "nvEncodeAPI.h"
    #include "nvUtils.h"

    #define SET_VER(configStruct, type) {configStruct.version = type##_VER;}

    #if defined (NV_WINDOWS)
    #include "d3d9.h"
    #define NVENCAPI __stdcall
    #pragma warning(disable : 4996)
    #elif defined (NV_UNIX)
    #include <dlfcn.h>
    #include <string.h>
    #define NVENCAPI
    #endif

    #define DEFAULT_I_QFACTOR -0.8f
    #define DEFAULT_B_QFACTOR 1.25f
    #define DEFAULT_I_QOFFSET 0.f
    #define DEFAULT_B_QOFFSET 1.25f

    typedef struct _EncodeConfig
    {
    int width;
    int height;
    int maxWidth;
    int maxHeight;
    int fps;
    int bitrate;
    int vbvMaxBitrate;
    int vbvSize;
    int rcMode;
    int qp;
    float i_quant_factor;
    float b_quant_factor;
    float i_quant_offset;
    float b_quant_offset;
    GUID presetGUID;
    int codec;
    int invalidateRefFramesEnableFlag;
    int intraRefreshEnableFlag;
    int intraRefreshPeriod;
    int intraRefreshDuration;
    int deviceType;
    int startFrameIdx;
    int endFrameIdx;
    int gopLength;
    int numB;
    int pictureStruct;
    int deviceID;
    int isYuv444;
    char *qpDeltaMapFile;
    char* inputFileName;
    char* outputFileName;
    char* encoderPreset;
    char* inputFilePath;
    char *encCmdFileName;
    int enableMEOnly;
    int preloadedFrameCount;
    }EncodeConfig;

    typedef struct _EncodeInputBuffer
    {
    unsigned int dwWidth;
    unsigned int dwHeight;
    CUdeviceptr pNV12devPtr;
    uint32_t uNV12Stride;
    CUdeviceptr pNV12TempdevPtr;
    uint32_t uNV12TempStride;
    NV_ENC_INPUT_PTR hInputSurface;
    NV_ENC_BUFFER_FORMAT bufferFmt;
    }EncodeInputBuffer;

    typedef struct _EncodeOutputBuffer
    {
    unsigned int dwBitstreamBufferSize;
    NV_ENC_OUTPUT_PTR hBitstreamBuffer;
    bool bEOSFlag;
    }EncodeOutputBuffer;

    typedef struct _EncodeBuffer
    {
    EncodeOutputBuffer stOutputBfr;
    EncodeInputBuffer stInputBfr;
    }EncodeBuffer;

    typedef struct _NvEncPictureCommand
    {
    bool bResolutionChangePending;
    bool bBitrateChangePending;
    bool bForceIDR;
    bool bForceIntraRefresh;
    bool bInvalidateRefFrames;

    uint32_t newWidth;
    uint32_t newHeight;

    uint32_t newBitrate;
    uint32_t newVBVSize;

    uint32_t intraRefreshDuration;

    uint32_t numRefFramesToInvalidate;
    uint32_t refFrameNumbers[16];
    }NvEncPictureCommand;

    enum
    {
    NV_ENC_H264 = 0,
    NV_ENC_HEVC = 1,
    };

    struct MEOnlyConfig
    {
    unsigned char *yuv[2][3];
    unsigned int stride[3];
    unsigned int width;
    unsigned int height;
    unsigned int inputFrameIndex;
    unsigned int referenceFrameIndex;
    };

    class CNvHWEncoder
    {
    public:
    uint32_t m_EncodeIdx;
    uint32_t m_uMaxWidth;
    uint32_t m_uMaxHeight;
    uint32_t m_uCurWidth;
    uint32_t m_uCurHeight;

    protected:
    bool m_bEncoderInitialized;
    GUID codecGUID;

    NV_ENCODE_API_FUNCTION_LIST* m_pEncodeAPI;
    HINSTANCE m_hinstLib;
    void *m_hEncoder;
    NV_ENC_INITIALIZE_PARAMS m_stCreateEncodeParams;
    NV_ENC_CONFIG m_stEncodeConfig;

    public:
    NVENCSTATUS NvEncOpenEncodeSession(void* device, uint32_t deviceType);
    NVENCSTATUS NvEncGetEncodeGUIDCount(uint32_t* encodeGUIDCount);
    NVENCSTATUS NvEncGetEncodeProfileGUIDCount(GUID encodeGUID, uint32_t* encodeProfileGUIDCount);
    NVENCSTATUS NvEncGetEncodeProfileGUIDs(GUID encodeGUID, GUID* profileGUIDs, uint32_t guidArraySize, uint32_t* GUIDCount);
    NVENCSTATUS NvEncGetEncodeGUIDs(GUID* GUIDs, uint32_t guidArraySize, uint32_t* GUIDCount);
    NVENCSTATUS NvEncGetInputFormatCount(GUID encodeGUID, uint32_t* inputFmtCount);
    NVENCSTATUS NvEncGetInputFormats(GUID encodeGUID, NV_ENC_BUFFER_FORMAT* inputFmts, uint32_t inputFmtArraySize, uint32_t* inputFmtCount);
    NVENCSTATUS NvEncGetEncodeCaps(GUID encodeGUID, NV_ENC_CAPS_PARAM* capsParam, int* capsVal);
    NVENCSTATUS NvEncGetEncodePresetCount(GUID encodeGUID, uint32_t* encodePresetGUIDCount);
    NVENCSTATUS NvEncGetEncodePresetGUIDs(GUID encodeGUID, GUID* presetGUIDs, uint32_t guidArraySize, uint32_t* encodePresetGUIDCount);
    NVENCSTATUS NvEncGetEncodePresetConfig(GUID encodeGUID, GUID presetGUID, NV_ENC_PRESET_CONFIG* presetConfig);
    NVENCSTATUS NvEncCreateInputBuffer(uint32_t width, uint32_t height, void** inputBuffer, uint32_t isYuv444);
    NVENCSTATUS NvEncDestroyInputBuffer(NV_ENC_INPUT_PTR inputBuffer);
    NVENCSTATUS NvEncCreateBitstreamBuffer(uint32_t size, void** bitstreamBuffer);
    NVENCSTATUS NvEncDestroyBitstreamBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer);
    NVENCSTATUS NvEncCreateMVBuffer(uint32_t size, void** bitstreamBuffer);
    NVENCSTATUS NvEncDestroyMVBuffer(NV_ENC_OUTPUT_PTR bitstreamBuffer);
    NVENCSTATUS NvRunMotionEstimationOnly(EncodeBuffer *pEncodeBuffer[2], MEOnlyConfig *pMEOnly);
    NVENCSTATUS NvEncLockBitstream(NV_ENC_LOCK_BITSTREAM* lockBitstreamBufferParams);
    NVENCSTATUS NvEncUnlockBitstream(NV_ENC_OUTPUT_PTR bitstreamBuffer);
    NVENCSTATUS NvEncLockInputBuffer(void* inputBuffer, void** bufferDataPtr, uint32_t* pitch);
    NVENCSTATUS NvEncUnlockInputBuffer(NV_ENC_INPUT_PTR inputBuffer);
    NVENCSTATUS NvEncGetEncodeStats(NV_ENC_STAT* encodeStats);
    NVENCSTATUS NvEncGetSequenceParams(NV_ENC_SEQUENCE_PARAM_PAYLOAD* sequenceParamPayload);
    NVENCSTATUS NvEncDestroyEncoder();
    NVENCSTATUS NvEncInvalidateRefFrames(const NvEncPictureCommand *pEncPicCommand);
    NVENCSTATUS NvEncOpenEncodeSessionEx(void* device, NV_ENC_DEVICE_TYPE deviceType);
    NVENCSTATUS NvEncReconfigureEncoder(const NvEncPictureCommand *pEncPicCommand);
    NVENCSTATUS NvEncFlushEncoderQueue(void *hEOSEvent);

    CNvHWEncoder();
    virtual ~CNvHWEncoder();
    NVENCSTATUS Initialize(void* device, NV_ENC_DEVICE_TYPE deviceType);
    NVENCSTATUS Deinitialize();
    NVENCSTATUS NvEncEncodeFrame(EncodeBuffer *pEncodeBuffer, NvEncPictureCommand *encPicCommand,
    uint32_t width, uint32_t height,
    NV_ENC_PIC_STRUCT ePicStruct = NV_ENC_PIC_STRUCT_FRAME,
    int8_t *qpDeltaMapArray = NULL, uint32_t qpDeltaMapArraySize = 0);
    NVENCSTATUS CreateEncoder(const EncodeConfig *pEncCfg);
    GUID GetPresetGUID(char* encoderPreset, int codec);
    NVENCSTATUS ProcessOutput(const EncodeBuffer *pEncodeBuffer);
    NVENCSTATUS FlushEncoder();
    NVENCSTATUS ValidateEncodeGUID(GUID inputCodecGuid);
    NVENCSTATUS ValidatePresetGUID(GUID presetCodecGuid, GUID inputCodecGuid);
    static NVENCSTATUS ParseArguments(EncodeConfig *encodeConfig, int argc, char *argv[]);
    };

    typedef NVENCSTATUS (NVENCAPI *MYPROC)(NV_ENCODE_API_FUNCTION_LIST*);
    654 changes: 654 additions & 0 deletions dynlink_cuda.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,654 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */


    // With these flags defined, this source file will dynamically
    // load the corresponding functions. Disabled by default.
    #define __CUDA_API_VERSION 4000

    #include <stdio.h>
    #include <string.h>
    #include "../inc/dynlink_cuda_cuda.h"
    #if INIT_CUDA_GL
    #include "../inc/dynlink_cudaGL.h"
    #endif
    #if INIT_CUDA_D3D9
    #include "../inc/dynlink_cudaD3D9.h"
    #endif

    tcuInit *_cuInit;
    tcuDriverGetVersion *cuDriverGetVersion;
    tcuDeviceGet *cuDeviceGet;
    tcuDeviceGetCount *cuDeviceGetCount;
    tcuDeviceGetName *cuDeviceGetName;
    tcuDeviceComputeCapability *cuDeviceComputeCapability;
    tcuDeviceTotalMem *cuDeviceTotalMem;
    tcuDeviceGetProperties *cuDeviceGetProperties;
    tcuDeviceGetAttribute *cuDeviceGetAttribute;
    tcuCtxCreate *cuCtxCreate;
    tcuCtxDestroy *cuCtxDestroy;
    tcuCtxAttach *cuCtxAttach;
    tcuCtxDetach *cuCtxDetach;
    tcuCtxPushCurrent *cuCtxPushCurrent;
    tcuCtxPopCurrent *cuCtxPopCurrent;
    tcuCtxGetCurrent *cuCtxGetCurrent;
    tcuCtxSetCurrent *cuCtxSetCurrent;
    tcuCtxGetDevice *cuCtxGetDevice;
    tcuCtxSynchronize *cuCtxSynchronize;
    tcuModuleLoad *cuModuleLoad;
    tcuModuleLoadData *cuModuleLoadData;
    tcuModuleLoadDataEx *cuModuleLoadDataEx;
    tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
    tcuModuleUnload *cuModuleUnload;
    tcuModuleGetFunction *cuModuleGetFunction;
    tcuModuleGetGlobal *cuModuleGetGlobal;
    tcuModuleGetTexRef *cuModuleGetTexRef;
    tcuModuleGetSurfRef *cuModuleGetSurfRef;
    tcuMemGetInfo *cuMemGetInfo;
    tcuMemAlloc *cuMemAlloc;
    tcuMemAllocPitch *cuMemAllocPitch;
    tcuMemFree *cuMemFree;
    tcuMemGetAddressRange *cuMemGetAddressRange;
    tcuMemAllocHost *cuMemAllocHost;
    tcuMemFreeHost *cuMemFreeHost;
    tcuMemHostAlloc *cuMemHostAlloc;
    tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
    tcuMemHostRegister *cuMemHostRegister;
    tcuMemHostUnregister *cuMemHostUnregister;
    tcuMemcpyHtoD *cuMemcpyHtoD;
    tcuMemcpyDtoH *cuMemcpyDtoH;
    tcuMemcpyDtoD *cuMemcpyDtoD;
    tcuMemcpyDtoA *cuMemcpyDtoA;
    tcuMemcpyAtoD *cuMemcpyAtoD;
    tcuMemcpyHtoA *cuMemcpyHtoA;
    tcuMemcpyAtoH *cuMemcpyAtoH;
    tcuMemcpyAtoA *cuMemcpyAtoA;
    tcuMemcpy2D *cuMemcpy2D;
    tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
    tcuMemcpy3D *cuMemcpy3D;
    tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
    tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
    tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
    tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
    tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
    tcuMemcpy2DAsync *cuMemcpy2DAsync;
    tcuMemcpy3DAsync *cuMemcpy3DAsync;
    tcuMemcpy *cuMemcpy;
    tcuMemcpyPeer *cuMemcpyPeer;
    tcuMemsetD8 *cuMemsetD8;
    tcuMemsetD16 *cuMemsetD16;
    tcuMemsetD32 *cuMemsetD32;
    tcuMemsetD2D8 *cuMemsetD2D8;
    tcuMemsetD2D16 *cuMemsetD2D16;
    tcuMemsetD2D32 *cuMemsetD2D32;
    tcuFuncSetBlockShape *cuFuncSetBlockShape;
    tcuFuncSetSharedSize *cuFuncSetSharedSize;
    tcuFuncGetAttribute *cuFuncGetAttribute;
    tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
    tcuLaunchKernel *cuLaunchKernel;
    tcuArrayCreate *cuArrayCreate;
    tcuArrayGetDescriptor *cuArrayGetDescriptor;
    tcuArrayDestroy *cuArrayDestroy;
    tcuArray3DCreate *cuArray3DCreate;
    tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
    tcuTexRefCreate *cuTexRefCreate;
    tcuTexRefDestroy *cuTexRefDestroy;
    tcuTexRefSetArray *cuTexRefSetArray;
    tcuTexRefSetAddress *cuTexRefSetAddress;
    tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
    tcuTexRefSetFormat *cuTexRefSetFormat;
    tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
    tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
    tcuTexRefSetFlags *cuTexRefSetFlags;
    tcuTexRefGetAddress *cuTexRefGetAddress;
    tcuTexRefGetArray *cuTexRefGetArray;
    tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
    tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
    tcuTexRefGetFormat *cuTexRefGetFormat;
    tcuTexRefGetFlags *cuTexRefGetFlags;
    tcuSurfRefSetArray *cuSurfRefSetArray;
    tcuSurfRefGetArray *cuSurfRefGetArray;
    tcuParamSetSize *cuParamSetSize;
    tcuParamSeti *cuParamSeti;
    tcuParamSetf *cuParamSetf;
    tcuParamSetv *cuParamSetv;
    tcuParamSetTexRef *cuParamSetTexRef;
    tcuLaunch *cuLaunch;
    tcuLaunchGrid *cuLaunchGrid;
    tcuLaunchGridAsync *cuLaunchGridAsync;
    tcuEventCreate *cuEventCreate;
    tcuEventRecord *cuEventRecord;
    tcuEventQuery *cuEventQuery;
    tcuEventSynchronize *cuEventSynchronize;
    tcuEventDestroy *cuEventDestroy;
    tcuEventElapsedTime *cuEventElapsedTime;
    tcuStreamCreate *cuStreamCreate;
    tcuStreamQuery *cuStreamQuery;
    tcuStreamSynchronize *cuStreamSynchronize;
    tcuStreamDestroy *cuStreamDestroy;
    tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
    tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
    tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
    tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
    tcuGraphicsMapResources *cuGraphicsMapResources;
    tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
    tcuGetExportTable *cuGetExportTable;
    tcuCtxSetLimit *cuCtxSetLimit;
    tcuCtxGetLimit *cuCtxGetLimit;
    tcuMemHostGetFlags *cuMemHostGetFlags;

    #if INIT_CUDA_GL
    // GL/CUDA interop
    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    tcuWGLGetDevice *cuWGLGetDevice;
    #endif

    //#if __CUDA_API_VERSION >= 3020
    tcuGLCtxCreate *cuGLCtxCreate;
    tcuGLCtxCreate *cuGLCtxCreate_v2;
    tcuGLMapBufferObject *cuGLMapBufferObject;
    tcuGLMapBufferObject *cuGLMapBufferObject_v2;
    tcuGLMapBufferObjectAsync *cuGLMapBufferObjectAsync;
    //#endif

    #if __CUDA_API_VERSION >= 6050
    tcuGLGetDevices *cuGLGetDevices;
    #endif

    tcuGLInit *cuGLInit; // deprecated in CUDA 3.0
    tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
    tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
    tcuGLSetBufferObjectMapFlags *cuGLSetBufferObjectMapFlags;
    tcuGLRegisterBufferObject *cuGLRegisterBufferObject;

    tcuGLUnmapBufferObject *cuGLUnmapBufferObject;
    tcuGLUnmapBufferObjectAsync *cuGLUnmapBufferObjectAsync;

    tcuGLUnregisterBufferObject *cuGLUnregisterBufferObject;
    tcuGLGetDevices *cuGLGetDevices; // CUDA 6.5 only
    #endif

    #if INIT_CUDA_D3D9
    // D3D9/CUDA interop (CUDA 1.x compatible API). These functions
    // are deprecated; please use the ones below
    tcuD3D9Begin *cuD3D9Begin;
    tcuD3D9End *cuD3D9End;

    // D3D9/CUDA interop (CUDA 2.x compatible)
    tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
    tcuD3D9RegisterResource *cuD3D9RegisterResource;
    tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
    tcuD3D9MapResources *cuD3D9MapResources;
    tcuD3D9UnmapResources *cuD3D9UnmapResources;
    tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
    tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
    tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
    tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
    tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
    tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;

    // D3D9/CUDA interop (CUDA 2.0+)
    tcuD3D9GetDevice *cuD3D9GetDevice;
    tcuD3D9GetDevice *cuD3D9GetDevices;
    tcuD3D9GetDevice *cuD3D9GetDevice_v2;
    tcuD3D9CtxCreate *cuD3D9CtxCreate;
    tcuD3D9CtxCreate *cuD3D9CtxCreate_v2;
    tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
    tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource_v2;
    #endif

    #define STRINGIFY(X) #X

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    #include <Windows.h>

    #ifdef UNICODE
    static LPCWSTR __CudaLibName = L"nvcuda.dll";
    #else
    static LPCSTR __CudaLibName = "nvcuda.dll";
    #endif

    typedef HMODULE CUDADRIVER;

    static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    {
    *pInstance = LoadLibrary(__CudaLibName);

    if (*pInstance == NULL)
    {
    printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
    return CUDA_ERROR_UNKNOWN;
    }

    return CUDA_SUCCESS;
    }

    #define GET_PROC_EX(name, alias, required) \
    alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
    if (alias == NULL && required) { \
    printf("Failed to find required function \"%s\" in %s\n", \
    #name, __CudaLibName); \
    }

    #define GET_PROC_EX_V2(name, alias, required) \
    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
    if (alias == NULL && required) { \
    printf("Failed to find required function \"%s\" in %s\n", \
    STRINGIFY(name##_v2), __CudaLibName); \
    }

    #elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)

    #include <dlfcn.h>

    #if defined(__APPLE__) || defined(__MACOSX)
    static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
    #else
    static char __CudaLibName[] = "libcuda.so";
    #endif

    typedef void *CUDADRIVER;

    static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    {
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);

    if (*pInstance == NULL)
    {
    printf("dlopen \"%s\" failed!\n", __CudaLibName);
    return CUDA_ERROR_UNKNOWN;
    }

    return CUDA_SUCCESS;
    }

    #define GET_PROC_EX(name, alias, required) \
    alias = (t##name *)dlsym(CudaDrvLib, #name); \
    if (alias == NULL && required) { \
    printf("Failed to find required function \"%s\" in %s\n", \
    #name, __CudaLibName); \
    }

    #define GET_PROC_EX_V2(name, alias, required) \
    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
    if (alias == NULL && required) { \
    printf("Failed to find required function \"%s\" in %s\n", \
    STRINGIFY(name##_v2), __CudaLibName); \
    }

    #else
    #error unsupported platform
    #endif

    #define CHECKED_CALL(call) \
    do { \
    CUresult result = (call); \
    if (CUDA_SUCCESS != result) { \
    return result; \
    } \
    } while(0)

    #define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
    #define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
    #define GET_PROC(name) GET_PROC_REQUIRED(name)
    #define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)

    #if INIT_CUDA_GL
    inline CUresult CUDAAPI cuInitGL(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
    {
    if (cudaVersion >= 2010)
    {
    GET_PROC(cuGLCtxCreate);
    GET_PROC(cuGraphicsGLRegisterBuffer);
    GET_PROC(cuGraphicsGLRegisterImage);
    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    GET_PROC(cuWGLGetDevice);
    #endif
    }
    if (cudaVersion >= 2030)
    {
    GET_PROC(cuGraphicsGLRegisterBuffer);
    GET_PROC(cuGraphicsGLRegisterImage);
    }
    if (cudaVersion >= 3000)
    {
    GET_PROC(cuGLGetDevices);
    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    GET_PROC(cuWGLGetDevice);
    #endif
    GET_PROC_V2(cuGLCtxCreate);

    GET_PROC_V2(cuGLMapBufferObject);
    GET_PROC(cuGLUnmapBufferObject);
    GET_PROC(cuGLMapBufferObjectAsync);
    GET_PROC(cuGLUnmapBufferObjectAsync);
    GET_PROC(cuGLRegisterBufferObject);
    GET_PROC(cuGLUnregisterBufferObject);
    GET_PROC(cuGLSetBufferObjectMapFlags);
    }

    return CUDA_SUCCESS;
    }
    #endif

    #ifdef INIT_CUDA_D3D9
    inline CUresult CUDAAPI cuInitD3D9(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
    {
    // D3D9/CUDA (CUDA 1.x compatible API)
    GET_PROC(cuD3D9Begin);
    GET_PROC(cuD3D9End);

    // D3D9/CUDA (CUDA 2.x compatible API)
    GET_PROC(cuD3D9GetDirect3DDevice);
    GET_PROC(cuD3D9RegisterResource);
    GET_PROC(cuD3D9UnregisterResource);
    GET_PROC(cuD3D9MapResources);
    GET_PROC(cuD3D9UnmapResources);
    GET_PROC(cuD3D9ResourceSetMapFlags);

    // D3D9/CUDA (CUDA 2.0+ compatible API)
    GET_PROC(cuD3D9GetDevice);
    GET_PROC(cuGraphicsD3D9RegisterResource);

    GET_PROC_V2(cuD3D9CtxCreate);
    GET_PROC_V2(cuD3D9ResourceGetSurfaceDimensions);
    GET_PROC_V2(cuD3D9ResourceGetMappedPointer);
    GET_PROC_V2(cuD3D9ResourceGetMappedSize);
    GET_PROC_V2(cuD3D9ResourceGetMappedPitch);
    // GET_PROC_V2(cuD3D9ResourceGetMappedArray);

    return CUDA_SUCCESS;
    }
    #endif

    #ifdef INIT_CUDA_D3D10
    inline CUresult CUDAAPI cuInitD3D10(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
    {
    if (cudaVersion >= 2030)
    {
    GET_PROC(cuD3D10GetDevice);
    GET_PROC(cuD3D10CtxCreate);
    GET_PROC(cuGraphicsD3D10RegisterResource);
    }
    return CUDA_SUCCESS;
    }
    #endif

    #ifdef INIT_CUDA_D3D11
    inline CUresult CUDAAPI cuInitD3D11(unsigned int Flags, int cudaVersion, CUDADRIVER &CudaDrvLib)
    {
    if (cudaVersion >= 3000)
    {
    GET_PROC(cuD3D11GetDevice);
    GET_PROC(cuD3D11CtxCreate);
    GET_PROC(cuGraphicsD3D11RegisterResource);
    }

    return CUDA_SUCCESS;
    }
    #endif


    CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion, void *pHandleDriver)
    {
    CUDADRIVER CudaDrvLib;
    int driverVer = 1000;

    CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
    if (pHandleDriver != NULL)
    {
    memcpy(pHandleDriver, &CudaDrvLib, sizeof(CUDADRIVER));
    }

    // cuInit is required; alias it to _cuInit
    GET_PROC_EX(cuInit, _cuInit, 1);
    CHECKED_CALL(_cuInit(Flags));

    // available since 2.2. if not present, version 1.0 is assumed
    GET_PROC_OPTIONAL(cuDriverGetVersion);

    if (cuDriverGetVersion)
    {
    CHECKED_CALL(cuDriverGetVersion(&driverVer));
    }

    // fetch all function pointers
    GET_PROC(cuDeviceGet);
    GET_PROC(cuDeviceGetCount);
    GET_PROC(cuDeviceGetName);
    GET_PROC(cuDeviceComputeCapability);
    GET_PROC(cuDeviceGetProperties);
    GET_PROC(cuDeviceGetAttribute);
    GET_PROC(cuCtxDestroy);
    GET_PROC(cuCtxAttach);
    GET_PROC(cuCtxDetach);
    GET_PROC(cuCtxPushCurrent);
    GET_PROC(cuCtxPopCurrent);
    GET_PROC(cuCtxGetDevice);
    GET_PROC(cuCtxSynchronize);
    GET_PROC(cuModuleLoad);
    GET_PROC(cuModuleLoadData);
    GET_PROC(cuModuleUnload);
    GET_PROC(cuModuleGetFunction);
    GET_PROC(cuModuleGetTexRef);
    GET_PROC(cuMemFreeHost);
    GET_PROC(cuMemHostAlloc);
    GET_PROC(cuFuncSetBlockShape);
    GET_PROC(cuFuncSetSharedSize);
    GET_PROC(cuFuncGetAttribute);
    GET_PROC(cuArrayDestroy);
    GET_PROC(cuTexRefCreate);
    GET_PROC(cuTexRefDestroy);
    GET_PROC(cuTexRefSetArray);
    GET_PROC(cuTexRefSetFormat);
    GET_PROC(cuTexRefSetAddressMode);
    GET_PROC(cuTexRefSetFilterMode);
    GET_PROC(cuTexRefSetFlags);
    GET_PROC(cuTexRefGetArray);
    GET_PROC(cuTexRefGetAddressMode);
    GET_PROC(cuTexRefGetFilterMode);
    GET_PROC(cuTexRefGetFormat);
    GET_PROC(cuTexRefGetFlags);
    GET_PROC(cuParamSetSize);
    GET_PROC(cuParamSeti);
    GET_PROC(cuParamSetf);
    GET_PROC(cuParamSetv);
    GET_PROC(cuParamSetTexRef);
    GET_PROC(cuLaunch);
    GET_PROC(cuLaunchGrid);
    GET_PROC(cuLaunchGridAsync);
    GET_PROC(cuEventCreate);
    GET_PROC(cuEventRecord);
    GET_PROC(cuEventQuery);
    GET_PROC(cuEventSynchronize);
    GET_PROC(cuEventDestroy);
    GET_PROC(cuEventElapsedTime);
    GET_PROC(cuStreamCreate);
    GET_PROC(cuStreamQuery);
    GET_PROC(cuStreamSynchronize);
    GET_PROC(cuStreamDestroy);

    // These could be _v2 interfaces
    if (cudaVersion >= 4000)
    {
    GET_PROC_V2(cuCtxDestroy);
    GET_PROC_V2(cuCtxPopCurrent);
    GET_PROC_V2(cuCtxPushCurrent);
    GET_PROC_V2(cuStreamDestroy);
    GET_PROC_V2(cuEventDestroy);
    }

    if (cudaVersion >= 3020)
    {
    GET_PROC_V2(cuDeviceTotalMem);
    GET_PROC_V2(cuCtxCreate);
    GET_PROC_V2(cuModuleGetGlobal);
    GET_PROC_V2(cuMemGetInfo);
    GET_PROC_V2(cuMemAlloc);
    GET_PROC_V2(cuMemAllocPitch);
    GET_PROC_V2(cuMemFree);
    GET_PROC_V2(cuMemGetAddressRange);
    GET_PROC_V2(cuMemAllocHost);
    GET_PROC_V2(cuMemHostGetDevicePointer);
    GET_PROC_V2(cuMemcpyHtoD);
    GET_PROC_V2(cuMemcpyDtoH);
    GET_PROC_V2(cuMemcpyDtoD);
    GET_PROC_V2(cuMemcpyDtoA);
    GET_PROC_V2(cuMemcpyAtoD);
    GET_PROC_V2(cuMemcpyHtoA);
    GET_PROC_V2(cuMemcpyAtoH);
    GET_PROC_V2(cuMemcpyAtoA);
    GET_PROC_V2(cuMemcpy2D);
    GET_PROC_V2(cuMemcpy2DUnaligned);
    GET_PROC_V2(cuMemcpy3D);
    GET_PROC_V2(cuMemcpyHtoDAsync);
    GET_PROC_V2(cuMemcpyDtoHAsync);
    GET_PROC_V2(cuMemcpyHtoAAsync);
    GET_PROC_V2(cuMemcpyAtoHAsync);
    GET_PROC_V2(cuMemcpy2DAsync);
    GET_PROC_V2(cuMemcpy3DAsync);
    GET_PROC_V2(cuMemsetD8);
    GET_PROC_V2(cuMemsetD16);
    GET_PROC_V2(cuMemsetD32);
    GET_PROC_V2(cuMemsetD2D8);
    GET_PROC_V2(cuMemsetD2D16);
    GET_PROC_V2(cuMemsetD2D32);
    GET_PROC_V2(cuArrayCreate);
    GET_PROC_V2(cuArrayGetDescriptor);
    GET_PROC_V2(cuArray3DCreate);
    GET_PROC_V2(cuArray3DGetDescriptor);
    GET_PROC_V2(cuTexRefSetAddress);
    GET_PROC_V2(cuTexRefSetAddress2D);
    GET_PROC_V2(cuTexRefGetAddress);
    }
    else
    {
    GET_PROC(cuDeviceTotalMem);
    GET_PROC(cuCtxCreate);
    GET_PROC(cuModuleGetGlobal);
    GET_PROC(cuMemGetInfo);
    GET_PROC(cuMemAlloc);
    GET_PROC(cuMemAllocPitch);
    GET_PROC(cuMemFree);
    GET_PROC(cuMemGetAddressRange);
    GET_PROC(cuMemAllocHost);
    GET_PROC(cuMemHostGetDevicePointer);
    GET_PROC(cuMemcpyHtoD);
    GET_PROC(cuMemcpyDtoH);
    GET_PROC(cuMemcpyDtoD);
    GET_PROC(cuMemcpyDtoA);
    GET_PROC(cuMemcpyAtoD);
    GET_PROC(cuMemcpyHtoA);
    GET_PROC(cuMemcpyAtoH);
    GET_PROC(cuMemcpyAtoA);
    GET_PROC(cuMemcpy2D);
    GET_PROC(cuMemcpy2DUnaligned);
    GET_PROC(cuMemcpy3D);
    GET_PROC(cuMemcpyHtoDAsync);
    GET_PROC(cuMemcpyDtoHAsync);
    GET_PROC(cuMemcpyHtoAAsync);
    GET_PROC(cuMemcpyAtoHAsync);
    GET_PROC(cuMemcpy2DAsync);
    GET_PROC(cuMemcpy3DAsync);
    GET_PROC(cuMemsetD8);
    GET_PROC(cuMemsetD16);
    GET_PROC(cuMemsetD32);
    GET_PROC(cuMemsetD2D8);
    GET_PROC(cuMemsetD2D16);
    GET_PROC(cuMemsetD2D32);
    GET_PROC(cuArrayCreate);
    GET_PROC(cuArrayGetDescriptor);
    GET_PROC(cuArray3DCreate);
    GET_PROC(cuArray3DGetDescriptor);
    GET_PROC(cuTexRefSetAddress);
    GET_PROC(cuTexRefSetAddress2D);
    GET_PROC(cuTexRefGetAddress);
    }

    // The following functions are specific to CUDA versions
    if (driverVer >= 2010)
    {
    GET_PROC(cuModuleLoadDataEx);
    GET_PROC(cuModuleLoadFatBinary);
    }

    if (driverVer >= 2030)
    {
    GET_PROC(cuMemHostGetFlags);
    }

    if (driverVer >= 3000)
    {
    GET_PROC(cuMemcpyDtoDAsync);
    GET_PROC(cuFuncSetCacheConfig);

    GET_PROC(cuGraphicsUnregisterResource);
    GET_PROC(cuGraphicsSubResourceGetMappedArray);

    #if (__CUDA_API_VERSION >= 3020)
    if (cudaVersion >= 3020)
    {
    GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
    }
    else
    {
    GET_PROC(cuGraphicsResourceGetMappedPointer);
    }
    #endif
    GET_PROC(cuGraphicsResourceSetMapFlags);
    GET_PROC(cuGraphicsMapResources);
    GET_PROC(cuGraphicsUnmapResources);
    GET_PROC(cuGetExportTable);
    }

    if (driverVer >= 3010)
    {
    GET_PROC(cuModuleGetSurfRef);
    GET_PROC(cuSurfRefSetArray);
    GET_PROC(cuSurfRefGetArray);
    GET_PROC(cuCtxSetLimit);
    GET_PROC(cuCtxGetLimit);
    }

    if (driverVer >= 4000)
    {
    GET_PROC(cuCtxSetCurrent);
    GET_PROC(cuCtxGetCurrent);
    GET_PROC(cuMemHostRegister);
    GET_PROC(cuMemHostUnregister);
    GET_PROC(cuMemcpy);
    GET_PROC(cuMemcpyPeer);
    GET_PROC(cuLaunchKernel);
    }

    #if INIT_CUDA_GL
    if (cuInitGL(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
    return CUDA_ERROR_INVALID_DEVICE;
    #endif

    #if INIT_CUDA_D3D9
    if (cuInitD3D9(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
    return CUDA_ERROR_INVALID_DEVICE;
    #endif

    #if INIT_CUDA_D3D10
    if (cuInitD3D10(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
    return CUDA_ERROR_INVALID_DEVICE;
    #endif

    #if INIT_CUDA_D3D11
    if (cuInitD3D11(0, __CUDA_API_VERSION, CudaDrvLib) != CUDA_SUCCESS)
    return CUDA_ERROR_INVALID_DEVICE;
    #endif

    return CUDA_SUCCESS;
    }

    1,685 changes: 1,685 additions & 0 deletions dynlink_cuda_cuda.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,1685 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */

    #ifndef __cuda_cuda_h__
    #define __cuda_cuda_h__

    #include <stdlib.h>

    #ifndef __CUDA_API_VERSION
    #define __CUDA_API_VERSION 4000
    #endif

    /**
    * \defgroup CUDA_DRIVER CUDA Driver API
    *
    * This section describes the low-level CUDA driver application programming
    * interface.
    *
    * @{
    */

    /**
    * \defgroup CUDA_TYPES Data types used by CUDA driver
    * @{
    */

    /**
    * CUDA API version number
    */
    #define CUDA_VERSION 4000 /* 4.0 */

    #ifdef __cplusplus
    extern "C" {
    #endif

    /**
    * CUDA device pointer
    */
    #if __CUDA_API_VERSION >= 3020

    #if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
    typedef unsigned long long CUdeviceptr;
    #else
    typedef unsigned int CUdeviceptr;
    #endif

    #endif /* __CUDA_API_VERSION >= 3020 */

    typedef int CUdevice; /**< CUDA device */
    typedef struct CUctx_st *CUcontext; /**< CUDA context */
    typedef struct CUmod_st *CUmodule; /**< CUDA module */
    typedef struct CUfunc_st *CUfunction; /**< CUDA function */
    typedef struct CUarray_st *CUarray; /**< CUDA array */
    typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
    typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
    typedef struct CUevent_st *CUevent; /**< CUDA event */
    typedef struct CUstream_st *CUstream; /**< CUDA stream */
    typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */

    typedef struct CUuuid_st /**< CUDA definition of UUID */
    {
    char bytes[16];
    } CUuuid;

    /**
    * Context creation flags
    */
    typedef enum CUctx_flags_enum
    {
    CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
    CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
    CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
    CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
    CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
    CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
    #if __CUDA_API_VERSION < 4000
    CU_CTX_SCHED_MASK = 0x03,
    CU_CTX_FLAGS_MASK = 0x1f
    #else
    CU_CTX_SCHED_MASK = 0x07,
    CU_CTX_PRIMARY = 0x20, /**< Initialize and return the primary context */
    CU_CTX_FLAGS_MASK = 0x3f
    #endif
    } CUctx_flags;

    /**
    * Event creation flags
    */
    typedef enum CUevent_flags_enum
    {
    CU_EVENT_DEFAULT = 0, /**< Default event flag */
    CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */
    CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */
    } CUevent_flags;

    /**
    * Array formats
    */
    typedef enum CUarray_format_enum
    {
    CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
    CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
    CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
    CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
    CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
    CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
    } CUarray_format;

    /**
    * Texture reference addressing modes
    */
    typedef enum CUaddress_mode_enum
    {
    CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
    CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
    CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
    } CUaddress_mode;

    /**
    * Texture reference filtering modes
    */
    typedef enum CUfilter_mode_enum
    {
    CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
    CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
    } CUfilter_mode;

    /**
    * Device properties
    */
    typedef enum CUdevice_attribute_enum
    {
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Maximum texture array width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Maximum texture array height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35 /**< Device is using TCC driver model */
    #if __CUDA_API_VERSION >= 4000
    , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43 /**< Maximum layers in a 1D layered texture */
    #endif
    } CUdevice_attribute;

    /**
    * Legacy device properties
    */
    typedef struct CUdevprop_st
    {
    int maxThreadsPerBlock; /**< Maximum number of threads per block */
    int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
    int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
    int sharedMemPerBlock; /**< Shared memory available per block in bytes */
    int totalConstantMemory; /**< Constant memory available on device in bytes */
    int SIMDWidth; /**< Warp size in threads */
    int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
    int regsPerBlock; /**< 32-bit registers available per block */
    int clockRate; /**< Clock frequency in kilohertz */
    int textureAlign; /**< Alignment requirement for textures */
    } CUdevprop;

    /**
    * Function properties
    */
    typedef enum CUfunction_attribute_enum
    {
    /**
    * The maximum number of threads per block, beyond which a launch of the
    * function would fail. This number depends on both the function and the
    * device on which the function is currently loaded.
    */
    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,

    /**
    * The size in bytes of statically-allocated shared memory required by
    * this function. This does not include dynamically-allocated shared
    * memory requested by the user at runtime.
    */
    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,

    /**
    * The size in bytes of user-allocated constant memory required by this
    * function.
    */
    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,

    /**
    * The size in bytes of local memory used by each thread of this function.
    */
    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

    /**
    * The number of registers used by each thread of this function.
    */
    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,

    /**
    * The PTX virtual architecture version for which the function was
    * compiled. This value is the major PTX version * 10 + the minor PTX
    * version, so a PTX version 1.3 function would return the value 13.
    * Note that this may return the undefined value of 0 for cubins
    * compiled prior to CUDA 3.0.
    */
    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,

    /**
    * The binary architecture version for which the function was compiled.
    * This value is the major binary version * 10 + the minor binary version,
    * so a binary version 1.3 function would return the value 13. Note that
    * this will return a value of 10 for legacy cubins that do not have a
    * properly-encoded binary architecture version.
    */
    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,

    CU_FUNC_ATTRIBUTE_MAX
    } CUfunction_attribute;

    /**
    * Function cache configurations
    */
    typedef enum CUfunc_cache_enum
    {
    CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
    CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
    CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */
    } CUfunc_cache;

    /**
    * Memory types
    */
    typedef enum CUmemorytype_enum
    {
    CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
    CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
    CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */
    #if __CUDA_API_VERSION >= 4000
    , CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
    #endif
    } CUmemorytype;

    /**
    * Compute Modes
    */
    typedef enum CUcomputemode_enum
    {
    CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
    CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
    CU_COMPUTEMODE_PROHIBITED = 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
    #if __CUDA_API_VERSION >= 4000
    , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
    #endif
    } CUcomputemode;

    /**
    * Online compiler options
    */
    typedef enum CUjit_option_enum
    {
    /**
    * Max number of registers that a thread may use.\n
    * Option type: unsigned int
    */
    CU_JIT_MAX_REGISTERS = 0,

    /**
    * IN: Specifies minimum number of threads per block to target compilation
    * for\n
    * OUT: Returns the number of threads the compiler actually targeted.
    * This restricts the resource utilization fo the compiler (e.g. max
    * registers) such that a block with the given number of threads should be
    * able to launch based on register limitations. Note, this option does not
    * currently take into account any other resource limitations, such as
    * shared memory utilization.\n
    * Option type: unsigned int
    */
    CU_JIT_THREADS_PER_BLOCK,

    /**
    * Returns a float value in the option of the wall clock time, in
    * milliseconds, spent creating the cubin\n
    * Option type: float
    */
    CU_JIT_WALL_TIME,

    /**
    * Pointer to a buffer in which to print any log messsages from PTXAS
    * that are informational in nature (the buffer size is specified via
    * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
    * Option type: char*
    */
    CU_JIT_INFO_LOG_BUFFER,

    /**
    * IN: Log buffer size in bytes. Log messages will be capped at this size
    * (including null terminator)\n
    * OUT: Amount of log buffer filled with messages\n
    * Option type: unsigned int
    */
    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,

    /**
    * Pointer to a buffer in which to print any log messages from PTXAS that
    * reflect errors (the buffer size is specified via option
    * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
    * Option type: char*
    */
    CU_JIT_ERROR_LOG_BUFFER,

    /**
    * IN: Log buffer size in bytes. Log messages will be capped at this size
    * (including null terminator)\n
    * OUT: Amount of log buffer filled with messages\n
    * Option type: unsigned int
    */
    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,

    /**
    * Level of optimizations to apply to generated code (0 - 4), with 4
    * being the default and highest level of optimizations.\n
    * Option type: unsigned int
    */
    CU_JIT_OPTIMIZATION_LEVEL,

    /**
    * No option value required. Determines the target based on the current
    * attached context (default)\n
    * Option type: No option value needed
    */
    CU_JIT_TARGET_FROM_CUCONTEXT,

    /**
    * Target is chosen based on supplied ::CUjit_target_enum.\n
    * Option type: unsigned int for enumerated type ::CUjit_target_enum
    */
    CU_JIT_TARGET,

    /**
    * Specifies choice of fallback strategy if matching cubin is not found.
    * Choice is based on supplied ::CUjit_fallback_enum.\n
    * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
    */
    CU_JIT_FALLBACK_STRATEGY

    } CUjit_option;

    /**
    * Online compilation targets
    */
    typedef enum CUjit_target_enum
    {
    CU_TARGET_COMPUTE_10 = 0, /**< Compute device class 1.0 */
    CU_TARGET_COMPUTE_11, /**< Compute device class 1.1 */
    CU_TARGET_COMPUTE_12, /**< Compute device class 1.2 */
    CU_TARGET_COMPUTE_13, /**< Compute device class 1.3 */
    CU_TARGET_COMPUTE_20, /**< Compute device class 2.0 */
    CU_TARGET_COMPUTE_21 /**< Compute device class 2.1 */
    } CUjit_target;

    /**
    * Cubin matching fallback strategies
    */
    typedef enum CUjit_fallback_enum
    {
    CU_PREFER_PTX = 0, /**< Prefer to compile ptx */

    CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */

    } CUjit_fallback;

    /**
    * Flags to register a graphics resource
    */
    typedef enum CUgraphicsRegisterFlags_enum
    {
    CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04
    } CUgraphicsRegisterFlags;

    /**
    * Flags for mapping and unmapping interop resources
    */
    typedef enum CUgraphicsMapResourceFlags_enum
    {
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
    } CUgraphicsMapResourceFlags;

    /**
    * Array indices for cube faces
    */
    typedef enum CUarray_cubemap_face_enum
    {
    CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */
    } CUarray_cubemap_face;

    /**
    * Limits
    */
    typedef enum CUlimit_enum
    {
    CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
    CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
    CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
    } CUlimit;

    /**
    * Error codes
    */
    typedef enum cudaError_enum
    {
    /**
    * The API call returned with no errors. In the case of query calls, this
    * can also mean that the operation being queried is complete (see
    * ::cuEventQuery() and ::cuStreamQuery()).
    */
    CUDA_SUCCESS = 0,

    /**
    * This indicates that one or more of the parameters passed to the API call
    * is not within an acceptable range of values.
    */
    CUDA_ERROR_INVALID_VALUE = 1,

    /**
    * The API call failed because it was unable to allocate enough memory to
    * perform the requested operation.
    */
    CUDA_ERROR_OUT_OF_MEMORY = 2,

    /**
    * This indicates that the CUDA driver has not been initialized with
    * ::cuInit() or that initialization has failed.
    */
    CUDA_ERROR_NOT_INITIALIZED = 3,

    /**
    * This indicates that the CUDA driver is in the process of shutting down.
    */
    CUDA_ERROR_DEINITIALIZED = 4,

    /**
    * This indicates profiling APIs are called while application is running
    * in visual profiler mode.
    */
    CUDA_ERROR_PROFILER_DISABLED = 5,
    /**
    * This indicates profiling has not been initialized for this context.
    * Call cuProfilerInitialize() to resolve this.
    */
    CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
    /**
    * This indicates profiler has already been started and probably
    * cuProfilerStart() is incorrectly called.
    */
    CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
    /**
    * This indicates profiler has already been stopped and probably
    * cuProfilerStop() is incorrectly called.
    */
    CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
    /**
    * This indicates that no CUDA-capable devices were detected by the installed
    * CUDA driver.
    */
    CUDA_ERROR_NO_DEVICE = 100,

    /**
    * This indicates that the device ordinal supplied by the user does not
    * correspond to a valid CUDA device.
    */
    CUDA_ERROR_INVALID_DEVICE = 101,


    /**
    * This indicates that the device kernel image is invalid. This can also
    * indicate an invalid CUDA module.
    */
    CUDA_ERROR_INVALID_IMAGE = 200,

    /**
    * This most frequently indicates that there is no context bound to the
    * current thread. This can also be returned if the context passed to an
    * API call is not a valid handle (such as a context that has had
    * ::cuCtxDestroy() invoked on it). This can also be returned if a user
    * mixes different API versions (i.e. 3010 context with 3020 API calls).
    * See ::cuCtxGetApiVersion() for more details.
    */
    CUDA_ERROR_INVALID_CONTEXT = 201,

    /**
    * This indicated that the context being supplied as a parameter to the
    * API call was already the active context.
    * \deprecated
    * This error return is deprecated as of CUDA 3.2. It is no longer an
    * error to attempt to push the active context via ::cuCtxPushCurrent().
    */
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,

    /**
    * This indicates that a map or register operation has failed.
    */
    CUDA_ERROR_MAP_FAILED = 205,

    /**
    * This indicates that an unmap or unregister operation has failed.
    */
    CUDA_ERROR_UNMAP_FAILED = 206,

    /**
    * This indicates that the specified array is currently mapped and thus
    * cannot be destroyed.
    */
    CUDA_ERROR_ARRAY_IS_MAPPED = 207,

    /**
    * This indicates that the resource is already mapped.
    */
    CUDA_ERROR_ALREADY_MAPPED = 208,

    /**
    * This indicates that there is no kernel image available that is suitable
    * for the device. This can occur when a user specifies code generation
    * options for a particular CUDA source file that do not include the
    * corresponding device configuration.
    */
    CUDA_ERROR_NO_BINARY_FOR_GPU = 209,

    /**
    * This indicates that a resource has already been acquired.
    */
    CUDA_ERROR_ALREADY_ACQUIRED = 210,

    /**
    * This indicates that a resource is not mapped.
    */
    CUDA_ERROR_NOT_MAPPED = 211,

    /**
    * This indicates that a mapped resource is not available for access as an
    * array.
    */
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,

    /**
    * This indicates that a mapped resource is not available for access as a
    * pointer.
    */
    CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,

    /**
    * This indicates that an uncorrectable ECC error was detected during
    * execution.
    */
    CUDA_ERROR_ECC_UNCORRECTABLE = 214,

    /**
    * This indicates that the ::CUlimit passed to the API call is not
    * supported by the active device.
    */
    CUDA_ERROR_UNSUPPORTED_LIMIT = 215,

    /**
    * This indicates that the ::CUcontext passed to the API call can
    * only be bound to a single CPU thread at a time but is already
    * bound to a CPU thread.
    */
    CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,

    /**
    * This indicates that the device kernel source is invalid.
    */
    CUDA_ERROR_INVALID_SOURCE = 300,

    /**
    * This indicates that the file specified was not found.
    */
    CUDA_ERROR_FILE_NOT_FOUND = 301,

    /**
    * This indicates that a link to a shared object failed to resolve.
    */
    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,

    /**
    * This indicates that initialization of a shared object failed.
    */
    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,

    /**
    * This indicates that an OS call failed.
    */
    CUDA_ERROR_OPERATING_SYSTEM = 304,


    /**
    * This indicates that a resource handle passed to the API call was not
    * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
    */
    CUDA_ERROR_INVALID_HANDLE = 400,


    /**
    * This indicates that a named symbol was not found. Examples of symbols
    * are global/constant variable names, texture names, and surface names.
    */
    CUDA_ERROR_NOT_FOUND = 500,


    /**
    * This indicates that asynchronous operations issued previously have not
    * completed yet. This result is not actually an error, but must be indicated
    * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
    * may return this value include ::cuEventQuery() and ::cuStreamQuery().
    */
    CUDA_ERROR_NOT_READY = 600,


    /**
    * An exception occurred on the device while executing a kernel. Common
    * causes include dereferencing an invalid device pointer and accessing
    * out of bounds shared memory. The context cannot be used, so it must
    * be destroyed (and a new one should be created). All existing device
    * memory allocations from this context are invalid and must be
    * reconstructed if the program is to continue using CUDA.
    */
    CUDA_ERROR_LAUNCH_FAILED = 700,

    /**
    * This indicates that a launch did not occur because it did not have
    * appropriate resources. This error usually indicates that the user has
    * attempted to pass too many arguments to the device kernel, or the
    * kernel launch specifies too many threads for the kernel's register
    * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
    * when a 32-bit int is expected) is equivalent to passing too many
    * arguments and can also result in this error.
    */
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,

    /**
    * This indicates that the device kernel took too long to execute. This can
    * only occur if timeouts are enabled - see the device attribute
    * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
    * context cannot be used (and must be destroyed similar to
    * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
    * this context are invalid and must be reconstructed if the program is to
    * continue using CUDA.
    */
    CUDA_ERROR_LAUNCH_TIMEOUT = 702,

    /**
    * This error indicates a kernel launch that uses an incompatible texturing
    * mode.
    */
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,

    /**
    * This error indicates that a call to ::cuCtxEnablePeerAccess() is
    * trying to re-enable peer access to a context which has already
    * had peer access to it enabled.
    */
    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,

    /**
    * This error indicates that a call to ::cuMemPeerRegister is trying to
    * register memory from a context which has not had peer access
    * enabled yet via ::cuCtxEnablePeerAccess(), or that
    * ::cuCtxDisablePeerAccess() is trying to disable peer access
    * which has not been enabled yet.
    */
    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,

    /**
    * This error indicates that a call to ::cuMemPeerRegister is trying to
    * register already-registered memory.
    */
    CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,

    /**
    * This error indicates that a call to ::cuMemPeerUnregister is trying to
    * unregister memory that has not been registered.
    */
    CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707,

    /**
    * This error indicates that ::cuCtxCreate was called with the flag
    * ::CU_CTX_PRIMARY on a device which already has initialized its
    * primary context.
    */
    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,

    /**
    * This error indicates that the context current to the calling thread
    * has been destroyed using ::cuCtxDestroy, or is a primary context which
    * has not yet been initialized.
    */
    CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,

    /**
    * This indicates that an unknown internal error has occurred.
    */
    CUDA_ERROR_UNKNOWN = 999
    } CUresult;

    #if __CUDA_API_VERSION >= 4000
    /**
    * If set, host memory is portable between CUDA contexts.
    * Flag for ::cuMemHostAlloc()
    */
    #define CU_MEMHOSTALLOC_PORTABLE 0x01

    /**
    * If set, host memory is mapped into CUDA address space and
    * ::cuMemHostGetDevicePointer() may be called on the host pointer.
    * Flag for ::cuMemHostAlloc()
    */
    #define CU_MEMHOSTALLOC_DEVICEMAP 0x02

    /**
    * If set, host memory is allocated as write-combined - fast to write,
    * faster to DMA, slow to read except via SSE4 streaming load instruction
    * (MOVNTDQA).
    * Flag for ::cuMemHostAlloc()
    */
    #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04

    /**
    * If set, host memory is portable between CUDA contexts.
    * Flag for ::cuMemHostRegister()
    */
    #define CU_MEMHOSTREGISTER_PORTABLE 0x01

    /**
    * If set, host memory is mapped into CUDA address space and
    * ::cuMemHostGetDevicePointer() may be called on the host pointer.
    * Flag for ::cuMemHostRegister()
    */
    #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02

    /**
    * If set, peer memory is mapped into CUDA address space and
    * ::cuMemPeerGetDevicePointer() may be called on the host pointer.
    * Flag for ::cuMemPeerRegister()
    */
    #define CU_MEMPEERREGISTER_DEVICEMAP 0x02
    #endif

    #if __CUDA_API_VERSION >= 3020
    /**
    * 2D memory copy parameters
    */
    typedef struct CUDA_MEMCPY2D_st
    {
    size_t srcXInBytes; /**< Source X in bytes */
    size_t srcY; /**< Source Y */

    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
    const void *srcHost; /**< Source host pointer */
    CUdeviceptr srcDevice; /**< Source device pointer */
    CUarray srcArray; /**< Source array reference */
    size_t srcPitch; /**< Source pitch (ignored when src is array) */

    size_t dstXInBytes; /**< Destination X in bytes */
    size_t dstY; /**< Destination Y */

    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
    void *dstHost; /**< Destination host pointer */
    CUdeviceptr dstDevice; /**< Destination device pointer */
    CUarray dstArray; /**< Destination array reference */
    size_t dstPitch; /**< Destination pitch (ignored when dst is array) */

    size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
    size_t Height; /**< Height of 2D memory copy */
    } CUDA_MEMCPY2D;

    /**
    * 3D memory copy parameters
    */
    typedef struct CUDA_MEMCPY3D_st
    {
    size_t srcXInBytes; /**< Source X in bytes */
    size_t srcY; /**< Source Y */
    size_t srcZ; /**< Source Z */
    size_t srcLOD; /**< Source LOD */
    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
    const void *srcHost; /**< Source host pointer */
    CUdeviceptr srcDevice; /**< Source device pointer */
    CUarray srcArray; /**< Source array reference */
    void *reserved0; /**< Must be NULL */
    size_t srcPitch; /**< Source pitch (ignored when src is array) */
    size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */

    size_t dstXInBytes; /**< Destination X in bytes */
    size_t dstY; /**< Destination Y */
    size_t dstZ; /**< Destination Z */
    size_t dstLOD; /**< Destination LOD */
    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
    void *dstHost; /**< Destination host pointer */
    CUdeviceptr dstDevice; /**< Destination device pointer */
    CUarray dstArray; /**< Destination array reference */
    void *reserved1; /**< Must be NULL */
    size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
    size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

    size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
    size_t Height; /**< Height of 3D memory copy */
    size_t Depth; /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D;

    /**
    * 3D memory cross-context copy parameters
    */
    typedef struct CUDA_MEMCPY3D_PEER_st
    {
    size_t srcXInBytes; /**< Source X in bytes */
    size_t srcY; /**< Source Y */
    size_t srcZ; /**< Source Z */
    size_t srcLOD; /**< Source LOD */
    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
    const void *srcHost; /**< Source host pointer */
    CUdeviceptr srcDevice; /**< Source device pointer */
    CUarray srcArray; /**< Source array reference */
    CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
    size_t srcPitch; /**< Source pitch (ignored when src is array) */
    size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */

    size_t dstXInBytes; /**< Destination X in bytes */
    size_t dstY; /**< Destination Y */
    size_t dstZ; /**< Destination Z */
    size_t dstLOD; /**< Destination LOD */
    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
    void *dstHost; /**< Destination host pointer */
    CUdeviceptr dstDevice; /**< Destination device pointer */
    CUarray dstArray; /**< Destination array reference */
    CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
    size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
    size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

    size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
    size_t Height; /**< Height of 3D memory copy */
    size_t Depth; /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D_PEER;

    /**
    * Array descriptor
    */
    typedef struct CUDA_ARRAY_DESCRIPTOR_st
    {
    size_t Width; /**< Width of array */
    size_t Height; /**< Height of array */

    CUarray_format Format; /**< Array format */
    unsigned int NumChannels; /**< Channels per array element */
    } CUDA_ARRAY_DESCRIPTOR;

    /**
    * 3D array descriptor
    */
    typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
    {
    size_t Width; /**< Width of 3D array */
    size_t Height; /**< Height of 3D array */
    size_t Depth; /**< Depth of 3D array */

    CUarray_format Format; /**< Array format */
    unsigned int NumChannels; /**< Channels per array element */
    unsigned int Flags; /**< Flags */
    } CUDA_ARRAY3D_DESCRIPTOR;

    #endif /* __CUDA_API_VERSION >= 3020 */

    /**
    * If set, the CUDA array is a collection of layers, where each layer is either a 1D
    * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
    * of layers, not the depth of a 3D array.
    */
    #define CUDA_ARRAY3D_LAYERED 0x01

    /**
    * Deprecated, use CUDA_ARRAY3D_LAYERED
    */
    #define CUDA_ARRAY3D_2DARRAY 0x01

    /**
    * This flag must be set in order to bind a surface reference
    * to the CUDA array
    */
    #define CUDA_ARRAY3D_SURFACE_LDST 0x02

    /**
    * Override the texref format with a format inferred from the array.
    * Flag for ::cuTexRefSetArray()
    */
    #define CU_TRSA_OVERRIDE_FORMAT 0x01

    /**
    * Read the texture as integers rather than promoting the values to floats
    * in the range [0,1].
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_READ_AS_INTEGER 0x01

    /**
    * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_NORMALIZED_COORDINATES 0x02

    /**
    * Perform sRGB->linear conversion during texture read.
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_SRGB 0x10

    /**
    * End of array terminator for the \p extra parameter to
    * ::cuLaunchKernel
    */
    #define CU_LAUNCH_PARAM_END ((void*)0x00)

    /**
    * Indicator that the next value in the \p extra parameter to
    * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
    * parameters used for launching kernel \p f. This buffer needs to
    * honor all alignment/padding requirements of the individual parameters.
    * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
    * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
    * effect.
    */
    #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)

    /**
    * Indicator that the next value in the \p extra parameter to
    * ::cuLaunchKernel will be a pointer to a size_t which contains the
    * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
    * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
    * in the \p extra array if the value associated with
    * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
    */
    #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)

    /**
    * For texture references loaded into the module, use default texunit from
    * texture reference.
    */
    #define CU_PARAM_TR_DEFAULT -1

    /**
    * CUDA API made obselete at API version 3020
    */
    #if defined(__CUDA_API_VERSION_INTERNAL)
    #define CUdeviceptr CUdeviceptr_v1
    #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st
    #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1
    #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st
    #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1
    #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st
    #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1
    #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
    #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1
    #endif /* CUDA_FORCE_LEGACY32_INTERNAL */

    #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
    typedef unsigned int CUdeviceptr;

    typedef struct CUDA_MEMCPY2D_st
    {
    unsigned int srcXInBytes; /**< Source X in bytes */
    unsigned int srcY; /**< Source Y */
    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
    const void *srcHost; /**< Source host pointer */
    CUdeviceptr srcDevice; /**< Source device pointer */
    CUarray srcArray; /**< Source array reference */
    unsigned int srcPitch; /**< Source pitch (ignored when src is array) */

    unsigned int dstXInBytes; /**< Destination X in bytes */
    unsigned int dstY; /**< Destination Y */
    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
    void *dstHost; /**< Destination host pointer */
    CUdeviceptr dstDevice; /**< Destination device pointer */
    CUarray dstArray; /**< Destination array reference */
    unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */

    unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
    unsigned int Height; /**< Height of 2D memory copy */
    } CUDA_MEMCPY2D;

    typedef struct CUDA_MEMCPY3D_st
    {
    unsigned int srcXInBytes; /**< Source X in bytes */
    unsigned int srcY; /**< Source Y */
    unsigned int srcZ; /**< Source Z */
    unsigned int srcLOD; /**< Source LOD */
    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
    const void *srcHost; /**< Source host pointer */
    CUdeviceptr srcDevice; /**< Source device pointer */
    CUarray srcArray; /**< Source array reference */
    void *reserved0; /**< Must be NULL */
    unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
    unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */

    unsigned int dstXInBytes; /**< Destination X in bytes */
    unsigned int dstY; /**< Destination Y */
    unsigned int dstZ; /**< Destination Z */
    unsigned int dstLOD; /**< Destination LOD */
    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
    void *dstHost; /**< Destination host pointer */
    CUdeviceptr dstDevice; /**< Destination device pointer */
    CUarray dstArray; /**< Destination array reference */
    void *reserved1; /**< Must be NULL */
    unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
    unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */

    unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
    unsigned int Height; /**< Height of 3D memory copy */
    unsigned int Depth; /**< Depth of 3D memory copy */
    } CUDA_MEMCPY3D;

    typedef struct CUDA_ARRAY_DESCRIPTOR_st
    {
    unsigned int Width; /**< Width of array */
    unsigned int Height; /**< Height of array */

    CUarray_format Format; /**< Array format */
    unsigned int NumChannels; /**< Channels per array element */
    } CUDA_ARRAY_DESCRIPTOR;

    typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
    {
    unsigned int Width; /**< Width of 3D array */
    unsigned int Height; /**< Height of 3D array */
    unsigned int Depth; /**< Depth of 3D array */

    CUarray_format Format; /**< Array format */
    unsigned int NumChannels; /**< Channels per array element */
    unsigned int Flags; /**< Flags */
    } CUDA_ARRAY3D_DESCRIPTOR;

    #endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */

    /*
    * If set, the CUDA array contains an array of 2D slices
    * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
    * the number of slices, not the depth of a 3D array.
    */
    #define CUDA_ARRAY3D_2DARRAY 0x01

    /**
    * This flag must be set in order to bind a surface reference
    * to the CUDA array
    */
    #define CUDA_ARRAY3D_SURFACE_LDST 0x02

    /**
    * Override the texref format with a format inferred from the array.
    * Flag for ::cuTexRefSetArray()
    */
    #define CU_TRSA_OVERRIDE_FORMAT 0x01

    /**
    * Read the texture as integers rather than promoting the values to floats
    * in the range [0,1].
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_READ_AS_INTEGER 0x01

    /**
    * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_NORMALIZED_COORDINATES 0x02

    /**
    * Perform sRGB->linear conversion during texture read.
    * Flag for ::cuTexRefSetFlags()
    */
    #define CU_TRSF_SRGB 0x10

    /**
    * For texture references loaded into the module, use default texunit from
    * texture reference.
    */
    #define CU_PARAM_TR_DEFAULT -1

    /** @} */ /* END CUDA_TYPES */

    #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    #define CUDAAPI __stdcall
    #else
    #define CUDAAPI
    #endif

    /**
    * \defgroup CUDA_INITIALIZE Initialization
    *
    * This section describes the initialization functions of the low-level CUDA
    * driver application programming interface.
    *
    * @{
    */

    /*********************************
    ** Initialization
    *********************************/
    typedef CUresult CUDAAPI tcuInit(unsigned int Flags);

    /*********************************
    ** Driver Version Query
    *********************************/
    typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);

    /************************************
    **
    ** Device management
    **
    ***********************************/

    typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
    typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
    typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
    typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
    #else
    typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
    #endif

    typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
    typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);

    /************************************
    **
    ** Context management
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
    typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
    typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);

    typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
    typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);

    typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
    typedef CUresult CUDAAPI tcuCtxSynchronize(void);


    /************************************
    **
    ** Module management
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
    typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
    typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
    typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
    typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
    typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
    #else
    typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
    #endif

    typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
    typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);

    /************************************
    **
    ** Memory management
    **
    ***********************************/
    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
    size_t *pPitch,
    size_t WidthInBytes,
    size_t Height,
    // size of biggest r/w to be performed by kernels on this memory
    // 4, 8 or 16 bytes
    unsigned int ElementSizeBytes
    );
    #else
    typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
    unsigned int *pPitch,
    unsigned int WidthInBytes,
    unsigned int Height,
    // size of biggest r/w to be performed by kernels on this memory
    // 4, 8 or 16 bytes
    unsigned int ElementSizeBytes
    );
    #endif

    typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);

    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
    #else
    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
    #endif

    typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
    typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);

    typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
    typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);

    typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
    typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
    typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);

    /************************************
    **
    ** Synchronous Memcpy
    **
    ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
    ** but if host memory is involved, they wait until the copy is done before returning.
    **
    ***********************************/
    // 1D functions
    #if __CUDA_API_VERSION >= 3020
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);

    // device <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);

    // system <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);

    // array <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
    #else
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);

    // device <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);

    // system <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
    typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);

    // array <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
    #endif

    // 2D memcpy
    typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
    typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);

    // 3D memcpy
    typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);

    /************************************
    **
    ** Asynchronous Memcpy
    **
    ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
    ** memcpy's done with these functions execute in parallel with the CPU and, if
    ** the hardware is available, may execute in parallel with the GPU.
    ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
    **
    ***********************************/

    // 1D functions
    #if __CUDA_API_VERSION >= 3020
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
    const void *srcHost, size_t ByteCount, CUstream hStream);
    typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
    CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
    CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);

    // system <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
    const void *srcHost, size_t ByteCount, CUstream hStream);
    typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
    size_t ByteCount, CUstream hStream);
    #else
    // system <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
    const void *srcHost, unsigned int ByteCount, CUstream hStream);
    typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
    CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);

    // device <-> device memory
    typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
    CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);

    // system <-> array memory
    typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
    const void *srcHost, unsigned int ByteCount, CUstream hStream);
    typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
    unsigned int ByteCount, CUstream hStream);
    #endif

    // 2D memcpy
    typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);

    // 3D memcpy
    typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);

    /************************************
    **
    ** Memset
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
    typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
    typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);

    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
    typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
    typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
    #else
    typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
    typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
    typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
    #endif

    /************************************
    **
    ** Function management
    **
    ***********************************/


    typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
    typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
    typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
    typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
    typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
    unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
    unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
    unsigned int sharedMemBytes,
    CUstream hStream, void **kernelParams, void **extra);

    /************************************
    **
    ** Array management
    **
    ***********************************/

    typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
    typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
    typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);

    typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
    typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);


    /************************************
    **
    ** Texture reference management
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
    typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);

    typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);

    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
    typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
    #else
    typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
    typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
    #endif

    typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
    typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
    typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
    typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);

    typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
    typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
    typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);

    /************************************
    **
    ** Surface reference management
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
    typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);

    /************************************
    **
    ** Parameter management
    **
    ***********************************/

    typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
    typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
    typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
    typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
    typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);


    /************************************
    **
    ** Launch functions
    **
    ***********************************/

    typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
    typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
    typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);

    /************************************
    **
    ** Events
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
    typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
    typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
    typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);

    /************************************
    **
    ** Streams
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
    typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
    typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
    typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);

    /************************************
    **
    ** Graphics interop
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
    typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);

    #if __CUDA_API_VERSION >= 3020
    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
    #else
    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
    #endif

    typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
    typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
    typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);

    /************************************
    **
    ** Export tables
    **
    ***********************************/
    typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);

    /************************************
    **
    ** Limits
    **
    ***********************************/

    typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
    typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);


    extern tcuDriverGetVersion *cuDriverGetVersion;
    extern tcuDeviceGet *cuDeviceGet;
    extern tcuDeviceGetCount *cuDeviceGetCount;
    extern tcuDeviceGetName *cuDeviceGetName;
    extern tcuDeviceComputeCapability *cuDeviceComputeCapability;
    extern tcuDeviceGetProperties *cuDeviceGetProperties;
    extern tcuDeviceGetAttribute *cuDeviceGetAttribute;
    extern tcuCtxDestroy *cuCtxDestroy;
    extern tcuCtxAttach *cuCtxAttach;
    extern tcuCtxDetach *cuCtxDetach;
    extern tcuCtxPushCurrent *cuCtxPushCurrent;
    extern tcuCtxPopCurrent *cuCtxPopCurrent;

    extern tcuCtxSetCurrent *cuCtxSetCurrent;
    extern tcuCtxGetCurrent *cuCtxGetCurrent;

    extern tcuCtxGetDevice *cuCtxGetDevice;
    extern tcuCtxSynchronize *cuCtxSynchronize;
    extern tcuModuleLoad *cuModuleLoad;
    extern tcuModuleLoadData *cuModuleLoadData;
    extern tcuModuleLoadDataEx *cuModuleLoadDataEx;
    extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
    extern tcuModuleUnload *cuModuleUnload;
    extern tcuModuleGetFunction *cuModuleGetFunction;
    extern tcuModuleGetTexRef *cuModuleGetTexRef;
    extern tcuModuleGetSurfRef *cuModuleGetSurfRef;
    extern tcuMemFreeHost *cuMemFreeHost;
    extern tcuMemHostAlloc *cuMemHostAlloc;
    extern tcuMemHostGetFlags *cuMemHostGetFlags;

    extern tcuMemHostRegister *cuMemHostRegister;
    extern tcuMemHostUnregister *cuMemHostUnregister;
    extern tcuMemcpy *cuMemcpy;
    extern tcuMemcpyPeer *cuMemcpyPeer;

    extern tcuDeviceTotalMem *cuDeviceTotalMem;
    extern tcuCtxCreate *cuCtxCreate;
    extern tcuModuleGetGlobal *cuModuleGetGlobal;
    extern tcuMemGetInfo *cuMemGetInfo;
    extern tcuMemAlloc *cuMemAlloc;
    extern tcuMemAllocPitch *cuMemAllocPitch;
    extern tcuMemFree *cuMemFree;
    extern tcuMemGetAddressRange *cuMemGetAddressRange;
    extern tcuMemAllocHost *cuMemAllocHost;
    extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
    extern tcuFuncSetBlockShape *cuFuncSetBlockShape;
    extern tcuFuncSetSharedSize *cuFuncSetSharedSize;
    extern tcuFuncGetAttribute *cuFuncGetAttribute;
    extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
    extern tcuLaunchKernel *cuLaunchKernel;
    extern tcuArrayDestroy *cuArrayDestroy;
    extern tcuTexRefCreate *cuTexRefCreate;
    extern tcuTexRefDestroy *cuTexRefDestroy;
    extern tcuTexRefSetArray *cuTexRefSetArray;
    extern tcuTexRefSetFormat *cuTexRefSetFormat;
    extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
    extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
    extern tcuTexRefSetFlags *cuTexRefSetFlags;
    extern tcuTexRefGetArray *cuTexRefGetArray;
    extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
    extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
    extern tcuTexRefGetFormat *cuTexRefGetFormat;
    extern tcuTexRefGetFlags *cuTexRefGetFlags;
    extern tcuSurfRefSetArray *cuSurfRefSetArray;
    extern tcuSurfRefGetArray *cuSurfRefGetArray;
    extern tcuParamSetSize *cuParamSetSize;
    extern tcuParamSeti *cuParamSeti;
    extern tcuParamSetf *cuParamSetf;
    extern tcuParamSetv *cuParamSetv;
    extern tcuParamSetTexRef *cuParamSetTexRef;
    extern tcuLaunch *cuLaunch;
    extern tcuLaunchGrid *cuLaunchGrid;
    extern tcuLaunchGridAsync *cuLaunchGridAsync;
    extern tcuEventCreate *cuEventCreate;
    extern tcuEventRecord *cuEventRecord;
    extern tcuEventQuery *cuEventQuery;
    extern tcuEventSynchronize *cuEventSynchronize;
    extern tcuEventDestroy *cuEventDestroy;
    extern tcuEventElapsedTime *cuEventElapsedTime;
    extern tcuStreamCreate *cuStreamCreate;
    extern tcuStreamQuery *cuStreamQuery;
    extern tcuStreamSynchronize *cuStreamSynchronize;
    extern tcuStreamDestroy *cuStreamDestroy;
    extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
    extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
    extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
    extern tcuGraphicsMapResources *cuGraphicsMapResources;
    extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
    extern tcuGetExportTable *cuGetExportTable;
    extern tcuCtxSetLimit *cuCtxSetLimit;
    extern tcuCtxGetLimit *cuCtxGetLimit;

    // These functions could be using the CUDA 3.2 interface (_v2)
    extern tcuMemcpyHtoD *cuMemcpyHtoD;
    extern tcuMemcpyDtoH *cuMemcpyDtoH;
    extern tcuMemcpyDtoD *cuMemcpyDtoD;
    extern tcuMemcpyDtoA *cuMemcpyDtoA;
    extern tcuMemcpyAtoD *cuMemcpyAtoD;
    extern tcuMemcpyHtoA *cuMemcpyHtoA;
    extern tcuMemcpyAtoH *cuMemcpyAtoH;
    extern tcuMemcpyAtoA *cuMemcpyAtoA;
    extern tcuMemcpy2D *cuMemcpy2D;
    extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
    extern tcuMemcpy3D *cuMemcpy3D;
    extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
    extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
    extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
    extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
    extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
    extern tcuMemcpy2DAsync *cuMemcpy2DAsync;
    extern tcuMemcpy3DAsync *cuMemcpy3DAsync;
    extern tcuMemsetD8 *cuMemsetD8;
    extern tcuMemsetD16 *cuMemsetD16;
    extern tcuMemsetD32 *cuMemsetD32;
    extern tcuMemsetD2D8 *cuMemsetD2D8;
    extern tcuMemsetD2D16 *cuMemsetD2D16;
    extern tcuMemsetD2D32 *cuMemsetD2D32;
    extern tcuArrayCreate *cuArrayCreate;
    extern tcuArrayGetDescriptor *cuArrayGetDescriptor;
    extern tcuArray3DCreate *cuArray3DCreate;
    extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
    extern tcuTexRefSetAddress *cuTexRefSetAddress;
    extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
    extern tcuTexRefGetAddress *cuTexRefGetAddress;
    extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;

    /************************************/
    CUresult CUDAAPI cuInit (unsigned int, int cudaVersion, void *hHandleDriver);
    /************************************/

    #ifdef __cplusplus
    }
    #endif

    #endif //__cuda_cuda_h__
    28 changes: 28 additions & 0 deletions nvCPUOPSys.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,28 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */

    #ifndef NVCPUOPSYS_H
    #define NVCPUOPSYS_H


    #if defined(_WIN32) || defined(_WIN16)
    # define NV_WINDOWS
    #endif

    #if (defined(__unix__) || defined(__unix) ) && !defined(nvmacosx) && !defined(vxworks) && !defined(__DJGPP__) && !defined(NV_UNIX) && !defined(__QNX__) && !defined(__QNXNTO__)/* XXX until removed from Makefiles */
    # define NV_UNIX
    #endif /* defined(__unix__) */

    #if defined(__linux__) && !defined(NV_LINUX) && !defined(NV_VMWARE)
    # define NV_LINUX
    #endif /* defined(__linux__) */

    #endif
    3,118 changes: 3,118 additions & 0 deletions nvEncodeAPI.h
    3,118 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
    132 changes: 132 additions & 0 deletions nvUtils.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,132 @@
    /*
    * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
    *
    * Please refer to the NVIDIA end user license agreement (EULA) associated
    * with this source code for terms and conditions that govern your use of
    * this software. Any use, reproduction, disclosure, or distribution of
    * this software and related documentation outside the terms of the EULA
    * is strictly prohibited.
    *
    */

    #ifndef NVUTILS_H
    #define NVUTILS_H

    #include "nvCPUOPSys.h"

    #if defined (NV_WINDOWS)
    #include <windows.h>

    #elif defined NV_UNIX
    #include <sys/time.h>
    #include <limits.h>

    #define FALSE 0
    #define TRUE 1
    #define S_OK 0
    #define INFINITE UINT_MAX
    #define stricmp strcasecmp
    #define FILE_BEGIN SEEK_SET
    #define INVALID_SET_FILE_POINTER (-1)
    #define INVALID_HANDLE_VALUE ((void *)(-1))

    typedef void* HANDLE;
    typedef void* HINSTANCE;
    typedef unsigned long DWORD, *LPWORD;
    typedef DWORD FILE_SIZE;
    typedef DWORD HRESULT;

    #endif

    #define MAX(a, b) ((a) > (b) ? (a) : (b))
    #define MIN(a, b) ((a) < (b) ? (a) : (b))
    #define FABS(a) ((a) >= 0 ? (a) : -(a))

    inline bool NvSleep(unsigned int mSec)
    {
    #if defined (NV_WINDOWS)
    Sleep(mSec);
    #elif defined NV_UNIX
    usleep(mSec * 1000);
    #else
    #error NvSleep function unknown for this platform.
    #endif
    return true;
    }

    inline bool NvQueryPerformanceFrequency(unsigned long long *freq)
    {
    *freq = 0;
    #if defined (NV_WINDOWS)
    LARGE_INTEGER lfreq;
    if (!QueryPerformanceFrequency(&lfreq)) {
    return false;
    }
    *freq = lfreq.QuadPart;
    #elif defined NV_UNIX
    // We use system's gettimeofday() to return timer ticks in uSec
    *freq = 1000000000;
    #else
    #error NvQueryPerformanceFrequency function not defined for this platform.
    #endif

    return true;
    }

    #define SEC_TO_NANO_ULL(sec) ((unsigned long long)sec * 1000000000)
    #define MICRO_TO_NANO_ULL(sec) ((unsigned long long)sec * 1000)

    inline bool NvQueryPerformanceCounter(unsigned long long *counter)
    {
    *counter = 0;
    #if defined (NV_WINDOWS)
    LARGE_INTEGER lcounter;
    if (!QueryPerformanceCounter(&lcounter)) {
    return false;
    }
    *counter = lcounter.QuadPart;
    #elif defined NV_UNIX
    struct timeval tv;
    int ret;

    ret = gettimeofday(&tv, NULL);
    if (ret != 0) {
    return false;
    }

    *counter = SEC_TO_NANO_ULL(tv.tv_sec) + MICRO_TO_NANO_ULL(tv.tv_usec);
    #else
    #error NvQueryPerformanceCounter function not defined for this platform.
    #endif
    return true;
    }

    #if defined NV_UNIX
    __inline bool operator==(const GUID &guid1, const GUID &guid2)
    {
    if (guid1.Data1 == guid2.Data1 &&
    guid1.Data2 == guid2.Data2 &&
    guid1.Data3 == guid2.Data3 &&
    guid1.Data4[0] == guid2.Data4[0] &&
    guid1.Data4[1] == guid2.Data4[1] &&
    guid1.Data4[2] == guid2.Data4[2] &&
    guid1.Data4[3] == guid2.Data4[3] &&
    guid1.Data4[4] == guid2.Data4[4] &&
    guid1.Data4[5] == guid2.Data4[5] &&
    guid1.Data4[6] == guid2.Data4[6] &&
    guid1.Data4[7] == guid2.Data4[7])
    {
    return true;
    }

    return false;
    }
    __inline bool operator!=(const GUID &guid1, const GUID &guid2)
    {
    return !(guid1 == guid2);
    }
    #endif
    #endif

    #define PRINTERR(message, ...) \
    fprintf(stderr, "%s line %d: " message, __FILE__, __LINE__, ##__VA_ARGS__)