yuikns · November 5, 2016 16:19
diff --git a/interop.c b/interop.c

 //
 //
 //

 #include <stdlib.h>

 //
 //
 //

 #include "interop.h"

 //
 //
 //

 struct pxl_interop
 {
  // w x h
  int                    width;
  int                    height;

  // GL buffers
  GLuint                 fb0;
  GLuint                 rb0;

  // CUDA resources
  cudaGraphicsResource_t cgr0;
 };

 //
 //
 //

 struct pxl_interop*
 pxl_interop_create(GLFWwindow* window)
 {
  struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));
  
  // init cuda graphics resource
  interop->cgr0 = NULL;

  // render buffer object w/a color buffer
  glGenRenderbuffers(1,&interop->rb0);

  // frame buffer object
  glGenFramebuffers(1,&interop->fb0);

  // return it
  return interop;
 }


 void
 pxl_interop_destroy(struct pxl_interop* const interop)
 {
  free(interop);
 }

 //
 //
 //

 cudaError_t
 pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
 {
  // save new size
  interop->width  = width;
  interop->height = height;

  //
  // RESIZE FBO'S COLOR BUFFER
  //
  
  // bind rbo
  glBindRenderbuffer   (GL_RENDERBUFFER,interop->rb0);
  // resize color buffer
  glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);

  // bind fbo to read
  glBindFramebuffer        (GL_FRAMEBUFFER,interop->fb0);
  // attach rb0 to the fb0
  glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);

  // unbind rbo
  glBindRenderbuffer(GL_RENDERBUFFER,0);

  // bind to default fb
  glBindFramebuffer(GL_FRAMEBUFFER,0);
  
  //
  // REGISTER RBO WITH CUDA
  //
  
  cudaError_t cuda_err;

  // unregister
  if (interop->cgr0 != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);

  // register image
  cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
                                         interop->rb0,
                                         GL_RENDERBUFFER,
                                         cudaGraphicsRegisterFlagsSurfaceLoadStore | cudaGraphicsRegisterFlagsWriteDiscard);

  // diddle some flags
  cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);

  return cuda_err;
 }

 void
 pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
 {
  *width  = interop->width;
  *height = interop->height;
 }

 //
 //
 //

 cudaError_t
 pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
 {
  cudaError_t cuda_err;
  
  // map graphics resources
  cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);

  // get a CUDA Array
  cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);

  return cuda_err;
 }

 cudaError_t
 pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
 {
  cudaError_t cuda_err;
  
  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);

  return cuda_err;
 }
  
 //
 //
 //

 void
 pxl_interop_clear(struct pxl_interop* const interop)
 {
  /*
  const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
  const GLuint clear_color[] = { 255, 0, 0, 255 };
                       
  glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
  glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
  */

  const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
  glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
 }

 //
 //
 //

 void
 pxl_interop_blit(struct pxl_interop* const interop)
 {
  /*
  glBlitFramebuffer(0,0,              interop->width,interop->height,
                    0,interop->height,interop->width,0,
                    GL_COLOR_BUFFER_BIT,
                    GL_NEAREST);
  */

  glBlitNamedFramebuffer(interop->fb0,0,
                         0,0,              interop->width,interop->height,
                         0,interop->height,interop->width,0,
                         GL_COLOR_BUFFER_BIT,
                         GL_NEAREST);
 }

 //
 //
 //
diff --git a/interop.h b/interop.h

 /*
 * Copyright 2015 Allan MacKinnon.  All rights reserved.
 *
 */

 #pragma once

 //
 //
 //

 #include <glad/glad.h>
 #include <GLFW/glfw3.h>

 //
 //
 //

 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>

 //
 //
 //

 struct pxl_interop*
 pxl_interop_create();

 void
 pxl_interop_destroy(struct pxl_interop* const interop);

 //
 //
 //

 cudaError_t
 pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);

 void
 pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);

 //
 //
 //

 cudaError_t
 pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);

 cudaError_t
 pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);

 //
 //
 //

 void
 pxl_interop_clear(struct pxl_interop* const interop);

 void
 pxl_interop_blit(struct pxl_interop* const interop);

 //
 //
 //
diff --git a/kernel.cu b/kernel.cu
 // -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*-

 //
 //
 //

 surface<void,cudaSurfaceType2D> surf;

 //
 //
 //

 union pxl_rgbx_24
 {
  uint1       b32;

  struct {
    unsigned  r  : 8;
    unsigned  g  : 8;
    unsigned  b  : 8;
    unsigned  na : 8;
  };
 };

 //
 //
 //

 extern "C"
 __global__
 void
 pxl_kernel(const int width)
 {
  // pixel coordinates
  const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
  const int x   = idx % width;
  const int y   = idx / width;

  // pixel color
  const int          t    = (unsigned int)clock() / 1250000; // 1.25 GHz
  const int          xt   = (idx + t) % width;
  const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
  const unsigned int bar  = ((y + t) / 32) & 3;

  union pxl_rgbx_24  rgbx;

  rgbx.r  = (bar == 0) || (bar == 1) ? ramp : 0;
  rgbx.g  = (bar == 0) || (bar == 2) ? ramp : 0;
  rgbx.b  = (bar == 0) || (bar == 3) ? ramp : 0;
  rgbx.na = 255;

  // cudaBoundaryModeZero squelches out-of-bound writes
  surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
              surf,
              x*sizeof(rgbx),
              y,
              cudaBoundaryModeZero);
 }

 //
 //
 //

 #define PXL_KERNEL_THREADS_PER_BLOCK  256 // enough for 4Kx2 monitor

 extern "C"
 cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array,
                    const int         width,
                    const int         height,
                    cudaStream_t      stream)
 {
  cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);

  if (cuda_err)
    return cuda_err;

  const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

  if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);

  return cudaSuccess;
 }

 //
 //
 //
diff --git a/main.c b/main.c
 //
 //
 //

 #include <glad/glad.h>
 #include <GLFW/glfw3.h>

 //
 //
 //

 #include <stdlib.h>
 #include <stdio.h>

 //
 //
 //

 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>

 //
 //
 //

 #include "interop.h"

 //
 // FPS COUNTER FROM HERE:
 //
 // http://antongerdelan.net/opengl/glcontext2.html
 //

 static
 void
 pxl_glfw_fps(GLFWwindow* window)
 {
  // static fps counters
  static double stamp_prev  = 0.0;
  static int    frame_count = 0;

  // locals
  const double stamp_curr = glfwGetTime();
  const double elapsed    = stamp_curr - stamp_prev;
  
  if (elapsed > 0.5)
    {
      stamp_prev = stamp_curr;
      
      const double fps = (double)frame_count / elapsed;

      int  width, height;
      char tmp[64];

      glfwGetFramebufferSize(window,&width,&height);
  
      sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);

      glfwSetWindowTitle(window,tmp);

      frame_count = 0;
    }

  frame_count++;
 }

 //
 //
 //

 static
 void
 pxl_glfw_error_callback(int error, const char* description)
 {
  fputs(description,stderr);
 }

 static
 void
 pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
 {
  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
    glfwSetWindowShouldClose(window, GL_TRUE);
 }

 static
 void
 pxl_glfw_init(GLFWwindow** window, const int width, const int height)
 {
  //
  // INITIALIZE GLFW/GLAD
  //
  
  glfwSetErrorCallback(pxl_glfw_error_callback);

  if (!glfwInit())
    exit(EXIT_FAILURE);
 
  const GLFWvidmode* mode = glfwGetVideoMode(glfwGetPrimaryMonitor());

  glfwWindowHint(GLFW_DEPTH_BITS,            0);
  glfwWindowHint(GLFW_STENCIL_BITS,          0);

  glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);

  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);

  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);

  *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);

  if (*window == NULL)
    {
      glfwTerminate();
      exit(EXIT_FAILURE);
    }

  glfwMakeContextCurrent(*window);

  // set up GLAD
  gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);

  // ignore vsync for now
  glfwSwapInterval(0);

  // enable SRGB 
  // glEnable(GL_FRAMEBUFFER_SRGB);

  // only copy r/g/b
  // glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
 }

 //
 //
 //

 static
 void
 pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
 {
  // get context
  struct pxl_interop* const interop = glfwGetWindowUserPointer(window);

  pxl_interop_resize(interop,width,height);
 }

 //
 //
 //

 cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream);

 //
 //
 //

 int
 main(int argc, char* argv[])
 {
  //
  // INIT GLFW
  //

  GLFWwindow* window;

  pxl_glfw_init(&window,1024,1024);

  //
  // INIT CUDA
  //
  cudaError_t cuda_err;
  
  int gl_device_id,gl_device_count;
      
  cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);

  int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
  
  cuda_err = cudaSetDevice(cuda_device_id);

  //
  // INFO
  //
  struct cudaDeviceProp props;

  cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
  printf("GL   : %-24s (%2d)\n",props.name,props.multiProcessorCount);

  cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
  printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

  //
  // CREATE A CUDA STREAM
  //

  cudaStream_t stream;
  
  cuda_err = cudaStreamCreate(&stream);

  //
  // CREATE AND SAVE INTEROP INSTANCE
  //
  
  struct pxl_interop* const interop = pxl_interop_create(window);

  glfwSetWindowUserPointer(window,interop);

  //
  // SET CALLBACKS
  //

  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
  
  //
  // GET ACTUAL WINDOW SIZE
  //
  
  int width, height;

  // get initial width/height
  glfwGetFramebufferSize(window,&width,&height);

  // resize with initial window dimensions
  cuda_err = pxl_interop_resize(interop,width,height);

  //
  // LOOP UNTIL DONE
  //
  
  while (!glfwWindowShouldClose(window))
    {
      //
      // MONITOR FPS
      //

      pxl_glfw_fps(window);
      
      //
      // EXECUTE CUDA KERNEL ON RENDER BUFFER
      //

      int         width,height;
      cudaArray_t cuda_array;

      pxl_interop_get_size(interop,&width,&height);

      cuda_err = pxl_interop_map(interop,&cuda_array,stream);

      cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);

      cuda_err = pxl_interop_unmap(interop,stream);

      cuda_err = cudaStreamSynchronize(stream);

      //
      // BLIT
      // 

      pxl_interop_blit(interop);

      //
      // SWAP
      //
      
      glfwSwapBuffers(window);

      //
      // PUMP/POLL/WAIT
      //
      
      glfwPollEvents(); // glfwWaitEvents();
    }

  //
  // CLEANUP
  //
  
  pxl_interop_destroy(interop);
  
  glfwDestroyWindow(window);

  glfwTerminate();

  cudaDeviceReset();

  // missing some clean up here
  
  exit(EXIT_SUCCESS);
 }

 //
 //
 //

	//
	//
	//

	#include <stdlib.h>

	//
	//
	//

	#include "interop.h"

	//
	//
	//

	struct pxl_interop
	{
	// w x h
	int width;
	int height;

	// GL buffers
	GLuint fb0;
	GLuint rb0;

	// CUDA resources
	cudaGraphicsResource_t cgr0;
	};

	//
	//
	//

	struct pxl_interop*
	pxl_interop_create(GLFWwindow* window)
	{
	struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));

	// init cuda graphics resource
	interop->cgr0 = NULL;

	// render buffer object w/a color buffer
	glGenRenderbuffers(1,&interop->rb0);

	// frame buffer object
	glGenFramebuffers(1,&interop->fb0);

	// return it
	return interop;
	}


	void
	pxl_interop_destroy(struct pxl_interop* const interop)
	{
	free(interop);
	}

	//
	//
	//

	cudaError_t
	pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
	{
	// save new size
	interop->width = width;
	interop->height = height;

	//
	// RESIZE FBO'S COLOR BUFFER
	//

	// bind rbo
	glBindRenderbuffer (GL_RENDERBUFFER,interop->rb0);
	// resize color buffer
	glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);

	// bind fbo to read
	glBindFramebuffer (GL_FRAMEBUFFER,interop->fb0);
	// attach rb0 to the fb0
	glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);

	// unbind rbo
	glBindRenderbuffer(GL_RENDERBUFFER,0);

	// bind to default fb
	glBindFramebuffer(GL_FRAMEBUFFER,0);

	//
	// REGISTER RBO WITH CUDA
	//

	cudaError_t cuda_err;

	// unregister
	if (interop->cgr0 != NULL)
	cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);

	// register image
	cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
	interop->rb0,
	GL_RENDERBUFFER,
	cudaGraphicsRegisterFlagsSurfaceLoadStore \| cudaGraphicsRegisterFlagsWriteDiscard);

	// diddle some flags
	cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);

	return cuda_err;
	}

	void
	pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
	{
	*width = interop->width;
	*height = interop->height;
	}

	//
	//
	//

	cudaError_t
	pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
	{
	cudaError_t cuda_err;

	// map graphics resources
	cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);

	// get a CUDA Array
	cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);

	return cuda_err;
	}

	cudaError_t
	pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
	{
	cudaError_t cuda_err;

	cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);

	return cuda_err;
	}

	//
	//
	//

	void
	pxl_interop_clear(struct pxl_interop* const interop)
	{
	/*
	const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
	const GLuint clear_color[] = { 255, 0, 0, 255 };

	glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
	glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
	*/

	const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
	glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
	}

	//
	//
	//

	void
	pxl_interop_blit(struct pxl_interop* const interop)
	{
	/*
	glBlitFramebuffer(0,0, interop->width,interop->height,
	0,interop->height,interop->width,0,
	GL_COLOR_BUFFER_BIT,
	GL_NEAREST);
	*/

	glBlitNamedFramebuffer(interop->fb0,0,
	0,0, interop->width,interop->height,
	0,interop->height,interop->width,0,
	GL_COLOR_BUFFER_BIT,
	GL_NEAREST);
	}

	//
	//
	//

	/*
	* Copyright 2015 Allan MacKinnon. All rights reserved.
	*
	*/

	#pragma once

	//
	//
	//

	#include <glad/glad.h>
	#include <GLFW/glfw3.h>

	//
	//
	//

	#include <cuda_runtime.h>
	#include <cuda_gl_interop.h>

	//
	//
	//

	struct pxl_interop*
	pxl_interop_create();

	void
	pxl_interop_destroy(struct pxl_interop* const interop);

	//
	//
	//

	cudaError_t
	pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);

	void
	pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);

	//
	//
	//

	cudaError_t
	pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);

	cudaError_t
	pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);

	//
	//
	//

	void
	pxl_interop_clear(struct pxl_interop* const interop);

	void
	pxl_interop_blit(struct pxl_interop* const interop);

	//
	//
	//
	// -- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; --

	//
	//
	//

	surface<void,cudaSurfaceType2D> surf;

	//
	//
	//

	union pxl_rgbx_24
	{
	uint1 b32;

	struct {
	unsigned r : 8;
	unsigned g : 8;
	unsigned b : 8;
	unsigned na : 8;
	};
	};

	//
	//
	//

	extern "C"
	__global__
	void
	pxl_kernel(const int width)
	{
	// pixel coordinates
	const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
	const int x = idx % width;
	const int y = idx / width;

	// pixel color
	const int t = (unsigned int)clock() / 1250000; // 1.25 GHz
	const int xt = (idx + t) % width;
	const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
	const unsigned int bar = ((y + t) / 32) & 3;

	union pxl_rgbx_24 rgbx;

	rgbx.r = (bar == 0) \|\| (bar == 1) ? ramp : 0;
	rgbx.g = (bar == 0) \|\| (bar == 2) ? ramp : 0;
	rgbx.b = (bar == 0) \|\| (bar == 3) ? ramp : 0;
	rgbx.na = 255;

	// cudaBoundaryModeZero squelches out-of-bound writes
	surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
	surf,
	x*sizeof(rgbx),
	y,
	cudaBoundaryModeZero);
	}

	//
	//
	//

	#define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor

	extern "C"
	cudaError_t
	pxl_kernel_launcher(cudaArray_const_t array,
	const int width,
	const int height,
	cudaStream_t stream)
	{
	cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);

	if (cuda_err)
	return cuda_err;

	const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

	if (blocks > 0)
	pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);

	return cudaSuccess;
	}

	//
	//
	//