// -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*- // // // surface surf; // // // union pxl_rgbx_24 { uint1 b32; struct { unsigned r : 8; unsigned g : 8; unsigned b : 8; unsigned na : 8; }; }; // // // extern "C" __global__ void pxl_kernel(const int width) { // pixel coordinates const int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int x = idx % width; const int y = idx / width; // pixel color const int t = (unsigned int)clock() / 1250000; // 1.25 GHz const int xt = (idx + t) % width; const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f); const unsigned int bar = ((y + t) / 32) & 3; union pxl_rgbx_24 rgbx; rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0; rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0; rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0; rgbx.na = 255; // cudaBoundaryModeZero squelches out-of-bound writes surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock() surf, x*sizeof(rgbx), y, cudaBoundaryModeZero); } // // // #define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor extern "C" cudaError_t pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream) { cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array); if (cuda_err) return cuda_err; const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK; if (blocks > 0) pxl_kernel<<>>(width); return cudaSuccess; } // // //