# send NumPy array data to CUDA code # ============================================================================= # ============================================================================= # MAIN LESSONS: # 1. python natively provides "buffer protocol" which is interface to C array # 2. numpy supports this, and offers you a ptr to the C array data # 3. use ctypes as native interface to C libraries, send them the ptr # 4. no point using other tools as they all break with updates # 5. for cuda, compile it as an extern C shared object # need to restart python each time .so changes to reimport w ctypes # https://stackoverflow.com/questions/145270/calling-c-c-from-python # https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python # gcc -fPIC -shared -o mult.so mult.c # fPIC means position independent code, which is good for shared libraries # because the code doesn't depend on memory location where it is loaded import os # to change directories import subprocess # to compile from python side import ctypes # to access shared obj library import numpy as np # to use numpy arrays # change directory to where the files are located work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda' os.chdir(work_dir) os.getcwd() # PART 1: compile CUDA code # ============================================================================= # ============================================================================= c_source_filename = 'mult.cu' shared_obj_filename = c_source_filename.replace('.cu', '.so') # modify the CUDA source file c_source_contents = ''' __global__ void kernel(float *x, int32_t n) { int32_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) x[i] *= 2.0f; } extern "C" void mult(float* x, int32_t n) { float *d_x; cudaMalloc(&d_x, n*sizeof(float)); cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice); kernel<<<(n+255)/256, 256>>>(d_x, n); cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost); cudaFree(d_x); } ''' # save CUDA source contents to disk with open(os.path.join(work_dir, c_source_filename), 'w') as f: _ = f.write(c_source_contents) # compile # https://forums.developer.nvidia.com/t/shared-library-creation/4776/10 os.remove(shared_obj_filename) compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so' result = subprocess.run(compile_command.split(' '), capture_output=True, text=True) assert result.stderr == '' result # PART 2: use numpy array with CUDA code # ============================================================================= # ============================================================================= np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32) # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html assert np_array.flags['ALIGNED'] == True assert np_array.flags['C_CONTIGUOUS'] == True # load shared library, specify argument / return types my_lib = ctypes.CDLL('./mult.so') my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int) my_lib.mult.restype = None # call function with pointer to NumPy array data np_array my_lib.mult(np_array.ctypes.data, np_array.size) np_array