# send NumPy array data to CUDA code
# =============================================================================
# =============================================================================
# MAIN LESSONS:
# 1. python natively provides "buffer protocol" which is interface to C array
# 2. numpy supports this, and offers you a ptr to the C array data
# 3. use ctypes as native interface to C libraries, send them the ptr
# 4. no point using other tools as they all break with updates
# 5. for cuda, compile it as an extern C shared object
#        need to restart python each time .so changes to reimport w ctypes

# https://stackoverflow.com/questions/145270/calling-c-c-from-python
# https://stackoverflow.com/questions/64084033/modern-2020-way-to-call-c-code-from-python

# gcc -fPIC -shared -o mult.so mult.c
# fPIC means position independent code, which is good for shared libraries
# because the code doesn't depend on memory location where it is loaded


import os                   # to change directories
import subprocess           # to compile from python side
import ctypes               # to access shared obj library
import numpy as np          # to use numpy arrays


# change directory to where the files are located 
work_dir = '/home/chad/Desktop/_backups/notes/projects/numpy_2_cuda'
os.chdir(work_dir)
os.getcwd()


# PART 1: compile CUDA code 
# =============================================================================
# =============================================================================

c_source_filename = 'mult.cu'
shared_obj_filename = c_source_filename.replace('.cu', '.so')

# modify the CUDA source file 
c_source_contents = '''
__global__ void kernel(float *x, int32_t n)
{
  int32_t i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) 
    x[i] *= 2.0f;
}

extern "C" void mult(float* x, int32_t n)
{
  float *d_x;

  cudaMalloc(&d_x, n*sizeof(float)); 
  cudaMemcpy(d_x, x, n*sizeof(float), cudaMemcpyHostToDevice);

  kernel<<<(n+255)/256, 256>>>(d_x, n);

  cudaMemcpy(x, d_x, n*sizeof(float), cudaMemcpyDeviceToHost);
  cudaFree(d_x);
}
'''

# save CUDA source contents to disk 
with open(os.path.join(work_dir, c_source_filename), 'w') as  f:
    _ = f.write(c_source_contents)

# compile
# https://forums.developer.nvidia.com/t/shared-library-creation/4776/10
os.remove(shared_obj_filename)
compile_command = '/usr/local/cuda/bin/nvcc --shared --compiler-options -fPIC -shared mult.cu -o mult.so'
result = subprocess.run(compile_command.split(' '), capture_output=True, text=True)
assert result.stderr == ''
result


# PART 2: use numpy array with CUDA code 
# =============================================================================
# =============================================================================

np_array = np.array([1, 2, 3, 4, 5], dtype=np.float32)
# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
assert np_array.flags['ALIGNED'] == True
assert np_array.flags['C_CONTIGUOUS'] == True

# load shared library, specify argument / return types
my_lib               = ctypes.CDLL('./mult.so')
my_lib.mult.argtypes = (ctypes.c_void_p, ctypes.c_int)
my_lib.mult.restype  = None

# call function with pointer to NumPy array data
np_array
my_lib.mult(np_array.ctypes.data, np_array.size)
np_array