import ctypes import torch import time def nvrtc_compile(source: str) -> str: from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer libnvrtc = CDLL('libnvrtc.so') def get_error_string() -> str: err_p = c_char_p() libnvrtc.nvrtcGetErrorString(result, byref(err_str)) return err_p.value.decode() prog = ctypes.c_void_p() res = libnvrtc.nvrtcCreateProgram(byref(prog), source, "nvrtc.cu", 0, None, None) if res != 0: raise RuntimeError(f"Can't create program: {get_error_string()}") if libnvrtc.nvrtcCompileProgram(prog, 0, None) != 0: raise RuntimeError(f"Can't compile: {get_error_string()}") ptx_size = c_size_t() if libnvrtc.nvrtcGetPTXSize(prog, byref(ptx_size)) != 0: raise RuntimeError("Can't get PTX size: {get_error_string()}") ptx = create_string_buffer(ptx_size.value) if libnvrtc.nvrtcGetPTX(prog, ptx) != 0: raise RuntimeError(f"Can't get PTX: {get_error_string()}") libnvrtc.nvrtcDestroyProgram(ctypes.byref(prog)) return ptx.value.decode() # Load CUDA driver and NVRTC libcuda = ctypes.CDLL('libcuda.so') CUDA_SUCCESS = 0 # Helper: check CUDA errors def check_cuda(result): if result != CUDA_SUCCESS: err_str = ctypes.c_char_p() libcuda.cuGetErrorString(result, ctypes.byref(err_str)) raise RuntimeError(f'CUDA error: {err_str.value.decode()}') # CUDA kernel as string kernel = b""" extern "C" __global__ void vector_add(const float* a, const float* b, float* c, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) c[i] = a[i] + b[i]; } """ # 1. Compile with NVRTC start_time = time.time() ptx = nvrtc_compile(kernel) # 3. Load PTX module module = ctypes.c_void_p() with torch.cuda.default_stream(): check_cuda(libcuda.cuModuleLoadData(ctypes.byref(module), ptx.encode("latin-1"))) func = ctypes.c_void_p() check_cuda(libcuda.cuModuleGetFunction(ctypes.byref(func), module, b"vector_add")) compile_time = time.time() - start_time print(f"Kernel compiled in {compile_time:.2f} seconds") # 4. Prepare data N = 512 a = torch.rand(N, device="cuda") b = torch.rand(N, device="cuda") c = torch.empty_like(a) d_a = ctypes.c_void_p(a.data_ptr()) d_b = ctypes.c_void_p(b.data_ptr()) d_c = ctypes.c_void_p(c.data_ptr()) # 5. Launch kernel threads = 256 blocks = (N + threads - 1) // threads int_arg = ctypes.c_int(N) arg1 = ctypes.byref(d_a) arg2 = ctypes.byref(d_b) arg3 = ctypes.byref(d_c) arg4 = ctypes.byref(int_arg) args = (ctypes.c_void_p * 4)( ctypes.cast(arg1, ctypes.c_void_p), ctypes.cast(arg2, ctypes.c_void_p), ctypes.cast(arg3, ctypes.c_void_p), ctypes.cast(arg4, ctypes.c_void_p) ) check_cuda(libcuda.cuLaunchKernel(func, blocks, 1, 1, threads, 1, 1, 0, None, args, None)) # Print a few results print("Result (first 5):", c[:5])