import cutlass import cutlass.cute as cute from cutlass.cute.runtime import from_dlpack @cute.kernel def kernel_use_vec_as_arg(vec, res: cute.Tensor): # cute.print_tensor(vec) res.store(vec) @cute.jit def vector_as_kernel_arg(res: cute.Tensor): # Create an array/vector on CPU vec = cute.make_fragment(10, dtype=cutlass.Float32) vec.fill(1.0) # Pass array/vector to kernel as argument without explicit copy from host to device kernel_use_vec_as_arg(vec.load(), res).launch(grid=[1, 1, 1], block=[1, 1, 1]) import torch res = torch.zeros(10, dtype=torch.float32, device="cuda") vector_as_kernel_arg(from_dlpack(res)) torch.cuda.synchronize() print(res)