jeromeku’s gists

jeromeku / _chi2.c

Created October 10, 2025 13:51 — forked from dfm/_chi2.c

How to wrap C code in Python

	#include <Python.h>
	#include <numpy/arrayobject.h>
	#include "chi2.h"

	/* Docstrings */
	static char module_docstring[] =
	"This module provides an interface for calculating chi-squared using C.";
	static char chi2_docstring[] =
	"Calculate the chi-squared of some data given a model.";

jeromeku / mirage-example-fx-graph.py

Created September 22, 2025 16:08 — forked from ProExpertProg/mirage-example-fx-graph.py

Mirage starting example

	from typing import Optional, Callable, Sequence, Any

	import torch
	from torch import nn, fx
	from torch.library import Library
	import torch.nn.functional as F
	import torch._inductor
	import torch._inductor.compile_fx

	mirage_lib = Library("mirage", "FRAGMENT") # noqa

jeromeku / presentation_urls.txt

Created September 20, 2025 22:53 — forked from matthias-springer/presentation_urls.txt

MLIR Bufferization: From Tensors to MemRefs

	Debugging Spurious Copies: Mini Example
	https://gist.github.com/matthias-springer/81748fe1e530974dd5ff6b3ad57e3eeb

	Debugging Spurious Copies: Matmul, Tiled
	https://gist.github.com/matthias-springer/372162baa30e79c49180bb3ace216995
	https://gist.github.com/matthias-springer/b664feb23be0159f72726025923bb9ca

	Empty Tensor Elimination
	https://gist.github.com/matthias-springer/b3f40d1667c977c29a76cc7a469cc1a0
	https://gist.github.com/matthias-springer/e531580242d27f14e0a239e0b6fe80ae

jeromeku / tv_layout_viz.py

Created September 19, 2025 09:51 — forked from Chillee/tv_layout_viz.py

Cutlass Thread-Value Layout Visualizer

jeromeku / pass_list_as_vector.py

Created September 14, 2025 17:12 — forked from fengxie/pass_list_as_vector.py

CuTe DSL pass list from python and convert to vector for kernel


	from typing import List

	import cutlass
	import cutlass.cute as cute
	from cutlass.cute.runtime import from_dlpack


	@cute.kernel
	def kernel_use_vec_as_arg(vec, res: cute.Tensor):

jeromeku / vector_as_kernel_arg.py

Created September 14, 2025 17:12 — forked from fengxie/vector_as_kernel_arg.py

CuTe DSL passing vector as kernel argument


	import cutlass
	import cutlass.cute as cute
	from cutlass.cute.runtime import from_dlpack


	@cute.kernel
	def kernel_use_vec_as_arg(vec, res: cute.Tensor):
	# cute.print_tensor(vec)
	res.store(vec)

jeromeku / ctypes-nvrtc.py

Created September 8, 2025 13:31 — forked from malfet/ctypes-nvrtc.py

	import ctypes
	import torch
	import time

	def nvrtc_compile(source: str) -> str:
	from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
	libnvrtc = CDLL('libnvrtc.so')
	def get_error_string() -> str:
	err_p = c_char_p()
	libnvrtc.nvrtcGetErrorString(result, byref(err_str))

jeromeku / test.py

Created August 27, 2025 01:25 — forked from Observer007/test.py

cute dsl inline_asm returns more than one values

	import cutlass
	import cutlass.cute as cute
	from cutlass._mlir.dialects import llvm
	from cutlass._mlir.extras import types as T

	def compare_and_swap_i32(a: cutlass.Int32, b: cutlass.Int32) -> tuple[cutlass.Int32, cutlass.Int32]:
	out_i32x2 = llvm.inline_asm(
	llvm.StructType.get_literal([T.i32(), T.i32()]),
	[cutlass.Int32(a).ir_value(), cutlass.Int32(b).ir_value()],
	"{\n\t"

jeromeku / vllm_forloop.py

Created August 11, 2025 13:19 — forked from vwxyzjn/vllm_forloop.py

	import time
	from vllm import LLM, SamplingParams
	from vllm.inputs import PromptType
	from vllm.outputs import PoolingRequestOutput, RequestOutput
	from typing import Union, cast, Sequence
	from multiprocessing import Queue, Event
	import threading

	class MyLLM(LLM):
	def keep_running(

jeromeku / softmax_quack.py

Created July 11, 2025 12:10 — forked from Chillee/softmax_quack.py

Random Kernel Microbenchmarks

	import argparse
	import time
	from typing import Type

	import torch
	import torch.nn.functional as F
	import torch._inductor.config

	torch._inductor.config.triton.multi_kernel = True