Skip to content

Instantly share code, notes, and snippets.

@jeromeku
jeromeku / _chi2.c
Created October 10, 2025 13:51 — forked from dfm/_chi2.c
How to wrap C code in Python
#include <Python.h>
#include <numpy/arrayobject.h>
#include "chi2.h"
/* Docstrings */
static char module_docstring[] =
"This module provides an interface for calculating chi-squared using C.";
static char chi2_docstring[] =
"Calculate the chi-squared of some data given a model.";
@jeromeku
jeromeku / mirage-example-fx-graph.py
Created September 22, 2025 16:08 — forked from ProExpertProg/mirage-example-fx-graph.py
Mirage starting example
from typing import Optional, Callable, Sequence, Any
import torch
from torch import nn, fx
from torch.library import Library
import torch.nn.functional as F
import torch._inductor
import torch._inductor.compile_fx
mirage_lib = Library("mirage", "FRAGMENT") # noqa
@jeromeku
jeromeku / presentation_urls.txt
Created September 20, 2025 22:53 — forked from matthias-springer/presentation_urls.txt
MLIR Bufferization: From Tensors to MemRefs
Debugging Spurious Copies: Mini Example
https://gist.github.com/matthias-springer/81748fe1e530974dd5ff6b3ad57e3eeb
Debugging Spurious Copies: Matmul, Tiled
https://gist.github.com/matthias-springer/372162baa30e79c49180bb3ace216995
https://gist.github.com/matthias-springer/b664feb23be0159f72726025923bb9ca
Empty Tensor Elimination
https://gist.github.com/matthias-springer/b3f40d1667c977c29a76cc7a469cc1a0
https://gist.github.com/matthias-springer/e531580242d27f14e0a239e0b6fe80ae
@jeromeku
jeromeku / tv_layout_viz.py
Created September 19, 2025 09:51 — forked from Chillee/tv_layout_viz.py
Cutlass Thread-Value Layout Visualizer
import math
import cutlass.cute as cute
import cutlass
def visualize_tv_layout(
tiler_mn: tuple[int, int],
tv_layout, # (((thr_shape),(val_shape)),
# ((thr_stride),(val_stride)))
*,
font_size: int = 10,
@jeromeku
jeromeku / pass_list_as_vector.py
Created September 14, 2025 17:12 — forked from fengxie/pass_list_as_vector.py
CuTe DSL pass list from python and convert to vector for kernel
from typing import List
import cutlass
import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack
@cute.kernel
def kernel_use_vec_as_arg(vec, res: cute.Tensor):
@jeromeku
jeromeku / vector_as_kernel_arg.py
Created September 14, 2025 17:12 — forked from fengxie/vector_as_kernel_arg.py
CuTe DSL passing vector as kernel argument
import cutlass
import cutlass.cute as cute
from cutlass.cute.runtime import from_dlpack
@cute.kernel
def kernel_use_vec_as_arg(vec, res: cute.Tensor):
# cute.print_tensor(vec)
res.store(vec)
import ctypes
import torch
import time
def nvrtc_compile(source: str) -> str:
from ctypes import CDLL, c_void_p, c_char_p, c_size_t, byref, create_string_buffer
libnvrtc = CDLL('libnvrtc.so')
def get_error_string() -> str:
err_p = c_char_p()
libnvrtc.nvrtcGetErrorString(result, byref(err_str))
@jeromeku
jeromeku / test.py
Created August 27, 2025 01:25 — forked from Observer007/test.py
cute dsl inline_asm returns more than one values
import cutlass
import cutlass.cute as cute
from cutlass._mlir.dialects import llvm
from cutlass._mlir.extras import types as T
def compare_and_swap_i32(a: cutlass.Int32, b: cutlass.Int32) -> tuple[cutlass.Int32, cutlass.Int32]:
out_i32x2 = llvm.inline_asm(
llvm.StructType.get_literal([T.i32(), T.i32()]),
[cutlass.Int32(a).ir_value(), cutlass.Int32(b).ir_value()],
"{\n\t"
import time
from vllm import LLM, SamplingParams
from vllm.inputs import PromptType
from vllm.outputs import PoolingRequestOutput, RequestOutput
from typing import Union, cast, Sequence
from multiprocessing import Queue, Event
import threading
class MyLLM(LLM):
def keep_running(
@jeromeku
jeromeku / softmax_quack.py
Created July 11, 2025 12:10 — forked from Chillee/softmax_quack.py
Random Kernel Microbenchmarks
import argparse
import time
from typing import Type
import torch
import torch.nn.functional as F
import torch._inductor.config
torch._inductor.config.triton.multi_kernel = True