Skip to content

Instantly share code, notes, and snippets.

View cli99's full-sized avatar
🐼

Cheng Li cli99

🐼
View GitHub Profile
@cli99
cli99 / cursor.md
Last active August 18, 2025 16:33
Cursor Setup
@cli99
cli99 / one_pass_variance.py
Last active October 17, 2024 03:41
welford's variance
# https://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/
import torch
def two_pass_variance(data):
n = len(data)
mean = sum(data) / n
var = sum([(x - mean) ** 2 for x in data]) / (n - 1)
return var
@cli99
cli99 / test_autoquant.py
Created October 16, 2024 21:51
torchao autoquant
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torchao
from torchao.quantization.autoquant import (
DEFAULT_AUTOQUANT_CLASS_LIST,
DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
OTHER_AUTOQUANT_CLASS_LIST,
)
from torchao.quantization.quant_api import (
@cli99
cli99 / get_compiled_triton_code.py
Last active October 9, 2024 04:44
get torch compiled triton code
import torch
from torch._inductor.utils import get_code, get_triton_code
def my_model(x):
return torch.square(x)
compiled_model = torch.compile(my_model)
@cli99
cli99 / get_device_info.py
Created October 3, 2024 07:36
Get CUDA information
import pycuda.autoinit
import pycuda.driver as cuda
# Get the first CUDA device (index 0)
device = cuda.Device(0)
# List of attributes you want to get
attributes = [
cuda.device_attribute.MAX_THREADS_PER_BLOCK,
cuda.device_attribute.MAX_BLOCK_DIM_X,
@cli99
cli99 / test_Float8_e4m3fn.py
Last active September 23, 2024 22:08
Float8_e4m3fn
import torch
from torch.utils.cpp_extension import load_inline
finfo = torch.finfo(torch.float8_e4m3fn)
print(f"finfo: {finfo}")
# finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn)
cuda_source = """
C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max();
void test() {
# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch
import time
import torch
# 400000000B/1000000 = 400 MB
a = torch.randn(1000, 1000, device="cuda")
torch.softmax(a, dim=1)
@cli99
cli99 / test_torch_compile.py
Created September 18, 2024 00:14
torch.compile
import timeit
import torch
@torch.compile() # 0.103 seconds
# @torch.compile(fullgraph=True) # 0.105 seconds
# @torch.compile(fullgraph=False) # 0.102 seconds
# @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True) # 0.104 seconds
# @torch.compile(
@cli99
cli99 / test_fp8.py
Last active August 29, 2024 02:34
vLLM FP8
import os
import time
import torch
import transformers
from torch.profiler import ProfilerActivity, profile, record_function
from vllm import LLM, SamplingParams
os.environ["HOST_IP"] = "10.42.10.16"
# https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047
# CUDA Nightly
# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
# pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config