Skip to content

Instantly share code, notes, and snippets.

View w32zhong's full-sized avatar
⛹️
Trying to keep up.

Wei w32zhong

⛹️
Trying to keep up.
View GitHub Profile
@w32zhong
w32zhong / config.json
Last active September 28, 2025 17:03
sglang
{
"architectures": [
"Qwen3ForCausalLMEagle"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
@w32zhong
w32zhong / modular-max.py
Last active July 14, 2025 17:35
vllm compared to nano-vllm
## Setup
# conda create -n modular python=3.11
# uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
# conda install -c conda-forge gcc=12.1.0
model_path = 'Qwen/Qwen2.5-0.5B'
import time
from max.entrypoints.llm import LLM
from max.pipelines import PipelineConfig
@w32zhong
w32zhong / code.js
Created June 4, 2025 13:07 — forked from iiLaurens/code.js
Get all clickable elements on a page
window.scrollTo(0, 0)
var bodyRect = document.body.getBoundingClientRect();
var items = Array.prototype.slice.call(
document.querySelectorAll('*')
).map(function(element) {
var rect=element.getBoundingClientRect();
return {
element: element,
include: (element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer"),
@w32zhong
w32zhong / train.py
Created May 17, 2025 19:42 — forked from ddh0/train.py
Janky pretraining script for small llama models using HF fineweb - modify according to your needs
import os
import torch
import psutil
import datasets
import glob
from transformers import (
AutoTokenizer, LlamaConfig, LlamaForCausalLM, Trainer, TrainingArguments,
DataCollatorForLanguageModeling
)
@w32zhong
w32zhong / Dockerfile
Last active February 28, 2025 16:34
Example dockerfile
FROM nvcr.io/nvidia/pytorch:23.11-py3
WORKDIR /workspace
RUN pip install -r r1.txt
ADD requirements.txt r2.txt
# FlashAttention-2 compatibility copied from https://github.com/Dao-AILab/flash-attention/issues/836#issuecomment-1951433985
RUN pip install flash-attn==2.5.1.post1
RUN apt update && apt install -y tmux git-lfs
RUN pip install nvitop
ADD . myproject
WORKDIR /workspace/myproject
@w32zhong
w32zhong / grpo_demo.py
Created February 18, 2025 01:17 — forked from willccbb/grpo_demo.py
GRPO Llama-1B
# train_grpo.py
#
# See https://github.com/willccbb/verifiers for ongoing developments
#
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from trl import GRPOConfig, GRPOTrainer
@w32zhong
w32zhong / steps.md
Last active February 10, 2025 22:09

EAGLE v1 Replication

Set up environment and run an inference test:

git clone --branch v1 --depth 1 https://github.com/SafeAILab/EAGLE.git EAGLE-v1
cd EAGLE-v1
wget https://raw.githubusercontent.com/w32zhong/EAGLE/refs/heads/eagle-v1-save/application/test_v1.py -O eagle/application/test_v1.py
pip install -e .
pip install transformers==4.36.2
pip install accelerate==0.21.0
pip install datasets==3.2.0
@w32zhong
w32zhong / gpu_vram_estimate.py
Created October 5, 2024 17:14
GPU vram estimate for pre-training LLMs.
import math
def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1):
""" Returns amount of GPU VRAM (in GB) required to store
intermediate activations for traditional Transformer blocks
"""
mem_bytes = layers * precision * seqlen * bs * h_dim * (
16 + 2/precision + 2*heads*seqlen/h_dim
+ heads*seqlen/(precision*h_dim)
)
def test(method, bits, random_top_layer, quantize_top_layer, results={}):
print(prompt)
start_time = time.time()
if method == 'vanilla':
cnt_tokens = test_vanilla(bits)
elif method == 'eagle':
cnt_tokens = test_eagle(bits,
random_top_layer=random_top_layer,
quantize_top_layer=quantize_top_layer
)
@I.ir_module
class Module:
@T.prim_func
def main(var_A: T.handle, B: T.Buffer((768, 384), "int8"), Scale: T.Buffer((768, 3), "float16"), Zeros: T.Buffer((768, 3), "float16"), var_D: T.handle):
T.func_attr({"dequantize_info": {"B_decode": {"decode_block": "B_decode", "fast_decoding": T.bool(False), "group_size": 256, "source_format": {"bits": 4, "format": "uint"}, "storage_dtype": "int8", "target_format": "float16", "with_scaling": T.bool(True), "with_zeros": T.bool(True), "zeros_mode": "rescale"}}, "dlight.tensorcore_prenormlized": T.bool(True), "opt_shapes": {"m": [2, 12]}, "tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
m = T.int32()
A = T.match_buffer(var_A, (m, 768), "float16")
D = T.match_buffer(var_D, (m, 768), "float16")
# with T.block("root"):
A_reindex_pad_shared_dyn = T.alloc_buffer((1, (m + 127) // 128 * 128, 768), "float16", scope="shared.dyn")