Install cursor cli from https://api2.cursor.sh/updates/download-latest?os=cli-alpine-x64, then ./cursor tunnel
vscode-cli-alpine-arm64.tar.gz
Install cursor cli from https://api2.cursor.sh/updates/download-latest?os=cli-alpine-x64, then ./cursor tunnel
vscode-cli-alpine-arm64.tar.gz
| # https://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/ | |
| import torch | |
| def two_pass_variance(data): | |
| n = len(data) | |
| mean = sum(data) / n | |
| var = sum([(x - mean) ** 2 for x in data]) / (n - 1) | |
| return var |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torchao | |
| from torchao.quantization.autoquant import ( | |
| DEFAULT_AUTOQUANT_CLASS_LIST, | |
| DEFAULT_INT4_AUTOQUANT_CLASS_LIST, | |
| OTHER_AUTOQUANT_CLASS_LIST, | |
| ) | |
| from torchao.quantization.quant_api import ( |
| import torch | |
| from torch._inductor.utils import get_code, get_triton_code | |
| def my_model(x): | |
| return torch.square(x) | |
| compiled_model = torch.compile(my_model) |
| import pycuda.autoinit | |
| import pycuda.driver as cuda | |
| # Get the first CUDA device (index 0) | |
| device = cuda.Device(0) | |
| # List of attributes you want to get | |
| attributes = [ | |
| cuda.device_attribute.MAX_THREADS_PER_BLOCK, | |
| cuda.device_attribute.MAX_BLOCK_DIM_X, |
| import torch | |
| from torch.utils.cpp_extension import load_inline | |
| finfo = torch.finfo(torch.float8_e4m3fn) | |
| print(f"finfo: {finfo}") | |
| # finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn) | |
| cuda_source = """ | |
| C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max(); | |
| void test() { |
| # speechmatics.com/company/articles-and-news/timing-operations-in-pytorch | |
| import time | |
| import torch | |
| # 400000000B/1000000 = 400 MB | |
| a = torch.randn(1000, 1000, device="cuda") | |
| torch.softmax(a, dim=1) |
| import timeit | |
| import torch | |
| @torch.compile() # 0.103 seconds | |
| # @torch.compile(fullgraph=True) # 0.105 seconds | |
| # @torch.compile(fullgraph=False) # 0.102 seconds | |
| # @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True) # 0.104 seconds | |
| # @torch.compile( |
| import os | |
| import time | |
| import torch | |
| import transformers | |
| from torch.profiler import ProfilerActivity, profile, record_function | |
| from vllm import LLM, SamplingParams | |
| os.environ["HOST_IP"] = "10.42.10.16" |
| # https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047 | |
| # CUDA Nightly | |
| # pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/ | |
| # pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/ | |
| import os | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config |