Skip to content

Instantly share code, notes, and snippets.

@wconstab
wconstab / dtensor_overhead.py
Created April 22, 2025 18:39
simple dtensor overhead benchmark
import os
import time
import torch
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import distribute_tensor, Replicate, Shard
from torch._subclasses.fake_tensor import FakeTensorMode
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
os.environ["WORLD_SIZE"] = "1"
@wconstab
wconstab / logs
Created April 21, 2025 22:25
dtensor cartesian_prod debug
"""
@with_comms
def test_cartesian_product(self):
# repro an error from test_ops
# PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=2 python test/distributed/tensor/test_dtensor_ops.py TestDTensorOpsCPU.tesdefault: OutputSharding(output_spec=DTensorSpec(mesh=DeviceMesh('cpu', [0, 1, 2, 3]), placements=(Shard(dim=2),), tensor_meta=TensorMeta(shape=torch.Size([1t_dtensor_op_db_cartesian_prod_cpu_float32
x = torch.tensor([0.0])
y = torch.tensor([0.0, 1.0])
z = torch.tensor(
[
@wconstab
wconstab / repro.py
Created January 8, 2025 23:44
demonstrate comm/compute overlapping
import functools
import os
import torch
@functools.cache
def world_group() -> torch.distributed.ProcessGroup:
"""Get NCCL process group, initializing if needed"""
world_size = int(os.environ["WORLD_SIZE"])
@wconstab
wconstab / repro_142356.py
Created December 11, 2024 01:02
torchrun --nproc-per-node repro_142356.py
import os
import torch
import torch.distributed as dist
from torch.testing._internal.common_utils import find_free_port
init_method = "tcp://"
master_ip = os.getenv("MASTER_ADDR", "localhost")
world_size = 4
(Pdb) first_schedules = [[x for x in rank_data if x["state"] == "scheduled"][:1] for rank_data in all_entries.values()]
(Pdb) first_schedules = [x[0] for x in first_schedules if len(x) > 0]
(Pdb) data = [(x.get("profiling_name", ""), x.get("input_sizes", ""), x.get("collective_seq_id", None)) for x in first_schedules]
(Pdb) import tabulate
(Pdb) print(tabulate.tabulate(data))
@wconstab
wconstab / output.md
Created October 30, 2024 18:14
Saving and restoring RNG state for CPU and CUDA

repeatedly running the 'init' path shows the same initial tensor allocated on both cuda and cpu as long as the initial seed is the same.

then, repeatedly running the 'restore' path shows that both the cpu and cuda seed are transferred consistently from the first run.

python rando.py init --seed 123

x=tensor([ 1.3391,  0.2052, -1.6879,  0.5103, -0.3458,  0.6455,  1.5735,  0.3519,                                                                                                                                                                                       
         1.1298,  0.0098], device='cuda:0')                                                                                                                                                                                                                             
+ export USE_LIBUV=1
+ USE_LIBUV=1
+ TRAINER_DIR=--training.pipeline_parallel_degree
+ NGPU=4
+ LOG_RANK=0
+ CONFIG_FILE=./train_configs/debug_model.toml
+ overrides=
+ '[' 7 -ne 0 ']'
+ overrides='--training.pipeline_parallel_degree 2 --model.norm_type fused_rmsnorm --checkpoint.enable_checkpoint --checkpoint.interval 11'
+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/debug_model.toml --training.pipeline_parallel_degree 2 --model.norm_type fused_rmsnorm --checkpoint.enable_checkpoint --checkpoint.interval 11
@wconstab
wconstab / test.py
Created April 16, 2024 18:09
test shows hang when using single microbatch
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
import unittest
import copy
import torch
import torch.distributed as dist
import torch.nn as nn
from pippy.ManualPipelineStage import ManualPipelineStage
from pippy.PipelineSchedule import ScheduleGPipe
from torch.distributed._composable.fsdp.fully_shard import (
@wconstab
wconstab / test.py
Created April 12, 2024 00:30
pp stage wrong shape repro (TORCH_LOGS_RANK=0,1 TORCH_LOGS=+pippy python test/test.py)
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
import unittest
import torch
import torch.distributed as dist
import torch.nn as nn
from pippy.ManualPipelineStage import ManualPipelineStage
from pippy.PipelineSchedule import ScheduleGPipe
from torch.distributed._composable.fsdp.fully_shard import fully_shard, MixedPrecisionPolicy
from torch.distributed.device_mesh import init_device_mesh, DeviceMesh
@wconstab
wconstab / dump.txt
Created February 23, 2024 21:06
step_6 torchtrain pp flight dump
{'pg_id': 0, 'seq_id': 3, 'profiling_name': 'nccl:recv 0<-1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames': 'stack_1', 'first_time': '2024-02-23 12:56:31.060915', 'last_time': '2024-02-23 12:56:40.779100', 'Retired': '[0-1]', 'Active': '[0-1]', 'NotIssued': '[2-511]'}
{'pg_id': 0, 'seq_id': 4, 'profiling_name': 'nccl:recv 0<-1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames': 'stack_1', 'first_time': '2024-02-23 12:56:41.050293', 'last_time': '2024-02-23 12:56:43.105840', 'Retired': '[0-1]', 'Active': '[0-1]', 'NotIssued': '[2-511]'}
{'pg_id': 0, 'seq_id': 5, 'profiling_name': 'nccl:send 0->1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames