Will Constable wconstab

repeatedly running the 'init' path shows the same initial tensor allocated on both cuda and cpu as long as the initial seed is the same.

then, repeatedly running the 'restore' path shows that both the cpu and cuda seed are transferred consistently from the first run.

python rando.py init --seed 123

x=tensor([ 1.3391,  0.2052, -1.6879,  0.5103, -0.3458,  0.6455,  1.5735,  0.3519,                                                                                                                                                                                       
         1.1298,  0.0098], device='cuda:0')

	import os
	import time
	import torch
	from torch.distributed.device_mesh import init_device_mesh
	from torch.distributed.tensor import distribute_tensor, Replicate, Shard
	from torch._subclasses.fake_tensor import FakeTensorMode

	os.environ["MASTER_ADDR"] = "localhost"
	os.environ["MASTER_PORT"] = "12355"
	os.environ["WORLD_SIZE"] = "1"

	"""
	@with_comms
	def test_cartesian_product(self):
	# repro an error from test_ops
	# PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=2 python test/distributed/tensor/test_dtensor_ops.py TestDTensorOpsCPU.tesdefault: OutputSharding(output_spec=DTensorSpec(mesh=DeviceMesh('cpu', [0, 1, 2, 3]), placements=(Shard(dim=2),), tensor_meta=TensorMeta(shape=torch.Size([1t_dtensor_op_db_cartesian_prod_cpu_float32

	x = torch.tensor([0.0])
	y = torch.tensor([0.0, 1.0])
	z = torch.tensor(
	[


	import functools
	import os
	import torch


	@functools.cache
	def world_group() -> torch.distributed.ProcessGroup:
	"""Get NCCL process group, initializing if needed"""
	world_size = int(os.environ["WORLD_SIZE"])

	import os

	import torch
	import torch.distributed as dist

	from torch.testing._internal.common_utils import find_free_port

	init_method = "tcp://"
	master_ip = os.getenv("MASTER_ADDR", "localhost")
	world_size = 4

	(Pdb) first_schedules = [[x for x in rank_data if x["state"] == "scheduled"][:1] for rank_data in all_entries.values()]
	(Pdb) first_schedules = [x[0] for x in first_schedules if len(x) > 0]
	(Pdb) data = [(x.get("profiling_name", ""), x.get("input_sizes", ""), x.get("collective_seq_id", None)) for x in first_schedules]
	(Pdb) import tabulate
	(Pdb) print(tabulate.tabulate(data))

	+ export USE_LIBUV=1
	+ USE_LIBUV=1
	+ TRAINER_DIR=--training.pipeline_parallel_degree
	+ NGPU=4
	+ LOG_RANK=0
	+ CONFIG_FILE=./train_configs/debug_model.toml
	+ overrides=
	+ '[' 7 -ne 0 ']'
	+ overrides='--training.pipeline_parallel_degree 2 --model.norm_type fused_rmsnorm --checkpoint.enable_checkpoint --checkpoint.interval 11'
	+ torchrun --nproc_per_node=4 --rdzv_backend c10d --rdzv_endpoint=localhost:0 --local-ranks-filter 0 --role rank --tee 3 train.py --job.config_file ./train_configs/debug_model.toml --training.pipeline_parallel_degree 2 --model.norm_type fused_rmsnorm --checkpoint.enable_checkpoint --checkpoint.interval 11

	# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
	import unittest

	import copy
	import torch
	import torch.distributed as dist
	import torch.nn as nn
	from pippy.ManualPipelineStage import ManualPipelineStage
	from pippy.PipelineSchedule import ScheduleGPipe
	from torch.distributed._composable.fsdp.fully_shard import (

	{'pg_id': 0, 'seq_id': 3, 'profiling_name': 'nccl:recv 0<-1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames': 'stack_1', 'first_time': '2024-02-23 12:56:31.060915', 'last_time': '2024-02-23 12:56:40.779100', 'Retired': '[0-1]', 'Active': '[0-1]', 'NotIssued': '[2-511]'}
	{'pg_id': 0, 'seq_id': 4, 'profiling_name': 'nccl:recv 0<-1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames': 'stack_1', 'first_time': '2024-02-23 12:56:41.050293', 'last_time': '2024-02-23 12:56:43.105840', 'Retired': '[0-1]', 'Active': '[0-1]', 'NotIssued': '[2-511]'}
	{'pg_id': 0, 'seq_id': 5, 'profiling_name': 'nccl:send 0->1', 'input_sizes': [[4, 2048, 256]], 'output_sizes': [[4, 2048, 256]], 'state': 'scheduled', 'time_discovered_started_ns': None, 'time_discovered_completed_ns': None, 'frames