tokenbender · October 12, 2025 06:57 · Jun 26, 2025 · Jun 26, 2025
diff --git a/gistfile1.txt → train_modal_standalone.py b/gistfile1.txt → train_modal_standalone.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,786 @@
+import os
+import sys
+import time
+import math
+import pickle
+from contextlib import nullcontext
+from pathlib import Path
+import subprocess
+from dataclasses import dataclass
+import inspect
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+
+# Modal imports
+import modal
+
+# ============================================================================
+# CONFIGURATION - All settings embedded here, no CLI args needed
+# ============================================================================
+
+# Modal configuration
+N_GPUS = 4  # Number of GPUs to use
+GPU_TYPE = "A100"  # GPU type: "A100", "H200", "A10G", etc.
+
+# Training configuration for Shakespeare character-level model
+CONFIG = {
+    # I/O
+    "out_dir": "/data/checkpoints/shakespeare",
+    "eval_interval": 250,  # Will be auto-adjusted based on epochs
+    "log_interval": 10,  # Will be auto-adjusted based on epochs
+    "eval_iters": 200,
+    "eval_only": False,
+    "always_save_checkpoint": True,
+    "init_from": "scratch",
+
+    # wandb logging
+    "wandb_log": False,
+    "wandb_project": "nanogpt-shakespeare",
+    "wandb_run_name": "shakespeare-char",
+
+    # data
+    "dataset": "shakespeare_char",
+    "gradient_accumulation_steps": 4,  # Must be divisible by N_GPUS
+    "batch_size": 64,
+    "block_size": 256,
+
+    # model
+    "n_layer": 6,
+    "n_head": 6,
+    "n_embd": 384,
+    "dropout": 0.2,
+    "bias": False,
+
+    # training epochs (max_iters will be calculated automatically)
+    "num_epochs": 21.0,  # Set the number of epochs you want
+
+    # adamw optimizer
+    "learning_rate": 1e-3,
+    "max_iters": None,  # Will be calculated based on num_epochs
+    "weight_decay": 1e-1,
+    "beta1": 0.9,
+    "beta2": 0.95,
+    "grad_clip": 1.0,
+
+    # learning rate decay settings
+    "decay_lr": True,  # Will be auto-adjusted based on epochs
+    "warmup_iters": None,  # Will be calculated as percentage of max_iters
+    "lr_decay_iters": None,  # Will be set to max_iters
+    "min_lr": 1e-4,
+
+    # DDP settings
+    "backend": "nccl",
+
+    # system
+    "device": "cuda",
+    "dtype": "bfloat16",
+    "compile": True,
+}
+
+# ============================================================================
+# MODEL DEFINITION - Embedded from model.py
+# ============================================================================
+
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+class CausalSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                        .view(1, 1, config.block_size, config.block_size))
+
+    def forward(self, x):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+class MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+class GPT(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
+
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, idx, targets=None):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
+
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+
+        return logits, loss
+
+    def crop_block_size(self, block_size):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
+        for block in self.transformer.h:
+            if hasattr(block.attn, 'bias'):
+                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
+
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+
+        return optimizer
+
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = self.get_num_params()
+        cfg = self.config
+        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx
+
+# ============================================================================
+# DATA PREPARATION
+# ============================================================================
+
+def ensure_shakespeare_data(data_root="/data"):
+    """Download and prepare Shakespeare dataset if not exists"""
+    import requests
+
+    data_dir = os.path.join(data_root, "shakespeare_char")
+
+    # Check if prepared data already exists
+    train_path = os.path.join(data_dir, "train.bin")
+    val_path = os.path.join(data_dir, "val.bin")
+    meta_path = os.path.join(data_dir, "meta.pkl")
+
+    if os.path.exists(train_path) and os.path.exists(val_path) and os.path.exists(meta_path):
+        print(f"Shakespeare data already prepared in {data_dir}")
+        return
+
+    # Create directory
+    os.makedirs(data_dir, exist_ok=True)
+
+    # Download the tiny shakespeare dataset
+    input_file_path = os.path.join(data_dir, 'input.txt')
+    if not os.path.exists(input_file_path):
+        print("Downloading Shakespeare dataset...")
+        data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+        with open(input_file_path, 'w') as f:
+            f.write(requests.get(data_url).text)
+
+    with open(input_file_path, 'r') as f:
+        data = f.read()
+    print(f"length of dataset in characters: {len(data):,}")
+
+    # get all the unique characters that occur in this text
+    chars = sorted(list(set(data)))
+    vocab_size = len(chars)
+    print("all the unique characters:", ''.join(chars))
+    print(f"vocab size: {vocab_size:,}")
+
+    # create a mapping from characters to integers
+    stoi = { ch:i for i,ch in enumerate(chars) }
+    itos = { i:ch for i,ch in enumerate(chars) }
+    def encode(s):
+        return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+    def decode(l):
+        return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+    # create the train and test splits
+    n = len(data)
+    train_data = data[:int(n*0.9)]
+    val_data = data[int(n*0.9):]
+
+    # encode both to integers
+    train_ids = encode(train_data)
+    val_ids = encode(val_data)
+    print(f"train has {len(train_ids):,} tokens")
+    print(f"val has {len(val_ids):,} tokens")
+
+    # export to bin files
+    train_ids = np.array(train_ids, dtype=np.uint16)
+    val_ids = np.array(val_ids, dtype=np.uint16)
+    train_ids.tofile(train_path)
+    val_ids.tofile(val_path)
+
+    # save the meta information as well, to help us encode/decode later
+    meta = {
+        'vocab_size': vocab_size,
+        'itos': itos,
+        'stoi': stoi,
+    }
+    with open(meta_path, 'wb') as f:
+        pickle.dump(meta, f)
+
+    print("Data preparation complete!")
+
+# ============================================================================
+# TRAINING SCRIPT
+# ============================================================================
+
+def train():
+    """Main training function that runs under torchrun"""
+
+    # Load configuration
+    cfg = CONFIG
+
+    # Setup DDP
+    ddp = int(os.environ.get('RANK', -1)) != -1
+    if ddp:
+        init_process_group(backend=cfg['backend'])
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        device = f'cuda:{ddp_local_rank}'
+        torch.cuda.set_device(device)
+        master_process = ddp_rank == 0
+        seed_offset = ddp_rank
+        assert cfg['gradient_accumulation_steps'] % ddp_world_size == 0
+        gradient_accumulation_steps = cfg['gradient_accumulation_steps'] // ddp_world_size
+    else:
+        # single gpu
+        master_process = True
+        seed_offset = 0
+        ddp_world_size = 1
+        device = cfg['device']
+        gradient_accumulation_steps = cfg['gradient_accumulation_steps']
+
+    tokens_per_iter = gradient_accumulation_steps * ddp_world_size * cfg['batch_size'] * cfg['block_size']
+    print(f"tokens per iteration will be: {tokens_per_iter:,}")
+
+    if master_process:
+        os.makedirs(cfg['out_dir'], exist_ok=True)
+
+    torch.manual_seed(1337 + seed_offset)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    device_type = 'cuda' if 'cuda' in device else 'cpu'
+
+    # Data setup
+    data_dir = os.path.join("/data" if os.path.exists("/data") else "data", cfg['dataset'])
+
+    # Calculate dataset size and iterations needed for requested epochs
+    train_data_path = os.path.join(data_dir, 'train.bin')
+    if os.path.exists(train_data_path):
+        train_data = np.memmap(train_data_path, dtype=np.uint16, mode='r')
+        dataset_tokens = len(train_data)
+        print(f"Training dataset has {dataset_tokens:,} tokens")
+
+        # Calculate iterations needed for the requested number of epochs
+        if cfg['num_epochs'] is not None:
+            iterations_per_epoch = dataset_tokens / tokens_per_iter
+            cfg['max_iters'] = int(math.ceil(cfg['num_epochs'] * iterations_per_epoch))
+            print(f"For {cfg['num_epochs']} epochs, need {cfg['max_iters']} iterations")
+            print(f"Each epoch is ~{iterations_per_epoch:.1f} iterations")
+
+            # Auto-adjust other parameters based on total iterations
+            if cfg['warmup_iters'] is None:
+                # Default 2% warmup
+                cfg['warmup_iters'] = max(1, int(0.02 * cfg['max_iters']))
+
+            if cfg['lr_decay_iters'] is None:
+                cfg['lr_decay_iters'] = cfg['max_iters']
+
+            # Adjust eval/log intervals for short runs
+            if cfg['max_iters'] < 20:
+                cfg['eval_interval'] = max(1, cfg['max_iters'] // 4)
+                cfg['log_interval'] = 1
+                cfg['eval_iters'] = min(50, cfg['eval_iters'])
+                print(f"Adjusted for short run: eval_interval={cfg['eval_interval']}, log_interval={cfg['log_interval']}")
+
+            # Disable learning rate decay for very short runs
+            if cfg['max_iters'] < 10:
+                cfg['decay_lr'] = False
+                cfg['warmup_iters'] = 0
+                print("Disabled learning rate decay for very short run")
+        del train_data  # Free memory
+    else:
+        if cfg['max_iters'] is None:
+            raise ValueError("Cannot calculate max_iters: training data not found and max_iters not specified")
+
+    def get_batch(split):
+        # We recreate np.memmap every batch to avoid a memory leak
+        if split == 'train':
+            data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
+        else:
+            data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
+        ix = torch.randint(len(data) - cfg['block_size'], (cfg['batch_size'],))
+        x = torch.stack([torch.from_numpy((data[i:i+cfg['block_size']]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+cfg['block_size']]).astype(np.int64)) for i in ix])
+        if device_type == 'cuda':
+            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+        else:
+            x, y = x.to(device), y.to(device)
+        return x, y
+
+    # Init these up here
+    iter_num = 0
+    best_val_loss = 1e9
+
+    # Model init
+    meta_path = os.path.join(data_dir, 'meta.pkl')
+    meta_vocab_size = None
+    if os.path.exists(meta_path):
+        with open(meta_path, 'rb') as f:
+            meta = pickle.load(f)
+        meta_vocab_size = meta['vocab_size']
+        print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
+
+    # Model configuration
+    model_args = dict(
+        n_layer=cfg['n_layer'],
+        n_head=cfg['n_head'],
+        n_embd=cfg['n_embd'],
+        block_size=cfg['block_size'],
+        bias=cfg['bias'],
+        vocab_size=meta_vocab_size if meta_vocab_size is not None else 50304,
+        dropout=cfg['dropout']
+    )
+
+    if cfg['init_from'] == 'scratch':
+        print("Initializing a new model from scratch")
+        gptconf = GPTConfig(**model_args)
+        model = GPT(gptconf)
+    elif cfg['init_from'] == 'resume':
+        print(f"Resuming training from {cfg['out_dir']}")
+        ckpt_path = os.path.join(cfg['out_dir'], 'ckpt.pt')
+        checkpoint = torch.load(ckpt_path, map_location=device)
+        checkpoint_model_args = checkpoint['model_args']
+        for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
+            model_args[k] = checkpoint_model_args[k]
+        gptconf = GPTConfig(**model_args)
+        model = GPT(gptconf)
+        state_dict = checkpoint['model']
+        unwanted_prefix = '_orig_mod.'
+        for k,v in list(state_dict.items()):
+            if k.startswith(unwanted_prefix):
+                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+        model.load_state_dict(state_dict)
+        iter_num = checkpoint['iter_num']
+        best_val_loss = checkpoint['best_val_loss']
+
+    # Move model to device
+    model.to(device)
+
+    # Initialize a GradScaler
+    scaler = torch.cuda.amp.GradScaler(enabled=(cfg['dtype'] == 'float16'))
+
+    # Optimizer
+    optimizer = model.configure_optimizers(cfg['weight_decay'], cfg['learning_rate'], 
+                                         (cfg['beta1'], cfg['beta2']), device_type)
+    if cfg['init_from'] == 'resume' and 'optimizer' in checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+    checkpoint = None # free up memory
+
+    # Compile the model
+    if cfg['compile']:
+        print("compiling the model... (takes a ~minute)")
+        unoptimized_model = model
+        model = torch.compile(model)
+
+    # Wrap model into DDP container
+    if ddp:
+        model = DDP(model, device_ids=[ddp_local_rank])
+
+    # Training helpers
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[cfg['dtype']]
+    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+
+    @torch.no_grad()
+    def estimate_loss():
+        out = {}
+        model.eval()
+        for split in ['train', 'val']:
+            losses = torch.zeros(cfg['eval_iters'])
+            for k in range(cfg['eval_iters']):
+                X, Y = get_batch(split)
+                with ctx:
+                    logits, loss = model(X, Y)
+                losses[k] = loss.item()
+            out[split] = losses.mean()
+        model.train()
+        return out
+
+    # Learning rate decay scheduler (cosine with warmup)
+    def get_lr(it):
+        # Linear warmup
+        if it < cfg['warmup_iters']:
+            return cfg['learning_rate'] * (it + 1) / (cfg['warmup_iters'] + 1)
+        # If it > lr_decay_iters, return min learning rate
+        if it > cfg['lr_decay_iters']:
+            return cfg['min_lr']
+        # In between, use cosine decay
+        decay_ratio = (it - cfg['warmup_iters']) / (cfg['lr_decay_iters'] - cfg['warmup_iters'])
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+        return cfg['min_lr'] + coeff * (cfg['learning_rate'] - cfg['min_lr'])
+
+    # Logging
+    if cfg['wandb_log'] and master_process:
+        import wandb
+        wandb.init(project=cfg['wandb_project'], name=cfg['wandb_run_name'], config=cfg)
+
+    # Training loop
+    X, Y = get_batch('train')
+    t0 = time.time()
+    local_iter_num = 0
+    raw_model = model.module if ddp else model
+    running_mfu = -1.0
+
+    while True:
+        # Determine and set the learning rate for this iteration
+        lr = get_lr(iter_num) if cfg['decay_lr'] else cfg['learning_rate']
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+
+        # Evaluate the loss on train/val sets and write checkpoints
+        if iter_num % cfg['eval_interval'] == 0 and master_process:
+            losses = estimate_loss()
+            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+            if cfg['wandb_log']:
+                wandb.log({
+                    "iter": iter_num,
+                    "train/loss": losses['train'],
+                    "val/loss": losses['val'],
+                    "lr": lr,
+                    "mfu": running_mfu*100,
+                })
+            if losses['val'] < best_val_loss or cfg['always_save_checkpoint']:
+                best_val_loss = losses['val']
+                if iter_num > 0:
+                    checkpoint = {
+                        'model': raw_model.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'model_args': model_args,
+                        'iter_num': iter_num,
+                        'best_val_loss': best_val_loss,
+                        'config': cfg,
+                    }
+                    print(f"saving checkpoint to {cfg['out_dir']}")
+                    torch.save(checkpoint, os.path.join(cfg['out_dir'], 'ckpt.pt'))
+
+        if iter_num == 0 and cfg['eval_only']:
+            break
+
+        # Forward backward update
+        for micro_step in range(gradient_accumulation_steps):
+            if ddp:
+                model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
+            with ctx:
+                logits, loss = model(X, Y)
+                loss = loss / gradient_accumulation_steps
+            X, Y = get_batch('train')
+            scaler.scale(loss).backward()
+
+        # Clip gradients
+        if cfg['grad_clip'] != 0.0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg['grad_clip'])
+
+        # Step the optimizer
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad(set_to_none=True)
+
+        # Timing and logging
+        t1 = time.time()
+        dt = t1 - t0
+        t0 = t1
+        if iter_num % cfg['log_interval'] == 0 and master_process:
+            lossf = loss.item() * gradient_accumulation_steps
+            if local_iter_num >= 5:
+                mfu = raw_model.estimate_mfu(cfg['batch_size'] * gradient_accumulation_steps, dt)
+                running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
+            print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
+
+        iter_num += 1
+        local_iter_num += 1
+
+        # Termination conditions
+        if iter_num > cfg['max_iters']:
+            break
+
+    if ddp:
+        destroy_process_group()
+
+# ============================================================================
+# MODAL SETUP
+# ============================================================================
+
+# Create Modal app
+app = modal.App("nanogpt-training")
+
+# Build Modal image with all dependencies
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "numpy",
+        "torch",
+        "transformers",
+        "wandb",
+        "requests"
+    )
+)
+
+# Create Modal volume for persistent storage
+volume = modal.Volume.from_name("nanogpt-data", create_if_missing=True)
+
+# Modal entry point function
+@app.function(
+    gpu=f"{GPU_TYPE}:{N_GPUS}",
+    volumes={"/data": volume},
+    timeout=60 * 60 * 6,  # 6 hours
+    image=image,
+    secrets=[modal.Secret.from_name("wandb-secret")] if CONFIG.get("wandb_log", False) else [],
+)
+def train_modal():
+    """Launch distributed training on Modal"""
+
+    print(f"Starting Modal training with {N_GPUS} {GPU_TYPE} GPUs")
+    print(f"Dataset: {CONFIG['dataset']}")
+
+    # Prepare data
+    ensure_shakespeare_data("/data")
+
+    # Copy this script to a temporary location for torchrun
+    script_path = Path(__file__)
+    script_content = script_path.read_text()
+    temp_script = "/tmp/train_modal.py"
+    Path(temp_script).write_text(script_content)
+
+    # Launch distributed training with torchrun
+    cmd = [
+        "torchrun",
+        f"--nproc-per-node={N_GPUS}",
+        temp_script,
+    ]
+
+    print(f"Running command: {' '.join(cmd)}")
+
+    # Change to temp directory to run
+    os.chdir("/tmp")
+
+    # Launch distributed training
+    subprocess.run(cmd, check=True)
+
+    print("Training completed successfully!")
+    return "Training completed"
+
+# Main entry point
+if __name__ == "__main__":
+    # Check if we're running under torchrun
+    if "RANK" in os.environ:
+        # We're running distributed - execute training
+        train()
+    else:
+        # Not running under torchrun
+        print("This script should be run with torchrun or through Modal")
+        print("Examples:")
+        print("  Local: torchrun --nproc-per-node=4 train_modal_standalone.py")
+        print("  Modal: modal run train_modal_standalone.py::train_modal")
+        sys.exit(1)