Skip to content

Instantly share code, notes, and snippets.

@dcasati
Last active May 24, 2024 10:42
Show Gist options
  • Select an option

  • Save dcasati/c3176ed3d8c14f017bf6585805fc878f to your computer and use it in GitHub Desktop.

Select an option

Save dcasati/c3176ed3d8c14f017bf6585805fc878f to your computer and use it in GitHub Desktop.

Revisions

  1. dcasati revised this gist Sep 25, 2023. 1 changed file with 15 additions and 15 deletions.
    30 changes: 15 additions & 15 deletions run.sh
    Original file line number Diff line number Diff line change
    @@ -1,26 +1,26 @@
    #!/usr/bin/env bash
    set -x

    TORCH_CPP_LOG_LEVEL=INFO
    TORCH_DISTRIBUTED_DEBUG=DETAIL
    LOGLEVEL=DEBUG
    NCCL_DEBUG=warn
    LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH
    LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so
    NCCL_IB_PCI_RELAXED_ORDERING=1
    UCX_IB_PCI_RELAXED_ORDERING=on
    UCX_MEM_EVENTS=n
    NCCL_IB_DISABLE=0
    UCX_TLS=tcp
    UCX_NET_DEVICES=eth0
    CUDA_DEVICE_ORDER=PCI_BUS_ID
    NCCL_SOCKET_IFNAME=eth0
    export TORCH_CPP_LOG_LEVEL=INFO
    export TORCH_DISTRIBUTED_DEBUG=DETAIL
    export LOGLEVEL=DEBUG
    export NCCL_DEBUG=warn
    export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH
    export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so
    export NCCL_IB_PCI_RELAXED_ORDERING=1
    export UCX_IB_PCI_RELAXED_ORDERING=on
    export UCX_MEM_EVENTS=n
    export NCCL_IB_DISABLE=0
    export UCX_TLS=tcp
    export UCX_NET_DEVICES=eth0
    export CUDA_DEVICE_ORDER=PCI_BUS_ID
    export NCCL_SOCKET_IFNAME=eth0

    torchrun \
    --nproc_per_node=8 \
    --nnodes=2 \
    --max-restarts=1 \
    --node_rank=$1 \
    --node_rank=1 \
    --rdzv-id=test \
    --rdzv_endpoint=task-rahuls-healthy-league-service:8081 \
    oci_launch_scripts/k8s_nccl_job.py --backend=nccl
  2. dcasati created this gist Sep 25, 2023.
    26 changes: 26 additions & 0 deletions run.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    #!/usr/bin/env bash
    set -x

    TORCH_CPP_LOG_LEVEL=INFO
    TORCH_DISTRIBUTED_DEBUG=DETAIL
    LOGLEVEL=DEBUG
    NCCL_DEBUG=warn
    LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH
    LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so
    NCCL_IB_PCI_RELAXED_ORDERING=1
    UCX_IB_PCI_RELAXED_ORDERING=on
    UCX_MEM_EVENTS=n
    NCCL_IB_DISABLE=0
    UCX_TLS=tcp
    UCX_NET_DEVICES=eth0
    CUDA_DEVICE_ORDER=PCI_BUS_ID
    NCCL_SOCKET_IFNAME=eth0

    torchrun \
    --nproc_per_node=8 \
    --nnodes=2 \
    --max-restarts=1 \
    --node_rank=$1 \
    --rdzv-id=test \
    --rdzv_endpoint=task-rahuls-healthy-league-service:8081 \
    oci_launch_scripts/k8s_nccl_job.py --backend=nccl