# Modify apt sources lists cd /etc/apt/sources.list.d/ sudo rm gds-11-7.conf cuda-12-3.conf cuda-12-2.conf cuda-12-1.conf 989_cuda-11.conf cuda-ubuntu2004-11-7-local.list cuda-ubuntu2004-11-7-local.list # Modify apt preferences cd /etc/apt/preferences.d sudo rm cuda-repository-pin-600 nvidia-fabricmanager # Startup shell environment variables sudo vim /etc/profile.d/dlami.sh # comment out both sudo vim /etc/environment # 1. add /usr/local/cuda/bin: to the front of PATH= 2. Add new line: LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" # Add nvidia repo sources. wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo apt install ./cuda-keyring_1.1-1_all.deb # nvidia-driver-550 sudo apt update sudo apt install nvidia-driver-550 sudo reboot # CUDA 12.4 sudo apt install cuda-toolkit-12-4 sudo bash -c "echo '/usr/local/cuda/lib64' >> /etc/ld.so.conf" sudo ldconfig sudo vim /etc/environment # on a new line, type LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc echo 'sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-67607860.efs.us-west-2.amazonaws.com:/ efs' >> ~/.bashrc # cuDNN 9.1 for CUDA 12.4 sudo apt install cudnn9-cuda-12=9.1* sudo apt install libcudnn9-samples=9.1* # NCCL, etc sudo apt remove nvidia-fabricmanager-535 sudo apt install libcusparse-12-4 libcusparse-dev-12-4 libcusparselt0 libcusparselt-dev # CUSparse and CUSparse-LT sudo apt install nvidia-fabricmanager-550 nvidia-fabricmanager-dev-550 cuda-drivers-fabricmanager-550 # NVIDIA Fabric Manager for NVLink/NVSwitch sudo apt install libnccl2 libnccl-dev # NCCL sudo apt install libxnvctrl0=550.* nvidia-settings=550.* # Miscellaneous NVIDIA management tools sudo apt install nvidia-container-toolkit # 2.5 (Optional) NVIDIA Docker ### Start NVIDIA Fabric Manager service to avoid Error 802 - System Not Initialized ```bash sudo systemctl enable nvidia-fabricmanager.service sudo systemctl start nvidia-fabricmanager.service ``` # Edit dlami profile script to include the following sudo vim /etc/profile.d/dlami.sh ######### export LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda:/usr/local/cuda/targets/x86_64-linux/lib/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} export PATH=/usr/local/cuda/bin:/usr/local/cuda/include${PATH:+:$PATH} ######### # build pytorch # 1. Dependencies conda create -n clean_pytorch_ffmpeg_build cmake ninja intel::mkl-static intel::mkl-include astunparse "expecttest!=0.2.0" hypothesis numpy psutil pyyaml requests setuptools "typing-extensions>=4.8.0" sympy filelock networkx jinja2 fsspec conda activate clean_pytorch_ffmpeg_build conda install -c pytorch magma-cuda124 pip install types-dataclasses "optree>=0.9.1" lark # 2. PyTorch sources cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/pytorch.git && cd pytorch git submodule sync git submodule update --init --recursive ################################################################################################################################################################################ # TODO: Monkey-patch ${HOME}/pytorch/aten/src/ATen/core/boxing/impl/boxing.h (line 36-48) according to #################################################################################################################################################################################### # Build export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus export _GLIBCXX_USE_CXX11_ABI=1 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} export USE_SYSTEM_NCCL=1 export NCCL_ROOT=/usr export NCCL_INCLUDE_DIR=/usr/include # Also need this for suppressing "COULD NOT FIND NCCL" ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so) python setup.py clean && echo "Done Cleaning" (python setup.py install |& tee install_pytorch.log) && echo "DONE building pytorch"# Wait 10 mins for it to finish. # Test cd # Need to get out of the build directory. Otherwise bad things happen. python -c "import torch; print(torch.cuda.is_available()); exit()" python import torch torch.rand(2, 3, device='cuda') @ torch.rand(3, 2, device='cuda') # Check CUDA is working torch.svd(torch.rand(3,3, device='cuda')) # Check MAGMA-CUDA is working exit() # Get out of the Python shell. # torchvision cd && git clone --recursive --single-branch --branch v0.18.1 https://github.com/pytorch/vision.git && cd vision conda activate clean_pytorch_ffmpeg_build export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus export TORCHVISION_INCLUDE=/usr/local/include:/usr/local/include/ffnvcodec:/usr/local/cuda/include # for cuviddec.h and nvcuvid.h export TORCHVISION_LIBRARY=/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/local/lib:/usr/local/cuda/lib64 # for libnvcuvid.so export _GLIBCXX_USE_CXX11_ABI=1 python setup.py install # torchaudio cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/audio.git && cd audio git submodule sync git submodule update --init --recursive export USE_CUDA=1 export USE_OPENMP=1 python setup.py install | tee install_torchaudio.log cd && rm -rf audio # transformers and other huggingface libraries pip install transformers accelerate safetensors peft huggingface_hub timm peft # go from starter env conda create -n blip2 --clone clean_pytorch_ffmpeg_build conda activate blip2 ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so) # pip install project-specific dependencies for blip2 # remove open3d from requirements.txt pip install -e . --verbose