Running pytorch.distributed on Multiple Nodes¶
Key thing to know is that srun is like a super-ssh which means that when running srun cmd
it actually does something like ssh node cmd
Initial Solution¶
task.slurm¶
#!/bin/bash
#SBATCH --partition=TrixieMain
#SBATCH --account=dt-mtp
#SBATCH --time=00:20:00
#SBATCH --job-name=pytorch.distributed
#SBATCH --comment="Helping Harry with pytorch distributed on multiple nodes."
#SBATCH --gres=gpu:4
##SBATCH --ntasks=2
#SBATCH --wait-all-nodes=1
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=6
#SBATCH --output=%x-%j.out
# USEFUL Bookmarks
# [Run PyTorch Data Parallel training on ParallelCluster](https://www.hpcworkshops.com/08-ml-on-parallelcluster/03-distributed-data-parallel.html)
# [slurm SBATCH - Multiple Nodes, Same SLURMD_NODENAME](https://stackoverflow.com/a/51356947)
export TQDM_MININTERVAL=90
head_node_ip=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
readonly head_node_ip
readonly head_node_port=$(( $SLURM_JOBID % (50000 - 30000 + 1 ) + 30000 ))
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
readonly srun='srun --output=%x-%j.%t.out'
env
$srun bash \
task.sh \
$MASTER_ADDR_JOB \
$MASTER_PORT_JOB &
wait
task.sh¶
This script will be executed on each node.
Note that we are activating the conda
environment in this script so that each node/worker can have the proper environment.
#!/bin/bash
# USEFUL Bookmarks
# [Run PyTorch Data Parallel training on ParallelCluster](https://www.hpcworkshops.com/08-ml-on-parallelcluster/03-distributed-data-parallel.html)
# [slurm SBATCH - Multiple Nodes, Same SLURMD_NODENAME](https://stackoverflow.com/a/51356947)
#module load conda/3-24.9.0
#source activate molecule
source /gpfs/projects/DT/mtp/WMT20/opt/miniforge3/bin/activate
conda activate pytorch-1.7.1
readonly MASTER_ADDR_JOB=$1
readonly MASTER_PORT_JOB=$2
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
env
python \
-m torch.distributed.launch \
--nproc_per_node=4 \
--nnodes=$SLURM_NTASKS \
--node_rank=$SLURM_NODEID \
--master_addr=$MASTER_ADDR_JOB \
--master_port=$MASTER_PORT_JOB \
main.py \
--batch_size 128 \
--learning_rate 5e-5 &
wait
Accelerate/HuggingFace Specific¶
#!/bin/bash
# [SLURM/ACCELERATE](https://github.com/huggingface/accelerate/blob/main/examples/slurm/submit_multinode.sh)
# [MNIST](https://huggingface.co/blog/pytorch-ddp-accelerate-transformers)
#SBATCH --partition=TrixieMain
#SBATCH --account=dt-mtp
#SBATCH --time=12:00:00
#SBATCH --job-name=MNIST.distributed
#SBATCH --gres=gpu:4
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --output=%x-%j.out
#SBATCH --signal=B:USR1@30
#SBATCH --requeue
# Requeueing on Trixie
# [source](https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/)
# [source](https://hpc-uit.readthedocs.io/en/latest/jobs/examples.html#how-to-recover-files-before-a-job-times-out)
function _requeue {
echo "BASH - trapping signal 10 - requeueing $SLURM_JOBID"
date
# This would allow to generically requeue any job but since we are using XLM
# which is slurm aware, XLM could save its model before requeueing.
scontrol requeue "$SLURM_JOBID"
}
if [[ -n "$SLURM_JOBID" ]]; then
trap _requeue USR1
fi
# Keep a copy of the code in case it changes between runs.
head -n 112312 "$0" my_huggingface_trainer.py
# Setup your working environment.
source setup_env.sh ""
export TQDM_MININTERVAL=90
# These are required to setup the distributed framework.
head_node_ip=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
readonly head_node_ip
readonly head_node_port=$(( $SLURM_JOBID % (50000 - 30000 + 1 ) + 30000 ))
# Dump the environment.
( set -o posix ; set )
srun accelerate launch \
--num_processes=$((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
--num_machines="$SLURM_NNODES" \
--rdzv_backend=c10d \
--main_process_ip="$head_node_ip" \
--main_process_port="$head_node_port" \
--machine_rank="$SLURM_NODEID" \
my_huggingface_trainer.py