ultralytics/working_dir/multi_node.sh

34 lines
1 KiB
Bash
Executable file

#!/bin/bash
set -euo pipefail
TRAIN_SCRIPT="working_dir/yolo_train_mn.py"
if [ -n "${SLURM_NODELIST:-}" ]; then
MASTER_ADDR=${MASTER_ADDR:-$(scontrol show hostnames "$SLURM_NODELIST" | head -n 1)}
NODE_RANK=${NODE_RANK:-${SLURM_NODEID:-0}}
NNODES=${NNODES:-${SLURM_NNODES:-1}}
GPUS_PER_NODE=${GPUS_PER_NODE:-${SLURM_GPUS_ON_NODE:-1}}
else
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
NODE_RANK=${NODE_RANK:-0}
NNODES=${NNODES:-1}
GPUS_PER_NODE=${GPUS_PER_NODE:-1}
fi
MASTER_PORT=${MASTER_PORT:-29500}
export MASTER_ADDR MASTER_PORT NNODES GPUS_PER_NODE NODE_RANK TRAIN_SCRIPT
TORCHRUN=(~/containers/python_ultra_2506 -m torch.distributed.run)
"${TORCHRUN[@]}" \
--nnodes="$NNODES" \
--nproc_per_node="$GPUS_PER_NODE" \
--node_rank="$NODE_RANK" \
--master_addr="$MASTER_ADDR" \
--master_port="$MASTER_PORT" \
"$TRAIN_SCRIPT" \
--model ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml \
--config working_dir/rtdetr_train_pr.yaml \
--name rtdetr-resnet50_mn_trial \
--train "data=coco128.yaml" \