submit_job_venv_single-node.sh

#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################

#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2

############################################################
### Load modules or software
############################################################

# TODO: load/activate your desired modules and virtual environment

############################################################
### Parameters and Settings
############################################################

# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi

export NCCL_DEBUG=INFO
export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
export TF_GPU_THREAD_MODE='gpu_private'
export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication

############################################################
### Execution (Model Training)
############################################################

# each process sets required environment variables and
# runs the python script
source set_vars.sh
python -W ignore train_model.py --strategy 'mirrored'