Skip to content
Snippets Groups Projects
Verified Commit 0a7e2b4b authored by Jannis Klinkenberg's avatar Jannis Klinkenberg
Browse files

added MirroredStrategy example

parent f17d287f
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/zsh
# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy and Horovod
export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID}
\ No newline at end of file
......@@ -4,9 +4,6 @@ export RANK=${SLURM_PROCID}
export LOCAL_RANK=${SLURM_LOCALID}
export WORLD_SIZE=${SLURM_NTASKS}
# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy
export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID}
# make variables also available inside container
export APPTAINERENV_RANK=${RANK}
export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
......
......@@ -43,6 +43,7 @@ mkdir -p ${NEWTMP}
# each process sets required environment variables and
# runs the python script inside the container
srun zsh -c '\
source limit_gpu_visibility.sh && \
source set_vars.sh && \
apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
bash -c "bash ./execution_wrapper.sh"'
......@@ -43,6 +43,7 @@ mkdir -p ${NEWTMP}
# each process sets required environment variables and
# runs the python script inside the container
srun zsh -c '\
source limit_gpu_visibility.sh && \
source set_vars.sh && \
apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
apptainer exec --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
bash -c "python -W ignore train_model_horovod.py"'
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2
#SBATCH --account=supp0001
############################################################
### Load modules or software
############################################################
# load module for TensorFlow container
module load TensorFlow/nvcr-24.01-tf2-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
export NCCL_DEBUG=INFO
export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
export TF_GPU_THREAD_MODE='gpu_private'
export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
############################################################
### Execution (Model Training)
############################################################
# TensorFlow in container often needs a tmp directory
NEWTMP=$(pwd)/tmp
mkdir -p ${NEWTMP}
# each process sets required environment variables and
# runs the python script inside the container
source set_vars.sh
apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
bash -c "python -W ignore train_model.py --strategy 'mirrored'"
......@@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
# each process sets required environment variables and
# runs the python script
srun zsh -c "\
source limit_gpu_visibility.sh && \
source set_vars.sh && \
zsh ./execution_wrapper.sh"
\ No newline at end of file
......@@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
# each process sets required environment variables and
# runs the python script
srun zsh -c "\
source limit_gpu_visibility.sh && \
source set_vars.sh && \
python -W ignore train_model_horovod.py"
......@@ -12,6 +12,7 @@ import tensorflow.keras.applications as applications
def parse_command_line():
parser = argparse.ArgumentParser()
parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda")
parser.add_argument("--strategy", required=False, type=str, choices=["mirrored", "multi-worker"], default="multi-worker")
parser.add_argument("--num_epochs", required=False, type=int, default=5)
parser.add_argument("--batch_size", required=False, type=int, default=128)
parser.add_argument("--tensorboard", required=False, help="Whether to use tensorboard callback", action="store_true", default=False)
......@@ -79,12 +80,14 @@ def setup(args):
tf.config.optimizer.set_jit(True)
# define data parallel strategy for distrbuted training
if args.strategy == "mirrored":
strategy = tf.distribute.MirroredStrategy()
else:
strategy = tf.distribute.MultiWorkerMirroredStrategy(
communication_options=tf.distribute.experimental.CommunicationOptions(
implementation=tf.distribute.experimental.CollectiveCommunication.NCCL
)
)
print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync)
print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment