From 0a7e2b4b06886ebc966db2af4c707685d4c9527f Mon Sep 17 00:00:00 2001 From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de> Date: Fri, 6 Dec 2024 09:02:59 +0100 Subject: [PATCH] added MirroredStrategy example --- .../limit_gpu_visibility.sh | 4 ++ tensorflow/cifar10_distributed/set_vars.sh | 3 -- .../submit_job_container.sh | 1 + .../submit_job_container_horovod.sh | 3 +- .../submit_job_container_single-node.sh | 48 +++++++++++++++++++ .../cifar10_distributed/submit_job_venv.sh | 1 + .../submit_job_venv_horovod.sh | 1 + tensorflow/cifar10_distributed/train_model.py | 17 ++++--- 8 files changed, 67 insertions(+), 11 deletions(-) create mode 100644 tensorflow/cifar10_distributed/limit_gpu_visibility.sh create mode 100644 tensorflow/cifar10_distributed/submit_job_container_single-node.sh diff --git a/tensorflow/cifar10_distributed/limit_gpu_visibility.sh b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh new file mode 100644 index 0000000..7a7704f --- /dev/null +++ b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh @@ -0,0 +1,4 @@ +#!/usr/bin/zsh + +# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy and Horovod +export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID} \ No newline at end of file diff --git a/tensorflow/cifar10_distributed/set_vars.sh b/tensorflow/cifar10_distributed/set_vars.sh index 1decdb2..e3c81d1 100644 --- a/tensorflow/cifar10_distributed/set_vars.sh +++ b/tensorflow/cifar10_distributed/set_vars.sh @@ -4,9 +4,6 @@ export RANK=${SLURM_PROCID} export LOCAL_RANK=${SLURM_LOCALID} export WORLD_SIZE=${SLURM_NTASKS} -# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy -export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID} - # make variables also available inside container export APPTAINERENV_RANK=${RANK} export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK} diff --git a/tensorflow/cifar10_distributed/submit_job_container.sh b/tensorflow/cifar10_distributed/submit_job_container.sh index 3e1a1e7..9eea239 100644 --- a/tensorflow/cifar10_distributed/submit_job_container.sh +++ b/tensorflow/cifar10_distributed/submit_job_container.sh @@ -43,6 +43,7 @@ mkdir -p ${NEWTMP} # each process sets required environment variables and # runs the python script inside the container srun zsh -c '\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ bash -c "bash ./execution_wrapper.sh"' diff --git a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh index 0b62a97..99f24f1 100644 --- a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh +++ b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh @@ -43,6 +43,7 @@ mkdir -p ${NEWTMP} # each process sets required environment variables and # runs the python script inside the container srun zsh -c '\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ - apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ + apptainer exec --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ bash -c "python -W ignore train_model_horovod.py"' diff --git a/tensorflow/cifar10_distributed/submit_job_container_single-node.sh b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh new file mode 100644 index 0000000..61184cc --- /dev/null +++ b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh @@ -0,0 +1,48 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 +#SBATCH --account=supp0001 + +############################################################ +### Load modules or software +############################################################ + +# load module for TensorFlow container +module load TensorFlow/nvcr-24.01-tf2-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +export NCCL_DEBUG=INFO +export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages +export TF_GPU_THREAD_MODE='gpu_private' +export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication + +############################################################ +### Execution (Model Training) +############################################################ + +# TensorFlow in container often needs a tmp directory +NEWTMP=$(pwd)/tmp +mkdir -p ${NEWTMP} + +# each process sets required environment variables and +# runs the python script inside the container +source set_vars.sh +apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ + bash -c "python -W ignore train_model.py --strategy 'mirrored'" diff --git a/tensorflow/cifar10_distributed/submit_job_venv.sh b/tensorflow/cifar10_distributed/submit_job_venv.sh index c42ac3f..4e41338 100644 --- a/tensorflow/cifar10_distributed/submit_job_venv.sh +++ b/tensorflow/cifar10_distributed/submit_job_venv.sh @@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication # each process sets required environment variables and # runs the python script srun zsh -c "\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ zsh ./execution_wrapper.sh" \ No newline at end of file diff --git a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh index 90b41fc..4284dcd 100644 --- a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh +++ b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh @@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication # each process sets required environment variables and # runs the python script srun zsh -c "\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ python -W ignore train_model_horovod.py" diff --git a/tensorflow/cifar10_distributed/train_model.py b/tensorflow/cifar10_distributed/train_model.py index 812eea8..4fc24e1 100644 --- a/tensorflow/cifar10_distributed/train_model.py +++ b/tensorflow/cifar10_distributed/train_model.py @@ -12,6 +12,7 @@ import tensorflow.keras.applications as applications def parse_command_line(): parser = argparse.ArgumentParser() parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda") + parser.add_argument("--strategy", required=False, type=str, choices=["mirrored", "multi-worker"], default="multi-worker") parser.add_argument("--num_epochs", required=False, type=int, default=5) parser.add_argument("--batch_size", required=False, type=int, default=128) parser.add_argument("--tensorboard", required=False, help="Whether to use tensorboard callback", action="store_true", default=False) @@ -79,14 +80,16 @@ def setup(args): tf.config.optimizer.set_jit(True) # define data parallel strategy for distrbuted training - strategy = tf.distribute.MultiWorkerMirroredStrategy( - communication_options=tf.distribute.experimental.CommunicationOptions( - implementation=tf.distribute.experimental.CollectiveCommunication.NCCL + if args.strategy == "mirrored": + strategy = tf.distribute.MirroredStrategy() + else: + strategy = tf.distribute.MultiWorkerMirroredStrategy( + communication_options=tf.distribute.experimental.CommunicationOptions( + implementation=tf.distribute.experimental.CollectiveCommunication.NCCL + ) ) - ) - - print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync) - print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id) + print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync) + print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id) return strategy -- GitLab