From 0a7e2b4b06886ebc966db2af4c707685d4c9527f Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Fri, 6 Dec 2024 09:02:59 +0100
Subject: [PATCH] added MirroredStrategy example

---
 .../limit_gpu_visibility.sh                   |  4 ++
 tensorflow/cifar10_distributed/set_vars.sh    |  3 --
 .../submit_job_container.sh                   |  1 +
 .../submit_job_container_horovod.sh           |  3 +-
 .../submit_job_container_single-node.sh       | 48 +++++++++++++++++++
 .../cifar10_distributed/submit_job_venv.sh    |  1 +
 .../submit_job_venv_horovod.sh                |  1 +
 tensorflow/cifar10_distributed/train_model.py | 17 ++++---
 8 files changed, 67 insertions(+), 11 deletions(-)
 create mode 100644 tensorflow/cifar10_distributed/limit_gpu_visibility.sh
 create mode 100644 tensorflow/cifar10_distributed/submit_job_container_single-node.sh

diff --git a/tensorflow/cifar10_distributed/limit_gpu_visibility.sh b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh
new file mode 100644
index 0000000..7a7704f
--- /dev/null
+++ b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/zsh
+
+# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy and Horovod
+export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID}
\ No newline at end of file
diff --git a/tensorflow/cifar10_distributed/set_vars.sh b/tensorflow/cifar10_distributed/set_vars.sh
index 1decdb2..e3c81d1 100644
--- a/tensorflow/cifar10_distributed/set_vars.sh
+++ b/tensorflow/cifar10_distributed/set_vars.sh
@@ -4,9 +4,6 @@ export RANK=${SLURM_PROCID}
 export LOCAL_RANK=${SLURM_LOCALID}
 export WORLD_SIZE=${SLURM_NTASKS}
 
-# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy
-export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID}
-
 # make variables also available inside container
 export APPTAINERENV_RANK=${RANK}
 export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
diff --git a/tensorflow/cifar10_distributed/submit_job_container.sh b/tensorflow/cifar10_distributed/submit_job_container.sh
index 3e1a1e7..9eea239 100644
--- a/tensorflow/cifar10_distributed/submit_job_container.sh
+++ b/tensorflow/cifar10_distributed/submit_job_container.sh
@@ -43,6 +43,7 @@ mkdir -p ${NEWTMP}
 # each process sets required environment variables and
 # runs the python script inside the container
 srun zsh -c '\
+    source limit_gpu_visibility.sh && \
     source set_vars.sh && \
     apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
         bash -c "bash ./execution_wrapper.sh"'
diff --git a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh
index 0b62a97..99f24f1 100644
--- a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh
+++ b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh
@@ -43,6 +43,7 @@ mkdir -p ${NEWTMP}
 # each process sets required environment variables and
 # runs the python script inside the container
 srun zsh -c '\
+    source limit_gpu_visibility.sh && \
     source set_vars.sh && \
-    apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
+    apptainer exec --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
         bash -c "python -W ignore train_model_horovod.py"'
diff --git a/tensorflow/cifar10_distributed/submit_job_container_single-node.sh b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh
new file mode 100644
index 0000000..61184cc
--- /dev/null
+++ b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+#SBATCH --account=supp0001
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for TensorFlow container
+module load TensorFlow/nvcr-24.01-tf2-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+export NCCL_DEBUG=INFO
+export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
+export TF_GPU_THREAD_MODE='gpu_private'
+export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# TensorFlow in container often needs a tmp directory
+NEWTMP=$(pwd)/tmp
+mkdir -p ${NEWTMP}
+
+# each process sets required environment variables and
+# runs the python script inside the container
+source set_vars.sh
+apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
+    bash -c "python -W ignore train_model.py --strategy 'mirrored'"
diff --git a/tensorflow/cifar10_distributed/submit_job_venv.sh b/tensorflow/cifar10_distributed/submit_job_venv.sh
index c42ac3f..4e41338 100644
--- a/tensorflow/cifar10_distributed/submit_job_venv.sh
+++ b/tensorflow/cifar10_distributed/submit_job_venv.sh
@@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
 # each process sets required environment variables and
 # runs the python script
 srun zsh -c "\
+    source limit_gpu_visibility.sh && \
     source set_vars.sh && \
     zsh ./execution_wrapper.sh"
\ No newline at end of file
diff --git a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh
index 90b41fc..4284dcd 100644
--- a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh
+++ b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh
@@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
 # each process sets required environment variables and
 # runs the python script
 srun zsh -c "\
+    source limit_gpu_visibility.sh && \
     source set_vars.sh && \
     python -W ignore train_model_horovod.py"
diff --git a/tensorflow/cifar10_distributed/train_model.py b/tensorflow/cifar10_distributed/train_model.py
index 812eea8..4fc24e1 100644
--- a/tensorflow/cifar10_distributed/train_model.py
+++ b/tensorflow/cifar10_distributed/train_model.py
@@ -12,6 +12,7 @@ import tensorflow.keras.applications as applications
 def parse_command_line():
     parser = argparse.ArgumentParser()
     parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda")
+    parser.add_argument("--strategy", required=False, type=str, choices=["mirrored", "multi-worker"], default="multi-worker")
     parser.add_argument("--num_epochs", required=False, type=int, default=5)
     parser.add_argument("--batch_size", required=False, type=int, default=128)
     parser.add_argument("--tensorboard", required=False, help="Whether to use tensorboard callback", action="store_true", default=False)
@@ -79,14 +80,16 @@ def setup(args):
     tf.config.optimizer.set_jit(True)
 
     # define data parallel strategy for distrbuted training
-    strategy = tf.distribute.MultiWorkerMirroredStrategy(
-        communication_options=tf.distribute.experimental.CommunicationOptions(
-            implementation=tf.distribute.experimental.CollectiveCommunication.NCCL
+    if args.strategy == "mirrored":
+        strategy = tf.distribute.MirroredStrategy()
+    else:
+        strategy = tf.distribute.MultiWorkerMirroredStrategy(
+            communication_options=tf.distribute.experimental.CommunicationOptions(
+                implementation=tf.distribute.experimental.CollectiveCommunication.NCCL
+            )
         )
-    )
-
-    print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync)
-    print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id)
+        print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync)
+        print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id)
 
     return strategy
 
-- 
GitLab