From b410268711b949fecebcb8b45181d0b173b03dc2 Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Fri, 8 Nov 2024 16:31:28 +0100
Subject: [PATCH] added simple cifar examples

---
 tensorflow/cifar10/set_vars.sh                |  23 ++++
 tensorflow/cifar10/submit_job_container.sh    |  46 +++++++
 tensorflow/cifar10/submit_job_venv.sh         |  49 +++++++
 tensorflow/cifar10/train_model.py             | 122 ++++++++++++++++++
 .../cifar10_distributed/execution_wrapper.sh  |   8 --
 tensorflow/cifar10_distributed/set_vars.sh    |  11 +-
 .../submit_job_container.sh                   |   3 +
 .../cifar10_distributed/submit_job_venv.sh    |  51 ++++++++
 tensorflow/cifar10_distributed/train_model.py |  45 +++----
 9 files changed, 325 insertions(+), 33 deletions(-)
 create mode 100644 tensorflow/cifar10/set_vars.sh
 create mode 100644 tensorflow/cifar10/submit_job_container.sh
 create mode 100644 tensorflow/cifar10/submit_job_venv.sh
 create mode 100644 tensorflow/cifar10/train_model.py
 create mode 100644 tensorflow/cifar10_distributed/submit_job_venv.sh

diff --git a/tensorflow/cifar10/set_vars.sh b/tensorflow/cifar10/set_vars.sh
new file mode 100644
index 0000000..6333adb
--- /dev/null
+++ b/tensorflow/cifar10/set_vars.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/zsh
+
+export RANK=${SLURM_PROCID}
+export LOCAL_RANK=${SLURM_LOCALID}
+export WORLD_SIZE=${SLURM_NTASKS}
+
+# make variables also available inside container
+export APPTAINERENV_RANK=${RANK}
+export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
+export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
+export APPTAINERENV_TMP="/tmp"
+
+export APPTAINERENV_TF_CPP_MIN_LOG_LEVEL=${TF_CPP_MIN_LOG_LEVEL}
+export APPTAINERENV_TF_GPU_THREAD_MODE=${TF_GPU_THREAD_MODE}
+export APPTAINERENV_NCCL_SOCKET_NTHREADS=${NCCL_SOCKET_NTHREADS}
+export APPTAINERENV_NCCL_DEBUG=${NCCL_DEBUG}
+
+# make additional SLURM variables available inside container
+export APPTAINERENV_SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
+export APPTAINERENV_SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE}
+export APPTAINERENV_SLURM_NNODES=${SLURM_NNODES}
+export APPTAINERENV_SLURM_JOB_NODELIST=${SLURM_JOB_NODELIST}
+export APPTAINERENV_R_WLM_ABAQUSHOSTLIST="${R_WLM_ABAQUSHOSTLIST}"
diff --git a/tensorflow/cifar10/submit_job_container.sh b/tensorflow/cifar10/submit_job_container.sh
new file mode 100644
index 0000000..fa9b7b5
--- /dev/null
+++ b/tensorflow/cifar10/submit_job_container.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load TensorFlow/nvcr-24.01-tf2-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+export NCCL_DEBUG=INFO
+export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
+export TF_GPU_THREAD_MODE='gpu_private'
+export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# tensorflow in container often needs a tmp directory
+NEWTMP=$(pwd)/tmp
+mkdir -p ${NEWTMP}
+
+# run the python script inside the container
+source set_vars.sh
+apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \
+    bash -c "python -W ignore train_model.py"'
diff --git a/tensorflow/cifar10/submit_job_venv.sh b/tensorflow/cifar10/submit_job_venv.sh
new file mode 100644
index 0000000..cc27dd6
--- /dev/null
+++ b/tensorflow/cifar10/submit_job_venv.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+#SBATCH --account=supp0001
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+module purge
+module load GCC/11.3.0
+module load OpenMPI/4.1.4
+module load CMake/3.21.1
+module load Python/3.9.6
+module load NCCL/2.20.5-CUDA-12.4.0
+module load cuDNN/8.9.7.29-CUDA-12.3.0
+
+source /work/jk869269/venvs/tensorflow-2.17_CUDA-12.3/bin/activate
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+export NCCL_DEBUG=INFO
+export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
+export TF_GPU_THREAD_MODE='gpu_private'
+export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script
+source set_vars.sh
+python -W ignore train_model.py
\ No newline at end of file
diff --git a/tensorflow/cifar10/train_model.py b/tensorflow/cifar10/train_model.py
new file mode 100644
index 0000000..1251e61
--- /dev/null
+++ b/tensorflow/cifar10/train_model.py
@@ -0,0 +1,122 @@
+from __future__ import print_function
+import numpy as np
+import os, sys
+import argparse
+import datetime
+import tensorflow as tf
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras import backend as K
+from tensorflow.keras.datasets import cifar10
+import tensorflow.keras.applications as applications
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda")
+    parser.add_argument("--num_epochs", required=False, type=int, default=5)
+    parser.add_argument("--batch_size", required=False, type=int, default=128)
+    parser.add_argument("--verbosity", required=False, help="Keras verbosity level for training/evaluation", type=int, default=2)
+    parser.add_argument("--num_intraop_threads", required=False, help="Number of intra-op threads", type=int, default=None)
+    parser.add_argument("--num_interop_threads", required=False, help="Number of inter-op threads", type=int, default=None)
+    parser.add_argument("--tensorboard", required=False, help="Whether to use tensorboard callback", action="store_true", default=False)
+    parser.add_argument("--profile_batches", required=False, help='Batches to profile with for tensorboard. Format "batch_start,batch_end"', type=str, default="2,5")
+    args = parser.parse_args()
+
+    # specific to cifar 10 dataset
+    args.num_classes = 10
+
+    print("Settings:")
+    settings_map = vars(args)
+    for name in sorted(settings_map.keys()):
+        print("--" + str(name) + ": " + str(settings_map[name]))
+    print("")
+    sys.stdout.flush()
+
+    return args
+
+def load_dataset(args):
+    K.set_image_data_format("channels_last")
+
+    # load the cifar10 data
+    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+
+    # convert class vectors to binary class matrices.
+    y_train = tf.keras.utils.to_categorical(y_train, args.num_classes)
+    y_test = tf.keras.utils.to_categorical(y_test, args.num_classes)
+
+    # normalize base data
+    x_train = x_train.astype("float32") / 255
+    x_test = x_test.astype("float32") / 255
+    x_train_mean = np.mean(x_train, axis=0)
+    x_train -= x_train_mean
+    x_test -= x_train_mean
+
+    print("x_train shape:", x_train.shape)
+    print("y_train shape:", y_train.shape)
+    print(x_train.shape[0], "train samples")
+    print(x_test.shape[0], "test samples")
+    sys.stdout.flush()
+
+    return (x_train, y_train), (x_test, y_test)
+
+def setup(args):
+    if args.num_intraop_threads:
+        tf.config.threading.set_intra_op_parallelism_threads(args.num_intraop_threads)
+    if args.num_interop_threads:
+        tf.config.threading.set_inter_op_parallelism_threads(args.num_interop_threads)
+
+    print(f"Tensorflow get_intra_op_parallelism_threads: {tf.config.threading.get_intra_op_parallelism_threads()}")
+    print(f"Tensorflow get_inter_op_parallelism_threads: {tf.config.threading.get_inter_op_parallelism_threads()}")
+
+    l_gpu_devices = [] if args.device == "cpu" else tf.config.list_physical_devices("GPU")
+    print("List of GPU devices found:")
+    for dev in l_gpu_devices:
+        print(str(dev.device_type) + ": " + dev.name)
+    print("")
+    sys.stdout.flush()
+
+    tf.keras.backend.clear_session()
+    tf.config.optimizer.set_jit(True)
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    setup(args)
+
+    # data set loading
+    (x_train, y_train), (x_test, y_test) = load_dataset(args)
+    n_train, n_test = x_train.shape[0], x_test.shape[0]
+    input_shape = x_train.shape[1:]
+
+    # Generating input pipelines
+    ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(n_train).cache().batch(args.batch_size).prefetch(tf.data.AUTOTUNE)
+    ds_test = ds_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(n_test).cache().batch(args.batch_size).prefetch(tf.data.AUTOTUNE)
+
+    # callbacks to register
+    callbacks = []
+
+    model = applications.ResNet50(weights=None, input_shape=input_shape, classes=args.num_classes)
+    # model.summary() # display the model architecture
+    cur_optimizer = Adam(0.001)
+    model.compile(loss="categorical_crossentropy", optimizer=cur_optimizer, metrics=["accuracy"])
+
+    # callbacks to register
+    if args.tensorboard:
+        tensorboard_callback = tf.keras.callbacks.TensorBoard(
+            log_dir=os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
+            histogram_freq=1,
+            profile_batch=args.profile_batches,
+        )
+        callbacks.append(tensorboard_callback)
+
+    # train the model
+    model.fit(ds_train, epochs=args.num_epochs, verbose=args.verbosity, callbacks=callbacks)
+
+    # evaluate model
+    scores = model.evaluate(ds_test, verbose=args.verbosity)
+    print(f"Test Evaluation: Accuracy: {scores[1]}")
+    sys.stdout.flush()
+
+if __name__ == "__main__":
+    main()
diff --git a/tensorflow/cifar10_distributed/execution_wrapper.sh b/tensorflow/cifar10_distributed/execution_wrapper.sh
index ae1df9b..997737c 100644
--- a/tensorflow/cifar10_distributed/execution_wrapper.sh
+++ b/tensorflow/cifar10_distributed/execution_wrapper.sh
@@ -1,12 +1,4 @@
 #!/usr/bin/zsh
-############################################################
-### Parameters & Directories
-############################################################
-
-export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
-export TF_GPU_THREAD_MODE='gpu_private'
-export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
-
 ############################################################
 ### Set TF_CONFIG
 ############################################################
diff --git a/tensorflow/cifar10_distributed/set_vars.sh b/tensorflow/cifar10_distributed/set_vars.sh
index 19405a4..6333adb 100644
--- a/tensorflow/cifar10_distributed/set_vars.sh
+++ b/tensorflow/cifar10_distributed/set_vars.sh
@@ -1,16 +1,21 @@
-#!/usr/local_rwth/bin/zsh
+#!/usr/bin/zsh
 
 export RANK=${SLURM_PROCID}
 export LOCAL_RANK=${SLURM_LOCALID}
 export WORLD_SIZE=${SLURM_NTASKS}
 
-# make variables also available inside singularity container
+# make variables also available inside container
 export APPTAINERENV_RANK=${RANK}
 export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
 export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
 export APPTAINERENV_TMP="/tmp"
 
-# make additional SLURM variables available to container
+export APPTAINERENV_TF_CPP_MIN_LOG_LEVEL=${TF_CPP_MIN_LOG_LEVEL}
+export APPTAINERENV_TF_GPU_THREAD_MODE=${TF_GPU_THREAD_MODE}
+export APPTAINERENV_NCCL_SOCKET_NTHREADS=${NCCL_SOCKET_NTHREADS}
+export APPTAINERENV_NCCL_DEBUG=${NCCL_DEBUG}
+
+# make additional SLURM variables available inside container
 export APPTAINERENV_SLURM_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
 export APPTAINERENV_SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE}
 export APPTAINERENV_SLURM_NNODES=${SLURM_NNODES}
diff --git a/tensorflow/cifar10_distributed/submit_job_container.sh b/tensorflow/cifar10_distributed/submit_job_container.sh
index 3ffe0d1..8fc8291 100644
--- a/tensorflow/cifar10_distributed/submit_job_container.sh
+++ b/tensorflow/cifar10_distributed/submit_job_container.sh
@@ -28,6 +28,9 @@ echo "Current machine: $(hostname)"
 nvidia-smi
 
 export NCCL_DEBUG=INFO
+export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
+export TF_GPU_THREAD_MODE='gpu_private'
+export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
 
 ############################################################
 ### Execution (Model Training)
diff --git a/tensorflow/cifar10_distributed/submit_job_venv.sh b/tensorflow/cifar10_distributed/submit_job_venv.sh
new file mode 100644
index 0000000..be3ff76
--- /dev/null
+++ b/tensorflow/cifar10_distributed/submit_job_venv.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+#SBATCH --account=supp0001
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+module purge
+module load GCC/11.3.0
+module load OpenMPI/4.1.4
+module load CMake/3.21.1
+module load Python/3.9.6
+module load NCCL/2.20.5-CUDA-12.4.0
+module load cuDNN/8.9.7.29-CUDA-12.3.0
+
+source /work/jk869269/venvs/tensorflow-2.17_CUDA-12.3/bin/activate
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+export NCCL_DEBUG=INFO
+export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages
+export TF_GPU_THREAD_MODE='gpu_private'
+export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# each process sets required environment variables and
+# runs the python script
+srun zsh -c '\
+    source set_vars.sh && \
+    zsh ./execution_wrapper.sh'
\ No newline at end of file
diff --git a/tensorflow/cifar10_distributed/train_model.py b/tensorflow/cifar10_distributed/train_model.py
index 41c29e8..ab0c3c0 100644
--- a/tensorflow/cifar10_distributed/train_model.py
+++ b/tensorflow/cifar10_distributed/train_model.py
@@ -14,7 +14,6 @@ def parse_command_line():
     parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda")
     parser.add_argument("--num_epochs", required=False, type=int, default=5)
     parser.add_argument("--batch_size", required=False, type=int, default=128)
-    parser.add_argument("--num_workers", required=False, type=int, default=1)
     parser.add_argument("--distributed", required=False, action="store_true", default=False)
     parser.add_argument("--verbosity", required=False, help="Keras verbosity level for training/evaluation", type=int, default=2)
     parser.add_argument("--num_intraop_threads", required=False, help="Number of intra-op threads", type=int, default=None)
@@ -78,19 +77,18 @@ def load_dataset(args):
 
     return (x_train, y_train), (x_test, y_test)
 
-def setup(args) -> None:
+def setup(args):
     if args.num_intraop_threads:
         tf.config.threading.set_intra_op_parallelism_threads(args.num_intraop_threads)
     if args.num_interop_threads:
         tf.config.threading.set_inter_op_parallelism_threads(args.num_interop_threads)
 
+    l_gpu_devices = [] if args.device == "cpu" else tf.config.list_physical_devices("GPU")
+
     if args.world_rank == 0:
         print(f"Tensorflow get_intra_op_parallelism_threads: {tf.config.threading.get_intra_op_parallelism_threads()}")
         print(f"Tensorflow get_inter_op_parallelism_threads: {tf.config.threading.get_inter_op_parallelism_threads()}")
-        sys.stdout.flush()
 
-    l_gpu_devices = [] if args.device == "cpu" else tf.config.list_physical_devices("GPU")
-    if args.world_rank == 0:
         print("List of GPU devices found:")
         for dev in l_gpu_devices:
             print(str(dev.device_type) + ": " + dev.name)
@@ -101,13 +99,6 @@ def setup(args) -> None:
     tf.keras.backend.clear_session()
     tf.config.optimizer.set_jit(True)
 
-def main():
-    # parse command line arguments
-    args = parse_command_line()
-
-    # run setup (e.g., create distributed environment if desired)
-    setup(args)
-
     # define data parallel strategy for distrbuted training
     strategy = tf.distribute.MultiWorkerMirroredStrategy(
         communication_options=tf.distribute.experimental.CommunicationOptions(
@@ -115,6 +106,15 @@ def main():
         )
     )
 
+    return strategy
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    strategy = setup(args)
+
     # data set loading
     (x_train, y_train), (x_test, y_test) = load_dataset(args)
     n_train, n_test = x_train.shape[0], x_test.shape[0]
@@ -133,22 +133,23 @@ def main():
         cur_optimizer = Adam(0.001)
         model.compile(loss="categorical_crossentropy", optimizer=cur_optimizer, metrics=["accuracy"])
 
-        # callbacks to register
-        if args.tensorboard:
-            tensorboard_callback = tf.keras.callbacks.TensorBoard(
-                log_dir=os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
-                histogram_freq=1,
-                profile_batch=args.profile_batches,
-            )
-            callbacks.append(tensorboard_callback)
+    # callbacks to register
+    if args.tensorboard:
+        tensorboard_callback = tf.keras.callbacks.TensorBoard(
+            log_dir=os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S")),
+            histogram_freq=1,
+            profile_batch=args.profile_batches,
+        )
+        callbacks.append(tensorboard_callback)
 
     # train the model
     model.fit(ds_train, epochs=args.num_epochs, verbose=args.verbosity, callbacks=callbacks)
 
     # evaluate model
     scores = model.evaluate(ds_test, verbose=args.verbosity)
-    print(f"Test Evaluation: Accuracy: {scores[1]}")
-    sys.stdout.flush()
+    if args.world_rank == 0:
+        print(f"Test Evaluation: Accuracy: {scores[1]}")
+        sys.stdout.flush()
 
 if __name__ == "__main__":
     main()
-- 
GitLab