diff --git a/tensorflow/cifar10_distributed/limit_gpu_visibility.sh b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh new file mode 100644 index 0000000000000000000000000000000000000000..7a7704f3d33025f2f21cb2a18a92c0c0e6d366ba --- /dev/null +++ b/tensorflow/cifar10_distributed/limit_gpu_visibility.sh @@ -0,0 +1,4 @@ +#!/usr/bin/zsh + +# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy and Horovod +export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID} \ No newline at end of file diff --git a/tensorflow/cifar10_distributed/set_vars.sh b/tensorflow/cifar10_distributed/set_vars.sh index 1decdb2024fd34543ab5dda4b2f66d4dbf9a0097..e3c81d15e9ed37729d231e3abe3b0d69879b0a05 100644 --- a/tensorflow/cifar10_distributed/set_vars.sh +++ b/tensorflow/cifar10_distributed/set_vars.sh @@ -4,9 +4,6 @@ export RANK=${SLURM_PROCID} export LOCAL_RANK=${SLURM_LOCALID} export WORLD_SIZE=${SLURM_NTASKS} -# limit visible devices to ensure correct device selection and number of replicas in TensorFlow MultiWorkerMirroredStrategy -export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID} - # make variables also available inside container export APPTAINERENV_RANK=${RANK} export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK} diff --git a/tensorflow/cifar10_distributed/submit_job_container.sh b/tensorflow/cifar10_distributed/submit_job_container.sh index 3e1a1e7204f9c43f0dd7320548eb8c0366f47e88..9eea23940286ca210a0048f9989283ae773489e0 100644 --- a/tensorflow/cifar10_distributed/submit_job_container.sh +++ b/tensorflow/cifar10_distributed/submit_job_container.sh @@ -43,6 +43,7 @@ mkdir -p ${NEWTMP} # each process sets required environment variables and # runs the python script inside the container srun zsh -c '\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ bash -c "bash ./execution_wrapper.sh"' diff --git a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh index 0b62a97120d9ae1997c5824653c1c7d4de0e9af2..99f24f11355d30e0ca30472c40dd511aa5a8c9de 100644 --- a/tensorflow/cifar10_distributed/submit_job_container_horovod.sh +++ b/tensorflow/cifar10_distributed/submit_job_container_horovod.sh @@ -43,6 +43,7 @@ mkdir -p ${NEWTMP} # each process sets required environment variables and # runs the python script inside the container srun zsh -c '\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ - apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ + apptainer exec --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ bash -c "python -W ignore train_model_horovod.py"' diff --git a/tensorflow/cifar10_distributed/submit_job_container_single-node.sh b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh new file mode 100644 index 0000000000000000000000000000000000000000..61184cc214852649be4bf2a5bf24ae32e9a69595 --- /dev/null +++ b/tensorflow/cifar10_distributed/submit_job_container_single-node.sh @@ -0,0 +1,48 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 +#SBATCH --account=supp0001 + +############################################################ +### Load modules or software +############################################################ + +# load module for TensorFlow container +module load TensorFlow/nvcr-24.01-tf2-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +export NCCL_DEBUG=INFO +export TF_CPP_MIN_LOG_LEVEL=1 # disable info messages +export TF_GPU_THREAD_MODE='gpu_private' +export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication + +############################################################ +### Execution (Model Training) +############################################################ + +# TensorFlow in container often needs a tmp directory +NEWTMP=$(pwd)/tmp +mkdir -p ${NEWTMP} + +# each process sets required environment variables and +# runs the python script inside the container +source set_vars.sh +apptainer exec -e --nv -B ${NEWTMP}:/tmp ${TENSORFLOW_IMAGE} \ + bash -c "python -W ignore train_model.py --strategy 'mirrored'" diff --git a/tensorflow/cifar10_distributed/submit_job_venv.sh b/tensorflow/cifar10_distributed/submit_job_venv.sh index c42ac3f394efa35b1c9e2fde633f20ca2cfc627b..4e413382c1902056713c8586677f609a42dd4183 100644 --- a/tensorflow/cifar10_distributed/submit_job_venv.sh +++ b/tensorflow/cifar10_distributed/submit_job_venv.sh @@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication # each process sets required environment variables and # runs the python script srun zsh -c "\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ zsh ./execution_wrapper.sh" \ No newline at end of file diff --git a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh index 90b41fc3b321f5e1d6fa6a024b97789da119e8f9..4284dcdd505a9b0e1c29797bd1e0e0a74095d0bd 100644 --- a/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh +++ b/tensorflow/cifar10_distributed/submit_job_venv_horovod.sh @@ -37,5 +37,6 @@ export NCCL_SOCKET_NTHREADS=8 # multi-threading for NCCL communication # each process sets required environment variables and # runs the python script srun zsh -c "\ + source limit_gpu_visibility.sh && \ source set_vars.sh && \ python -W ignore train_model_horovod.py" diff --git a/tensorflow/cifar10_distributed/train_model.py b/tensorflow/cifar10_distributed/train_model.py index 812eea8d77ca031235497c7b7aa357101df06b2d..4fc24e1ffee20523cdac908feda4b531a3cf158f 100644 --- a/tensorflow/cifar10_distributed/train_model.py +++ b/tensorflow/cifar10_distributed/train_model.py @@ -12,6 +12,7 @@ import tensorflow.keras.applications as applications def parse_command_line(): parser = argparse.ArgumentParser() parser.add_argument("--device", required=False, type=str, choices=["cpu", "cuda"], default="cuda") + parser.add_argument("--strategy", required=False, type=str, choices=["mirrored", "multi-worker"], default="multi-worker") parser.add_argument("--num_epochs", required=False, type=int, default=5) parser.add_argument("--batch_size", required=False, type=int, default=128) parser.add_argument("--tensorboard", required=False, help="Whether to use tensorboard callback", action="store_true", default=False) @@ -79,14 +80,16 @@ def setup(args): tf.config.optimizer.set_jit(True) # define data parallel strategy for distrbuted training - strategy = tf.distribute.MultiWorkerMirroredStrategy( - communication_options=tf.distribute.experimental.CommunicationOptions( - implementation=tf.distribute.experimental.CollectiveCommunication.NCCL + if args.strategy == "mirrored": + strategy = tf.distribute.MirroredStrategy() + else: + strategy = tf.distribute.MultiWorkerMirroredStrategy( + communication_options=tf.distribute.experimental.CommunicationOptions( + implementation=tf.distribute.experimental.CollectiveCommunication.NCCL + ) ) - ) - - print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync) - print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id) + print("MultiWorkerMirroredStrategy.num_replicas_in_sync:", strategy.num_replicas_in_sync) + print("MultiWorkerMirroredStrategy.worker_index:", strategy.cluster_resolver.task_id) return strategy