diff --git a/basic_mpi.sh b/basic_mpi.sh deleted file mode 100644 index 4f0e4229c049caa8273ce0e9e8f763aca0db72a4..0000000000000000000000000000000000000000 --- a/basic_mpi.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/zsh - -### Job Parameters -#SBATCH --ntasks=8 # Ask for 8 MPI tasks -#SBATCH --time=00:15:00 # Run time of 15 minutes -#SBATCH --job-name=example_job # Sets the job name -#SBATCH --output=stdout.txt # redirects stdout and stderr to stdout.txt -#SBATCH --account=<project-id> # Insertg your project-id or delete this line - -### Program Code -srun hostname \ No newline at end of file diff --git a/pytorch/cifar10/submit_job_container.sh b/pytorch/cifar10/submit_job_container.sh new file mode 100644 index 0000000000000000000000000000000000000000..adbb8987b021adf0754e36eecd197a6c82471bcb --- /dev/null +++ b/pytorch/cifar10/submit_job_container.sh @@ -0,0 +1,36 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script inside the container +apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py" \ No newline at end of file diff --git a/pytorch/cifar10/submit_job_utilization_monitoring.sh b/pytorch/cifar10/submit_job_utilization_monitoring.sh new file mode 100644 index 0000000000000000000000000000000000000000..db7c1d7a199f8e5b96b3e573b2b6035f1c7f0b74 --- /dev/null +++ b/pytorch/cifar10/submit_job_utilization_monitoring.sh @@ -0,0 +1,53 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# specify that utilization monitoring via nvidia-smi should be done +export ENABLE_MONITORING=1 + +############################################################ +### Execution (Model Training) +############################################################ + +if [ "${ENABLE_MONITORING:-0}" = "1" ]; then + # start monitoring process in the background (every 2 sec) + nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \ + --format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt & + # remember ID of process that has just been started in background + export proc_id_monitor=$! +fi + +# run the python script inside the container +apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py" + +if [ "${ENABLE_MONITORING:-0}" = "1" ]; then + # end monitoring process again + kill -2 ${proc_id_monitor} + sleep 5 +fi \ No newline at end of file diff --git a/pytorch/cifar10/submit_job_venv.sh b/pytorch/cifar10/submit_job_venv.sh new file mode 100644 index 0000000000000000000000000000000000000000..982188fdb77b6fa78b3686cfb4fc04ef8b1ccc95 --- /dev/null +++ b/pytorch/cifar10/submit_job_venv.sh @@ -0,0 +1,33 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# TODO: activate your desired virtual environment + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script +python train_model.py \ No newline at end of file diff --git a/pytorch/cifar10/train_model.py b/pytorch/cifar10/train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..7a86f612bd59a9283913ac4a5423708fb8a9c839 --- /dev/null +++ b/pytorch/cifar10/train_model.py @@ -0,0 +1,150 @@ +import argparse +import os, sys +import time +import typing + +import torch +from torchvision.models import resnet50 +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms +from torch.utils.data import DataLoader + +def parse_command_line(): + parser = argparse.ArgumentParser() + parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda") + parser.add_argument("--num_epochs", required=False, type=int, default=2) + parser.add_argument("--batch_size", required=False, type=int, default=128) + parser.add_argument("--num_workers", required=False, type=int, default=1) + args = parser.parse_args() + return args + +def load_dataset(args): + # standardization values for CIFAR10 dataset + mean = (0.4919, 0.4827, 0.4472) + std = (0.2022, 0.1994, 0.2010) + + # define the following transformations + # - resize to 224x224 + # - transform back to tensor + # - normalize data using mean and std from above + trans = transforms.Compose([ + transforms.Resize(224), + transforms.ToTensor(), + transforms.Normalize(mean, std) + ]) + + # load CIFAR10 dataset splits for train and test and apply the transformations + # note: you need to download the dataset once at the beginning + ds_train = CIFAR10("datasets", train=True, download=True, transform=trans) + ds_test = CIFAR10("datasets", train=False, download=False, transform=trans) + + # finally create separate data loaders for train and test with the following common arguments + common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True} + loader_train = DataLoader(ds_train, **(common_kwargs)) + loader_test = DataLoader(ds_test, **(common_kwargs)) + + return loader_train, loader_test + +def train(args, model, loader_train, optimizer, epoch): + # use a CrossEntropyLoss loss function + loss_func = torch.nn.CrossEntropyLoss() + + # set model into train mode + model.train() + + # track accuracy for complete epoch + total, correct = 0, 0 + total_steps = len(loader_train) + + elapsed_time = time.time() + for i, (x_batch, y_batch) in enumerate(loader_train): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # run forward pass + y_pred = model(x_batch) + + # calculate loss + loss = loss_func(y_pred, y_batch) + + # run backward pass and optimizer to update weights + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # track training accuracy + _, predicted = y_pred.max(1) + total += y_batch.size(0) + correct += predicted.eq(y_batch).sum().item() + + if i % 20 == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}") + sys.stdout.flush() + elapsed_time = time.time() - elapsed_time + + print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}") + sys.stdout.flush() + +def test(args, model, loader_test, epoch): + # set model into evaluation mode + model.eval() + + with torch.no_grad(): + correct, total = 0, 0 + for i, (x_batch, y_batch) in enumerate(loader_test): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # predict class + outputs = model(x_batch) + _, predicted = outputs.max(1) + + # track test accuracy + total += y_batch.size(0) + correct += (predicted == y_batch).sum().item() + + print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}") + sys.stdout.flush() + +def setup(args) -> None: + + # set gpu device on local machine + if args.device == 'cuda': + # optimization hint for torch runtime + torch.backends.cudnn.benchmark = True + + print("Current configuration:") + for arg in vars(args): + print(f" --{arg}, {getattr(args, arg)}") + +def cleanup(args: typing.Dict[str, typing.Any]): + pass + +def main(): + # parse command line arguments + args = parse_command_line() + + # run setup (e.g., create distributed environment if desired) + setup(args) + + # get data loaders for train and test split + loader_train, loader_test = load_dataset(args) + + # create resnet50 with random weights + model = resnet50().to(args.device) + + # initialize optimizer with model parameters + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # train and test model for configured number of epochs + for epoch in range(args.num_epochs): + train(args, model, loader_train, optimizer, epoch) + test (args, model, loader_test, epoch) + + # cleaup env + cleanup(args) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pytorch/cifar10_distributed/set_vars.sh b/pytorch/cifar10_distributed/set_vars.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b1b74a86d70bc5e4218d9c5b5f4817a2867e44a --- /dev/null +++ b/pytorch/cifar10_distributed/set_vars.sh @@ -0,0 +1,11 @@ +#!/usr/local_rwth/bin/zsh + +export RANK=${SLURM_PROCID} +export LOCAL_RANK=${SLURM_LOCALID} +export WORLD_SIZE=${SLURM_NTASKS} + +export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR} +export APPTAINERENV_MASTER_PORT=${MASTER_PORT} +export APPTAINERENV_RANK=${RANK} +export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK} +export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE} \ No newline at end of file diff --git a/pytorch/cifar10_distributed/submit_job_container.sh b/pytorch/cifar10_distributed/submit_job_container.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fcf497edf5e944d2060c619556adffbed34a448 --- /dev/null +++ b/pytorch/cifar10_distributed/submit_job_container.sh @@ -0,0 +1,44 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# variables required for distributed runs with DDP +export MASTER_ADDR=$(hostname) +export MASTER_PORT=12340 # error --> change to different port +export NCCL_DEBUG=INFO + +############################################################ +### Execution (Model Training) +############################################################ + +# each process sets required environment variables and +# runs the python script inside the container +srun zsh -c '\ + source set_vars.sh && \ + apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py --distributed"' \ No newline at end of file diff --git a/pytorch/cifar10_distributed/submit_job_venv.sh b/pytorch/cifar10_distributed/submit_job_venv.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff2fe9252d20c0440cf697b33541609e5113f9ec --- /dev/null +++ b/pytorch/cifar10_distributed/submit_job_venv.sh @@ -0,0 +1,41 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 + +############################################################ +### Load modules or software +############################################################ + +# TODO: activate your desired virtual environment + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# variables required for distributed runs with DDP +export MASTER_ADDR=$(hostname) +export MASTER_PORT=12340 # error --> change to different port +export NCCL_DEBUG=INFO + +############################################################ +### Execution (Model Training) +############################################################ + +# each process sets required environment variables and +# runs the python script +srun zsh -c '\ + source set_vars.sh && \ + python train_model.py --distributed' \ No newline at end of file diff --git a/pytorch/cifar10_distributed/train_model.py b/pytorch/cifar10_distributed/train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..873ca0785aa6a4f421253daa7c529e13e017f34e --- /dev/null +++ b/pytorch/cifar10_distributed/train_model.py @@ -0,0 +1,189 @@ +import argparse +import os, sys +import time +import typing + +import torch +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from torchvision.models import resnet50 +from torchvision.datasets import CIFAR10 +import torchvision.transforms as transforms +from torch.utils.data import DistributedSampler, DataLoader + +def parse_command_line(): + parser = argparse.ArgumentParser() + parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda") + parser.add_argument("--num_epochs", required=False, type=int, default=2) + parser.add_argument("--batch_size", required=False, type=int, default=128) + parser.add_argument("--num_workers", required=False, type=int, default=1) + parser.add_argument("--distributed", required=False, action="store_true", default=False) + args = parser.parse_args() + + # default args for distributed + args.world_size = 1 + args.world_rank = 0 + args.local_rank = 0 + + return args + +def load_dataset(args): + # standardization values for CIFAR10 dataset + mean = (0.4919, 0.4827, 0.4472) + std = (0.2022, 0.1994, 0.2010) + + # define the following transformations + # - resize to 224x224 + # - transform back to tensor + # - normalize data using mean and std from above + trans = transforms.Compose([ + transforms.Resize(224), + transforms.ToTensor(), + transforms.Normalize(mean, std) + ]) + + # load CIFAR10 dataset splits for train and test and apply the transformations + # note: you need to download the dataset once at the beginning + ds_train = CIFAR10("datasets", train=True, download=True, transform=trans) + ds_test = CIFAR10("datasets", train=False, download=False, transform=trans) + + # define distributed samplers (only for distributed version) + sampler_train, sampler_test = None, None + if args.distributed: + sampler_train = DistributedSampler(dataset=ds_train, shuffle=True, + num_replicas=args.world_size, rank=args.world_rank) + sampler_test = DistributedSampler(dataset=ds_test, shuffle=False, + num_replicas=args.world_size, rank=args.world_rank) + + # finally create separate data loaders for train and test with the following common arguments + common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True} + loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs)) + loader_test = DataLoader(ds_test, sampler=sampler_test, **(common_kwargs)) + + return loader_train, loader_test + +def train(args, model, loader_train, optimizer, epoch): + # use a CrossEntropyLoss loss function + loss_func = torch.nn.CrossEntropyLoss() + + # set model into train mode + model.train() + + # track accuracy for complete epoch + total, correct = 0, 0 + total_steps = len(loader_train) + + elapsed_time = time.time() + for i, (x_batch, y_batch) in enumerate(loader_train): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # run forward pass + y_pred = model(x_batch) + + # calculate loss + loss = loss_func(y_pred, y_batch) + + # run backward pass and optimizer to update weights + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # track training accuracy + _, predicted = y_pred.max(1) + total += y_batch.size(0) + correct += predicted.eq(y_batch).sum().item() + + if args.world_rank == 0 and i % 20 == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}") + sys.stdout.flush() + elapsed_time = time.time() - elapsed_time + + if args.world_rank == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}") + sys.stdout.flush() + +def test(args, model, loader_test, epoch): + # set model into evaluation mode + model.eval() + + with torch.no_grad(): + correct, total = 0, 0 + for i, (x_batch, y_batch) in enumerate(loader_test): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # predict class + outputs = model(x_batch) + _, predicted = outputs.max(1) + + # track test accuracy + total += y_batch.size(0) + correct += (predicted == y_batch).sum().item() + + if args.world_rank == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}") + sys.stdout.flush() + +def setup(args) -> None: + + if args.distributed: + args.world_size = int(os.environ['WORLD_SIZE']) + args.world_rank = int(os.environ['RANK']) + args.local_rank = int(os.environ['LOCAL_RANK']) + # initialize process group and wait for completion (only for distributed version) + dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank) + dist.barrier() + + # set gpu device on local machine + if args.device == 'cuda': + if args.distributed: + # assign GPUs to processes on the local system (only for distributed version) + torch.cuda.set_device(args.local_rank) + args.device = torch.device(f'cuda:{args.local_rank}') + # optimization hint for torch runtime + torch.backends.cudnn.benchmark = True + + if args.world_rank == 0: + print("Current configuration:") + for arg in vars(args): + print(f" --{arg}, {getattr(args, arg)}") + +def cleanup(args: typing.Dict[str, typing.Any]): + if args.distributed: + # wait for processes and destory group again (only for distributed version) + dist.barrier() + dist.destroy_process_group() + +def main(): + # parse command line arguments + args = parse_command_line() + + # run setup (e.g., create distributed environment if desired) + setup(args) + + # get data loaders for train and test split + loader_train, loader_test = load_dataset(args) + + # create resnet50 with random weights + model = resnet50().to(args.device) + if args.distributed: + # use DDP to wrap model (only for distributed version) + dist.barrier() + model = DDP(model, device_ids=[args.local_rank]) + + # initialize optimizer with model parameters + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # train and test model for configured number of epochs + for epoch in range(args.num_epochs): + train(args, model, loader_train, optimizer, epoch) + test (args, model, loader_test, epoch) + + # cleaup env + cleanup(args) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pytorch/mnist/submit_job_container.sh b/pytorch/mnist/submit_job_container.sh new file mode 100644 index 0000000000000000000000000000000000000000..adbb8987b021adf0754e36eecd197a6c82471bcb --- /dev/null +++ b/pytorch/mnist/submit_job_container.sh @@ -0,0 +1,36 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script inside the container +apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py" \ No newline at end of file diff --git a/pytorch/mnist/submit_job_utilization_monitoring.sh b/pytorch/mnist/submit_job_utilization_monitoring.sh new file mode 100644 index 0000000000000000000000000000000000000000..db7c1d7a199f8e5b96b3e573b2b6035f1c7f0b74 --- /dev/null +++ b/pytorch/mnist/submit_job_utilization_monitoring.sh @@ -0,0 +1,53 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# specify that utilization monitoring via nvidia-smi should be done +export ENABLE_MONITORING=1 + +############################################################ +### Execution (Model Training) +############################################################ + +if [ "${ENABLE_MONITORING:-0}" = "1" ]; then + # start monitoring process in the background (every 2 sec) + nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \ + --format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt & + # remember ID of process that has just been started in background + export proc_id_monitor=$! +fi + +# run the python script inside the container +apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py" + +if [ "${ENABLE_MONITORING:-0}" = "1" ]; then + # end monitoring process again + kill -2 ${proc_id_monitor} + sleep 5 +fi \ No newline at end of file diff --git a/pytorch/mnist/submit_job_venv.sh b/pytorch/mnist/submit_job_venv.sh new file mode 100644 index 0000000000000000000000000000000000000000..982188fdb77b6fa78b3686cfb4fc04ef8b1ccc95 --- /dev/null +++ b/pytorch/mnist/submit_job_venv.sh @@ -0,0 +1,33 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:1 + +############################################################ +### Load modules or software +############################################################ + +# TODO: activate your desired virtual environment + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script +python train_model.py \ No newline at end of file diff --git a/pytorch/mnist/train_model.py b/pytorch/mnist/train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8c60aff314a3df0921d1d766efbda789cf382354 --- /dev/null +++ b/pytorch/mnist/train_model.py @@ -0,0 +1,170 @@ +import argparse +import os, sys +import time +import typing + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.datasets import MNIST +import torchvision.transforms as transforms +from torch.utils.data import DataLoader + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + +def parse_command_line(): + parser = argparse.ArgumentParser() + parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda") + parser.add_argument("--num_epochs", required=False, type=int, default=2) + parser.add_argument("--batch_size", required=False, type=int, default=128) + parser.add_argument("--num_workers", required=False, type=int, default=1) + args = parser.parse_args() + return args + +def load_dataset(args): + # define the following transformations + # - transform back to tensor + # - normalize data using mean and std + trans = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + # load MNIST dataset splits for train and test and apply the transformations + # note: you need to download the dataset once at the beginning + ds_train = MNIST("datasets", train=True, download=True, transform=trans) + ds_test = MNIST("datasets", train=False, download=False, transform=trans) + + # finally create separate data loaders for train and test with the following common arguments + common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True} + loader_train = DataLoader(ds_train, **(common_kwargs)) + loader_test = DataLoader(ds_test, **(common_kwargs)) + + return loader_train, loader_test + +def train(args, model, loader_train, optimizer, epoch): + # use a CrossEntropyLoss loss function + loss_func = torch.nn.CrossEntropyLoss() + + # set model into train mode + model.train() + + # track accuracy for complete epoch + total, correct = 0, 0 + total_steps = len(loader_train) + + elapsed_time = time.time() + for i, (x_batch, y_batch) in enumerate(loader_train): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # run forward pass + y_pred = model(x_batch) + + # calculate loss + loss = loss_func(y_pred, y_batch) + + # run backward pass and optimizer to update weights + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # track training accuracy + _, predicted = y_pred.max(1) + total += y_batch.size(0) + correct += predicted.eq(y_batch).sum().item() + + if i % 20 == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}") + sys.stdout.flush() + elapsed_time = time.time() - elapsed_time + + print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}") + sys.stdout.flush() + +def test(args, model, loader_test, epoch): + # set model into evaluation mode + model.eval() + + with torch.no_grad(): + correct, total = 0, 0 + for i, (x_batch, y_batch) in enumerate(loader_test): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # predict class + outputs = model(x_batch) + _, predicted = outputs.max(1) + + # track test accuracy + total += y_batch.size(0) + correct += (predicted == y_batch).sum().item() + + print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}") + sys.stdout.flush() + +def setup(args) -> None: + + # set gpu device on local machine + if args.device == 'cuda': + # optimization hint for torch runtime + torch.backends.cudnn.benchmark = True + + print("Current configuration:") + for arg in vars(args): + print(f" --{arg}, {getattr(args, arg)}") + +def cleanup(args: typing.Dict[str, typing.Any]): + pass + +def main(): + # parse command line arguments + args = parse_command_line() + + # run setup (e.g., create distributed environment if desired) + setup(args) + + # get data loaders for train and test split + loader_train, loader_test = load_dataset(args) + + # create model with random weights + model = Net().to(args.device) + + # initialize optimizer with model parameters + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # train and test model for configured number of epochs + for epoch in range(args.num_epochs): + train(args, model, loader_train, optimizer, epoch) + test (args, model, loader_test, epoch) + + # cleaup env + cleanup(args) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pytorch/mnist_distributed/set_vars.sh b/pytorch/mnist_distributed/set_vars.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b1b74a86d70bc5e4218d9c5b5f4817a2867e44a --- /dev/null +++ b/pytorch/mnist_distributed/set_vars.sh @@ -0,0 +1,11 @@ +#!/usr/local_rwth/bin/zsh + +export RANK=${SLURM_PROCID} +export LOCAL_RANK=${SLURM_LOCALID} +export WORLD_SIZE=${SLURM_NTASKS} + +export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR} +export APPTAINERENV_MASTER_PORT=${MASTER_PORT} +export APPTAINERENV_RANK=${RANK} +export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK} +export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE} \ No newline at end of file diff --git a/pytorch/mnist_distributed/submit_job_container.sh b/pytorch/mnist_distributed/submit_job_container.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fcf497edf5e944d2060c619556adffbed34a448 --- /dev/null +++ b/pytorch/mnist_distributed/submit_job_container.sh @@ -0,0 +1,44 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 + +############################################################ +### Load modules or software +############################################################ + +# load module for PyTorch container +module load PyTorch/nvcr-24.01-py3 +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# variables required for distributed runs with DDP +export MASTER_ADDR=$(hostname) +export MASTER_PORT=12340 # error --> change to different port +export NCCL_DEBUG=INFO + +############################################################ +### Execution (Model Training) +############################################################ + +# each process sets required environment variables and +# runs the python script inside the container +srun zsh -c '\ + source set_vars.sh && \ + apptainer exec -e --nv ${PYTORCH_IMAGE} \ + bash -c "python train_model.py --distributed"' \ No newline at end of file diff --git a/pytorch/mnist_distributed/submit_job_venv.sh b/pytorch/mnist_distributed/submit_job_venv.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff2fe9252d20c0440cf697b33541609e5113f9ec --- /dev/null +++ b/pytorch/mnist_distributed/submit_job_venv.sh @@ -0,0 +1,41 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --cpus-per-task=24 +#SBATCH --gres=gpu:2 + +############################################################ +### Load modules or software +############################################################ + +# TODO: activate your desired virtual environment + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" +nvidia-smi + +# variables required for distributed runs with DDP +export MASTER_ADDR=$(hostname) +export MASTER_PORT=12340 # error --> change to different port +export NCCL_DEBUG=INFO + +############################################################ +### Execution (Model Training) +############################################################ + +# each process sets required environment variables and +# runs the python script +srun zsh -c '\ + source set_vars.sh && \ + python train_model.py --distributed' \ No newline at end of file diff --git a/pytorch/mnist_distributed/train_model.py b/pytorch/mnist_distributed/train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ceeb81f8ec6e189a143598cd4453feee1b7b7517 --- /dev/null +++ b/pytorch/mnist_distributed/train_model.py @@ -0,0 +1,210 @@ +import argparse +import os, sys +import time +import typing + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from torchvision.models import resnet50 +from torchvision.datasets import MNIST +import torchvision.transforms as transforms +from torch.utils.data import DistributedSampler, DataLoader + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + +def parse_command_line(): + parser = argparse.ArgumentParser() + parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda") + parser.add_argument("--num_epochs", required=False, type=int, default=2) + parser.add_argument("--batch_size", required=False, type=int, default=128) + parser.add_argument("--num_workers", required=False, type=int, default=1) + parser.add_argument("--distributed", required=False, action="store_true", default=False) + args = parser.parse_args() + + # default args for distributed + args.world_size = 1 + args.world_rank = 0 + args.local_rank = 0 + + return args + +def load_dataset(args): + # define the following transformations + # - transform back to tensor + # - normalize data using mean and std + trans = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + # load MNIST dataset splits for train and test and apply the transformations + # note: you need to download the dataset once at the beginning + ds_train = MNIST("datasets", train=True, download=True, transform=trans) + ds_test = MNIST("datasets", train=False, download=False, transform=trans) + + # define distributed samplers (only for distributed version) + sampler_train, sampler_test = None, None + if args.distributed: + sampler_train = DistributedSampler(dataset=ds_train, shuffle=True, + num_replicas=args.world_size, rank=args.world_rank) + sampler_test = DistributedSampler(dataset=ds_test, shuffle=False, + num_replicas=args.world_size, rank=args.world_rank) + + # finally create separate data loaders for train and test with the following common arguments + common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True} + loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs)) + loader_test = DataLoader(ds_test, sampler=sampler_test, **(common_kwargs)) + + return loader_train, loader_test + +def train(args, model, loader_train, optimizer, epoch): + # use a CrossEntropyLoss loss function + loss_func = torch.nn.CrossEntropyLoss() + + # set model into train mode + model.train() + + # track accuracy for complete epoch + total, correct = 0, 0 + total_steps = len(loader_train) + + elapsed_time = time.time() + for i, (x_batch, y_batch) in enumerate(loader_train): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # run forward pass + y_pred = model(x_batch) + + # calculate loss + loss = loss_func(y_pred, y_batch) + + # run backward pass and optimizer to update weights + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # track training accuracy + _, predicted = y_pred.max(1) + total += y_batch.size(0) + correct += predicted.eq(y_batch).sum().item() + + if args.world_rank == 0 and i % 20 == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}") + sys.stdout.flush() + elapsed_time = time.time() - elapsed_time + + if args.world_rank == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}") + sys.stdout.flush() + +def test(args, model, loader_test, epoch): + # set model into evaluation mode + model.eval() + + with torch.no_grad(): + correct, total = 0, 0 + for i, (x_batch, y_batch) in enumerate(loader_test): + # transfer data to the device + x_batch = x_batch.to(args.device, non_blocking=True) + y_batch = y_batch.to(args.device, non_blocking=True) + + # predict class + outputs = model(x_batch) + _, predicted = outputs.max(1) + + # track test accuracy + total += y_batch.size(0) + correct += (predicted == y_batch).sum().item() + + if args.world_rank == 0: + print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}") + sys.stdout.flush() + +def setup(args) -> None: + + if args.distributed: + args.world_size = int(os.environ['WORLD_SIZE']) + args.world_rank = int(os.environ['RANK']) + args.local_rank = int(os.environ['LOCAL_RANK']) + # initialize process group and wait for completion (only for distributed version) + dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank) + dist.barrier() + + # set gpu device on local machine + if args.device == 'cuda': + if args.distributed: + # assign GPUs to processes on the local system (only for distributed version) + torch.cuda.set_device(args.local_rank) + args.device = torch.device(f'cuda:{args.local_rank}') + # optimization hint for torch runtime + torch.backends.cudnn.benchmark = True + + if args.world_rank == 0: + print("Current configuration:") + for arg in vars(args): + print(f" --{arg}, {getattr(args, arg)}") + +def cleanup(args: typing.Dict[str, typing.Any]): + if args.distributed: + # wait for processes and destory group again (only for distributed version) + dist.barrier() + dist.destroy_process_group() + +def main(): + # parse command line arguments + args = parse_command_line() + + # run setup (e.g., create distributed environment if desired) + setup(args) + + # get data loaders for train and test split + loader_train, loader_test = load_dataset(args) + + # create model with random weights + model = Net().to(args.device) + if args.distributed: + # use DDP to wrap model (only for distributed version) + dist.barrier() + model = DDP(model, device_ids=[args.local_rank]) + + # initialize optimizer with model parameters + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # train and test model for configured number of epochs + for epoch in range(args.num_epochs): + train(args, model, loader_train, optimizer, epoch) + test (args, model, loader_test, epoch) + + # cleaup env + cleanup(args) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/slurm/basic_mpi.sh b/slurm/basic_mpi.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1bc9cbd9335b3f5385caa4211a2cec65465f03d --- /dev/null +++ b/slurm/basic_mpi.sh @@ -0,0 +1,15 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --ntasks=8 # Ask for 8 MPI tasks +#SBATCH --time=00:15:00 # Run time of 15 minutes +#SBATCH --job-name=example_job # Sets the job name +#SBATCH --output=stdout_%j.txt # Redirects stdout and stderr to stdout.txt +#SBATCH --account=<project-id> # Insertg your project-id or delete this line + +############################################################ +### Execution / Commands +############################################################ +srun hostname \ No newline at end of file diff --git a/slurm/beeond.sh b/slurm/beeond.sh new file mode 100644 index 0000000000000000000000000000000000000000..6afac44883645f43afeb9b3a2e44fbb953a04d22 --- /dev/null +++ b/slurm/beeond.sh @@ -0,0 +1,26 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +# request Beeond +#SBATCH --beeond + +# specify other Slurm commands +#SBATCH --time=01:00:00 + +############################################################ +### Execution / Commands +############################################################ + +# copy files to Beeond +cp -r $WORK/yourfiles $BEEOND + +# navigate to Beeond +cd $BEEOND/yourfiles + +# perform your job, which has high I/O meta data and bandwidth demands +echo "hello world" > result + +# afterwards: copy results back to a persistent storage +cp -r $BEEOND/yourfiles/result $WORK/yourfiles/ \ No newline at end of file diff --git a/tensorflow/.gitkeep b/tensorflow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391