From 43473322d24d790b877c49c3a75f674a437b51e5 Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Wed, 30 Oct 2024 16:36:54 +0100
Subject: [PATCH] transferred existing examples

---
 basic_mpi.sh                                  |  11 -
 pytorch/cifar10/submit_job_container.sh       |  36 +++
 .../submit_job_utilization_monitoring.sh      |  53 +++++
 pytorch/cifar10/submit_job_venv.sh            |  33 +++
 pytorch/cifar10/train_model.py                | 150 +++++++++++++
 pytorch/cifar10_distributed/set_vars.sh       |  11 +
 .../submit_job_container.sh                   |  44 ++++
 .../cifar10_distributed/submit_job_venv.sh    |  41 ++++
 pytorch/cifar10_distributed/train_model.py    | 189 ++++++++++++++++
 pytorch/mnist/submit_job_container.sh         |  36 +++
 .../submit_job_utilization_monitoring.sh      |  53 +++++
 pytorch/mnist/submit_job_venv.sh              |  33 +++
 pytorch/mnist/train_model.py                  | 170 ++++++++++++++
 pytorch/mnist_distributed/set_vars.sh         |  11 +
 .../mnist_distributed/submit_job_container.sh |  44 ++++
 pytorch/mnist_distributed/submit_job_venv.sh  |  41 ++++
 pytorch/mnist_distributed/train_model.py      | 210 ++++++++++++++++++
 slurm/basic_mpi.sh                            |  15 ++
 slurm/beeond.sh                               |  26 +++
 tensorflow/.gitkeep                           |   0
 20 files changed, 1196 insertions(+), 11 deletions(-)
 delete mode 100644 basic_mpi.sh
 create mode 100644 pytorch/cifar10/submit_job_container.sh
 create mode 100644 pytorch/cifar10/submit_job_utilization_monitoring.sh
 create mode 100644 pytorch/cifar10/submit_job_venv.sh
 create mode 100644 pytorch/cifar10/train_model.py
 create mode 100644 pytorch/cifar10_distributed/set_vars.sh
 create mode 100644 pytorch/cifar10_distributed/submit_job_container.sh
 create mode 100644 pytorch/cifar10_distributed/submit_job_venv.sh
 create mode 100644 pytorch/cifar10_distributed/train_model.py
 create mode 100644 pytorch/mnist/submit_job_container.sh
 create mode 100644 pytorch/mnist/submit_job_utilization_monitoring.sh
 create mode 100644 pytorch/mnist/submit_job_venv.sh
 create mode 100644 pytorch/mnist/train_model.py
 create mode 100644 pytorch/mnist_distributed/set_vars.sh
 create mode 100644 pytorch/mnist_distributed/submit_job_container.sh
 create mode 100644 pytorch/mnist_distributed/submit_job_venv.sh
 create mode 100644 pytorch/mnist_distributed/train_model.py
 create mode 100644 slurm/basic_mpi.sh
 create mode 100644 slurm/beeond.sh
 create mode 100644 tensorflow/.gitkeep

diff --git a/basic_mpi.sh b/basic_mpi.sh
deleted file mode 100644
index 4f0e422..0000000
--- a/basic_mpi.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/zsh 
-
-### Job Parameters 
-#SBATCH --ntasks=8              # Ask for 8 MPI tasks
-#SBATCH --time=00:15:00         # Run time of 15 minutes
-#SBATCH --job-name=example_job  # Sets the job name
-#SBATCH --output=stdout.txt     # redirects stdout and stderr to stdout.txt
-#SBATCH --account=<project-id>  # Insertg your project-id or delete this line
-
-### Program Code
-srun hostname
\ No newline at end of file
diff --git a/pytorch/cifar10/submit_job_container.sh b/pytorch/cifar10/submit_job_container.sh
new file mode 100644
index 0000000..adbb898
--- /dev/null
+++ b/pytorch/cifar10/submit_job_container.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script inside the container
+apptainer exec -e --nv ${PYTORCH_IMAGE} \
+    bash -c "python train_model.py"
\ No newline at end of file
diff --git a/pytorch/cifar10/submit_job_utilization_monitoring.sh b/pytorch/cifar10/submit_job_utilization_monitoring.sh
new file mode 100644
index 0000000..db7c1d7
--- /dev/null
+++ b/pytorch/cifar10/submit_job_utilization_monitoring.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# specify that utilization monitoring via nvidia-smi should be done
+export ENABLE_MONITORING=1
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
+    # start monitoring process in the background (every 2 sec)
+    nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \
+        --format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt &
+    # remember ID of process that has just been started in background
+    export proc_id_monitor=$!
+fi
+
+# run the python script inside the container
+apptainer exec -e --nv ${PYTORCH_IMAGE} \
+    bash -c "python train_model.py"
+
+if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
+    # end monitoring process again
+    kill -2 ${proc_id_monitor}
+    sleep 5
+fi
\ No newline at end of file
diff --git a/pytorch/cifar10/submit_job_venv.sh b/pytorch/cifar10/submit_job_venv.sh
new file mode 100644
index 0000000..982188f
--- /dev/null
+++ b/pytorch/cifar10/submit_job_venv.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script
+python train_model.py
\ No newline at end of file
diff --git a/pytorch/cifar10/train_model.py b/pytorch/cifar10/train_model.py
new file mode 100644
index 0000000..7a86f61
--- /dev/null
+++ b/pytorch/cifar10/train_model.py
@@ -0,0 +1,150 @@
+import argparse
+import os, sys
+import time
+import typing
+
+import torch
+from torchvision.models import resnet50
+from torchvision.datasets import CIFAR10
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()    
+    parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
+    parser.add_argument("--num_epochs", required=False, type=int, default=2)
+    parser.add_argument("--batch_size", required=False, type=int, default=128)
+    parser.add_argument("--num_workers", required=False, type=int, default=1)
+    args = parser.parse_args()
+    return args
+
+def load_dataset(args):
+    # standardization values for CIFAR10 dataset
+    mean = (0.4919, 0.4827, 0.4472)
+    std = (0.2022, 0.1994, 0.2010)
+    
+    # define the following transformations
+    # - resize to 224x224
+    # - transform back to tensor
+    # - normalize data using mean and std from above
+    trans = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean, std)
+    ])
+    
+    # load CIFAR10 dataset splits for train and test and apply the transformations
+    # note: you need to download the dataset once at the beginning
+    ds_train = CIFAR10("datasets", train=True,  download=True,  transform=trans)
+    ds_test  = CIFAR10("datasets", train=False, download=False, transform=trans)
+    
+    # finally create separate data loaders for train and test with the following common arguments
+    common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
+    loader_train = DataLoader(ds_train, **(common_kwargs))
+    loader_test  = DataLoader(ds_test,  **(common_kwargs))
+    
+    return loader_train, loader_test
+
+def train(args, model, loader_train, optimizer, epoch):
+    # use a CrossEntropyLoss loss function
+    loss_func = torch.nn.CrossEntropyLoss()
+
+    # set model into train mode
+    model.train()
+    
+    # track accuracy for complete epoch
+    total, correct = 0, 0
+    total_steps = len(loader_train)
+    
+    elapsed_time = time.time()
+    for i, (x_batch, y_batch) in enumerate(loader_train):
+        # transfer data to the device
+        x_batch = x_batch.to(args.device, non_blocking=True)
+        y_batch = y_batch.to(args.device, non_blocking=True)
+        
+        # run forward pass
+        y_pred = model(x_batch)
+        
+        # calculate loss
+        loss = loss_func(y_pred, y_batch)
+        
+        # run backward pass and optimizer to update weights
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+        # track training accuracy
+        _, predicted = y_pred.max(1)
+        total += y_batch.size(0)
+        correct += predicted.eq(y_batch).sum().item()
+        
+        if i % 20 == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
+            sys.stdout.flush()
+    elapsed_time = time.time() - elapsed_time
+
+    print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
+    sys.stdout.flush()
+
+def test(args, model, loader_test, epoch):
+    # set model into evaluation mode
+    model.eval()
+    
+    with torch.no_grad():
+        correct, total = 0, 0
+        for i, (x_batch, y_batch) in enumerate(loader_test):
+            # transfer data to the device
+            x_batch = x_batch.to(args.device, non_blocking=True)
+            y_batch = y_batch.to(args.device, non_blocking=True)
+            
+            # predict class
+            outputs = model(x_batch)
+            _, predicted = outputs.max(1)
+            
+            # track test accuracy
+            total += y_batch.size(0)
+            correct += (predicted == y_batch).sum().item()
+        
+        print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
+        sys.stdout.flush()
+
+def setup(args) -> None:
+
+    # set gpu device on local machine
+    if args.device == 'cuda':
+        # optimization hint for torch runtime
+        torch.backends.cudnn.benchmark = True
+
+    print("Current configuration:")
+    for arg in vars(args):
+        print(f"  --{arg}, {getattr(args, arg)}")
+
+def cleanup(args: typing.Dict[str, typing.Any]):
+    pass
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    setup(args)
+    
+    # get data loaders for train and test split
+    loader_train, loader_test = load_dataset(args)
+    
+    # create resnet50 with random weights
+    model = resnet50().to(args.device)
+
+    # initialize optimizer with model parameters
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    # train and test model for configured number of epochs
+    for epoch in range(args.num_epochs):
+        train(args, model, loader_train, optimizer, epoch)
+        test (args, model, loader_test, epoch)
+
+    # cleaup env
+    cleanup(args)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pytorch/cifar10_distributed/set_vars.sh b/pytorch/cifar10_distributed/set_vars.sh
new file mode 100644
index 0000000..7b1b74a
--- /dev/null
+++ b/pytorch/cifar10_distributed/set_vars.sh
@@ -0,0 +1,11 @@
+#!/usr/local_rwth/bin/zsh
+
+export RANK=${SLURM_PROCID}
+export LOCAL_RANK=${SLURM_LOCALID}
+export WORLD_SIZE=${SLURM_NTASKS}
+
+export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR}
+export APPTAINERENV_MASTER_PORT=${MASTER_PORT}
+export APPTAINERENV_RANK=${RANK}
+export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
+export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
\ No newline at end of file
diff --git a/pytorch/cifar10_distributed/submit_job_container.sh b/pytorch/cifar10_distributed/submit_job_container.sh
new file mode 100644
index 0000000..1fcf497
--- /dev/null
+++ b/pytorch/cifar10_distributed/submit_job_container.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# variables required for distributed runs with DDP
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=12340 # error --> change to different port
+export NCCL_DEBUG=INFO
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# each process sets required environment variables and
+# runs the python script inside the container
+srun zsh -c '\
+    source set_vars.sh && \
+    apptainer exec -e --nv ${PYTORCH_IMAGE} \
+        bash -c "python train_model.py --distributed"'
\ No newline at end of file
diff --git a/pytorch/cifar10_distributed/submit_job_venv.sh b/pytorch/cifar10_distributed/submit_job_venv.sh
new file mode 100644
index 0000000..ff2fe92
--- /dev/null
+++ b/pytorch/cifar10_distributed/submit_job_venv.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# variables required for distributed runs with DDP
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=12340 # error --> change to different port
+export NCCL_DEBUG=INFO
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# each process sets required environment variables and
+# runs the python script
+srun zsh -c '\
+    source set_vars.sh && \
+    python train_model.py --distributed'
\ No newline at end of file
diff --git a/pytorch/cifar10_distributed/train_model.py b/pytorch/cifar10_distributed/train_model.py
new file mode 100644
index 0000000..873ca07
--- /dev/null
+++ b/pytorch/cifar10_distributed/train_model.py
@@ -0,0 +1,189 @@
+import argparse
+import os, sys
+import time
+import typing
+
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torchvision.models import resnet50
+from torchvision.datasets import CIFAR10
+import torchvision.transforms as transforms
+from torch.utils.data import DistributedSampler, DataLoader
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()    
+    parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
+    parser.add_argument("--num_epochs", required=False, type=int, default=2)
+    parser.add_argument("--batch_size", required=False, type=int, default=128)
+    parser.add_argument("--num_workers", required=False, type=int, default=1)
+    parser.add_argument("--distributed", required=False, action="store_true", default=False)
+    args = parser.parse_args()
+    
+    # default args for distributed
+    args.world_size = 1
+    args.world_rank = 0
+    args.local_rank = 0
+   
+    return args
+
+def load_dataset(args):
+    # standardization values for CIFAR10 dataset
+    mean = (0.4919, 0.4827, 0.4472)
+    std = (0.2022, 0.1994, 0.2010)
+    
+    # define the following transformations
+    # - resize to 224x224
+    # - transform back to tensor
+    # - normalize data using mean and std from above
+    trans = transforms.Compose([
+        transforms.Resize(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean, std)
+    ])
+    
+    # load CIFAR10 dataset splits for train and test and apply the transformations
+    # note: you need to download the dataset once at the beginning
+    ds_train = CIFAR10("datasets", train=True,  download=True,  transform=trans)
+    ds_test  = CIFAR10("datasets", train=False, download=False, transform=trans)
+    
+    # define distributed samplers (only for distributed version)
+    sampler_train, sampler_test = None, None
+    if args.distributed:
+        sampler_train = DistributedSampler(dataset=ds_train, shuffle=True,
+                                           num_replicas=args.world_size, rank=args.world_rank)
+        sampler_test  = DistributedSampler(dataset=ds_test, shuffle=False,
+                                           num_replicas=args.world_size, rank=args.world_rank)
+    
+    # finally create separate data loaders for train and test with the following common arguments
+    common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
+    loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs))
+    loader_test  = DataLoader(ds_test,  sampler=sampler_test, **(common_kwargs))
+    
+    return loader_train, loader_test
+
+def train(args, model, loader_train, optimizer, epoch):
+    # use a CrossEntropyLoss loss function
+    loss_func = torch.nn.CrossEntropyLoss()
+
+    # set model into train mode
+    model.train()
+    
+    # track accuracy for complete epoch
+    total, correct = 0, 0
+    total_steps = len(loader_train)
+    
+    elapsed_time = time.time()
+    for i, (x_batch, y_batch) in enumerate(loader_train):
+        # transfer data to the device
+        x_batch = x_batch.to(args.device, non_blocking=True)
+        y_batch = y_batch.to(args.device, non_blocking=True)
+        
+        # run forward pass
+        y_pred = model(x_batch)
+        
+        # calculate loss
+        loss = loss_func(y_pred, y_batch)
+        
+        # run backward pass and optimizer to update weights
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+        # track training accuracy
+        _, predicted = y_pred.max(1)
+        total += y_batch.size(0)
+        correct += predicted.eq(y_batch).sum().item()
+        
+        if args.world_rank == 0 and i % 20 == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
+            sys.stdout.flush()
+    elapsed_time = time.time() - elapsed_time
+
+    if args.world_rank == 0:
+        print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
+        sys.stdout.flush()
+
+def test(args, model, loader_test, epoch):
+    # set model into evaluation mode
+    model.eval()
+    
+    with torch.no_grad():
+        correct, total = 0, 0
+        for i, (x_batch, y_batch) in enumerate(loader_test):
+            # transfer data to the device
+            x_batch = x_batch.to(args.device, non_blocking=True)
+            y_batch = y_batch.to(args.device, non_blocking=True)
+            
+            # predict class
+            outputs = model(x_batch)
+            _, predicted = outputs.max(1)
+            
+            # track test accuracy
+            total += y_batch.size(0)
+            correct += (predicted == y_batch).sum().item()
+        
+        if args.world_rank == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
+            sys.stdout.flush()
+
+def setup(args) -> None:
+
+    if args.distributed:
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.world_rank = int(os.environ['RANK'])
+        args.local_rank = int(os.environ['LOCAL_RANK'])
+        # initialize process group and wait for completion (only for distributed version)
+        dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank)
+        dist.barrier()
+
+    # set gpu device on local machine
+    if args.device == 'cuda':
+        if args.distributed:
+            # assign GPUs to processes on the local system (only for distributed version)
+            torch.cuda.set_device(args.local_rank)
+            args.device = torch.device(f'cuda:{args.local_rank}')
+        # optimization hint for torch runtime
+        torch.backends.cudnn.benchmark = True
+
+    if args.world_rank == 0:
+        print("Current configuration:")
+        for arg in vars(args):
+            print(f"  --{arg}, {getattr(args, arg)}")
+
+def cleanup(args: typing.Dict[str, typing.Any]):
+    if args.distributed:
+        # wait for processes and destory group again (only for distributed version)
+        dist.barrier()
+        dist.destroy_process_group()
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    setup(args)
+    
+    # get data loaders for train and test split
+    loader_train, loader_test = load_dataset(args)
+    
+    # create resnet50 with random weights
+    model = resnet50().to(args.device)
+    if args.distributed:
+        # use DDP to wrap model (only for distributed version)
+        dist.barrier()
+        model = DDP(model, device_ids=[args.local_rank])
+
+    # initialize optimizer with model parameters
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    # train and test model for configured number of epochs
+    for epoch in range(args.num_epochs):
+        train(args, model, loader_train, optimizer, epoch)
+        test (args, model, loader_test, epoch)
+
+    # cleaup env
+    cleanup(args)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pytorch/mnist/submit_job_container.sh b/pytorch/mnist/submit_job_container.sh
new file mode 100644
index 0000000..adbb898
--- /dev/null
+++ b/pytorch/mnist/submit_job_container.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script inside the container
+apptainer exec -e --nv ${PYTORCH_IMAGE} \
+    bash -c "python train_model.py"
\ No newline at end of file
diff --git a/pytorch/mnist/submit_job_utilization_monitoring.sh b/pytorch/mnist/submit_job_utilization_monitoring.sh
new file mode 100644
index 0000000..db7c1d7
--- /dev/null
+++ b/pytorch/mnist/submit_job_utilization_monitoring.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# specify that utilization monitoring via nvidia-smi should be done
+export ENABLE_MONITORING=1
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
+    # start monitoring process in the background (every 2 sec)
+    nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \
+        --format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt &
+    # remember ID of process that has just been started in background
+    export proc_id_monitor=$!
+fi
+
+# run the python script inside the container
+apptainer exec -e --nv ${PYTORCH_IMAGE} \
+    bash -c "python train_model.py"
+
+if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
+    # end monitoring process again
+    kill -2 ${proc_id_monitor}
+    sleep 5
+fi
\ No newline at end of file
diff --git a/pytorch/mnist/submit_job_venv.sh b/pytorch/mnist/submit_job_venv.sh
new file mode 100644
index 0000000..982188f
--- /dev/null
+++ b/pytorch/mnist/submit_job_venv.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:1
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script
+python train_model.py
\ No newline at end of file
diff --git a/pytorch/mnist/train_model.py b/pytorch/mnist/train_model.py
new file mode 100644
index 0000000..8c60aff
--- /dev/null
+++ b/pytorch/mnist/train_model.py
@@ -0,0 +1,170 @@
+import argparse
+import os, sys
+import time
+import typing
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.datasets import MNIST
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()    
+    parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
+    parser.add_argument("--num_epochs", required=False, type=int, default=2)
+    parser.add_argument("--batch_size", required=False, type=int, default=128)
+    parser.add_argument("--num_workers", required=False, type=int, default=1)
+    args = parser.parse_args()
+    return args
+
+def load_dataset(args):
+    # define the following transformations
+    # - transform back to tensor
+    # - normalize data using mean and std
+    trans = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    
+    # load MNIST dataset splits for train and test and apply the transformations
+    # note: you need to download the dataset once at the beginning
+    ds_train = MNIST("datasets", train=True,  download=True,  transform=trans)
+    ds_test  = MNIST("datasets", train=False, download=False, transform=trans)
+    
+    # finally create separate data loaders for train and test with the following common arguments
+    common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
+    loader_train = DataLoader(ds_train, **(common_kwargs))
+    loader_test  = DataLoader(ds_test,  **(common_kwargs))
+    
+    return loader_train, loader_test
+
+def train(args, model, loader_train, optimizer, epoch):
+    # use a CrossEntropyLoss loss function
+    loss_func = torch.nn.CrossEntropyLoss()
+
+    # set model into train mode
+    model.train()
+    
+    # track accuracy for complete epoch
+    total, correct = 0, 0
+    total_steps = len(loader_train)
+    
+    elapsed_time = time.time()
+    for i, (x_batch, y_batch) in enumerate(loader_train):
+        # transfer data to the device
+        x_batch = x_batch.to(args.device, non_blocking=True)
+        y_batch = y_batch.to(args.device, non_blocking=True)
+        
+        # run forward pass
+        y_pred = model(x_batch)
+        
+        # calculate loss
+        loss = loss_func(y_pred, y_batch)
+        
+        # run backward pass and optimizer to update weights
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+        # track training accuracy
+        _, predicted = y_pred.max(1)
+        total += y_batch.size(0)
+        correct += predicted.eq(y_batch).sum().item()
+        
+        if i % 20 == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
+            sys.stdout.flush()
+    elapsed_time = time.time() - elapsed_time
+
+    print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
+    sys.stdout.flush()
+
+def test(args, model, loader_test, epoch):
+    # set model into evaluation mode
+    model.eval()
+    
+    with torch.no_grad():
+        correct, total = 0, 0
+        for i, (x_batch, y_batch) in enumerate(loader_test):
+            # transfer data to the device
+            x_batch = x_batch.to(args.device, non_blocking=True)
+            y_batch = y_batch.to(args.device, non_blocking=True)
+            
+            # predict class
+            outputs = model(x_batch)
+            _, predicted = outputs.max(1)
+            
+            # track test accuracy
+            total += y_batch.size(0)
+            correct += (predicted == y_batch).sum().item()
+        
+        print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
+        sys.stdout.flush()
+
+def setup(args) -> None:
+
+    # set gpu device on local machine
+    if args.device == 'cuda':
+        # optimization hint for torch runtime
+        torch.backends.cudnn.benchmark = True
+
+    print("Current configuration:")
+    for arg in vars(args):
+        print(f"  --{arg}, {getattr(args, arg)}")
+
+def cleanup(args: typing.Dict[str, typing.Any]):
+    pass
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    setup(args)
+    
+    # get data loaders for train and test split
+    loader_train, loader_test = load_dataset(args)
+    
+    # create model with random weights
+    model = Net().to(args.device)
+
+    # initialize optimizer with model parameters
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    # train and test model for configured number of epochs
+    for epoch in range(args.num_epochs):
+        train(args, model, loader_train, optimizer, epoch)
+        test (args, model, loader_test, epoch)
+
+    # cleaup env
+    cleanup(args)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pytorch/mnist_distributed/set_vars.sh b/pytorch/mnist_distributed/set_vars.sh
new file mode 100644
index 0000000..7b1b74a
--- /dev/null
+++ b/pytorch/mnist_distributed/set_vars.sh
@@ -0,0 +1,11 @@
+#!/usr/local_rwth/bin/zsh
+
+export RANK=${SLURM_PROCID}
+export LOCAL_RANK=${SLURM_LOCALID}
+export WORLD_SIZE=${SLURM_NTASKS}
+
+export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR}
+export APPTAINERENV_MASTER_PORT=${MASTER_PORT}
+export APPTAINERENV_RANK=${RANK}
+export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
+export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
\ No newline at end of file
diff --git a/pytorch/mnist_distributed/submit_job_container.sh b/pytorch/mnist_distributed/submit_job_container.sh
new file mode 100644
index 0000000..1fcf497
--- /dev/null
+++ b/pytorch/mnist_distributed/submit_job_container.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for PyTorch container
+module load PyTorch/nvcr-24.01-py3
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# variables required for distributed runs with DDP
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=12340 # error --> change to different port
+export NCCL_DEBUG=INFO
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# each process sets required environment variables and
+# runs the python script inside the container
+srun zsh -c '\
+    source set_vars.sh && \
+    apptainer exec -e --nv ${PYTORCH_IMAGE} \
+        bash -c "python train_model.py --distributed"'
\ No newline at end of file
diff --git a/pytorch/mnist_distributed/submit_job_venv.sh b/pytorch/mnist_distributed/submit_job_venv.sh
new file mode 100644
index 0000000..ff2fe92
--- /dev/null
+++ b/pytorch/mnist_distributed/submit_job_venv.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=24
+#SBATCH --gres=gpu:2
+
+############################################################
+### Load modules or software
+############################################################
+
+# TODO: activate your desired virtual environment
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+nvidia-smi
+
+# variables required for distributed runs with DDP
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=12340 # error --> change to different port
+export NCCL_DEBUG=INFO
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# each process sets required environment variables and
+# runs the python script
+srun zsh -c '\
+    source set_vars.sh && \
+    python train_model.py --distributed'
\ No newline at end of file
diff --git a/pytorch/mnist_distributed/train_model.py b/pytorch/mnist_distributed/train_model.py
new file mode 100644
index 0000000..ceeb81f
--- /dev/null
+++ b/pytorch/mnist_distributed/train_model.py
@@ -0,0 +1,210 @@
+import argparse
+import os, sys
+import time
+import typing
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torchvision.models import resnet50
+from torchvision.datasets import MNIST
+import torchvision.transforms as transforms
+from torch.utils.data import DistributedSampler, DataLoader
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()    
+    parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
+    parser.add_argument("--num_epochs", required=False, type=int, default=2)
+    parser.add_argument("--batch_size", required=False, type=int, default=128)
+    parser.add_argument("--num_workers", required=False, type=int, default=1)
+    parser.add_argument("--distributed", required=False, action="store_true", default=False)
+    args = parser.parse_args()
+    
+    # default args for distributed
+    args.world_size = 1
+    args.world_rank = 0
+    args.local_rank = 0
+   
+    return args
+
+def load_dataset(args):    
+    # define the following transformations
+    # - transform back to tensor
+    # - normalize data using mean and std
+    trans = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    
+    # load MNIST dataset splits for train and test and apply the transformations
+    # note: you need to download the dataset once at the beginning
+    ds_train = MNIST("datasets", train=True,  download=True,  transform=trans)
+    ds_test  = MNIST("datasets", train=False, download=False, transform=trans)
+    
+    # define distributed samplers (only for distributed version)
+    sampler_train, sampler_test = None, None
+    if args.distributed:
+        sampler_train = DistributedSampler(dataset=ds_train, shuffle=True,
+                                           num_replicas=args.world_size, rank=args.world_rank)
+        sampler_test  = DistributedSampler(dataset=ds_test, shuffle=False,
+                                           num_replicas=args.world_size, rank=args.world_rank)
+    
+    # finally create separate data loaders for train and test with the following common arguments
+    common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
+    loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs))
+    loader_test  = DataLoader(ds_test,  sampler=sampler_test, **(common_kwargs))
+    
+    return loader_train, loader_test
+
+def train(args, model, loader_train, optimizer, epoch):
+    # use a CrossEntropyLoss loss function
+    loss_func = torch.nn.CrossEntropyLoss()
+
+    # set model into train mode
+    model.train()
+    
+    # track accuracy for complete epoch
+    total, correct = 0, 0
+    total_steps = len(loader_train)
+    
+    elapsed_time = time.time()
+    for i, (x_batch, y_batch) in enumerate(loader_train):
+        # transfer data to the device
+        x_batch = x_batch.to(args.device, non_blocking=True)
+        y_batch = y_batch.to(args.device, non_blocking=True)
+        
+        # run forward pass
+        y_pred = model(x_batch)
+        
+        # calculate loss
+        loss = loss_func(y_pred, y_batch)
+        
+        # run backward pass and optimizer to update weights
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        
+        # track training accuracy
+        _, predicted = y_pred.max(1)
+        total += y_batch.size(0)
+        correct += predicted.eq(y_batch).sum().item()
+        
+        if args.world_rank == 0 and i % 20 == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
+            sys.stdout.flush()
+    elapsed_time = time.time() - elapsed_time
+
+    if args.world_rank == 0:
+        print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
+        sys.stdout.flush()
+
+def test(args, model, loader_test, epoch):
+    # set model into evaluation mode
+    model.eval()
+    
+    with torch.no_grad():
+        correct, total = 0, 0
+        for i, (x_batch, y_batch) in enumerate(loader_test):
+            # transfer data to the device
+            x_batch = x_batch.to(args.device, non_blocking=True)
+            y_batch = y_batch.to(args.device, non_blocking=True)
+            
+            # predict class
+            outputs = model(x_batch)
+            _, predicted = outputs.max(1)
+            
+            # track test accuracy
+            total += y_batch.size(0)
+            correct += (predicted == y_batch).sum().item()
+        
+        if args.world_rank == 0:
+            print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
+            sys.stdout.flush()
+
+def setup(args) -> None:
+
+    if args.distributed:
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.world_rank = int(os.environ['RANK'])
+        args.local_rank = int(os.environ['LOCAL_RANK'])
+        # initialize process group and wait for completion (only for distributed version)
+        dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank)
+        dist.barrier()
+
+    # set gpu device on local machine
+    if args.device == 'cuda':
+        if args.distributed:
+            # assign GPUs to processes on the local system (only for distributed version)
+            torch.cuda.set_device(args.local_rank)
+            args.device = torch.device(f'cuda:{args.local_rank}')
+        # optimization hint for torch runtime
+        torch.backends.cudnn.benchmark = True
+
+    if args.world_rank == 0:
+        print("Current configuration:")
+        for arg in vars(args):
+            print(f"  --{arg}, {getattr(args, arg)}")
+
+def cleanup(args: typing.Dict[str, typing.Any]):
+    if args.distributed:
+        # wait for processes and destory group again (only for distributed version)
+        dist.barrier()
+        dist.destroy_process_group()
+
+def main():
+    # parse command line arguments
+    args = parse_command_line()
+
+    # run setup (e.g., create distributed environment if desired)
+    setup(args)
+    
+    # get data loaders for train and test split
+    loader_train, loader_test = load_dataset(args)
+    
+    # create model with random weights
+    model = Net().to(args.device)
+    if args.distributed:
+        # use DDP to wrap model (only for distributed version)
+        dist.barrier()
+        model = DDP(model, device_ids=[args.local_rank])
+
+    # initialize optimizer with model parameters
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    # train and test model for configured number of epochs
+    for epoch in range(args.num_epochs):
+        train(args, model, loader_train, optimizer, epoch)
+        test (args, model, loader_test, epoch)
+
+    # cleaup env
+    cleanup(args)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/slurm/basic_mpi.sh b/slurm/basic_mpi.sh
new file mode 100644
index 0000000..b1bc9cb
--- /dev/null
+++ b/slurm/basic_mpi.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/zsh 
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --ntasks=8              # Ask for 8 MPI tasks
+#SBATCH --time=00:15:00         # Run time of 15 minutes
+#SBATCH --job-name=example_job  # Sets the job name
+#SBATCH --output=stdout_%j.txt  # Redirects stdout and stderr to stdout.txt
+#SBATCH --account=<project-id>  # Insertg your project-id or delete this line
+
+############################################################
+### Execution / Commands
+############################################################
+srun hostname
\ No newline at end of file
diff --git a/slurm/beeond.sh b/slurm/beeond.sh
new file mode 100644
index 0000000..6afac44
--- /dev/null
+++ b/slurm/beeond.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+# request Beeond
+#SBATCH --beeond
+
+# specify other Slurm commands
+#SBATCH --time=01:00:00
+
+############################################################
+### Execution / Commands
+############################################################
+
+# copy files to Beeond
+cp -r $WORK/yourfiles $BEEOND
+
+# navigate to Beeond
+cd $BEEOND/yourfiles
+
+# perform your job, which has high I/O meta data and bandwidth demands
+echo "hello world" > result
+
+# afterwards: copy results back to a persistent storage
+cp -r $BEEOND/yourfiles/result $WORK/yourfiles/
\ No newline at end of file
diff --git a/tensorflow/.gitkeep b/tensorflow/.gitkeep
new file mode 100644
index 0000000..e69de29
-- 
GitLab