Skip to content
Snippets Groups Projects
Verified Commit 43473322 authored by Jannis Klinkenberg's avatar Jannis Klinkenberg
Browse files

transferred existing examples

parent 27313610
No related branches found
No related tags found
No related merge requests found
Showing
with 1196 additions and 0 deletions
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
############################################################
### Execution (Model Training)
############################################################
# run the python script inside the container
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py"
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# specify that utilization monitoring via nvidia-smi should be done
export ENABLE_MONITORING=1
############################################################
### Execution (Model Training)
############################################################
if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
# start monitoring process in the background (every 2 sec)
nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \
--format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt &
# remember ID of process that has just been started in background
export proc_id_monitor=$!
fi
# run the python script inside the container
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py"
if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
# end monitoring process again
kill -2 ${proc_id_monitor}
sleep 5
fi
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# TODO: activate your desired virtual environment
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
############################################################
### Execution (Model Training)
############################################################
# run the python script
python train_model.py
\ No newline at end of file
import argparse
import os, sys
import time
import typing
import torch
from torchvision.models import resnet50
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
def parse_command_line():
parser = argparse.ArgumentParser()
parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
parser.add_argument("--num_epochs", required=False, type=int, default=2)
parser.add_argument("--batch_size", required=False, type=int, default=128)
parser.add_argument("--num_workers", required=False, type=int, default=1)
args = parser.parse_args()
return args
def load_dataset(args):
# standardization values for CIFAR10 dataset
mean = (0.4919, 0.4827, 0.4472)
std = (0.2022, 0.1994, 0.2010)
# define the following transformations
# - resize to 224x224
# - transform back to tensor
# - normalize data using mean and std from above
trans = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
# load CIFAR10 dataset splits for train and test and apply the transformations
# note: you need to download the dataset once at the beginning
ds_train = CIFAR10("datasets", train=True, download=True, transform=trans)
ds_test = CIFAR10("datasets", train=False, download=False, transform=trans)
# finally create separate data loaders for train and test with the following common arguments
common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
loader_train = DataLoader(ds_train, **(common_kwargs))
loader_test = DataLoader(ds_test, **(common_kwargs))
return loader_train, loader_test
def train(args, model, loader_train, optimizer, epoch):
# use a CrossEntropyLoss loss function
loss_func = torch.nn.CrossEntropyLoss()
# set model into train mode
model.train()
# track accuracy for complete epoch
total, correct = 0, 0
total_steps = len(loader_train)
elapsed_time = time.time()
for i, (x_batch, y_batch) in enumerate(loader_train):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# run forward pass
y_pred = model(x_batch)
# calculate loss
loss = loss_func(y_pred, y_batch)
# run backward pass and optimizer to update weights
optimizer.zero_grad()
loss.backward()
optimizer.step()
# track training accuracy
_, predicted = y_pred.max(1)
total += y_batch.size(0)
correct += predicted.eq(y_batch).sum().item()
if i % 20 == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
sys.stdout.flush()
elapsed_time = time.time() - elapsed_time
print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
sys.stdout.flush()
def test(args, model, loader_test, epoch):
# set model into evaluation mode
model.eval()
with torch.no_grad():
correct, total = 0, 0
for i, (x_batch, y_batch) in enumerate(loader_test):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# predict class
outputs = model(x_batch)
_, predicted = outputs.max(1)
# track test accuracy
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
sys.stdout.flush()
def setup(args) -> None:
# set gpu device on local machine
if args.device == 'cuda':
# optimization hint for torch runtime
torch.backends.cudnn.benchmark = True
print("Current configuration:")
for arg in vars(args):
print(f" --{arg}, {getattr(args, arg)}")
def cleanup(args: typing.Dict[str, typing.Any]):
pass
def main():
# parse command line arguments
args = parse_command_line()
# run setup (e.g., create distributed environment if desired)
setup(args)
# get data loaders for train and test split
loader_train, loader_test = load_dataset(args)
# create resnet50 with random weights
model = resnet50().to(args.device)
# initialize optimizer with model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# train and test model for configured number of epochs
for epoch in range(args.num_epochs):
train(args, model, loader_train, optimizer, epoch)
test (args, model, loader_test, epoch)
# cleaup env
cleanup(args)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/local_rwth/bin/zsh
export RANK=${SLURM_PROCID}
export LOCAL_RANK=${SLURM_LOCALID}
export WORLD_SIZE=${SLURM_NTASKS}
export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR}
export APPTAINERENV_MASTER_PORT=${MASTER_PORT}
export APPTAINERENV_RANK=${RANK}
export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# variables required for distributed runs with DDP
export MASTER_ADDR=$(hostname)
export MASTER_PORT=12340 # error --> change to different port
export NCCL_DEBUG=INFO
############################################################
### Execution (Model Training)
############################################################
# each process sets required environment variables and
# runs the python script inside the container
srun zsh -c '\
source set_vars.sh && \
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py --distributed"'
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2
############################################################
### Load modules or software
############################################################
# TODO: activate your desired virtual environment
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# variables required for distributed runs with DDP
export MASTER_ADDR=$(hostname)
export MASTER_PORT=12340 # error --> change to different port
export NCCL_DEBUG=INFO
############################################################
### Execution (Model Training)
############################################################
# each process sets required environment variables and
# runs the python script
srun zsh -c '\
source set_vars.sh && \
python train_model.py --distributed'
\ No newline at end of file
import argparse
import os, sys
import time
import typing
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision.models import resnet50
from torchvision.datasets import CIFAR10
import torchvision.transforms as transforms
from torch.utils.data import DistributedSampler, DataLoader
def parse_command_line():
parser = argparse.ArgumentParser()
parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
parser.add_argument("--num_epochs", required=False, type=int, default=2)
parser.add_argument("--batch_size", required=False, type=int, default=128)
parser.add_argument("--num_workers", required=False, type=int, default=1)
parser.add_argument("--distributed", required=False, action="store_true", default=False)
args = parser.parse_args()
# default args for distributed
args.world_size = 1
args.world_rank = 0
args.local_rank = 0
return args
def load_dataset(args):
# standardization values for CIFAR10 dataset
mean = (0.4919, 0.4827, 0.4472)
std = (0.2022, 0.1994, 0.2010)
# define the following transformations
# - resize to 224x224
# - transform back to tensor
# - normalize data using mean and std from above
trans = transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),
transforms.Normalize(mean, std)
])
# load CIFAR10 dataset splits for train and test and apply the transformations
# note: you need to download the dataset once at the beginning
ds_train = CIFAR10("datasets", train=True, download=True, transform=trans)
ds_test = CIFAR10("datasets", train=False, download=False, transform=trans)
# define distributed samplers (only for distributed version)
sampler_train, sampler_test = None, None
if args.distributed:
sampler_train = DistributedSampler(dataset=ds_train, shuffle=True,
num_replicas=args.world_size, rank=args.world_rank)
sampler_test = DistributedSampler(dataset=ds_test, shuffle=False,
num_replicas=args.world_size, rank=args.world_rank)
# finally create separate data loaders for train and test with the following common arguments
common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs))
loader_test = DataLoader(ds_test, sampler=sampler_test, **(common_kwargs))
return loader_train, loader_test
def train(args, model, loader_train, optimizer, epoch):
# use a CrossEntropyLoss loss function
loss_func = torch.nn.CrossEntropyLoss()
# set model into train mode
model.train()
# track accuracy for complete epoch
total, correct = 0, 0
total_steps = len(loader_train)
elapsed_time = time.time()
for i, (x_batch, y_batch) in enumerate(loader_train):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# run forward pass
y_pred = model(x_batch)
# calculate loss
loss = loss_func(y_pred, y_batch)
# run backward pass and optimizer to update weights
optimizer.zero_grad()
loss.backward()
optimizer.step()
# track training accuracy
_, predicted = y_pred.max(1)
total += y_batch.size(0)
correct += predicted.eq(y_batch).sum().item()
if args.world_rank == 0 and i % 20 == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
sys.stdout.flush()
elapsed_time = time.time() - elapsed_time
if args.world_rank == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
sys.stdout.flush()
def test(args, model, loader_test, epoch):
# set model into evaluation mode
model.eval()
with torch.no_grad():
correct, total = 0, 0
for i, (x_batch, y_batch) in enumerate(loader_test):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# predict class
outputs = model(x_batch)
_, predicted = outputs.max(1)
# track test accuracy
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
if args.world_rank == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
sys.stdout.flush()
def setup(args) -> None:
if args.distributed:
args.world_size = int(os.environ['WORLD_SIZE'])
args.world_rank = int(os.environ['RANK'])
args.local_rank = int(os.environ['LOCAL_RANK'])
# initialize process group and wait for completion (only for distributed version)
dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank)
dist.barrier()
# set gpu device on local machine
if args.device == 'cuda':
if args.distributed:
# assign GPUs to processes on the local system (only for distributed version)
torch.cuda.set_device(args.local_rank)
args.device = torch.device(f'cuda:{args.local_rank}')
# optimization hint for torch runtime
torch.backends.cudnn.benchmark = True
if args.world_rank == 0:
print("Current configuration:")
for arg in vars(args):
print(f" --{arg}, {getattr(args, arg)}")
def cleanup(args: typing.Dict[str, typing.Any]):
if args.distributed:
# wait for processes and destory group again (only for distributed version)
dist.barrier()
dist.destroy_process_group()
def main():
# parse command line arguments
args = parse_command_line()
# run setup (e.g., create distributed environment if desired)
setup(args)
# get data loaders for train and test split
loader_train, loader_test = load_dataset(args)
# create resnet50 with random weights
model = resnet50().to(args.device)
if args.distributed:
# use DDP to wrap model (only for distributed version)
dist.barrier()
model = DDP(model, device_ids=[args.local_rank])
# initialize optimizer with model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# train and test model for configured number of epochs
for epoch in range(args.num_epochs):
train(args, model, loader_train, optimizer, epoch)
test (args, model, loader_test, epoch)
# cleaup env
cleanup(args)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
############################################################
### Execution (Model Training)
############################################################
# run the python script inside the container
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py"
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# specify that utilization monitoring via nvidia-smi should be done
export ENABLE_MONITORING=1
############################################################
### Execution (Model Training)
############################################################
if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
# start monitoring process in the background (every 2 sec)
nvidia-smi --query-gpu=timestamp,index,compute_mode,pstate,utilization.gpu,utilization.memory,memory.used,temperature.gpu,power.draw \
--format=csv --loop=2 &> gpu_monitoring_${SLURM_JOBID}.txt &
# remember ID of process that has just been started in background
export proc_id_monitor=$!
fi
# run the python script inside the container
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py"
if [ "${ENABLE_MONITORING:-0}" = "1" ]; then
# end monitoring process again
kill -2 ${proc_id_monitor}
sleep 5
fi
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:1
############################################################
### Load modules or software
############################################################
# TODO: activate your desired virtual environment
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
############################################################
### Execution (Model Training)
############################################################
# run the python script
python train_model.py
\ No newline at end of file
import argparse
import os, sys
import time
import typing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def parse_command_line():
parser = argparse.ArgumentParser()
parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
parser.add_argument("--num_epochs", required=False, type=int, default=2)
parser.add_argument("--batch_size", required=False, type=int, default=128)
parser.add_argument("--num_workers", required=False, type=int, default=1)
args = parser.parse_args()
return args
def load_dataset(args):
# define the following transformations
# - transform back to tensor
# - normalize data using mean and std
trans = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# load MNIST dataset splits for train and test and apply the transformations
# note: you need to download the dataset once at the beginning
ds_train = MNIST("datasets", train=True, download=True, transform=trans)
ds_test = MNIST("datasets", train=False, download=False, transform=trans)
# finally create separate data loaders for train and test with the following common arguments
common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
loader_train = DataLoader(ds_train, **(common_kwargs))
loader_test = DataLoader(ds_test, **(common_kwargs))
return loader_train, loader_test
def train(args, model, loader_train, optimizer, epoch):
# use a CrossEntropyLoss loss function
loss_func = torch.nn.CrossEntropyLoss()
# set model into train mode
model.train()
# track accuracy for complete epoch
total, correct = 0, 0
total_steps = len(loader_train)
elapsed_time = time.time()
for i, (x_batch, y_batch) in enumerate(loader_train):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# run forward pass
y_pred = model(x_batch)
# calculate loss
loss = loss_func(y_pred, y_batch)
# run backward pass and optimizer to update weights
optimizer.zero_grad()
loss.backward()
optimizer.step()
# track training accuracy
_, predicted = y_pred.max(1)
total += y_batch.size(0)
correct += predicted.eq(y_batch).sum().item()
if i % 20 == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
sys.stdout.flush()
elapsed_time = time.time() - elapsed_time
print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
sys.stdout.flush()
def test(args, model, loader_test, epoch):
# set model into evaluation mode
model.eval()
with torch.no_grad():
correct, total = 0, 0
for i, (x_batch, y_batch) in enumerate(loader_test):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# predict class
outputs = model(x_batch)
_, predicted = outputs.max(1)
# track test accuracy
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
sys.stdout.flush()
def setup(args) -> None:
# set gpu device on local machine
if args.device == 'cuda':
# optimization hint for torch runtime
torch.backends.cudnn.benchmark = True
print("Current configuration:")
for arg in vars(args):
print(f" --{arg}, {getattr(args, arg)}")
def cleanup(args: typing.Dict[str, typing.Any]):
pass
def main():
# parse command line arguments
args = parse_command_line()
# run setup (e.g., create distributed environment if desired)
setup(args)
# get data loaders for train and test split
loader_train, loader_test = load_dataset(args)
# create model with random weights
model = Net().to(args.device)
# initialize optimizer with model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# train and test model for configured number of epochs
for epoch in range(args.num_epochs):
train(args, model, loader_train, optimizer, epoch)
test (args, model, loader_test, epoch)
# cleaup env
cleanup(args)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/local_rwth/bin/zsh
export RANK=${SLURM_PROCID}
export LOCAL_RANK=${SLURM_LOCALID}
export WORLD_SIZE=${SLURM_NTASKS}
export APPTAINERENV_MASTER_ADDR=${MASTER_ADDR}
export APPTAINERENV_MASTER_PORT=${MASTER_PORT}
export APPTAINERENV_RANK=${RANK}
export APPTAINERENV_LOCAL_RANK=${LOCAL_RANK}
export APPTAINERENV_WORLD_SIZE=${WORLD_SIZE}
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2
############################################################
### Load modules or software
############################################################
# load module for PyTorch container
module load PyTorch/nvcr-24.01-py3
module list
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# variables required for distributed runs with DDP
export MASTER_ADDR=$(hostname)
export MASTER_PORT=12340 # error --> change to different port
export NCCL_DEBUG=INFO
############################################################
### Execution (Model Training)
############################################################
# each process sets required environment variables and
# runs the python script inside the container
srun zsh -c '\
source set_vars.sh && \
apptainer exec -e --nv ${PYTORCH_IMAGE} \
bash -c "python train_model.py --distributed"'
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
#SBATCH --time=00:15:00
#SBATCH --partition=c23g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=24
#SBATCH --gres=gpu:2
############################################################
### Load modules or software
############################################################
# TODO: activate your desired virtual environment
############################################################
### Parameters and Settings
############################################################
# print some information about current system
echo "Job nodes: ${SLURM_JOB_NODELIST}"
echo "Current machine: $(hostname)"
nvidia-smi
# variables required for distributed runs with DDP
export MASTER_ADDR=$(hostname)
export MASTER_PORT=12340 # error --> change to different port
export NCCL_DEBUG=INFO
############################################################
### Execution (Model Training)
############################################################
# each process sets required environment variables and
# runs the python script
srun zsh -c '\
source set_vars.sh && \
python train_model.py --distributed'
\ No newline at end of file
import argparse
import os, sys
import time
import typing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision.models import resnet50
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.utils.data import DistributedSampler, DataLoader
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def parse_command_line():
parser = argparse.ArgumentParser()
parser.add_argument("--device", required=False, type=str, choices=['cpu', 'cuda'], default="cuda")
parser.add_argument("--num_epochs", required=False, type=int, default=2)
parser.add_argument("--batch_size", required=False, type=int, default=128)
parser.add_argument("--num_workers", required=False, type=int, default=1)
parser.add_argument("--distributed", required=False, action="store_true", default=False)
args = parser.parse_args()
# default args for distributed
args.world_size = 1
args.world_rank = 0
args.local_rank = 0
return args
def load_dataset(args):
# define the following transformations
# - transform back to tensor
# - normalize data using mean and std
trans = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# load MNIST dataset splits for train and test and apply the transformations
# note: you need to download the dataset once at the beginning
ds_train = MNIST("datasets", train=True, download=True, transform=trans)
ds_test = MNIST("datasets", train=False, download=False, transform=trans)
# define distributed samplers (only for distributed version)
sampler_train, sampler_test = None, None
if args.distributed:
sampler_train = DistributedSampler(dataset=ds_train, shuffle=True,
num_replicas=args.world_size, rank=args.world_rank)
sampler_test = DistributedSampler(dataset=ds_test, shuffle=False,
num_replicas=args.world_size, rank=args.world_rank)
# finally create separate data loaders for train and test with the following common arguments
common_kwargs = {"batch_size": args.batch_size, "num_workers": args.num_workers, "pin_memory": True}
loader_train = DataLoader(ds_train, sampler=sampler_train, **(common_kwargs))
loader_test = DataLoader(ds_test, sampler=sampler_test, **(common_kwargs))
return loader_train, loader_test
def train(args, model, loader_train, optimizer, epoch):
# use a CrossEntropyLoss loss function
loss_func = torch.nn.CrossEntropyLoss()
# set model into train mode
model.train()
# track accuracy for complete epoch
total, correct = 0, 0
total_steps = len(loader_train)
elapsed_time = time.time()
for i, (x_batch, y_batch) in enumerate(loader_train):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# run forward pass
y_pred = model(x_batch)
# calculate loss
loss = loss_func(y_pred, y_batch)
# run backward pass and optimizer to update weights
optimizer.zero_grad()
loss.backward()
optimizer.step()
# track training accuracy
_, predicted = y_pred.max(1)
total += y_batch.size(0)
correct += predicted.eq(y_batch).sum().item()
if args.world_rank == 0 and i % 20 == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tStep {i:4d} / {total_steps:4d}")
sys.stdout.flush()
elapsed_time = time.time() - elapsed_time
if args.world_rank == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tElapsed: {elapsed_time:.3f} sec\tAcc: {(correct/total):.3f}")
sys.stdout.flush()
def test(args, model, loader_test, epoch):
# set model into evaluation mode
model.eval()
with torch.no_grad():
correct, total = 0, 0
for i, (x_batch, y_batch) in enumerate(loader_test):
# transfer data to the device
x_batch = x_batch.to(args.device, non_blocking=True)
y_batch = y_batch.to(args.device, non_blocking=True)
# predict class
outputs = model(x_batch)
_, predicted = outputs.max(1)
# track test accuracy
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
if args.world_rank == 0:
print(f"Epoch {epoch+1}/{args.num_epochs}\tTest Acc: {(correct/total):.3f}")
sys.stdout.flush()
def setup(args) -> None:
if args.distributed:
args.world_size = int(os.environ['WORLD_SIZE'])
args.world_rank = int(os.environ['RANK'])
args.local_rank = int(os.environ['LOCAL_RANK'])
# initialize process group and wait for completion (only for distributed version)
dist.init_process_group(backend='nccl', init_method="env://", world_size=args.world_size, rank=args.world_rank)
dist.barrier()
# set gpu device on local machine
if args.device == 'cuda':
if args.distributed:
# assign GPUs to processes on the local system (only for distributed version)
torch.cuda.set_device(args.local_rank)
args.device = torch.device(f'cuda:{args.local_rank}')
# optimization hint for torch runtime
torch.backends.cudnn.benchmark = True
if args.world_rank == 0:
print("Current configuration:")
for arg in vars(args):
print(f" --{arg}, {getattr(args, arg)}")
def cleanup(args: typing.Dict[str, typing.Any]):
if args.distributed:
# wait for processes and destory group again (only for distributed version)
dist.barrier()
dist.destroy_process_group()
def main():
# parse command line arguments
args = parse_command_line()
# run setup (e.g., create distributed environment if desired)
setup(args)
# get data loaders for train and test split
loader_train, loader_test = load_dataset(args)
# create model with random weights
model = Net().to(args.device)
if args.distributed:
# use DDP to wrap model (only for distributed version)
dist.barrier()
model = DDP(model, device_ids=[args.local_rank])
# initialize optimizer with model parameters
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# train and test model for configured number of epochs
for epoch in range(args.num_epochs):
train(args, model, loader_train, optimizer, epoch)
test (args, model, loader_test, epoch)
# cleaup env
cleanup(args)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/zsh #!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
### Job Parameters
#SBATCH --ntasks=8 # Ask for 8 MPI tasks #SBATCH --ntasks=8 # Ask for 8 MPI tasks
#SBATCH --time=00:15:00 # Run time of 15 minutes #SBATCH --time=00:15:00 # Run time of 15 minutes
#SBATCH --job-name=example_job # Sets the job name #SBATCH --job-name=example_job # Sets the job name
#SBATCH --output=stdout.txt # redirects stdout and stderr to stdout.txt #SBATCH --output=stdout_%j.txt # Redirects stdout and stderr to stdout.txt
#SBATCH --account=<project-id> # Insertg your project-id or delete this line #SBATCH --account=<project-id> # Insertg your project-id or delete this line
### Program Code ############################################################
### Execution / Commands
############################################################
srun hostname srun hostname
\ No newline at end of file
#!/usr/bin/zsh
############################################################
### Slurm flags
############################################################
# request Beeond
#SBATCH --beeond
# specify other Slurm commands
#SBATCH --time=01:00:00
############################################################
### Execution / Commands
############################################################
# copy files to Beeond
cp -r $WORK/yourfiles $BEEOND
# navigate to Beeond
cd $BEEOND/yourfiles
# perform your job, which has high I/O meta data and bandwidth demands
echo "hello world" > result
# afterwards: copy results back to a persistent storage
cp -r $BEEOND/yourfiles/result $WORK/yourfiles/
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment