diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000000000000000000000000000000000000..f7579995a9c7fe1241b5360ef9785a3b3a268801 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "genieai.chatgpt-vscode" + ] +} \ No newline at end of file diff --git a/01_train_model.py b/01_train_model.py index 53541b26f8ff398a523d680d2dcdeb7d65d835f2..aecab4018ce28f692f7d25999d157cee2b291840 100644 --- a/01_train_model.py +++ b/01_train_model.py @@ -1,82 +1,129 @@ +import hydra +from omegaconf import DictConfig import torch -import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR from pathlib import Path -import fire from modules.models.simple_net import Net from modules.training.training import train, test -from modules.utils.parallelize import pex - -def main(batch_size: int = 64, test_batch_size: int = 1000, epochs: int = 14, lr: float = 1.0, - gamma: float = 0.7, no_cuda: bool = False, no_mps: bool = False, - dry_run: bool = False, seed: int = 1, log_interval: int = 10, save_model: bool = False) -> None: +# Registering the config path with Hydra +@hydra.main(config_path="./data/config", config_name="train_model", version_base="1.3") +def main(cfg: DictConfig) -> None: """ Main function for training and evaluating a neural network on the MNIST dataset. + Utilizes Hydra for configuration management, separating model and training configurations. Args: - batch_size (int): Input batch size for training. Default: 64. - test_batch_size (int): Input batch size for testing. Default: 1000. - epochs (int): Number of epochs to train. Default: 14. - lr (float): Learning rate. Default: 1.0. - gamma (float): Learning rate step gamma. Default: 0.7. - no_cuda (bool): Flag to disable CUDA training. Default: False. - no_mps (bool): Flag to disable macOS GPU training. Default: False. - dry_run (bool): Flag for a quick single pass. Default: False. - seed (int): Random seed. Default: 1. - log_interval (int): Interval for logging training status. Default: 10. - save_model (bool): Flag to save the current model. Default: False. + cfg (DictConfig): Configuration object containing all parameters and sub-configurations. + Structure and default values of cfg are as follows: + ``` + model: + num_layers: 2 # Default: 2, Number of layers in the neural network model. + training: + batch_size: 64 # Default: 64, Input batch size for training. + test_batch_size: 1000 # Default: 1000, input batch size for testing. + epochs: 14 # Default: 14, number of epochs to train. + lr: 1.0 # Default: 1.0, learning rate. + gamma: 0.7 # Default: 0.7, learning rate step gamma. + no_cuda: False # Default: False, flag to disable CUDA training. + no_mps: False # Default: False, flag to disable macOS GPU training. + dry_run: False # Default: False, flag for a quick single pass. + seed: 1 # Default: 1, random seed for reproducibility. + log_interval: 10 # Default: 10, interval for logging training status. + save_model: True # Default: True, flag to save the trained model. + data_dir: "./data" # Default: "./data", directory for storing dataset files. + model_dir: "./models" # Default: "./models", directory for saving trained model files. + ``` Returns: None: This function does not return any value. + + Examples: + To run training with the default configuration specified in `./data/config/train_model.yaml`: + ```bash + $ python train.py + ``` + + To change the number of epochs to 20: + ```bash + $ python train.py training.epochs=20 + ``` + + To override configuration with another file `alternative.yaml`: + ```bash + $ python train.py +config=alternative.yaml + ``` + + To perform multiple runs with different model sizes using Hydra's multirun feature: + ```bash + $ python train.py --multirun model.num_layers=1,2,3 + ``` + + Using Hydra and Slurm for cluster job submissions: + ```bash + $ python train.py --multirun model.num_layers=1,2,3 hydra/launcher=slurm \ + hydra.launcher.partition=my_partition \ + hydra.launcher.comment='MNIST training runs' \ + hydra.launcher.nodes=1 \ + hydra.launcher.tasks_per_node=1 \ + hydra.launcher.mem_per_cpu=4G + ``` + + Note: For integrating Hydra with Slurm, additional configuration may be required and should be checked against Hydra's documentation and your Slurm setup. """ - use_cuda = not no_cuda and torch.cuda.is_available() - use_mps = not no_mps and torch.backends.mps.is_available() - torch.manual_seed(seed) + # Determine if CUDA or MPS should be used based on configuration and availability + use_cuda: bool = not cfg.training.no_cuda and torch.cuda.is_available() + use_mps: bool = not cfg.training.no_mps and torch.backends.mps.is_available() + torch.manual_seed(cfg.training.seed) + + device: torch.device = torch.device("cuda") if use_cuda else torch.device("mps") if use_mps else torch.device("cpu") + + # Setup DataLoader arguments based on device availability + train_kwargs: dict = {'batch_size': cfg.training.batch_size} + test_kwargs: dict = {'batch_size': cfg.training.test_batch_size} if use_cuda: - device = torch.device("cuda") - elif use_mps: - device = torch.device("mps") - else: - device = torch.device("cpu") - - train_kwargs = {'batch_size': batch_size} - test_kwargs = {'batch_size': test_batch_size} - if use_cuda: - cuda_kwargs = {'num_workers': 1, - 'pin_memory': True, - 'shuffle': True} + cuda_kwargs: dict = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) + # Image transformation pipeline transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) - dataset1 = datasets.MNIST('./data', train=True, download=True, transform=transform) - dataset2 = datasets.MNIST('./data', train=False, transform=transform) - train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) - test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - model = Net().to(device) - optimizer = optim.Adadelta(model.parameters(), lr=lr) + # Dataset preparation + dataset1: datasets.MNIST = datasets.MNIST(cfg.training.data_dir, train=True, download=True, transform=transform) + dataset2: datasets.MNIST = datasets.MNIST(cfg.training.data_dir, train=False, transform=transform) + + # DataLoaders for training and testing + train_loader: torch.utils.data.DataLoader = torch.utils.data.DataLoader(dataset1, **train_kwargs) + test_loader: torch.utils.data.DataLoader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - scheduler = StepLR(optimizer, step_size=1, gamma=gamma) - for epoch in range(1, epochs + 1): - train(model, device, train_loader, optimizer, epoch, log_interval, dry_run) + # Model initialization + model: Net = Net(num_layers=cfg.model.num_layers).to(device) + + # Optimizer setup + optimizer: optim.Optimizer = optim.Adadelta(model.parameters(), lr=cfg.training.lr) + + # Learning rate scheduler + scheduler: StepLR = StepLR(optimizer, step_size=1, gamma=cfg.training.gamma) + + # Training loop + for epoch in range(1, cfg.training.epochs + 1): + train(model, device, train_loader, optimizer, epoch, cfg.training.log_interval, cfg.training.dry_run) test(model, device, test_loader) scheduler.step() - if save_model: - Path("./data/models").mkdir(parents=True, exist_ok=True) - torch.save(model.state_dict(), f"./data/models/mnist_cnn_{seed}.pt") - + # Save the model checkpoint if configured to do so + if cfg.training.save_model: + Path(cfg.training.model_dir).mkdir(parents=True, exist_ok=True) + torch.save(model.state_dict(), f"{cfg.training.model_dir}/mnist_cnn_{cfg.training.seed}.pt") if __name__ == '__main__': - fire.Fire() \ No newline at end of file + main() \ No newline at end of file diff --git a/README.md b/README.md index 78e5cbf66b072d78cf624800d98945c2c8768b8c..ec778ab5e1a8c0c762482c84a6769dc2953d3534 100644 --- a/README.md +++ b/README.md @@ -7,20 +7,20 @@ The docker image that we are going to use is the one on 'env_setup/Dockerfile'. ```bash # build image -docker build -t andresfp14/xaicu122 ./env_setup +docker build -t andresfp14/xaicu118 ./env_setup # push image to docker repo (if you want to make it available in general) -docker push andresfp14/xaicu122 +docker push andresfp14/xaicu118 # Examples of how to launch it in windows -docker run -it --rm --name xaicu122 --gpus all -p 8888:8888 -p 6007:6007 -v %cd%:/home/example andresfp14/xaicu122 -docker run -d --rm --name xaicu122 --gpus all -p 8888:8888 -p 6007:6007 -v %cd%:/home/example andresfp14/xaicu122 bash +docker run -it --rm --name xaicu118 --gpus all -p 8888:8888 -p 6007:6007 -v %cd%:/home/example andresfp14/xaicu118 +docker run -d --rm --name xaicu118 --gpus all -p 8888:8888 -p 6007:6007 -v %cd%:/home/example andresfp14/xaicu118 bash # Examples of how to launch it in linux -docker run -it --rm --name xaicu122 --shm-size 100G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu122 bash -docker run -d --rm --name xaicu122 --shm-size 50G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu122 bash -docker run -idt --rm --name xai_1 --shm-size 50G --gpus '"device=0:0"' -v ~/data/datasets:/home/example/data/datasets -v $(pwd):/home/example andresfp14/xaicu122 -docker run -idt --rm --name xai_2 --shm-size 50G --gpus '"device=0:0"' -v $(pwd):/home/example andresfp14/xaicu122 +docker run -it --rm --name xaicu118 --shm-size 100G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu118 bash +docker run -d --rm --name xaicu118 --shm-size 50G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu118 bash +docker run -idt --rm --name xai_1 --shm-size 50G --gpus '"device=0:0"' -v ~/data/datasets:/home/example/data/datasets -v $(pwd):/home/example andresfp14/xaicu118 +docker run -idt --rm --name xai_2 --shm-size 50G --gpus '"device=0:0"' -v $(pwd):/home/example andresfp14/xaicu118 ``` @@ -34,9 +34,9 @@ In general, this is defined in the file 'env/requirements.txt'. # with conda ############################### # create environment -conda create --prefix ./venv python=3.11 +conda create --prefix ./.venv python=3.11 # activate environment -conda activate ./venv +conda activate ./.venv # install requirements pip install -r ./env_setup/requirements.txt # export environment (if you want to update it) @@ -66,23 +66,37 @@ Now, with the environment setup, we can run the needed code from the base direct ```bash ############################### -# Getting help with fire +# Getting help ############################### -python 01_train_model.py main --help +python 01_train_model.py --help ############################### # Executing with default arguments ############################### -python 01_train_model.py main +python 01_train_model.py ############################### # Executing and changing an argument ############################### -python 01_train_model.py main --seed=7 +python 01_train_model.py training.seed=7 ############################### -# Executing the function main for multiple arguments -# See helper function pex (parallel execution). +# Executing with an alternative configuration file ############################### -python 01_train_model.py pex main --seed=[0,1,2,3,4,5,6,7,8,9] --num_processes=4 -``` +python 01_train_model.py +config=alternative.yaml + +############################### +# Executing multiple runs with different model sizes using Hydra's multirun feature +############################### +python 01_train_model.py --multirun model.num_layers=1,2,3 + +############################### +# Using Hydra and Slurm for cluster job submissions +############################### +python 01_train_model.py --multirun model.num_layers=1,2,3 hydra/launcher=slurm \ + hydra.launcher.partition=my_partition \ + hydra.launcher.comment='MNIST training runs' \ + hydra.launcher.nodes=1 \ + hydra.launcher.tasks_per_node=1 \ + hydra.launcher.mem_per_cpu=4G +``` \ No newline at end of file diff --git a/env_setup/requirements.txt b/env_setup/requirements.txt index e9fe0701c2fcd01dbbddba62f733566c561536fc..885ea99020bbf34f98e0a64403469e3f5feaddcd 100644 --- a/env_setup/requirements.txt +++ b/env_setup/requirements.txt @@ -8,6 +8,8 @@ torchaudio==2.2.1 numpy==1.26.3 matplotlib==3.8.3 -fire==0.6.0 pyyaml==6.0.1 tqdm==4.66.2 +hydra-core==1.3.2 +hydra-submitit-launcher==1.2.0 +hydra-joblib-launcher==1.2.0 \ No newline at end of file diff --git a/modules/models/simple_net.py b/modules/models/simple_net.py index 2476c1f2ce796aa4895bb633819a067c58ada966..c7187cfb7eba18d24326d261155c2431acd6b995 100644 --- a/modules/models/simple_net.py +++ b/modules/models/simple_net.py @@ -3,13 +3,18 @@ import torch.nn as nn import torch.nn.functional as F class Net(nn.Module): - def __init__(self): + def __init__(self, num_layers=1): super(Net, self).__init__() + self.num_layers = num_layers self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) self.dropout1 = nn.Dropout(0.25) self.dropout2 = nn.Dropout(0.5) self.fc1 = nn.Linear(9216, 128) + + # Intermediate fully connected layers + self.fc_intermediate = nn.Linear(128, 128) if self.num_layers > 1 else None + self.fc2 = nn.Linear(128, 10) def forward(self, x): @@ -22,7 +27,13 @@ class Net(nn.Module): x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) + + # Apply the intermediate fully connected layer if it exists + if self.fc_intermediate: + x = self.fc_intermediate(x) + x = F.relu(x) + x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) - return output \ No newline at end of file + return output diff --git a/modules/training/training.py b/modules/training/training.py index 50d4c7806004083ac329cc0a9fd203c4a4b544ab..6d5c2ca31cf57c34ca53e75eaf2320df39f565bc 100644 --- a/modules/training/training.py +++ b/modules/training/training.py @@ -1,8 +1,10 @@ import torch import torch.nn as nn import torch.nn.functional as F +from modules.utils.loggers import create_logger def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run): + logger = create_logger(name="training") model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) @@ -12,13 +14,14 @@ def train(model, device, train_loader, optimizer, epoch, log_interval, dry_run): loss.backward() optimizer.step() if batch_idx % log_interval == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) if dry_run: break def test(model, device, test_loader): + logger = create_logger(name="test") model.eval() test_loss = 0 correct = 0 @@ -32,6 +35,6 @@ def test(model, device, test_loader): test_loss /= len(test_loader.dataset) - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) \ No newline at end of file