diff --git a/.gitignore b/.gitignore index 55fa36617188b63244ab0250ecd1084393140c64..7e5d9dd62873a85181a66ffd20485e13f429c120 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,9 @@ rclone.exe tmp/* # don't commit the virtual environment venv +.venv +# don't commit vscode configs +.vscode ############################################################################### # The part below was generated automatically diff --git a/01_train_model.py b/01_train_model.py deleted file mode 100644 index 870db0ffef290a5c12ff3099c852afe19e512228..0000000000000000000000000000000000000000 --- a/01_train_model.py +++ /dev/null @@ -1,129 +0,0 @@ -import hydra -from omegaconf import DictConfig -import torch -import torch.optim as optim -from torchvision import datasets, transforms -from torch.optim.lr_scheduler import StepLR -from pathlib import Path - -from modules.models.simple_net import Net -from modules.training.training import train, test - -# Registering the config path with Hydra -@hydra.main(config_path="./data/config", config_name="train_model", version_base="1.3") -def main(cfg: DictConfig) -> None: - """ - Main function for training and evaluating a neural network on the MNIST dataset. - Utilizes Hydra for configuration management, separating model and training configurations. - - Args: - cfg (DictConfig): Configuration object containing all parameters and sub-configurations. - Structure and default values of cfg are as follows: - ``` - model: - num_layers: 2 # Default: 2, Number of layers in the neural network model. - training: - batch_size: 64 # Default: 64, Input batch size for training. - test_batch_size: 1000 # Default: 1000, input batch size for testing. - epochs: 14 # Default: 14, number of epochs to train. - lr: 1.0 # Default: 1.0, learning rate. - gamma: 0.7 # Default: 0.7, learning rate step gamma. - no_cuda: False # Default: False, flag to disable CUDA training. - no_mps: False # Default: False, flag to disable mps training. - dry_run: False # Default: False, flag for a quick single pass. - seed: 1 # Default: 1, random seed for reproducibility. - log_interval: 10 # Default: 10, interval for logging training status. - save_model: True # Default: True, flag to save the trained model. - data_dir: "./data" # Default: "./data", directory for storing dataset files. - model_dir: "./models" # Default: "./models", directory for saving trained model files. - ``` - - Returns: - None: This function does not return any value. - - Examples: - To run training with the default configuration specified in `./data/config/train_model.yaml`: - ```bash - $ python train.py - ``` - - To change the number of epochs to 20: - ```bash - $ python train.py training.epochs=20 - ``` - - To override configuration with another file `alternative.yaml`: - ```bash - $ python train.py +config=alternative.yaml - ``` - - To perform multiple runs with different model sizes using Hydra's multirun feature: - ```bash - $ python train.py --multirun model.num_layers=1,2,3 - ``` - - Using Hydra and Slurm for cluster job submissions: - ```bash - $ python train.py --multirun model.num_layers=1,2,3 hydra/launcher=slurm \ - hydra.launcher.partition=my_partition \ - hydra.launcher.comment='MNIST training runs' \ - hydra.launcher.nodes=1 \ - hydra.launcher.tasks_per_node=1 \ - hydra.launcher.mem_per_cpu=4G - ``` - - Note: For integrating Hydra with Slurm, additional configuration may be required and should be checked against Hydra's documentation and your Slurm setup. - """ - - # Determine if CUDA or MPS should be used based on configuration and availability - use_cuda: bool = not cfg.training.no_cuda and torch.cuda.is_available() - use_mps: bool = not cfg.training.no_mps and torch.backends.mps.is_available() - - torch.manual_seed(cfg.training.seed) - - device: torch.device = torch.device("cuda") if use_cuda else torch.device("mps") if use_mps else torch.device("cpu") - - # Setup DataLoader arguments based on device availability - train_kwargs: dict = {'batch_size': cfg.training.batch_size} - test_kwargs: dict = {'batch_size': cfg.training.test_batch_size} - if use_cuda: - cuda_kwargs: dict = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - - # Image transformation pipeline - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - - # Dataset preparation - dataset1: datasets.MNIST = datasets.MNIST(cfg.training.data_dir, train=True, download=True, transform=transform) - dataset2: datasets.MNIST = datasets.MNIST(cfg.training.data_dir, train=False, transform=transform) - - # DataLoaders for training and testing - train_loader: torch.utils.data.DataLoader = torch.utils.data.DataLoader(dataset1, **train_kwargs) - test_loader: torch.utils.data.DataLoader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - - # Model initialization - model: Net = Net(num_layers=cfg.model.num_layers).to(device) - - # Optimizer setup - optimizer: optim.Optimizer = optim.Adadelta(model.parameters(), lr=cfg.training.lr) - - # Learning rate scheduler - scheduler: StepLR = StepLR(optimizer, step_size=1, gamma=cfg.training.gamma) - - # Training loop - for epoch in range(1, cfg.training.epochs + 1): - train(model, device, train_loader, optimizer, epoch, cfg.training.log_interval, cfg.training.dry_run) - test(model, device, test_loader) - scheduler.step() - - # Save the model checkpoint if configured to do so - if cfg.training.save_model: - Path(cfg.training.model_dir).mkdir(parents=True, exist_ok=True) - torch.save(model.state_dict(), f"{cfg.training.model_dir}/mnist_cnn_{cfg.training.seed}.pt") - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/README.md b/README.md index dbcd60195a7d4783db7e45cf6e14c3384bd0b508..276cfaf0087f1d4abc1ca391d79ffe4813c8a5b4 100644 --- a/README.md +++ b/README.md @@ -62,17 +62,17 @@ module load Python/3.10.4 ```bash # Display help message with all available options and arguments -python 01_train_model.py --help +python runs/01_train_model.py --help # Execute the script with default configuration settings -python 01_train_model.py +python runs/01_train_model.py ``` ### Manual run ```bash # Execute the script with specific arguments, changing the number of epochs to 2 and the seed to 7 -python 01_train_model.py training.epochs=2 training.seed=7 +python runs/01_train_model.py training.epochs=2 training.seed=7 ``` ### Sweep with Hydra @@ -80,10 +80,10 @@ python 01_train_model.py training.epochs=2 training.seed=7 ```bash # Execute multiple runs with different model sizes using Hydra's multirun feature # This command will run the script for each combination of the specified values -python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 # Execute multiple runs as defined in a configuration file -python 01_train_model.py +experiment=sweep_models_lr +python runs/01_train_model.py +experiment=sweep_models_lr ``` ### Launchers @@ -91,13 +91,13 @@ python 01_train_model.py +experiment=sweep_models_lr ```bash # Execute multiple runs with Hydra's joblib launcher # This will run the script for each combination of the specified values using joblib for parallel execution -python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=joblib +python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=joblib # Or use Hydra's slurm launcher for running on a Slurm-based cluster -python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=slurm +python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=slurm # Or use Slurm with GPU support, running the script with multiple seed values -python 01_train_model.py --multirun training.epochs=2 training.seed=0,1,2,3,4 +launcher=slurmgpu +python runs/01_train_model.py --multirun training.epochs=2 training.seed=0,1,2,3,4 +launcher=slurmgpu ``` ## Run Code with Docker (GPU Server) diff --git a/data/config/dataset/mnist.yaml b/data/config/dataset/mnist.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15a59e23f40bc6c1eaf5b6b884ea801f529ea985 --- /dev/null +++ b/data/config/dataset/mnist.yaml @@ -0,0 +1,14 @@ +name: mnist +train: + _target_: torchvision.datasets.MNIST + root: './data/datasets' + train: True + download: True + transform: null + +test: + _target_: torchvision.datasets.MNIST + root: './data/datasets' + train: False + download: True + transform: null diff --git a/data/config/loader/default.yaml b/data/config/loader/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a13aaa886f5489290661a77e0429b41546a43a9a --- /dev/null +++ b/data/config/loader/default.yaml @@ -0,0 +1,14 @@ +name: default +train: + _target_: torch.utils.data.DataLoader + batch_size: 64 + shuffle: True + num_workers: 2 + pin_memory: True + +test: + _target_: torch.utils.data.DataLoader + batch_size: 1000 + shuffle: False + num_workers: 2 + pin_memory: True \ No newline at end of file diff --git a/data/config/model/default.yaml b/data/config/model/default.yaml deleted file mode 100644 index 660f6a7445fd6ef3b4d3504da0b7475d6b900190..0000000000000000000000000000000000000000 --- a/data/config/model/default.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# Model configuration for SimpleNet -num_layers: 3 # Number of layers in the neural network (int) diff --git a/data/config/model/net2.yaml b/data/config/model/net2.yaml index a7417f879aaa773ef721d4f3079fb8553f6de7f3..e3f1d1b296c02e581db95f114c9b681a3c254da9 100644 --- a/data/config/model/net2.yaml +++ b/data/config/model/net2.yaml @@ -1,2 +1,6 @@ # Model configuration for SimpleNet -num_layers: 2 # Number of layers in the neural network (int) +name: net2 # name of the model configuration +object: + _target_: modules.models.simple_net.Net + num_layers: 2 + latent_dim: 128 \ No newline at end of file diff --git a/data/config/model/net5.yaml b/data/config/model/net5.yaml index 31dc62de0ddbb14a934e0c385f3e80d3eb5f02af..355c573145e8af80ae3bcdeda9358e51436fde4f 100644 --- a/data/config/model/net5.yaml +++ b/data/config/model/net5.yaml @@ -1,2 +1,6 @@ # Model configuration for SimpleNet -num_layers: 5 # Number of layers in the neural network (int) +name: net5 # name of the model configuration +object: + _target_: modules.models.simple_net.Net + num_layers: 5 + latent_dim: 128 \ No newline at end of file diff --git a/data/config/model/net7.yaml b/data/config/model/net7.yaml index 5591ce7d76bf5bdfd3ed3b3ccc633184897c17e2..6c01cb0224359a5cac6e62d833cfcf0dd32713e2 100644 --- a/data/config/model/net7.yaml +++ b/data/config/model/net7.yaml @@ -1,2 +1,6 @@ # Model configuration for SimpleNet -num_layers: 7 # Number of layers in the neural network (int) +name: net7 # name of the model configuration +object: + _target_: modules.models.simple_net.Net + num_layers: 7 + latent_dim: 128 \ No newline at end of file diff --git a/data/config/train_model.yaml b/data/config/train_model.yaml index 6e2d8f5a237bb4a1a3eb0fe74ebbc39e353ae66c..97937a41585fa92055576caf6519caceeaaa51f6 100644 --- a/data/config/train_model.yaml +++ b/data/config/train_model.yaml @@ -1,7 +1,10 @@ defaults: - _self_ - - model: default - - training: default + - model: net2 + - training: short + - dataset: mnist + - transforms: default + - loader: default hydra: run: diff --git a/data/config/training/default.yaml b/data/config/training/default.yaml deleted file mode 100644 index 42a67d5ff430e6575c7c90affbf4d848d89cf59d..0000000000000000000000000000000000000000 --- a/data/config/training/default.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Training configuration for MNIST model -batch_size: 512 # Input batch size for training (int) -test_batch_size: 1000 # Input batch size for testing (int) -epochs: 14 # Number of epochs to train (int) -lr: 1.0 # Learning rate (float) -gamma: 0.7 # Factor for the learning rate scheduler (float) -no_cuda: false # Disable CUDA (bool) -no_mps: false # Disable mps training (bool) -dry_run: false # Perform a dry run (do not update weights) (bool) -seed: 1 # Seed for random number generation (int) -log_interval: 10 # How often to log progress (int) -save_model: true # Whether to save the model to disk (bool) -data_dir: './data/datasets' # Directory to store the dataset (str) -model_dir: './data/models' # Directory to save trained models (str) diff --git a/data/config/training/short.yaml b/data/config/training/short.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16bad7c2d978b7b92484b0d6b0ef7c0bfb94b672 --- /dev/null +++ b/data/config/training/short.yaml @@ -0,0 +1,22 @@ +# Training configuration for the model model +name: short # name of the training configuration +batch_size: 512 # Input batch size for training (int) +test_batch_size: 1000 # Input batch size for testing (int) +epochs: 2 # Number of epochs to train (int) +lr: 1.0 # Learning rate (float) +gamma: 0.7 # Factor for the learning rate scheduler (float) +use_cuda: True # Default: True, flag to enable CUDA training. +use_mps: True # Default: True, flag to enable macOS GPU training. +dry_run: false # Perform a dry run (do not update weights) (bool) +seed: 1 # Seed for random number generation (int) +log_interval: 10 # How often to log progress (int) +save_model: true # Whether to save the model to disk (bool) +base_data_dir: './data/datasets' # Directory to store the datasets (str) +base_model_dir: './data/models' # Directory to save trained models (str) +optimizer: + _target_: torch.optim.Adadelta + lr: 1.0 +scheduler: + _target_: torch.optim.lr_scheduler.StepLR + step_size: 1 + gamma: 0.7 \ No newline at end of file diff --git a/data/config/transforms/default.yaml b/data/config/transforms/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f56b6c37cfa306ff43a2cbdb06e929d86bf5423f --- /dev/null +++ b/data/config/transforms/default.yaml @@ -0,0 +1,16 @@ +name: default +train: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: [0.1307] + std: [0.3081] + +test: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ToTensor + - _target_: torchvision.transforms.Normalize + mean: [0.1307] + std: [0.3081] \ No newline at end of file diff --git a/modules/datasets/__init__.py b/modules/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/modules/fun.py b/modules/fun.py deleted file mode 100644 index 76e5ca8e912329ca87ef104253f498aa95938688..0000000000000000000000000000000000000000 --- a/modules/fun.py +++ /dev/null @@ -1,10 +0,0 @@ -import fire - -def fun(): - print("a") - -def fund(d): - print(d) - -if __name__ == "__main": - fire.Fire() \ No newline at end of file diff --git a/modules/models/simple_net.py b/modules/models/simple_net.py index c7187cfb7eba18d24326d261155c2431acd6b995..1b7810b2cd1f43d80edb268e846e647329dd79cb 100644 --- a/modules/models/simple_net.py +++ b/modules/models/simple_net.py @@ -3,19 +3,29 @@ import torch.nn as nn import torch.nn.functional as F class Net(nn.Module): - def __init__(self, num_layers=1): + def __init__(self, num_layers=1, latent_dim=128): super(Net, self).__init__() self.num_layers = num_layers + self.latent_dim = latent_dim + + # Convolutional layers self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) + + # Dropout layers self.dropout1 = nn.Dropout(0.25) self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - + + # First fully connected layer + self.fc1 = nn.Linear(9216, latent_dim) + # Intermediate fully connected layers - self.fc_intermediate = nn.Linear(128, 128) if self.num_layers > 1 else None - - self.fc2 = nn.Linear(128, 10) + self.fc_intermediates = nn.ModuleList( + [nn.Linear(latent_dim, latent_dim) for _ in range(num_layers - 1)] + ) + + # Output layer + self.fc2 = nn.Linear(latent_dim, 10) def forward(self, x): x = self.conv1(x) @@ -27,12 +37,12 @@ class Net(nn.Module): x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) - - # Apply the intermediate fully connected layer if it exists - if self.fc_intermediate: - x = self.fc_intermediate(x) + + # Apply the intermediate fully connected layers + for fc in self.fc_intermediates: + x = fc(x) x = F.relu(x) - + x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) diff --git a/modules/utils/parallelize.py b/modules/utils/parallelize.py deleted file mode 100644 index 73331e60cc111b2d179e92588c4e832f44705e60..0000000000000000000000000000000000000000 --- a/modules/utils/parallelize.py +++ /dev/null @@ -1,221 +0,0 @@ -import multiprocessing -import subprocess -import threading -import time -import random -from pathlib import Path -from tqdm import tqdm -from itertools import product -import sys -from .loggers import create_logger - -def stream_to_file(pipe, file_object): - """ - Reads lines continuously from a given pipe (stdout or stderr) and writes them to a file object in real-time. - This function is designed to be run in a separate thread, allowing asynchronous logging of subprocess output - without blocking the main execution thread. - - Args: - pipe (io.TextIOWrapper): The pipe object from which to read the output. This is typically obtained - from the stdout or stderr of a subprocess.Popen object, configured to be in text mode. - - file_object (file): An open file object where the pipe's content will be written. The file must be - opened in a mode that supports writing (e.g., 'w', 'a'). - - This function does not return a value but writes output directly to the provided file_object until the pipe - is closed. It's important to ensure that the file_object and pipe are correctly closed by the caller to - avoid resource leaks. - - Example usage: - # Assuming process is a subprocess.Popen object - with open('log.txt', 'w') as f: - stdout_thread = threading.Thread(target=stream_to_file, args=(process.stdout, f)) - stdout_thread.start() - # Other operations can be performed here while the thread captures the output in the background. - stdout_thread.join() # Ensure the logging thread has completed before closing the file. - """ - for line in iter(pipe.readline, ''): - file_object.write(line) - pipe.close() - -def p_run(func_name, log_name=None, base_file='run.py', **kwargs): - """ - Executes a specified Python function as a subprocess from a given script, capturing its standard output (stdout) - and standard error (stderr) to a log file in real-time. Allows for specifying a custom script file from which - the function will be called, instead of the default 'run.py'. - - Args: - func_name (str): The name of the Python function to execute. This function should be accessible - within the script specified by the 'base_file' parameter. - - log_name (str, optional): Custom name for the log file. If provided, the log file will be named - according to this parameter. If not provided, a unique name is generated based on the function - name, current timestamp, and a random number. - - base_file (str, optional): The Python script from which the function will be executed. Defaults - to 'run.py'. This script must be located in a directory accessible by the Python interpreter - executing the `p_run` function. - - **kwargs: Arbitrary keyword arguments that are passed to the function being executed. These arguments - are converted to command-line arguments in the format '--key=value'. - - The function creates a log file in the './data/logs/' directory, ensuring the directory exists. It then - initiates the subprocess with stdout and stderr piped. Two separate threads are spawned to asynchronously - capture the output from these pipes to the log file. - - This function does not return any value but logs the execution status. In case of a subprocess failure - (non-zero return code), an informative message is logged using a custom logger. - - Example usage: - p_run('data_processing_function', arg1='value1', arg2='value2', base_file='alternative_script.py') - # This would execute the 'data_processing_function' from 'alternative_script.py' with the specified arguments and capture its output to a log file. - """ - # Initialize logger - logger = create_logger("p_run") - - # Convert keyword arguments to command-line arguments - args = [f"--{key}={value}" for key, value in kwargs.items()] - - # Construct the command to be executed - cmd = [sys.executable, base_file, func_name] + args - - # Ensure the logs directory exists - Path("./data/logs/").mkdir(parents=True, exist_ok=True) - - # Generate log file name - logfile = f"./data/logs/{log_name}.log" if log_name else f"./data/logs/log_{func_name}_{int(time.time())}_{random.randint(0,999):03d}.log" - - # Open the log file - with open(logfile, "w") as output_file: - # Start the subprocess - process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - - # Create threads to capture stdout and stderr - stdout_thread = threading.Thread(target=stream_to_file, args=(process.stdout, output_file)) - stderr_thread = threading.Thread(target=stream_to_file, args=(process.stderr, output_file)) - - # Start the threads - stdout_thread.start() - stderr_thread.start() - - # Wait for the output capture to complete - stdout_thread.join() - stderr_thread.join() - - # Ensure the subprocess has finished - process.wait() - - # Log if the process did not complete successfully - if process.returncode != 0: - logger.info(f"Failed: {' '.join(cmd)}") - -def parallelize_function(func, parallel_args_list, num_processes=None, base_file='run.py'): - """ - Runs a function in parallel across multiple processes, using a specified script file. This function is designed - to parallelize execution of tasks that are encapsulated in Python functions, with arguments specified for each - task. It captures the output of these functions to log files in real-time. - - Args: - func (function): The function to be parallelized. Note that this function should be accessible within - the script specified by the 'base_file' parameter. - - parallel_args_list (list): A list of tuples, where each tuple contains a tuple of positional arguments - and a dictionary of keyword arguments for each invocation of the parallelized function. The keyword - arguments can include a 'log_name' key to specify custom log file names for each process. - - num_processes (int, optional): The number of processes to use for parallel execution. If None, the - function defaults to using the number of available CPU cores on the system. - - base_file (str, optional): The Python script from which the function will be executed in each subprocess. - Defaults to 'run.py'. This allows for specifying different scripts for different tasks. - - Returns: - list: A list of return values from the function for each set of arguments. Note that in the context of - this function, the primary purpose is to execute parallel tasks rather than collecting return values, - as the output is directed to log files. - - Example usage: - def my_func(arg1, arg2): - # Function body here - - args_list = [((arg1_value, arg2_value), {'log_name': 'custom_log_for_this_invocation'})] - results = parallelize_function(my_func, args_list, num_processes=4, base_file='my_script.py') - # This will run 'my_func' in parallel, using 'my_script.py', with specified arguments and log names. - """ - results = [] - - # Determine the number of processes - if num_processes is None: - num_processes = multiprocessing.cpu_count() - - # Initialize multiprocessing pool and tqdm progress bar - with multiprocessing.Pool(processes=num_processes) as pool, tqdm(total=len(parallel_args_list), desc="Processing", unit="task") as pbar: - async_results = [] - - # Submit tasks to the pool - for arg_set in parallel_args_list: - args, kwargs = arg_set - kwargs['base_file'] = base_file # Add the base_file to kwargs for each task - result = pool.apply_async(func, args=args, kwds=kwargs, callback=lambda _: pbar.update(1)) - async_results.append(result) - - # Collect results - for async_result in async_results: - results.append(async_result.get()) - - return results - -def pex(func_name, *args, num_processes=None, base_file=None, **kwargs): - """ - Executes a specified function in parallel across multiple processes, with the ability to expand and - combine list-type keyword arguments into multiple sets of arguments for the function. This allows for - comprehensive and efficient experimentation or task execution with varied parameters. - - Args: - func_name (str): The name of the function to execute in parallel. This function must be accessible - within the script specified by the 'base_file' parameter. - - *args: Positional arguments to be passed directly to the function. These are not expanded or varied - and are passed as-is to every invocation of the function. - - num_processes (int, optional): The number of processes to use for the parallel execution. Defaults to - the number of available CPU cores if None. - - base_file (str, optional): The Python script from which the function will be executed. If None, the - script that is currently being executed (where this function is called) is used. This allows for - different scripts to be used for different parallel execution tasks. - - **kwargs: Keyword arguments for the function. If a keyword argument is a list, this function will - generate combinations of these lists, running the target function for each combination alongside any - non-list arguments. - - Returns: - list: A list of results from each process. Note that in this context, the primary purpose is to - execute parallel tasks, and the actual return values might be less relevant if outputs are captured - in log files or external systems. - - Example usage: - pex('process_data', data_id=42, filters=['filter1', 'filter2'], num_processes=4) - # This would parallelize 'process_data' function calls over the combinations of 'filters' with - # 'data_id' as a constant argument, using the current script as the base file for execution. - """ - # Set base_file to the current script if not specified - if base_file is None: - base_file = sys.argv[0] - - # Separate list and non-list kwargs - list_args = {k: v for k, v in kwargs.items() if isinstance(v, list)} - non_list_args = {k: v for k, v in kwargs.items() if k not in list_args} - - # Generate combinations of list arguments - combinations = [dict(zip(list_args, prod)) for prod in product(*list_args.values())] - - # Prepare arguments for parallel execution - parallel_args_list = [] - for comb in combinations: - combined_args = {**non_list_args, **comb, 'func_name': func_name, 'base_file': base_file} - parallel_args_list.append((args, combined_args)) - - # Execute in parallel - return parallelize_function(p_run, parallel_args_list, num_processes=num_processes, base_file=base_file) - diff --git a/runs/01_train_model.py b/runs/01_train_model.py new file mode 100644 index 0000000000000000000000000000000000000000..c1acc31fbbb5dadafd84e488598aa22cf1a13f68 --- /dev/null +++ b/runs/01_train_model.py @@ -0,0 +1,167 @@ +# Add the parent directory to the Python path (because we are executing hydra from within runs) +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parents[1])) + +import hydra +from omegaconf import DictConfig, OmegaConf +from hydra.utils import instantiate +import torch +import json +import time + +from modules.training.training import train, test +from modules.utils.seeds import seed_everything + +# Registering the config path with Hydra +@hydra.main(config_path="../data/config", config_name="train_model", version_base="1.3") +def main(cfg: DictConfig) -> None: + """ + Main function for training and evaluating a neural network on the MNIST dataset. + Utilizes Hydra for configuration management, separating model and training configurations. + + Args: + cfg (DictConfig): Configuration object containing all parameters and sub-configurations. + Structure and default values of cfg are as follows: + ``` + model: + num_layers: 2 # Default: 2, Number of layers in the neural network model. + training: + batch_size: 64 # Default: 64, Input batch size for training. + test_batch_size: 1000 # Default: 1000, input batch size for testing. + epochs: 14 # Default: 14, number of epochs to train. + lr: 1.0 # Default: 1.0, learning rate. + gamma: 0.7 # Default: 0.7, learning rate step gamma. + use_cuda: True # Default: True, flag to enable CUDA training. + use_mps: True # Default: True, flag to enable macOS GPU training. + dry_run: False # Default: False, flag for a quick single pass. + seed: 1 # Default: 1, random seed for reproducibility. + log_interval: 10 # Default: 10, interval for logging training status. + save_model: True # Default: True, flag to save the trained model. + data_dir: "./data" # Default: "./data", directory for storing dataset files. + model_dir: "./models" # Default: "./models", directory for saving trained model files. + ``` + + Returns: + None: This function does not return any value. + + Examples: + To run training with the default configuration specified in `../data/config/train_model.yaml`: + ```bash + $ python runs/01_train_model.py + ``` + + To change the number of epochs to 2 and seed to 7: + ```bash + $ python runs/01_train_model.py training.epochs=2 training.seed=7 + ``` + + To override configuration with another file `sweep_models_lr.yaml`: + ```bash + $ python runs/01_train_model.py +experiment=sweep_models_lr + ``` + + To perform multiple runs with different model sizes and training epochs using Hydra's multirun feature: + ```bash + $ python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 + ``` + + Using Hydra's launcher for multiple runs: + ```bash + $ python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=joblib + ``` + + Or using Slurm for cluster job submissions: + ```bash + $ python runs/01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=slurm + ``` + + Or using Slurm with GPU for multiple seeds: + ```bash + $ python runs/01_train_model.py --multirun training.epochs=2 training.seed=0,1,2,3,4 +launcher=slurmgpu + ``` + + Note: For integrating Hydra with Slurm, additional configuration may be required and should be checked against Hydra's documentation and your Slurm setup. + """ + + ############################## + # Preliminaries + ############################## + + # Create a directory for saving models and results + model_save_dir = Path(cfg.training.base_model_dir) / f"{cfg.dataset.name}_{cfg.model.name}_{cfg.training.name}_{cfg.training.seed}" + model_save_dir.mkdir(parents=True, exist_ok=True) + + # Determine if CUDA or MPS should be used based on configuration and availability + use_cuda = cfg.training.use_cuda and torch.cuda.is_available() + use_mps = cfg.training.use_mps and torch.backends.mps.is_available() + + # Set the random seed for reproducibility + seed_everything(cfg.training.seed) + + # Select the device for computation (CUDA, MPS, or CPU) + device = torch.device("cuda") if use_cuda else torch.device("mps") if use_mps else torch.device("cpu") + + ############################## + # Object Instantiation + ############################## + + # Instantiate transforms for training and testing + train_transform = instantiate(cfg.transforms.train) + test_transform = instantiate(cfg.transforms.test) + + # Instantiate datasets with transforms + train_dataset = instantiate(cfg.dataset.train, transform=train_transform) + test_dataset = instantiate(cfg.dataset.test, transform=test_transform) + + # Instantiate data loaders + train_loader = instantiate(cfg.loader.train, dataset=train_dataset) + test_loader = instantiate(cfg.loader.test, dataset=test_dataset) + + # Instantiate model + model = instantiate(cfg.model.object).to(device) + + # Instantiate optimizer + optimizer = instantiate(cfg.training.optimizer, params=model.parameters()) + + # Instantiate scheduler + scheduler = instantiate(cfg.training.scheduler, optimizer=optimizer) + + ############################## + # Actual Task: Training Loop + ############################## + + # Training loop + for epoch in range(1, cfg.training.epochs + 1): + train(model, device, train_loader, optimizer, epoch, cfg.training.log_interval, cfg.training.dry_run) + test(model, device, test_loader) + scheduler.step() + + ############################## + # Saving Results + ############################## + + # Save the model checkpoint if configured to do so + if cfg.training.save_model: + model_path = model_save_dir / f"checkpoint.pt" + torch.save(model.state_dict(), model_path) + + # Save the result dictionary + results = { + "model_name": cfg.model.name, + "training_name": cfg.training.name, + "epochs": cfg.training.epochs, + "seed": cfg.training.seed, + "final_model_path": str(model_path), + "timestamp": time.time() + } + with open(model_save_dir / "results.json", "w") as f: + json.dump(results, f, indent=4) + + # Save the configuration + config_path = model_save_dir / "config.yaml" + with open(config_path, "w") as f: + OmegaConf.save(config=cfg, f=f) + +if __name__ == '__main__': + main() \ No newline at end of file