diff --git a/.gitignore b/.gitignore index 2ad508e5fe12efb2c14357fb48d04c6c604bacf3..55fa36617188b63244ab0250ecd1084393140c64 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ # don't commit stuff in data, it may be too large and it's better to move it in other ways. # except the configs of your experiments data/* -!data/configs/ +!data/config/ # Don't commit rclone in case you are using it rclone.exe diff --git a/README.md b/README.md index ec778ab5e1a8c0c762482c84a6769dc2953d3534..3c9e9aa1326f560356a2380b1201e3638e174326 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,15 @@ python 01_train_model.py +config=alternative.yaml ############################### python 01_train_model.py --multirun model.num_layers=1,2,3 +############################### +# Executing multiple runs with launchers +############################### +python 01_train_model.py --multirun model.num_layers=1,2,3 +launcher=joblib + +# or + +python 01_train_model.py --multirun model.num_layers=1,2,3 +launcher=slurm + ############################### # Using Hydra and Slurm for cluster job submissions ############################### diff --git a/data/config/experiment/sweep_models_lr.yaml b/data/config/experiment/sweep_models_lr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4bda5f0f4ba9137db2cba731ef9e7899ee80164a --- /dev/null +++ b/data/config/experiment/sweep_models_lr.yaml @@ -0,0 +1,11 @@ +# @package _global_ +defaults: + - override /training: default + +hydra: + mode: MULTIRUN + sweeper: + params: + model: net2,net5 + training.lr: 1.0,0.1,0.001 + training.epochs: 1 diff --git a/data/config/experiment/train_model_B.yaml b/data/config/experiment/train_model_B.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b3cc1e875ca353d1a6042f2b1bfa54d1794fe6d6 --- /dev/null +++ b/data/config/experiment/train_model_B.yaml @@ -0,0 +1,4 @@ +# @package _global_ +defaults: + - override /model: net2 + - override /training: default diff --git a/data/config/launcher/joblib.yaml b/data/config/launcher/joblib.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c59459178d9bd26c44f57fac39600cd492f272da --- /dev/null +++ b/data/config/launcher/joblib.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - override /hydra/launcher: joblib + +hydra: + launcher: # https://hydra.cc/docs/plugins/joblib_launcher/ + n_jobs: 5 + diff --git a/data/config/launcher/slurm.yaml b/data/config/launcher/slurm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03162202439a2231b63f1bcd98e03c10dc4b5707 --- /dev/null +++ b/data/config/launcher/slurm.yaml @@ -0,0 +1,12 @@ +# @package _global_ +defaults: + - override /hydra/launcher: submitit_slurm + +hydra: + callbacks: + log_job_return: + _target_: hydra.experimental.callbacks.LogJobReturnCallback + launcher: + setup: [which python, echo 1] + submitit_folder: ${hydra.sweep.dir}/.submitit/%j + diff --git a/data/config/launcher/submitit_local.yaml b/data/config/launcher/submitit_local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a383cb93c199785e8364b0ded05c58c3db8761a --- /dev/null +++ b/data/config/launcher/submitit_local.yaml @@ -0,0 +1,12 @@ +# @package _global_ +defaults: + - override /hydra/launcher: submitit_local + +hydra: + callbacks: + log_job_return: + _target_: hydra.experimental.callbacks.LogJobReturnCallback + launcher: # https://hydra.cc/docs/plugins/submitit_launcher/ + submitit_folder: ${hydra.sweep.dir}/.submitit/%j + nodes: 1 + diff --git a/data/config/model/default.yaml b/data/config/model/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..660f6a7445fd6ef3b4d3504da0b7475d6b900190 --- /dev/null +++ b/data/config/model/default.yaml @@ -0,0 +1,2 @@ +# Model configuration for SimpleNet +num_layers: 3 # Number of layers in the neural network (int) diff --git a/data/config/model/net2.yaml b/data/config/model/net2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7417f879aaa773ef721d4f3079fb8553f6de7f3 --- /dev/null +++ b/data/config/model/net2.yaml @@ -0,0 +1,2 @@ +# Model configuration for SimpleNet +num_layers: 2 # Number of layers in the neural network (int) diff --git a/data/config/model/net5.yaml b/data/config/model/net5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..31dc62de0ddbb14a934e0c385f3e80d3eb5f02af --- /dev/null +++ b/data/config/model/net5.yaml @@ -0,0 +1,2 @@ +# Model configuration for SimpleNet +num_layers: 5 # Number of layers in the neural network (int) diff --git a/data/config/model/net7.yaml b/data/config/model/net7.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5591ce7d76bf5bdfd3ed3b3ccc633184897c17e2 --- /dev/null +++ b/data/config/model/net7.yaml @@ -0,0 +1,2 @@ +# Model configuration for SimpleNet +num_layers: 7 # Number of layers in the neural network (int) diff --git a/data/config/train_model.yaml b/data/config/train_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb29364a1eceb52053c2987f15d18538fc9e9c5d --- /dev/null +++ b/data/config/train_model.yaml @@ -0,0 +1,10 @@ +defaults: + - model: default + - training: default + +hydra: + run: + dir: ./data/outputs/${now:%Y-%m-%d_%H-%M-%S}_${hydra.job.name} + sweep: + dir: ./data/multirun/${now:%Y-%m-%d_%H-%M-%S}_${hydra.job.name} + subdir: ${hydra.job.num} \ No newline at end of file diff --git a/data/config/training/default.yaml b/data/config/training/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..55721428203cc665cb0ef1265c5e2bdb65657f23 --- /dev/null +++ b/data/config/training/default.yaml @@ -0,0 +1,14 @@ +# Training configuration for MNIST model +batch_size: 512 # Input batch size for training (int) +test_batch_size: 1000 # Input batch size for testing (int) +epochs: 14 # Number of epochs to train (int) +lr: 1.0 # Learning rate (float) +gamma: 0.7 # Factor for the learning rate scheduler (float) +no_cuda: false # Disable CUDA (bool) +no_mps: false # Disable MacOS MPS GPU support (bool) +dry_run: false # Perform a dry run (do not update weights) (bool) +seed: 1 # Seed for random number generation (int) +log_interval: 10 # How often to log progress (int) +save_model: true # Whether to save the model to disk (bool) +data_dir: './data/datasets' # Directory to store the dataset (str) +model_dir: './data/models' # Directory to save trained models (str)