diff --git a/01_train_model.py b/01_train_model.py index aecab4018ce28f692f7d25999d157cee2b291840..870db0ffef290a5c12ff3099c852afe19e512228 100644 --- a/01_train_model.py +++ b/01_train_model.py @@ -29,7 +29,7 @@ def main(cfg: DictConfig) -> None: lr: 1.0 # Default: 1.0, learning rate. gamma: 0.7 # Default: 0.7, learning rate step gamma. no_cuda: False # Default: False, flag to disable CUDA training. - no_mps: False # Default: False, flag to disable macOS GPU training. + no_mps: False # Default: False, flag to disable mps training. dry_run: False # Default: False, flag for a quick single pass. seed: 1 # Default: 1, random seed for reproducibility. log_interval: 10 # Default: 10, interval for logging training status. diff --git a/README.md b/README.md index 3c9e9aa1326f560356a2380b1201e3638e174326..8d4b18a0591b5e6fd081358ceea45aa1470b74aa 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,9 @@ docker run -it --rm --name xaicu118 --gpus all -p 8888:8888 -p 6007:6007 -v %cd% docker run -d --rm --name xaicu118 --gpus all -p 8888:8888 -p 6007:6007 -v %cd%:/home/example andresfp14/xaicu118 bash # Examples of how to launch it in linux -docker run -it --rm --name xaicu118 --shm-size 100G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu118 bash -docker run -d --rm --name xaicu118 --shm-size 50G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu118 bash -docker run -idt --rm --name xai_1 --shm-size 50G --gpus '"device=0:0"' -v ~/data/datasets:/home/example/data/datasets -v $(pwd):/home/example andresfp14/xaicu118 -docker run -idt --rm --name xai_2 --shm-size 50G --gpus '"device=0:0"' -v $(pwd):/home/example andresfp14/xaicu118 +docker run -itd --rm --name xaicu118 --shm-size 5G --gpus all -p 8888:8888 -p 6007:6007 -v $(pwd):/home/example andresfp14/xaicu118 bash +docker run -idt --rm --name xai_1 --shm-size 5G --gpus '"device=0:0"' -v ~/data/datasets:/home/example/data/datasets -v $(pwd):/home/example andresfp14/xaicu118 +docker run -idt --rm --name xai_2 --shm-size 5G --gpus '"device=0:0"' -v $(pwd):/home/example andresfp14/xaicu118 ``` @@ -48,16 +47,20 @@ conda deactivate # with virtualenv ############################### # creates a virtualenv -python -m venv envname +python -m venv .venv # activates the virtualenv -source envname/bin/activate -. envname/bin/activate +source .venv/bin/activate +. .venv/bin/activate # install requirements pip install -r ./env_setup/requirements.txt # export environment (if you want to update it) pip freeze > ./env_setup/requirements2.txt # deactivate virtual environment deactivate + + +# if you are using the HPC, consider: +module load Python/3.10.4 ``` ## 3) Run code @@ -78,34 +81,28 @@ python 01_train_model.py ############################### # Executing and changing an argument ############################### -python 01_train_model.py training.seed=7 - -############################### -# Executing with an alternative configuration file -############################### -python 01_train_model.py +config=alternative.yaml +python 01_train_model.py training.epochs=2 training.seed=7 ############################### # Executing multiple runs with different model sizes using Hydra's multirun feature ############################### -python 01_train_model.py --multirun model.num_layers=1,2,3 +python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 ############################### # Executing multiple runs with launchers ############################### -python 01_train_model.py --multirun model.num_layers=1,2,3 +launcher=joblib +python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=joblib # or -python 01_train_model.py --multirun model.num_layers=1,2,3 +launcher=slurm +python 01_train_model.py --multirun training.epochs=2 model.num_layers=1,2,3 +launcher=slurm + +or + +python 01_train_model.py --multirun training.epochs=2 training.seed=0,1,2,3,4 +launcher=slurmgpu + +# or + +python 01_train_model.py --multirun +experiment=sweep_models_lr +launcher=slurm -############################### -# Using Hydra and Slurm for cluster job submissions -############################### -python 01_train_model.py --multirun model.num_layers=1,2,3 hydra/launcher=slurm \ - hydra.launcher.partition=my_partition \ - hydra.launcher.comment='MNIST training runs' \ - hydra.launcher.nodes=1 \ - hydra.launcher.tasks_per_node=1 \ - hydra.launcher.mem_per_cpu=4G ``` \ No newline at end of file diff --git a/data/config/training/default.yaml b/data/config/training/default.yaml index 55721428203cc665cb0ef1265c5e2bdb65657f23..42a67d5ff430e6575c7c90affbf4d848d89cf59d 100644 --- a/data/config/training/default.yaml +++ b/data/config/training/default.yaml @@ -5,7 +5,7 @@ epochs: 14 # Number of epochs to train (int) lr: 1.0 # Learning rate (float) gamma: 0.7 # Factor for the learning rate scheduler (float) no_cuda: false # Disable CUDA (bool) -no_mps: false # Disable MacOS MPS GPU support (bool) +no_mps: false # Disable mps training (bool) dry_run: false # Perform a dry run (do not update weights) (bool) seed: 1 # Seed for random number generation (int) log_interval: 10 # How often to log progress (int)