adding autocast

9d6350b1 · andres · f337a95c · 9d6350b1 · 9d6350b1 · 9d6350b1
Commit 9d6350b1 authored 1 year ago by andres
--- a/data/config/loader/basic.yaml
+++ b/data/config/loader/basic.yaml
 name: basic
 train:
  _target_: torch.utils.data.DataLoader
-  batch_size: 64
+  batch_size: 256 #64
  shuffle: True
  num_workers: 2
  pin_memory: True

--- a/data/config/training/basic.yaml
+++ b/data/config/training/basic.yaml
 # Training configuration for the model model
 name: basic # name of the training configuration
-use_cuda: True  # Default: True, flag to enable CUDA training.
+seed: 1
-use_mps: True  # Default: True, flag to enable macOS GPU training.
+device: cuda
-dry_run: false        # Perform a dry run (do not update weights) (bool)
-seed: 1               # Seed for random number generation (int)
-save_model: true     # Whether to save the model to disk (bool)
 loss: 
  _target_: torch.nn.NLLLoss
 optimizer:
@@ -15,19 +12,18 @@ scheduler:
  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
  mode: 'min'
  factor: 0.1
-  patience: 10
+  patience: 3
-scheduler_monitor: train_loss_epoch
+min_epochs: 10
-min_epochs: 50
 max_epochs: 300
 early_stopping_config:
  monitor: valid_acc_epoch
  min_delta: 0.001
-  patience: 20
+  patience: 10
  verbose: False
  mode: max
 gradient_clip_val: 1
 metrics: ['acc', 'f1']
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 5
 loggers:
  tensorboard:
    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger

--- a/modules/training/training.py
+++ b/modules/training/training.py
@@ -9,7 +9,7 @@ from torch.utils.data import DataLoader
 from modules.utils import MetricAggregator, EarlyStoppingCustom
 import logging
-def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLoader, cfg: DictConfig, device="cpu"):
+def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLoader, cfg: DictConfig):
    """
    Trains and evaluates a neural network model.
@@ -27,7 +27,7 @@ def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLo
    ##############################
    # Device Setup
    ##############################
-    device = torch.device(cfg.device)
+    device = "cuda" if (cfg.device=="cuda" and torch.cuda.is_available()) else "cpu"
    model.to(device)
    ##############################
@@ -58,6 +58,9 @@ def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLo
    # Early stopping and checkpointing
    early_stopping = EarlyStoppingCustom(**cfg.early_stopping_config)
+    # scaler for automatic mixed precision
+    scaler = torch.cuda.amp.GradScaler()
    ##############################
    # Epoch Loop
    ##############################
@@ -67,24 +70,30 @@ def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLo
        ##############################
        model.train()
        lr = torch.tensor(optimizer.param_groups[0]['lr'])
-        for batch_idx, batch in enumerate(train_loader):
+        for batch_idx, (x, y) in enumerate(train_loader):
-            x, y = batch
            x, y = x.to(device), y.to(device)
+            with torch.autocast(device_type=device):
                out = model(x)
                logprob = F.log_softmax(out, dim=1)
                y_hat_prob = torch.exp(logprob)
                loss = criterion(logprob, y)
-            loss.backward()
+                loss = loss / cfg.gradient_accumulation_steps
+            # Accumulates scaled gradients.
+            scaler.scale(loss).backward()
            # Gradient accumulation
            if (batch_idx + 1) % cfg.gradient_accumulation_steps == 0 or (batch_idx + 1) == len(train_loader):
-                optimizer.step()
+                scaler.step(optimizer)
+                scaler.update()
                optimizer.zero_grad()
            # Update metrics
+            with torch.autocast(device_type=device):
                metric_aggregator.step(y_hat_prob=y_hat_prob, y=y, loss=loss, epoch=torch.tensor(epoch+1), lr=lr, phase="train")
        # Compute and log metrics
+        with torch.autocast(device_type=device):
            train_results = metric_aggregator.compute(phase="train")
        logger.info(f"Epoch {epoch+1} Train: {' '.join([f'{k}:{v:.3E}'.replace('_epoch','').replace('train_','') for k,v in train_results.items() if isinstance(v,float)])}")
@@ -93,18 +102,20 @@ def train_model(model: nn.Module, train_loader: DataLoader, valid_loader: DataLo
        ##############################
        model.eval()
        with torch.no_grad():
-            for batch in valid_loader:
+            for batch_idx, (x, y) in enumerate(valid_loader):
-                x, y = batch
                x, y = x.to(device), y.to(device)
+                with torch.autocast(device_type=device):
                    out = model(x)
                    logprob = F.log_softmax(out, dim=1)
                    y_hat_prob = torch.exp(logprob)
                    val_loss = criterion(logprob, y)
                # Update metrics
+                with torch.autocast(device_type=device):
                    metric_aggregator.step(y_hat_prob=y_hat_prob, y=y, loss=loss, epoch=torch.tensor(epoch+1), lr=lr, phase="valid")
        # Compute and log metrics
+        with torch.autocast(device_type=device):
            valid_results = metric_aggregator.compute(phase="valid")
        logger.info(f"Epoch {epoch+1} Valid: {' '.join([f'{k}:{v:.3E}'.replace('_epoch','').replace('valid_','') for k,v in valid_results.items() if isinstance(v,float)])}")

--- a/modules/utils/metric_aggregator.py
+++ b/modules/utils/metric_aggregator.py
@@ -41,10 +41,10 @@ class MetricAggregator:
                self.init_agg(phase=phase, metric=metric)
        for metric in ["acc", "cm", "f1"]:
            if metric in self.metrics:
-                self.aggregators[phase][metric](y_hat_prob.to(self.device), y.to(self.device))
+                self.aggregators[phase][metric].update(y_hat_prob.to(self.device), y.to(self.device))
        for k, v in kwargs.items():
            if k in self.aggregators[phase]:
-                self.aggregators[phase][k](v.to(self.device))
+                self.aggregators[phase][k].update(v.to(self.device))
        for logger in self.loggers:
            logger.log_metrics({f"{phase}_{k}_step":v.detach().cpu().tolist() for k,v in kwargs.items()}, step=self.step_num)
        self.step_num+=1

--- a/runs/01_train_model.py
+++ b/runs/01_train_model.py
@@ -91,15 +91,11 @@ def main(cfg: DictConfig) -> None:
    model_save_dir = Path(cfg.path.base_path_models) / cfg.path.results
    model_save_dir.mkdir(parents=True, exist_ok=True)
-    # Determine if CUDA or MPS should be used based on configuration and availability
-    use_cuda = cfg.training.use_cuda and torch.cuda.is_available()
-    use_mps = cfg.training.use_mps and torch.backends.mps.is_available()
    # Set the random seed for reproducibility
    seed_everything(cfg.training.seed)
    # Select the device for computation (CUDA, MPS, or CPU)
-    device = torch.device("cuda") if use_cuda else torch.device("mps") if use_mps else torch.device("cpu")
+    device = "cuda" if (cfg.training.device=="cuda" and torch.cuda.is_available()) else "cpu"
    ##############################
    # Object Instantiation
@@ -125,29 +121,16 @@ def main(cfg: DictConfig) -> None:
    ##############################
    # Training loop    
-    train_model(model, train_loader, test_loader, cfg.training, device)
+    train_model(model, train_loader, test_loader, cfg.training)
    ##############################
    # Saving Results
    ##############################
    # Save the model checkpoint if configured to do so
-    if cfg.training.save_model:
+    model_path = model_save_dir / f"checkpoint.ckpt"
-        model_path = model_save_dir / f"checkpoint.pt"
    torch.save(model.state_dict(), model_path)
-    # Save the result dictionary
-    results = {
-        "model_name": cfg.model.name,
-        "training_name": cfg.training.name,
-        "epochs": cfg.training.epochs,
-        "seed": cfg.training.seed,
-        "final_model_path": str(model_path),
-        "timestamp": time.time()
-    }
-    with open(model_save_dir / "results.json", "w") as f:
-        json.dump(results, f, indent=4)
    # Save the configuration
    config_path = model_save_dir / "config.yaml"
    with open(config_path, "w") as f: