From 28797f8bbbe1abf9a127ca0b24c4bb2b3e1695c7 Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Thu, 31 Oct 2024 12:42:18 +0100
Subject: [PATCH] added jobs and README.md

---
 .../scikit-learn_regression/README.md         | 23 +++++++
 .../scikit-learn_regression.py                | 62 +++----------------
 .../submit_job_container.sh                   | 34 ++++++++++
 3 files changed, 66 insertions(+), 53 deletions(-)
 create mode 100644 machine-learning/scikit-learn_regression/README.md
 create mode 100644 machine-learning/scikit-learn_regression/submit_job_container.sh

diff --git a/machine-learning/scikit-learn_regression/README.md b/machine-learning/scikit-learn_regression/README.md
new file mode 100644
index 0000000..b247d1c
--- /dev/null
+++ b/machine-learning/scikit-learn_regression/README.md
@@ -0,0 +1,23 @@
+# Scikit-Learn - Regression Example
+
+## Interactive usage
+
+To interactively work, debug and execute codes you can either use our [HPC JupyterHub](https://jupyterhub.hpc.itc.rwth-aachen.de:9651/) (More information to that in our [Help](https://help.itc.rwth-aachen.de/service/rhr4fjjutttf/article/689934fec5a34c909c54606f6bc2e827/)) or use regular shell sessions on the cluster frontends or within interactivly batch jobs.
+
+As an example, in an interactive session on our HPC cluster, execute the following:
+
+```bash
+# load the container module on the cluster
+module load datascience-notebook
+
+# run the container and get a shell inside
+# more information on Apptainer flags: https://apptainer.org/docs/user/main/cli/apptainer_shell.html
+apptainer shell -e ${DATASCIENCENOTEBOOK_IMAGE}
+
+# within the newly opened shell in the container
+Apptainer> python scikit-learn_regression.py
+```
+
+# Batch usage
+
+To asynchronously execute scripts in batch, there is an example contained in this folder
\ No newline at end of file
diff --git a/machine-learning/scikit-learn_regression/scikit-learn_regression.py b/machine-learning/scikit-learn_regression/scikit-learn_regression.py
index 693050a..64dee8f 100644
--- a/machine-learning/scikit-learn_regression/scikit-learn_regression.py
+++ b/machine-learning/scikit-learn_regression/scikit-learn_regression.py
@@ -1,15 +1,6 @@
-# %% [markdown]
-# # PPCES - Task: Regression with California Housing Dataset
-# In this task, participants should apply preprocessing and regression techniques to the California housing (real world) dataset from scikit-learn, which lists several houses, specific attributes of the houses and their prices. The task involves
-# - Loading the dataset
-# - Applying preprocessing techniques such as feature standardization
-# - Training and evaluating the regression model
-# - Visualization results and scores
-
-# %% [markdown]
-# ### Step 1: Load desired Python modules
-
-# %%
+# Regression with California Housing Dataset
+
+### Step 1: Load desired Python modules
 import time
 import matplotlib.pyplot as plt
 
@@ -18,16 +9,8 @@ from sklearn.ensemble import RandomForestRegressor
 from sklearn.svm import SVR
 from sklearn.model_selection import train_test_split
 
-# %% [markdown]
-# ### Step 2: Loading the dataset
-# Load the dataset and print out the following information
-# - Description of the dataset
-# - Array shape of the feature data that is used for training the model
-# - Array shape of the label / target data
-# - List of the feature names
-# - List of the target names
+### Step 2: Loading the dataset
 
-# %%
 # load the dataset
 dataset = datasets.fetch_california_housing()
 
@@ -38,17 +21,8 @@ print(f"Shape of label data: {dataset.target.shape}")
 print(f"Feature names: {dataset.feature_names}")
 print(f"Target names: {dataset.target_names}")
 
-# %% [markdown]
-# ### Step 3: Data preprocessing
-# Run the following:
-# - Determine the min, max and avg values per feature
-#   - You should see that the value ranges are quite different per feature
-#   - Some models work better if all features have a similar order of magnitude
-# - Split the data into train and test splits
-# - Apply standardization to the training data split
-#   - Note: of course you also need to apply the same standardization to the test split later!
-
-# %%
+### Step 3: Data preprocessing
+
 # determine min and max values per feature
 min_vals = list(dataset.data.min(axis=0))
 avg_vals = list(dataset.data.mean(axis=0))
@@ -72,19 +46,8 @@ print(f"Scaler scale values (based on training set):\n{scaler.scale_}")
 # dont forget to apply the same scaler to the test data split
 X_test = scaler.transform(X_test)
 
-# %% [markdown]
-# ### Step 4: Train a Support Vector Regression (SVR) or RandomForest Regression model
-# For SVR, use the following parameters:
-# - `C=1.0` (regularization parameter)
-# - `epsilon=0.2` (specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value)
-# 
-# For RandomForest, use the following parameters:
-# - `n_estimators=10` (Number of different trees)
-# - `random_state=42`
-# - `max_depth=None` (depth of the trees, `None` will figure it out on its own)
-# - You can also play around with number of trees and depth
-
-# %%
+### Step 4: Train a Support Vector Regression (SVR) or RandomForest Regression model
+
 # create and intialize the model
 model = SVR(C=1.0, epsilon=0.2)
 # model = RandomForestRegressor(n_estimators=10, max_depth=None, random_state=42)
@@ -96,15 +59,8 @@ elapsed_time = time.time() - elapsed_time
 
 print(f"Elapsed time for training: {elapsed_time} sec")
 
-# %% [markdown]
-# ### Step 5: Evaluate the model using typical scoring functions
-# To evaluate the model performance, do the following:
-# - Predict the housing prices for the test split with the trained model (applying on unseen data)
-# - Plot both original (ground truth) and predicted values
-# - Determine `r2_score`, `mean_absolute_error` and `mean_absolute_percentage_error` for the prediction
-#   - Note: There are multiple scoring functions for different purposes. You can find more information here: https://scikit-learn.org/stable/modules/model_evaluation.html
+### Step 5: Evaluate the model using typical scoring functions
 
-# %%
 # predict housing prices for test split
 y_pred = model.predict(X_test)
 
diff --git a/machine-learning/scikit-learn_regression/submit_job_container.sh b/machine-learning/scikit-learn_regression/submit_job_container.sh
new file mode 100644
index 0000000..baac602
--- /dev/null
+++ b/machine-learning/scikit-learn_regression/submit_job_container.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23ms
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for datascience-notebook container
+module load datascience-notebook
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script inside the container
+apptainer exec -e ${DATASCIENCENOTEBOOK_IMAGE} \
+    bash -c "python scikit-learn_regression.py"
\ No newline at end of file
-- 
GitLab