From 28797f8bbbe1abf9a127ca0b24c4bb2b3e1695c7 Mon Sep 17 00:00:00 2001 From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de> Date: Thu, 31 Oct 2024 12:42:18 +0100 Subject: [PATCH] added jobs and README.md --- .../scikit-learn_regression/README.md | 23 +++++++ .../scikit-learn_regression.py | 62 +++---------------- .../submit_job_container.sh | 34 ++++++++++ 3 files changed, 66 insertions(+), 53 deletions(-) create mode 100644 machine-learning/scikit-learn_regression/README.md create mode 100644 machine-learning/scikit-learn_regression/submit_job_container.sh diff --git a/machine-learning/scikit-learn_regression/README.md b/machine-learning/scikit-learn_regression/README.md new file mode 100644 index 0000000..b247d1c --- /dev/null +++ b/machine-learning/scikit-learn_regression/README.md @@ -0,0 +1,23 @@ +# Scikit-Learn - Regression Example + +## Interactive usage + +To interactively work, debug and execute codes you can either use our [HPC JupyterHub](https://jupyterhub.hpc.itc.rwth-aachen.de:9651/) (More information to that in our [Help](https://help.itc.rwth-aachen.de/service/rhr4fjjutttf/article/689934fec5a34c909c54606f6bc2e827/)) or use regular shell sessions on the cluster frontends or within interactivly batch jobs. + +As an example, in an interactive session on our HPC cluster, execute the following: + +```bash +# load the container module on the cluster +module load datascience-notebook + +# run the container and get a shell inside +# more information on Apptainer flags: https://apptainer.org/docs/user/main/cli/apptainer_shell.html +apptainer shell -e ${DATASCIENCENOTEBOOK_IMAGE} + +# within the newly opened shell in the container +Apptainer> python scikit-learn_regression.py +``` + +# Batch usage + +To asynchronously execute scripts in batch, there is an example contained in this folder \ No newline at end of file diff --git a/machine-learning/scikit-learn_regression/scikit-learn_regression.py b/machine-learning/scikit-learn_regression/scikit-learn_regression.py index 693050a..64dee8f 100644 --- a/machine-learning/scikit-learn_regression/scikit-learn_regression.py +++ b/machine-learning/scikit-learn_regression/scikit-learn_regression.py @@ -1,15 +1,6 @@ -# %% [markdown] -# # PPCES - Task: Regression with California Housing Dataset -# In this task, participants should apply preprocessing and regression techniques to the California housing (real world) dataset from scikit-learn, which lists several houses, specific attributes of the houses and their prices. The task involves -# - Loading the dataset -# - Applying preprocessing techniques such as feature standardization -# - Training and evaluating the regression model -# - Visualization results and scores - -# %% [markdown] -# ### Step 1: Load desired Python modules - -# %% +# Regression with California Housing Dataset + +### Step 1: Load desired Python modules import time import matplotlib.pyplot as plt @@ -18,16 +9,8 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.model_selection import train_test_split -# %% [markdown] -# ### Step 2: Loading the dataset -# Load the dataset and print out the following information -# - Description of the dataset -# - Array shape of the feature data that is used for training the model -# - Array shape of the label / target data -# - List of the feature names -# - List of the target names +### Step 2: Loading the dataset -# %% # load the dataset dataset = datasets.fetch_california_housing() @@ -38,17 +21,8 @@ print(f"Shape of label data: {dataset.target.shape}") print(f"Feature names: {dataset.feature_names}") print(f"Target names: {dataset.target_names}") -# %% [markdown] -# ### Step 3: Data preprocessing -# Run the following: -# - Determine the min, max and avg values per feature -# - You should see that the value ranges are quite different per feature -# - Some models work better if all features have a similar order of magnitude -# - Split the data into train and test splits -# - Apply standardization to the training data split -# - Note: of course you also need to apply the same standardization to the test split later! - -# %% +### Step 3: Data preprocessing + # determine min and max values per feature min_vals = list(dataset.data.min(axis=0)) avg_vals = list(dataset.data.mean(axis=0)) @@ -72,19 +46,8 @@ print(f"Scaler scale values (based on training set):\n{scaler.scale_}") # dont forget to apply the same scaler to the test data split X_test = scaler.transform(X_test) -# %% [markdown] -# ### Step 4: Train a Support Vector Regression (SVR) or RandomForest Regression model -# For SVR, use the following parameters: -# - `C=1.0` (regularization parameter) -# - `epsilon=0.2` (specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value) -# -# For RandomForest, use the following parameters: -# - `n_estimators=10` (Number of different trees) -# - `random_state=42` -# - `max_depth=None` (depth of the trees, `None` will figure it out on its own) -# - You can also play around with number of trees and depth - -# %% +### Step 4: Train a Support Vector Regression (SVR) or RandomForest Regression model + # create and intialize the model model = SVR(C=1.0, epsilon=0.2) # model = RandomForestRegressor(n_estimators=10, max_depth=None, random_state=42) @@ -96,15 +59,8 @@ elapsed_time = time.time() - elapsed_time print(f"Elapsed time for training: {elapsed_time} sec") -# %% [markdown] -# ### Step 5: Evaluate the model using typical scoring functions -# To evaluate the model performance, do the following: -# - Predict the housing prices for the test split with the trained model (applying on unseen data) -# - Plot both original (ground truth) and predicted values -# - Determine `r2_score`, `mean_absolute_error` and `mean_absolute_percentage_error` for the prediction -# - Note: There are multiple scoring functions for different purposes. You can find more information here: https://scikit-learn.org/stable/modules/model_evaluation.html +### Step 5: Evaluate the model using typical scoring functions -# %% # predict housing prices for test split y_pred = model.predict(X_test) diff --git a/machine-learning/scikit-learn_regression/submit_job_container.sh b/machine-learning/scikit-learn_regression/submit_job_container.sh new file mode 100644 index 0000000..baac602 --- /dev/null +++ b/machine-learning/scikit-learn_regression/submit_job_container.sh @@ -0,0 +1,34 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23ms +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 + +############################################################ +### Load modules or software +############################################################ + +# load module for datascience-notebook container +module load datascience-notebook +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script inside the container +apptainer exec -e ${DATASCIENCENOTEBOOK_IMAGE} \ + bash -c "python scikit-learn_regression.py" \ No newline at end of file -- GitLab