From ce7459449dc3c2eda3642305bd2f6ff6d34d5bda Mon Sep 17 00:00:00 2001
From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de>
Date: Thu, 7 Nov 2024 08:43:07 +0100
Subject: [PATCH] added clustering example using scikit-learn

---
 .../scikit-learn_clustering/README.md         | 23 +++++
 .../scikit-learn_clustering.py                | 84 +++++++++++++++++++
 .../submit_job_container.sh                   | 34 ++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 machine-learning/scikit-learn_clustering/README.md
 create mode 100644 machine-learning/scikit-learn_clustering/scikit-learn_clustering.py
 create mode 100644 machine-learning/scikit-learn_clustering/submit_job_container.sh

diff --git a/machine-learning/scikit-learn_clustering/README.md b/machine-learning/scikit-learn_clustering/README.md
new file mode 100644
index 0000000..fc82fda
--- /dev/null
+++ b/machine-learning/scikit-learn_clustering/README.md
@@ -0,0 +1,23 @@
+# Scikit-Learn - Clustering Example
+
+## Interactive usage
+
+To interactively work, debug and execute codes you can either use our [HPC JupyterHub](https://jupyterhub.hpc.itc.rwth-aachen.de:9651/) (More information to that in our [Help](https://help.itc.rwth-aachen.de/service/rhr4fjjutttf/article/689934fec5a34c909c54606f6bc2e827/)) or use regular shell sessions on the cluster frontends or within interactivly batch jobs.
+
+As an example, in an interactive session on our HPC cluster, execute the following:
+
+```bash
+# load the container module on the cluster
+module load datascience-notebook
+
+# run the container and get a shell inside
+# more information on Apptainer flags: https://apptainer.org/docs/user/main/cli/apptainer_shell.html
+apptainer shell -e ${DATASCIENCENOTEBOOK_IMAGE}
+
+# within the newly opened shell in the container
+Apptainer> python scikit-learn_clustering.py
+```
+
+# Batch usage
+
+To asynchronously execute scripts in batch, there is an example contained in this folder
\ No newline at end of file
diff --git a/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py b/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py
new file mode 100644
index 0000000..3f8bc6d
--- /dev/null
+++ b/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py
@@ -0,0 +1,84 @@
+# Clustering Iris Dataset
+
+### Step 1: Load desired Python modules
+import time
+import matplotlib.pyplot as plt
+
+from sklearn import cluster, datasets
+from sklearn.decomposition import PCA
+
+### Step 2: Loading the dataset
+
+# load the dataset
+dataset = datasets.load_iris()
+
+# print desired information
+print(f"Data set description:\n{dataset.DESCR}")
+print(f"Shape of feature / training data: {dataset.data.shape}")
+print(f"Shape of label data: {dataset.target.shape}")
+print(f"Feature names: {dataset.feature_names}")
+print(f"Target names: {dataset.target_names}")
+
+### Step 3: Train a KMeans clustering model
+
+# create and intialize the clustering model
+model = cluster.KMeans(n_clusters=3, init="k-means++", random_state=42)
+
+# train / fit the model
+elapsed_time = time.time()
+model = model.fit(dataset.data)
+elapsed_time = time.time() - elapsed_time
+
+print(f"Elapsed time for preprocessing and training (original data): {elapsed_time} sec")
+
+### Step 4: Visualization of results + comparison to original classes
+
+# transform data to new 2D feature space
+pca = PCA(n_components=2)
+X_pca = pca.fit(dataset.data).transform(dataset.data)
+
+# define class colors
+colors = ["navy", "turquoise", "darkorange"]
+
+# =================================================================
+# == plot original classes
+# == using 2D feature space representation
+# =================================================================
+
+fig1 = plt.figure()
+for color, i, target_name in zip(colors, [0, 1, 2], dataset.target_names):
+    plt.scatter(
+        X_pca[dataset.target == i, 0], # x coordinates in new 2D feature space
+        X_pca[dataset.target == i, 1], # y coordinates in new 2D feature space
+        color=color, alpha=0.8, lw=2,
+        label=target_name
+    )
+plt.title("Original IRIS dataset classes (after applying PCA)")
+plt.legend(loc="best", shadow=False, scatterpoints=1)
+fig1.savefig("plot_original_classes_2D_PCA_representation.png", dpi=None, facecolor='w', edgecolor='w', 
+            format="png", transparent=False, bbox_inches='tight', pad_inches=0, metadata=None)
+
+# =================================================================
+# == plot classes resulting from clustering
+# == using 2D feature space representation
+# =================================================================
+
+# get cluster numbers for the different data samples
+y_pred = model.predict(dataset.data)
+
+# plot
+fig2 = plt.figure()
+for color, i, target_name in zip(colors, [0, 1, 2], ["Cluster 0", "Cluster 1", "Cluster 2"]):
+    plt.scatter(
+        X_pca[y_pred == i, 0], # x coordinates in new 2D feature space
+        X_pca[y_pred == i, 1], # y coordinates in new 2D feature space
+        color=color, alpha=0.8, lw=2,
+        label=target_name
+    )
+plt.title("Clustered IRIS dataset classes (after applying PCA)")
+plt.legend(loc="best", shadow=False, scatterpoints=1)
+# plt.show()
+fig2.savefig("plot_clustering_classes_2D_PCA_representation.png", dpi=None, facecolor='w', edgecolor='w', 
+            format="png", transparent=False, bbox_inches='tight', pad_inches=0, metadata=None)
+
+
diff --git a/machine-learning/scikit-learn_clustering/submit_job_container.sh b/machine-learning/scikit-learn_clustering/submit_job_container.sh
new file mode 100644
index 0000000..35b7e2b
--- /dev/null
+++ b/machine-learning/scikit-learn_clustering/submit_job_container.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/zsh
+############################################################
+### Slurm flags
+############################################################
+
+#SBATCH --time=00:15:00
+#SBATCH --partition=c23ms
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=4
+
+############################################################
+### Load modules or software
+############################################################
+
+# load module for datascience-notebook container
+module load datascience-notebook
+module list
+
+############################################################
+### Parameters and Settings
+############################################################
+
+# print some information about current system
+echo "Job nodes: ${SLURM_JOB_NODELIST}"
+echo "Current machine: $(hostname)"
+
+############################################################
+### Execution (Model Training)
+############################################################
+
+# run the python script inside the container
+apptainer exec -e ${DATASCIENCENOTEBOOK_IMAGE} \
+    bash -c "python scikit-learn_clustering.py"
\ No newline at end of file
-- 
GitLab