From ce7459449dc3c2eda3642305bd2f6ff6d34d5bda Mon Sep 17 00:00:00 2001 From: Jannis Klinkenberg <j.klinkenberg@itc.rwth-aachen.de> Date: Thu, 7 Nov 2024 08:43:07 +0100 Subject: [PATCH] added clustering example using scikit-learn --- .../scikit-learn_clustering/README.md | 23 +++++ .../scikit-learn_clustering.py | 84 +++++++++++++++++++ .../submit_job_container.sh | 34 ++++++++ 3 files changed, 141 insertions(+) create mode 100644 machine-learning/scikit-learn_clustering/README.md create mode 100644 machine-learning/scikit-learn_clustering/scikit-learn_clustering.py create mode 100644 machine-learning/scikit-learn_clustering/submit_job_container.sh diff --git a/machine-learning/scikit-learn_clustering/README.md b/machine-learning/scikit-learn_clustering/README.md new file mode 100644 index 0000000..fc82fda --- /dev/null +++ b/machine-learning/scikit-learn_clustering/README.md @@ -0,0 +1,23 @@ +# Scikit-Learn - Clustering Example + +## Interactive usage + +To interactively work, debug and execute codes you can either use our [HPC JupyterHub](https://jupyterhub.hpc.itc.rwth-aachen.de:9651/) (More information to that in our [Help](https://help.itc.rwth-aachen.de/service/rhr4fjjutttf/article/689934fec5a34c909c54606f6bc2e827/)) or use regular shell sessions on the cluster frontends or within interactivly batch jobs. + +As an example, in an interactive session on our HPC cluster, execute the following: + +```bash +# load the container module on the cluster +module load datascience-notebook + +# run the container and get a shell inside +# more information on Apptainer flags: https://apptainer.org/docs/user/main/cli/apptainer_shell.html +apptainer shell -e ${DATASCIENCENOTEBOOK_IMAGE} + +# within the newly opened shell in the container +Apptainer> python scikit-learn_clustering.py +``` + +# Batch usage + +To asynchronously execute scripts in batch, there is an example contained in this folder \ No newline at end of file diff --git a/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py b/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py new file mode 100644 index 0000000..3f8bc6d --- /dev/null +++ b/machine-learning/scikit-learn_clustering/scikit-learn_clustering.py @@ -0,0 +1,84 @@ +# Clustering Iris Dataset + +### Step 1: Load desired Python modules +import time +import matplotlib.pyplot as plt + +from sklearn import cluster, datasets +from sklearn.decomposition import PCA + +### Step 2: Loading the dataset + +# load the dataset +dataset = datasets.load_iris() + +# print desired information +print(f"Data set description:\n{dataset.DESCR}") +print(f"Shape of feature / training data: {dataset.data.shape}") +print(f"Shape of label data: {dataset.target.shape}") +print(f"Feature names: {dataset.feature_names}") +print(f"Target names: {dataset.target_names}") + +### Step 3: Train a KMeans clustering model + +# create and intialize the clustering model +model = cluster.KMeans(n_clusters=3, init="k-means++", random_state=42) + +# train / fit the model +elapsed_time = time.time() +model = model.fit(dataset.data) +elapsed_time = time.time() - elapsed_time + +print(f"Elapsed time for preprocessing and training (original data): {elapsed_time} sec") + +### Step 4: Visualization of results + comparison to original classes + +# transform data to new 2D feature space +pca = PCA(n_components=2) +X_pca = pca.fit(dataset.data).transform(dataset.data) + +# define class colors +colors = ["navy", "turquoise", "darkorange"] + +# ================================================================= +# == plot original classes +# == using 2D feature space representation +# ================================================================= + +fig1 = plt.figure() +for color, i, target_name in zip(colors, [0, 1, 2], dataset.target_names): + plt.scatter( + X_pca[dataset.target == i, 0], # x coordinates in new 2D feature space + X_pca[dataset.target == i, 1], # y coordinates in new 2D feature space + color=color, alpha=0.8, lw=2, + label=target_name + ) +plt.title("Original IRIS dataset classes (after applying PCA)") +plt.legend(loc="best", shadow=False, scatterpoints=1) +fig1.savefig("plot_original_classes_2D_PCA_representation.png", dpi=None, facecolor='w', edgecolor='w', + format="png", transparent=False, bbox_inches='tight', pad_inches=0, metadata=None) + +# ================================================================= +# == plot classes resulting from clustering +# == using 2D feature space representation +# ================================================================= + +# get cluster numbers for the different data samples +y_pred = model.predict(dataset.data) + +# plot +fig2 = plt.figure() +for color, i, target_name in zip(colors, [0, 1, 2], ["Cluster 0", "Cluster 1", "Cluster 2"]): + plt.scatter( + X_pca[y_pred == i, 0], # x coordinates in new 2D feature space + X_pca[y_pred == i, 1], # y coordinates in new 2D feature space + color=color, alpha=0.8, lw=2, + label=target_name + ) +plt.title("Clustered IRIS dataset classes (after applying PCA)") +plt.legend(loc="best", shadow=False, scatterpoints=1) +# plt.show() +fig2.savefig("plot_clustering_classes_2D_PCA_representation.png", dpi=None, facecolor='w', edgecolor='w', + format="png", transparent=False, bbox_inches='tight', pad_inches=0, metadata=None) + + diff --git a/machine-learning/scikit-learn_clustering/submit_job_container.sh b/machine-learning/scikit-learn_clustering/submit_job_container.sh new file mode 100644 index 0000000..35b7e2b --- /dev/null +++ b/machine-learning/scikit-learn_clustering/submit_job_container.sh @@ -0,0 +1,34 @@ +#!/usr/bin/zsh +############################################################ +### Slurm flags +############################################################ + +#SBATCH --time=00:15:00 +#SBATCH --partition=c23ms +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 + +############################################################ +### Load modules or software +############################################################ + +# load module for datascience-notebook container +module load datascience-notebook +module list + +############################################################ +### Parameters and Settings +############################################################ + +# print some information about current system +echo "Job nodes: ${SLURM_JOB_NODELIST}" +echo "Current machine: $(hostname)" + +############################################################ +### Execution (Model Training) +############################################################ + +# run the python script inside the container +apptainer exec -e ${DATASCIENCENOTEBOOK_IMAGE} \ + bash -c "python scikit-learn_clustering.py" \ No newline at end of file -- GitLab