From 966d8258faf9f44f301e72f74c638c6b73a89f1e Mon Sep 17 00:00:00 2001
From: Ann-Kathrin Edrich <edrich@mbd.rwth-aachen.de>
Date: Wed, 16 Oct 2024 15:09:42 +0200
Subject: [PATCH] Fix categorical_classes for ordinal encoding

---
 src/gui_version/RandomForest_gui.py | 36 ++++++++++++++++++++++++-----
 src/plain_scripts/RandomForest.py   |  4 ++--
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/gui_version/RandomForest_gui.py b/src/gui_version/RandomForest_gui.py
index 1113c57..debdd6e 100644
--- a/src/gui_version/RandomForest_gui.py
+++ b/src/gui_version/RandomForest_gui.py
@@ -140,7 +140,7 @@ class prepare_data:
                 classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                 self.distibuish_encoding[feat] = 'ohe'
             else:
-                classes.append(list(set(df_sub[feat + '_encode'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                 self.distibuish_encoding[feat] = 'ordinal'
             self.categorical_classes[feat] = {}
             self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
@@ -197,7 +197,7 @@ class prepare_data:
             if cat_feat.count(feat)>1:
                 classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
             else:
-                classes.append(list(set(df_sub[feat + '_encode'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
             self.categorical_classes[feat] = {}
             self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
             self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
@@ -240,8 +240,14 @@ class prepare_data:
 
 
 class RandomForest(prepare_data):
+    
+    """
+        This class conducts the training of the Random Forest model and the 
+        generation of the landslide susceptibility and hazard map.
+    """
 
     def __init__(self, master, aim, parallel=False, log=None, retrain=None):
+        
         super().__init__(master, aim, log=log, retrain=retrain)
         self.aim = aim
         self.logger = log
@@ -371,6 +377,13 @@ class RandomForest(prepare_data):
 
         """
         Split a NumPy array into chunks without changing the number of columns.
+        
+        Input:
+            pred: prediction dataset, varies depending on if the current run 
+                  is for model training or map generation
+                  
+        Output:
+            Nones
 
         """
 
@@ -506,12 +519,22 @@ class RandomForest(prepare_data):
             not included in the training dataset are either set to no_value or
             nevertheless considered in the prediction. The surplus additional
             features are removed either way to achieve the same set of features
-            as in the training dataset
+            as in the training dataset. 
+            
+            The prediction dataset is furthermore assessed if all features
+            that are included in the training dataset also appear in the prediction
+            dataset. If that is not the case, the training process is relaunched
+            with an adapted training dataset where the feature(s) that is/are
+            not contrained in the training dataset are removed. The second 
+            trained model will be stored in a seperate folder which is named
+            <old_folder_name>_retrain.
             
             Input:
-                train_classes: dictionary containing for each categorical feature
-                               all classes and the number of total classes
-                               contained in the training dataset
+                train_classes:      dictionary containing for each categorical feature
+                                    all classes and the number of total classes
+                                    contained in the training dataset
+                training_features:  Complete feature names of the features
+                                    contained in the training dataset
                 
             Output:
                 None
@@ -539,6 +562,7 @@ class RandomForest(prepare_data):
                 self.master.update()
 
                 self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset')
+                self.logger.error('Error: cannot proceed with mapping')
                 self.error = True
                 self.retrain = True
                 self.features_not_in_training.append(feat)
diff --git a/src/plain_scripts/RandomForest.py b/src/plain_scripts/RandomForest.py
index 8f48ed5..02d8002 100644
--- a/src/plain_scripts/RandomForest.py
+++ b/src/plain_scripts/RandomForest.py
@@ -107,7 +107,7 @@ class prepare_data:
                 classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                 self.distibuish_encoding[feat] = 'ohe'
             else:
-                classes.append(list(set(df_sub[feat + '_encoded'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                 self.distibuish_encoding[feat] = 'ordinal'
             self.categorical_classes[feat] = {}
             self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
@@ -171,7 +171,7 @@ class prepare_data:
             if cat_feat.count(feat)>1:
                 classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
             else:
-                classes.append(list(set(df_sub[feat + '_encoded'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
             self.categorical_classes[feat] = {}
             self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
             self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
-- 
GitLab