Fix categorical_classes for ordinal encoding

966d8258 · Ann-Kathrin Margarete Edrich · b5441d0b · 966d8258 · 966d8258
Commit 966d8258 authored 9 months ago by Ann-Kathrin Margarete Edrich
--- a/src/gui_version/RandomForest_gui.py
+++ b/src/gui_version/RandomForest_gui.py
@@ -140,7 +140,7 @@ class prepare_data:
                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                self.distibuish_encoding[feat] = 'ohe'
            else:
-                classes.append(list(set(df_sub[feat + '_encode'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                self.distibuish_encoding[feat] = 'ordinal'
            self.categorical_classes[feat] = {}
            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
@@ -197,7 +197,7 @@ class prepare_data:
            if cat_feat.count(feat)>1:
                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
            else:
-                classes.append(list(set(df_sub[feat + '_encode'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
            self.categorical_classes[feat] = {}
            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
            self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
@@ -241,7 +241,13 @@ class prepare_data:
 class RandomForest(prepare_data):
+    """
+        This class conducts the training of the Random Forest model and the 
+        generation of the landslide susceptibility and hazard map.
+    """
    def __init__(self, master, aim, parallel=False, log=None, retrain=None):
        super().__init__(master, aim, log=log, retrain=retrain)
        self.aim = aim
        self.logger = log
@@ -372,6 +378,13 @@ class RandomForest(prepare_data):
        """
        Split a NumPy array into chunks without changing the number of columns.
+        Input:
+            pred: prediction dataset, varies depending on if the current run 
+                  is for model training or map generation
+        Output:
+            Nones
        """
        # Calculate the number of rows in each chunk
@@ -506,12 +519,22 @@ class RandomForest(prepare_data):
            not included in the training dataset are either set to no_value or
            nevertheless considered in the prediction. The surplus additional
            features are removed either way to achieve the same set of features
-            as in the training dataset
+            as in the training dataset. 
+            The prediction dataset is furthermore assessed if all features
+            that are included in the training dataset also appear in the prediction
+            dataset. If that is not the case, the training process is relaunched
+            with an adapted training dataset where the feature(s) that is/are
+            not contrained in the training dataset are removed. The second 
+            trained model will be stored in a seperate folder which is named
+            <old_folder_name>_retrain.
            Input:
                train_classes:      dictionary containing for each categorical feature
                                    all classes and the number of total classes
                                    contained in the training dataset
+                training_features:  Complete feature names of the features
+                                    contained in the training dataset
            Output:
                None
@@ -539,6 +562,7 @@ class RandomForest(prepare_data):
                self.master.update()
                self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset')
+                self.logger.error('Error: cannot proceed with mapping')
                self.error = True
                self.retrain = True
                self.features_not_in_training.append(feat)

--- a/src/plain_scripts/RandomForest.py
+++ b/src/plain_scripts/RandomForest.py
@@ -107,7 +107,7 @@ class prepare_data:
                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                self.distibuish_encoding[feat] = 'ohe'
            else:
-                classes.append(list(set(df_sub[feat + '_encoded'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
                self.distibuish_encoding[feat] = 'ordinal'
            self.categorical_classes[feat] = {}
            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
@@ -171,7 +171,7 @@ class prepare_data:
            if cat_feat.count(feat)>1:
                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
            else:
-                classes.append(list(set(df_sub[feat + '_encoded'].tolist())))
+                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
            self.categorical_classes[feat] = {}
            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
            self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)