Restructure comparison of training and prediction dataset

dc91f7b1 · Ann-Kathrin Margarete Edrich · e5145ddb · dc91f7b1 · dc91f7b1 · dc91f7b1
Commit dc91f7b1 authored 5 months ago by Ann-Kathrin Margarete Edrich
--- a/src/plain_scripts/RandomForest.py
+++ b/src/plain_scripts/RandomForest.py
@@ -25,12 +25,12 @@ class prepare_data:
        used in the Random Forest classifier.
    """
-    def __init__(self, aim, logger, retrain):
+    def __init__(self, aim, logger):
        invalid = False
        self.aim = aim
        self.logger = logger
-        self.retrain = retrain
        if aim == 'train_test':
            print('Train the model')
            invalid = False
@@ -67,19 +67,11 @@ class prepare_data:
        else:
            path_pred = settings.path_pred
-        if path_pred.split('.')[-1] == 'csv':
-            self.features = pd.read_csv(path_pred)
-        elif path_pred.split('.')[-1] == 'nc':
        ds = nc.Dataset(path_pred)
        pred = ds['Result'][:, :].data
        pred_features = ds['features'][:].data
        self.feature_list = char_to_string(pred_features)
-            if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list:
        self.features = pd.DataFrame(pred, columns=self.feature_list)
-            else:
-                self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list)
        self.dropped = ds['Dropped'][:].data
        self.dropped = [int(x) for x in self.dropped]
@@ -88,33 +80,9 @@ class prepare_data:
        self.xy['ycoord'] = self.features['ycoord']
        self.xy['xcoord'] = self.features['xcoord']
-        # Remove all features that shall not be included in
+        self.features = self.features.drop(['xcoord', 'ycoord'], axis=1)
-        # prediction from DataFrame (see settings!)
-        if len(settings.not_included_pred_data) > 0:
-            for dataset in settings.not_included_pred_data:
-                self.features = self.features.drop(dataset, axis=1)
-        # Determine which classes are contained in the categorical features
-        # It is distinguished between one-hot and ordinal encoded features
-        self.categorical_classes = {}
-        cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat]
-        df_sub = self.features[cat_subset]
-        cat_feat = ['_'.join(col.split('_')[:len(col.split('_'))-1]) for col in df_sub.columns.tolist()]
-        self.distibuish_encoding = {}
-        for feat in list(set(cat_feat)):
-            classes = []
-            if cat_feat.count(feat)>1:
-                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
-                self.distibuish_encoding[feat] = 'ohe'
-            else:
-                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
-                self.distibuish_encoding[feat] = 'ordinal'
-            self.categorical_classes[feat] = {}
-            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
-            self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
        self.feature_list = list(self.features.columns)
-        self.features_org = self.features.copy()
+        self.features = np.array(self.features)
        self.logger.info('Features for prediction were imported')
        self.logger.info('The following '
@@ -133,6 +101,7 @@ class prepare_data:
            self.features = pd.read_csv(settings.path_train + 'training.csv')
        else:
            self.features = pd.read_csv(settings.path_train)
        # Extract and remove labels from training dataset
        self.labels = np.array(self.features[self.label_name]).reshape(
            [np.shape(self.features[self.label_name])[0], 1])
@@ -142,47 +111,14 @@ class prepare_data:
        self.xy['ycoord'] = self.features['ycoord']
        self.xy['xcoord'] = self.features['xcoord']
-        # Drop ID from training data
+        self.features = self.features.drop(['xcoord', 'ycoord', 'ID'], axis=1)
-        self.features = self.features.drop('ID', axis=1)
-        self.features = self.features.drop(['xcoord', 'ycoord'], axis=1)
-        # Remove all features that shall not be included in
-        # training from DataFrame (see settings!)
-        if self.retrain:
-            features_to_remove = pd.read_csv(settings.path_ml + settings.model_to_save + '/feature_mismatch_training.csv')['to_drop'].to_list()
-            not_included_train_data = settings.not_included_train_data + features_to_remove
-        else:
-            not_included_train_data = settings.not_included_train_data
-        if len(not_included_train_data) > 0:
-            for dataset in not_included_train_data:
-                self.features = self.features.drop(dataset, axis=1)
-        # Determine which classes are contained in the categorical features
-        # It is distinguished between one-hot and ordinal encoded features
-        self.categorical_classes = {}
-        cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat]
-        df_sub = self.features[cat_subset]
-        cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()]
-        for feat in list(set(cat_feat)):
-            classes = []
-            if cat_feat.count(feat)>1:
-                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
-            else:
-                classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
-            self.categorical_classes[feat] = {}
-            self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
-            self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
        self.feature_list = list(self.features.columns)
+        self.features = np.array(self.features)
        self.logger.info('Features for training were imported')
        self.logger.info('The following ' + str(len(self.feature_list))
              + ' features are included in the training dataset: '
              + str(self.feature_list))
-        self.features = np.array(self.features)
    def split_training_testing(self):
@@ -196,31 +132,29 @@ class prepare_data:
                             test_size=self.test_size,
                             random_state=settings.random_seed,
                             stratify=self.labels)
        print('Data split')
        self.logger.info('Training data split in training and test dataset')
 class RandomForest(prepare_data):
-    def __init__(self, aim, parallel=False, log=None, retrain=None):
+    def __init__(self, aim, parallel=False, log=None):
-        super().__init__(aim, log, retrain)
+        super().__init__(aim, log)
        self.aim = aim
        self.parallel = parallel
-        self.retrain = retrain
        self.logger = log
        self.num_chunks = 10
        # Random Forest settings
        self.criterion = settings.criterion
        self.n_estimators = settings.num_trees
        self.max_depth = settings.depth
        self.model_dir = settings.model_database_dir
-        if self.retrain:
-            self.model_to_load = settings.model_to_load + '_retrain'
-            self.model_to_save = settings.model_to_save + '_retrain'
-        else:
        self.model_to_load = settings.model_to_load
        self.model_to_save = settings.model_to_save
        self.output_dir = None
@@ -240,7 +174,6 @@ class RandomForest(prepare_data):
            print('Prediction is performed')
            self.create_output_dir()
            self.load_model()
-            if not self.error:
            self.predict()
            self.extract_pos_neg_predictions()
            self.reshape_prediction()
@@ -375,8 +308,7 @@ class RandomForest(prepare_data):
                  'roc_tpr': self.tpr,
                  'roc_auc': self.roc_auc,
                  'accuracy': self.acc,
-                  'fbeta': self.fbeta,
+                  'fbeta': self.fbeta
-                  'categories': self.categorical_classes
                  }
        with open(settings.model_database_dir
@@ -386,77 +318,6 @@ class RandomForest(prepare_data):
        self.logger.info('Parameters are saved')
-    def adapt_categorical_features(self, train_classes, training_features):
-        """
-            The encoded features in the training and prediction dataset are
-            compared regarding the contained classes. Depending on the user
-            input, instances in the prediction dataset with classes that are
-            not included in the training dataset are either set to no_value or
-            nevertheless considered in the prediction. The surplus additional
-            features are removed either way to achieve the same set of features
-            as in the training dataset
-        """
-        self.instances_to_drop = []
-        self.features_not_in_training = []
-        for feat in [val for val in training_features if '_encode' in val]:
-            if feat not in self.feature_list:
-                print('Error: cannot proceed with mapping')
-                print('Error: Categorical feature ' + feat + ' not in prediction dataset')
-                self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset')
-                self.error = True
-                self.retrain = True
-                self.features_not_in_training.append(feat)
-        if len(self.features_not_in_training) > 0:
-                pd.DataFrame(self.features_not_in_training, columns=['to_drop']).to_csv(self.model_dir + self.model_to_load + 'feature_mismatch_training.csv', index=False)
-        if not self.retrain:
-            if list(set([val for val in training_features if '_encode' in val])) != list(set(self.feature_list)):    
-                for feat in list(set(['_'.join(val.split('_')[:-2]) for val in self.feature_list if '_encode' in val])):
-                    if feat in list(self.distibuish_encoding.keys()):
-                        if self.distibuish_encoding[feat] == 'ohe':
-                            if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])):
-                                print(feat + ': Prediction dataset contains more or other classes than training dataset')
-                                self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset')
-                                self.logger.info('Apply user defined handling approach')
-                                common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes']))
-                                if self.properties_map['keep']:
-                                    if len(common_elements) == 0:
-                                        print('Error: no common classes for ' + feat + ' in training and prediction dataset')
-                                        self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset')
-                                        self.error = True
-                                    else:
-                                        to_drop = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements]
-                                        self.features = self.features.drop(to_drop, axis=1)
-                                        self.feature_list = self.features.columns.tolist()
-                                elif self.properties_map['remove_instances']:
-                                    to_drop_col = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements]
-                                    to_drop_row = []
-                                    for col in to_drop_col:
-                                        to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist()
-                                    self.features = self.features.drop(to_drop_col, axis=1)
-                                    print('Not matching features have been removed')
-                                    self.logger.info('Not matching features have been removed')
-                                    self.feature_list = self.features.columns.tolist()
-                                    self.instances_to_drop = self.instances_to_drop + to_drop_row
-                                    print('Instances to consider during mapping have been adapted')
-                                    self.logger.info('Instances to consider during mapping have been adapted')
-            print('Categorical features have been handled and hamonised')
-            self.logger.info('Categorical features have been handled and hamonised')
-            self.logger.info('Remaining features: ' + str(self.feature_list))
    def load_model(self):
        """
@@ -478,63 +339,7 @@ class RandomForest(prepare_data):
                  + '/model_params.pkl', 'rb') as f:
            params = pkl.load(f)
        features = params['features']
-        self.error = False
-        self.adapt_categorical_features(params['categories'], features)
-        if not self.error:
-            if len(self.feature_list) == len(features):
-                if set(self.feature_list) != set(features):
-                    print('Error: Not all features of the model are contained in the prediction dataset')
-                    self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
-                    self.error = True
-                elif self.feature_list != features:
-                    print('The order or features differs. Prediction features are reordered')
-                    self.logger.info('The order or features differs. Prediction features are reordered')
-                    self.features = self.features[features]
-                    if self.features.columns.tolist() != features:
-                        print('There is still something wrong with the order of the features!')
-                elif self.feature_list == features:
-                    print('Prediction and training dataset have the same order')
-                    self.logger.info('Prediction and training dataset have the same order')
-            elif len(self.feature_list) < len(features):
-                print('Error: Not all features of the model are contained in the prediction dataset')
-                self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
-                self.error = True
-            elif len(self.feature_list) > len(features):
-                if set(features).issubset(self.feature_list):
-                    to_drop = list(set(self.feature_list)-set(features))
-                    self.features = self.features.drop(to_drop, axis=1)
-                    self.feature = self.features[features]
-                    if self.features.columns.tolist() != features:
-                        print('There is still something wrong with the order of the features!')
-                        self.error = True
-                    else:                      
-                        print('Features in the prediction dataset which were not used for training were removed')
-                        print('Features in the prediction dataset were sorted to match the training features')
-                        self.logger.warning('Features in the prediction dataset which were not used for training were removed')
-                        self.logger.info('Features left: ' + str(self.feature_list))
-                        self.logger.info('Features in the prediction dataset were sorted to match the training features')
-                else:
-                    Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid(
-                                row=self.row, column=1)
-                    self.row = self.row + 1
-                    self.master.update()
-                    self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
-                    self.error = True
-            if not self.error:
-                self.feature_list = self.features.columns.tolist()
-                self.features = self.features.to_numpy()  
        self.logger.info('Model loaded from '
                         + self.model_dir
@@ -572,9 +377,8 @@ class RandomForest(prepare_data):
            Reshape the individual predictions into a map.
        """
-        dropped = list(set(self.dropped + self.instances_to_drop))
        arr_xy = np.array(self.xy)
-        arr_xy[dropped, :] = settings.no_value#*np.shape(arr_xy)[1]
+        arr_xy[self.dropped, :] = settings.no_value#*np.shape(arr_xy)[1]
        result = np.reshape(list(arr_xy[:, 2]),
                            (len(list(set(self.xy['ycoord']))),

--- a/src/plain_scripts/check_user_input.py
+++ b/src/plain_scripts/check_user_input.py
@@ -14,16 +14,16 @@ class check_general_settings():
        if training_dataset or map_generation:
            if os.path.isdir(path_train):
-                save_path = path_train + 'check_user_input.log'
+                save_path = path_train + '/check_user_input.log'
            else:
-                save_path = os.path.dirname(path_train) + 'check_user_input.log'
+                save_path = os.path.dirname(path_train) + '/check_user_input.log'
        elif prediction_dataset:
            if os.path.isdir(path_pred):
-                save_path = path_pred + 'check_user_input.log'
+                save_path = path_pred + '/check_user_input.log'
            else:
-                save_path = os.path.dirname(path_pred) + 'check_user_input.log'
+                save_path = os.path.dirname(path_pred) + '/check_user_input.log'
        else:
-            save_path = 'check_user_input.log'
+            save_path = '/check_user_input.log'
        if os.path.exists(save_path):
            os.remove(save_path)

--- a/src/plain_scripts/compatibility_of_input_datasets.py
+++ b/src/plain_scripts/compatibility_of_input_datasets.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jan 29 13:20:59 2025
+@author: aedrich
+"""
+import numpy as np
+import pandas as pd
+import netCDF4 as nc
+import pickle as pkl
+import os
+import logging
+import settings
+import re
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score
+from joblib import delayed, Parallel
+from tkinter import Label
+from utilities.ncfile_generation import generate_basic_ncfile
+from utilities.strings_for_ncfile import char_to_string, features_to_char
+class comparison_training_prediction_dataset:
+    def __init__(self, logger):
+        self.logger = logger
+        self.error = False
+        self.import_prediction_dataset()
+        self.import_training_dataset()
+        self.compare_features()
+        if not self.error:
+            self.additional_instances_to_drop()
+            self.save_prediction_dataset()
+            self.save_training_dataset()
+    def import_prediction_dataset(self):
+        ds = nc.Dataset(settings.path_pred)
+        pred = ds['Result'][:, :].data
+        pred_features = ds['features'][:].data
+        self.feature_list = char_to_string(pred_features)
+        if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list:
+            self.pred = pd.DataFrame(pred, columns=self.feature_list)
+        else:
+            self.pred = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list)
+        self.xy = pd.DataFrame()
+        self.xy['ycoord'] = self.pred['ycoord']
+        self.xy['xcoord'] = self.pred['xcoord']
+        self.idx = ds['Dropped'][:].data
+        self.idx = [int(x) for x in self.idx]
+        if len(settings.not_included_pred_data) > 0:
+            for dataset in settings.not_included_pred_data:
+                if dataset in self.pred.columns.tolist():
+                    self.pred = self.pred.drop(dataset, axis=1)
+        self.logger.info('Prediction dataset imported')
+        self.logger.info('The following ' + str(len(self.pred.columns.tolist())) 
+                         + ' features are included in the prediction dataset: ' 
+                         + str(self.pred.columns.tolist()))
+    def import_training_dataset(self):
+        # Import training dataset as csv file
+        self.train = pd.read_csv(settings.path_train)
+        # Extract and remove labels from training dataset
+        self.labels = np.array(
+            self.train['label']).reshape(
+                [np.shape(self.train['label'])[0], 1])
+        self.xy_train = pd.DataFrame()
+        self.xy_train['ID'] = self.train['ID']
+        self.xy_train['label'] = self.train['label']
+        self.xy_train['ycoord'] = self.train['ycoord']
+        self.xy_train['xcoord'] = self.train['xcoord']
+        self.train = self.train.drop(['xcoord', 'ycoord', 'ID', 'label'], axis=1)
+        if len(settings.not_included_train_data) > 0:
+            for dataset in settings.not_included_train_data:
+                if dataset in self.train.columns.tolist():
+                    self.train = self.train.drop(dataset, axis=1)
+        self.logger.info('Training dataset imported')
+        self.logger.info('The following ' + str(len(self.train.columns.tolist()))
+                         + ' features are included in the training dataset: '
+                         + str(self.train.columns.tolist()))
+    def compare_features(self):
+        """
+            It is assessed if all features in the training dataset also appear
+            in the prediction dataset. If that is not the case, the training 
+            process will be relaunched with an adapted training dataset where the 
+            feature(s) that is/are not contrained in the training dataset are
+            removed. The second trained model will be stored in a seperate
+            folder which is named <old_folder_name>_retrain.
+            If more features appear in the prediction dataset, the additional 
+            features are removed.
+        """
+        self.logger.info('Features are compared between training and prediction dataset')
+        if set(self.train.columns) == set(self.pred.columns):  
+            self.logger.info('Features are identical in both training and prediction dataset')
+            self.pred = self.pred[self.train.columns]
+            self.logger.info('Potentially varying order of features has been fixed')
+            self.error = False
+        else:
+            self.logger.warning('Features are not identical in the training and prediction dataset')
+            extra_in_pred = set(self.pred.columns) - set(self.train.columns)
+            extra_in_train = set(self.train.columns) - set(self.pred.columns)
+            if len(extra_in_pred) > 0 and len(extra_in_train) == 0:
+                self.logger.warning('More features in prediction dataset, additional features are removed')
+                self.pred = self.pred[self.train.columns]
+                self.error = False
+            elif len(extra_in_train) > 0  and len(extra_in_pred) == 0 :
+                self.logger.warning('More features in training dataset, additional features are removed')
+                self.train = self.train[self.pred.columns]
+                self.error = False
+            elif len(extra_in_train) > 0  and len(extra_in_pred) > 0:
+                self.logger.warning('There are mismatching features in both datasets')
+                self.common_columns = self.train.columns.intersection(self.pred.columns)
+                if len(self.common_columns.tolist()) == 0:
+                    self.logger.error('Error: No common columns in training and prediction dataset')
+                    self.error = True
+                elif len(self.common_columns.tolist()) < 6:
+                    self.logger.warning('Warning: only ' + str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')
+                    self.error = False
+                    self.train = self.train[self.common_columns]
+                    self.pred = self.pred[self.common_columns]
+                else:
+                    self.logger.info(str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')         
+                    self.error = False
+                    self.train = self.train[self.common_columns]
+                    self.pred = self.pred[self.common_columns]
+            else:
+                self.logger.error('Error: Unknown issue detected. Check features manually!')
+                self.error = True
+            self.logger.info('Feature comparison completed')
+    def additional_instances_to_drop(self):     
+        """
+            All instances that have a value of zero in all columns of a categorical
+            feature are identified and appended to the list of instances for which
+            a reliable prediction is not possible.
+            Input:
+                master: related to information display in external window
+                logger: related to generation of a process log
+                row: related to information display in external window, int
+                idx: Previously defined instances for which prediction is not
+                     possible, list
+                pred: prediction dataset, pandas DataFrame
+            Output:
+                idx: Updated list of instances for which prediction is not
+                     possible, list
+                row: Updated row information related to information display in
+                     external window, int
+        """
+        self.logger.info('Start identification of instances that are not represented by at least one categorical feature')
+        columns = self.pred.columns
+        # Regular expression to match "<feature>_<value>_encoded"
+        pattern = re.compile(r"^(.*?)(_?\d+)?_encoded$")
+        encoded_features = {pattern.match(col).group(1) for col in columns if pattern.match(col)}
+        self.logger.info('Identified encoded features: ' + str(encoded_features))
+        count = 0
+        for feature in encoded_features:
+            feature_cols = [col for col in self.pred.columns if col.startswith(feature) and col.endswith("_encoded")]
+            all_zero_rows = (self.pred[feature_cols] == 0).all(axis=1)
+            all_zero_rows = self.pred.index[all_zero_rows].tolist()
+            self.idx = list(set(self.idx + all_zero_rows))
+            count = count + len(all_zero_rows)
+        self.logger.info(str(count) + ' instances have been identified that are not represented by at least one categorical feature')
+    def save_prediction_dataset(self):
+        """
+            Save prediction dataset and information on dropped rows as nc-file
+        """
+        self.pred = pd.concat([self.xy, self.pred], axis=1)
+        pred = self.pred.to_numpy()
+        char_features = features_to_char(self.pred.columns)
+        outfile = settings.path_pred
+        self.logger.info('Prediction dataset is saved to ' + outfile)
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        ds = generate_basic_ncfile(outfile, crs=None)
+        ds.createDimension('lat', (np.shape(pred)[0]))
+        ds.createDimension('lon', (np.shape(pred)[1]))
+        ds.createDimension('ix', (len(self.idx)))
+        ds.createDimension('feat', len(char_features))
+        result = ds.createVariable('Result', 'f4', ('lat', 'lon'))
+        dropped = ds.createVariable('Dropped', 'u8', 'ix')
+        Features = ds.createVariable('features', 'S1', 'feat')
+        result[:, :] = pred
+        dropped[:] = np.array(self.idx)
+        Features[:] = char_features
+        ds.close()
+    def save_training_dataset(self):
+        """
+            Save dataframe as csv. If necessary folder is created.
+        """
+        self.logger.info('Saving of training data in progress')
+        outfile = settings.path_train
+        # If outfile exists already, delete
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        self.train = pd.concat([self.xy_train, self.train], axis=1)
+        # Save dataframe as csv
+        self.train.to_csv(outfile, sep=',', index=False)
+        self.logger.info('Training dataset saved')
--- a/src/plain_scripts/settings copy.py
+++ b/src/plain_scripts/settings copy.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+    This is a template file for settings.py
+    Either duplicate and rename or fill out and rename.
+    More information on the individual meaning and what to consider can be
+    found in the user manual
+"""
+import logging
+import json
+import types
+def export_variables(logger):
+    variables = globals()
+    # Filter out non-serializable objects
+    defined_vars = {}
+    for k, v in variables.items():
+        if not k.startswith('__') and not callable(v) and not isinstance(v, types.ModuleType):
+            try:
+                # Test if the value is JSON serializable
+                json.dumps(v)
+                defined_vars[k] = v
+            except (TypeError, OverflowError):
+                # Skip non-serializable values
+                pass
+    # Convert the dictionary to a JSON string
+    vars_json = json.dumps(defined_vars, indent=4)
+    logger.info("Exported variables: %s", vars_json)
+# Mandatory parameters
+days = 2
+approach = 'statistical'
+# Steps
+training_dataset = False # Boolean, if training dataset shall be created
+preprocessing = 'no_interpolation' # Defines preprocessing approach: 'cluster', 'interpolation', 'no_interpolation'
+train_from_scratch = True
+train_delete = None
+prediction_dataset = False # Boolean, if prediction dataset shall be created
+pred_from_scratch = True
+pred_delete = None
+map_generation = True # Boolean, if mapping shall be performed
+# General
+crs = 'wgs84' # Coordinate reference system, string
+no_value = -999 # No data value, integer, suggestion -999
+random_seed = 42 # Random seed, integer
+resolution = 25 # Resolution in m of the final map, integer, all datasets will be interpolated to this resolution
+path_ml = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/' # Path to where shire framework related parameters/files will be stored
+data_summary_path = None # Path to the data summary file, string, relevant only for training/prediction dataset generation
+key_to_include_path = None # Path to kets_to_include file, string, relevant only for training/prediction dataset generation
+# Training dataset generation
+size = None # Size of the validation dataset, float number between 0 and 1
+path_train = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/training_datasets/{days}/training_statistical_{days}d.csv' # Path to directory where the training dataset is/shall be stored
+ohe = None # One-hot encoding, bool
+path_landslide_database = None # Path to where the landslide database is stored, string 
+ID = 'ID' # Name of the column containing landslide ID, string
+landslide_database_x = 'xcoord' # Name of the column containing longitude values, string
+landslide_database_y = 'ycoord' # Name of the column containing latitude values, string
+path_nonls_locations = None # Path to where the non-landslide database is stored, string
+num_nonls = None # Number of non-landslide locations to include in the training dataset, integer
+nonls_database_x = None # Name of the column containing longitude values, string
+nonls_database_y = None  # Name of the column containing longitude values, string
+#cluster = False # Use clustering for training dataset generation, bool
+#interpolation = False # Use interpolation for training dataset generation, bool
+# Prediction dataset generation
+bounding_box = None # Coordinates of the edges of the bounding box of the area of interest, list, [<ymax>, <ymin>, <xmin>, <xmax>]
+path_pred = None # Path to directory where the prediction dataset is/shall be stored
+# Map generation
+RF_training = True # Train the RF, bool
+RF_prediction = True # Make a prediction using the RF, bool
+not_included_pred_data = ['xcoord', 'ycoord']# List of features in the training dataset not to be considered in prediction
+not_included_train_data = [] # List of features in the training dataset not to be considered in model training
+num_trees = 100 # Number of trees in the Random Forest, integer
+criterion = 'gini' # Criterion for the Random Forest, string
+depth = 20  # Number of nodes of the RF, integer
+model_to_save = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder name for storage of the RF results, string
+model_to_load = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder where RF model is stored, string, identical to model_to_save if training and prediction is done at the same time
+model_database_dir = path_ml # Directory where models should be stored
+parallel = True # Boolean, true if prediction data shall be split to predict in parallel
+keep_cat_features = False #bool, true if categorical features shall be kept even if some instances in prediction dataset have classes not covered by the prediction dataset
+remove_instances = True # bool, true of instances in prediction dataset shall be removed if they have different classes than the instances in the training dataset
\ No newline at end of file
--- a/src/plain_scripts/shire.py
+++ b/src/plain_scripts/shire.py
@@ -9,6 +9,8 @@ from create_training_data import create_training_data
 from create_prediction_data import create_prediction_data
 from RandomForest import RandomForest
 from check_user_input import check_general_settings
+from compatibility_of_input_datasets import comparison_training_prediction_dataset
 from utilities.initialise_log import save_log
 """
@@ -78,6 +80,12 @@ else:
        print('Map will be generated')
        logger.info('Map generation started')
+        print('Training and prediction dataset will be assessed for compatibility')
+        logger.info('Training and prediction dataset will be assessed for compatibility')
+        s = comparison_training_prediction_dataset(logger)
+        if not s.error:
            if settings.parallel:
                print('Prediction will run in parallel')
                logger.info('Prediction will run in parallel')