diff --git a/src/gui_version/RandomForest_gui.py b/src/gui_version/RandomForest_gui.py index 2057521e6e538a13eaab42fa09002521bcdcd419..79bcca73490e03a15cd0e0c70ca4608caf9f4882 100644 --- a/src/gui_version/RandomForest_gui.py +++ b/src/gui_version/RandomForest_gui.py @@ -17,6 +17,7 @@ from tkinter import Label from utilities.ncfile_generation import generate_ncfile from utilities.strings_for_ncfile import char_to_string + class prepare_data: """ @@ -24,17 +25,14 @@ class prepare_data: used in the Random Forest classifier. """ - def __init__(self, master, aim, log=None, retrain=False): + def __init__(self, master, aim, log=None): self.master = master self.logger = log self.row = 0 - self.retrain = retrain + self.import_parameters() - if self.retrain: - self.logger.info("Model is retrained") - else: - self.logger.info("Susceptibility/hazard map generation started") + self.logger.info("Susceptibility/hazard map generation started") self.master.geometry() self.master.winfo_toplevel().title("Map generation") @@ -70,32 +68,18 @@ class prepare_data: self.split_training_testing() elif aim == 'prediction': self.import_features() - + def import_parameters(self): + + """ + User-defined parameters are imported. + """ with open('tmp_map.pkl', 'rb') as handle: self.properties_map = pkl.load(handle) with open('tmp_settings.pkl', 'rb') as handle: self.properties_settings = pkl.load(handle) - - if self.properties_map['drop_pred'] == '': - self.not_included_pred_data = [] - else: - self.not_included_pred_data = self.properties_map[ - 'drop_pred'].split(',') - - if self.properties_map['drop_train'] == '': - self.not_included_train_data = [] - else: - self.not_included_train_data = self.properties_map[ - 'drop_train'].split(',') - - if self.retrain: - self.features_to_remove = pd.read_csv(self.properties_map['model_path'] + '/' + self.properties_map['model_to_save'] + '/feature_mismatch_training.csv')['to_drop'].to_list() - self.not_included_train_data = self.not_included_train_data + self.features_to_remove - self.properties_map['model_to_save'] = self.properties_map['model_to_save'] + '_retrain' - self.properties_map['model_to_load'] = self.properties_map['model_to_load'] + '_retrain' def import_features(self): @@ -103,53 +87,23 @@ class prepare_data: Imports the features for prediction. """ - # Import prediction dataset either as csv file or nc file - if self.properties_map['pred_path'].split('.')[-1] == 'csv': - self.features = pd.read_csv(self.properties_map['pred_path']) - - elif self.properties_map['pred_path'].split('.')[-1] == 'nc': - ds = nc.Dataset(self.properties_map['pred_path']) - pred = ds['Result'][:, :].data - pred_features = ds['features'][:].data - self.feature_list = char_to_string(pred_features) - if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: - self.features = pd.DataFrame(pred, columns=self.feature_list) - else: - self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) - - self.dropped = ds['Dropped'][:].data - self.dropped = [int(x) for x in self.dropped] + ds = nc.Dataset(self.properties_map['pred_path']) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + self.features = pd.DataFrame(pred, columns=self.feature_list) + + self.dropped = ds['Dropped'][:].data + self.dropped = [int(x) for x in self.dropped] # Save the prediction coordinates in the prediction dataset self.xy['ycoord'] = self.features['ycoord'] self.xy['xcoord'] = self.features['xcoord'] - - # Remove all features that shall not be included - # in prediction from DataFrame (see settings!) - if len(self.not_included_pred_data) > 0: - for dataset in self.not_included_pred_data: - self.features = self.features.drop(dataset, axis=1) - - # Determine which classes are contained in the categorical features - # It is distinguished between one-hot and ordinal encoded features - self.categorical_classes = {} - cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] - df_sub = self.features[cat_subset] - cat_feat = ['_'.join(col.split('_')[:(len(col.split('_'))-2)]) for col in df_sub.columns.tolist()] - self.distibuish_encoding = {} - for feat in list(set(cat_feat)): - classes = [] - if cat_feat.count(feat)>1: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.distibuish_encoding[feat] = 'ohe' - else: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.distibuish_encoding[feat] = 'ordinal' - self.categorical_classes[feat] = {} - self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] - self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) + + self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) self.feature_list = list(self.features.columns) - self.features_org = self.features.copy() + self.features = np.array(self.features) + self.logger.info('Features imported') self.logger.info('The following ' + str(len(self.feature_list)) + ' features are included in the prediction dataset: ' @@ -159,7 +113,7 @@ class prepare_data: row=self.row, column=1) self.row = self.row + 1 self.master.update() - + def import_features_labels(self): """ @@ -178,35 +132,8 @@ class prepare_data: self.xy['ycoord'] = self.features['ycoord'] self.xy['xcoord'] = self.features['xcoord'] - self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) - - # Drop ID from training data - self.features = self.features.drop('ID', axis=1) - - # Remove all features that shall not be included in - # training from DataFrame (see settings!) - if len(self.not_included_train_data) > 0: - for dataset in self.not_included_train_data: - self.features = self.features.drop(dataset, axis=1) - - # Determine which classes are contained in the categorical features - # It is distinguished between one-hot and ordinal encoded features - self.categorical_classes = {} - cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] - df_sub = self.features[cat_subset] - cat_feat = ['_'.join(col.split('_')[:(len(col.split('_'))-2)]) for col in df_sub.columns.tolist()] - for feat in list(set(cat_feat)): - classes = [] - if cat_feat.count(feat)>1: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - else: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.categorical_classes[feat] = {} - self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] - self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) - + self.features = self.features.drop(['xcoord', 'ycoord', 'ID'], axis=1) self.feature_list = list(self.features.columns) - self.features_backup = self.features.copy() self.features = np.array(self.features) self.logger.info('Features imported') @@ -219,7 +146,7 @@ class prepare_data: row=self.row, column=1) self.row = self.row + 1 self.master.update() - + def split_training_testing(self): """ @@ -240,8 +167,7 @@ class prepare_data: row=self.row, column=1) self.row = self.row + 1 self.master.update() - - + class RandomForest(prepare_data): """ @@ -249,14 +175,14 @@ class RandomForest(prepare_data): generation of the landslide susceptibility and hazard map. """ - def __init__(self, master, aim, parallel=False, log=None, retrain=None): + def __init__(self, master, aim, parallel=False, log=None): - super().__init__(master, aim, log=log, retrain=retrain) + super().__init__(master, aim, log=log) self.aim = aim self.logger = log self.parallel = parallel self.num_chunks = 10 - self.retrain = retrain + # Random Forest settings self.criterion = self.properties_map['criterion'] self.n_estimators = self.properties_map['num_trees'] @@ -297,12 +223,11 @@ class RandomForest(prepare_data): self.create_output_dir() self.load_model() - if not self.error: - self.predict() - self.extract_pos_neg_predictions() - self.reshape_prediction() - self.save_prediction() - + self.predict() + self.extract_pos_neg_predictions() + self.reshape_prediction() + self.save_prediction() + def define(self): """ @@ -375,7 +300,7 @@ class RandomForest(prepare_data): self.master.update() self.logger.info('Validation data predicted') - + def split_array_into_chunks(self, pred): """ @@ -491,8 +416,7 @@ class RandomForest(prepare_data): 'roc_tpr': self.tpr, 'roc_auc': self.roc_auc, 'accuracy': self.acc, - 'fbeta': self.fbeta, - 'categories': self.categorical_classes + 'fbeta': self.fbeta } with open(self.model_dir @@ -510,136 +434,6 @@ class RandomForest(prepare_data): self.row = self.row + 1 self.master.update() - def adapt_categorical_features(self, train_classes, training_features): - - """ - Assure that identical categorical features are used in training - and prediction dataset - - The encoded features in the training and prediction dataset are - compared regarding the contained classes. Depending on the user - input, instances in the prediction dataset with classes that are - not included in the training dataset are either set to no_value or - nevertheless considered in the prediction. The surplus additional - features are removed either way to achieve the same set of features - as in the training dataset. - - The prediction dataset is furthermore assessed if all features - that are included in the training dataset also appear in the prediction - dataset. If that is not the case, the training process is relaunched - with an adapted training dataset where the feature(s) that is/are - not contrained in the training dataset are removed. The second - trained model will be stored in a seperate folder which is named - <old_folder_name>_retrain. - - Input: - train_classes: dictionary containing for each categorical feature - all classes and the number of total classes - contained in the training dataset - training_features: Complete feature names of the features - contained in the training dataset - - Output: - None - """ - - Label(self.master, text="Categorical features are compared between training and prediction dataset").grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.instances_to_drop = [] - self.features_not_in_training = [] - - for feat in [val for val in training_features if '_encode' in val]: - if feat not in self.feature_list: - - Label(self.master, text='Categorical feature ' + feat + ' not in prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - Label(self.master, text='Error: cannot proceed with mapping').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset') - self.logger.error('Error: cannot proceed with mapping') - self.error = True - self.retrain = True - self.features_not_in_training.append(feat) - - if len(self.features_not_in_training) > 0: - pd.DataFrame(self.features_not_in_training, columns=['to_drop']).to_csv(self.model_dir + self.model_to_load + 'feature_mismatch_training.csv', index=False) - - if not self.retrain: - if list(set([val for val in training_features if '_encode' in val])) != list(set(self.feature_list)): - for feat in list(set(['_'.join(val.split('_')[:-2]) for val in self.feature_list if '_encode' in val])): - if feat in list(self.distibuish_encoding.keys()): - if self.distibuish_encoding[feat] == 'ohe': - if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])): - Label(self.master, text=feat + ': Prediction dataset contains more or other classes than training dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - Label(self.master, text='Apply user defined handling approach').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset') - self.logger.info('Apply user defined handling approach') - - common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes'])) - - if self.properties_map['keep']: - if len(common_elements) == 0: - - Label(self.master, text='Error: no common classes for ' + feat + ' in training and prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset') - self.error = True - else: - to_drop = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] - self.features = self.features.drop(to_drop, axis=1) - self.feature_list = self.features.columns.tolist() - elif self.properties_map['remove_instances']: - to_drop_col = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] - to_drop_row = [] - for col in to_drop_col: - to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist() - self.features = self.features.drop(to_drop_col, axis=1) - - Label(self.master, text='Not matching features have been removed').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.info('Not matching features have been removed') - - self.feature_list = self.features.columns.tolist() - self.instances_to_drop = self.instances_to_drop + to_drop_row - - Label(self.master, text='Instances to consider during mapping have been adapted').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.info('Instances to consider during mapping have been adapted') - - Label(self.master, text='Categorical features have been handled and hamonised').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.info('Categorical features have been handled and hamonised') - self.logger.info('Remaining features: ' + str(self.feature_list)) - def load_model(self): """ @@ -659,97 +453,8 @@ class RandomForest(prepare_data): + self.properties_map['model_to_load'] + '/model_params.pkl', 'rb') as f: params = pkl.load(f) - features = params['features'] - self.error = False - self.adapt_categorical_features(params['categories'], features) - - if not self.error: - if len(self.feature_list) == len(features): - if set(self.feature_list) != set(features): - - Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - elif self.feature_list != features: - - Label(self.master, text='The order or features differs. Prediction features are reordered').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.info('The order or features differs. Prediction features are reordered') - - self.features = self.features[features] - if self.features.columns.tolist() != features: - - Label(self.master, text='There is still something wrong with the order of the features!').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - self.error = True - elif self.feature_list == features: - - Label(self.master, text='Prediction and training dataset have the same order').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.info('Prediction and training dataset have the same order') - elif len(self.feature_list) < len(features): - - Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - elif len(self.feature_list) > len(features): - if set(features).issubset(self.feature_list): - to_drop = list(set(self.feature_list)-set(features)) - self.features_org = self.features_org.drop(to_drop, axis=1) - self.features_org = self.features_org[features] - if self.features_org.columns.tolist() != features: - Label(self.master, text='There is still something wrong with the order of the features!').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - self.error = True - else: - Label(self.master, text='Features in the prediction dataset which were not used for training were removed').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - Label(self.master, text='Features in the prediction dataset were sorted to match the training features').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.warning('Features in the prediction dataset which were not used for training were removed') - self.logger.info('Features left: ' + str(self.feature_list)) - self.logger.info('Features in the prediction dataset were sorted to match the training features') - else: - Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - if not self.error: - self.feature_list = self.features.columns.tolist() - self.features = self.features.to_numpy() - - self.logger.info('Model loaded from ' + self.logger.info('Model succesfully loaded from ' + self.model_dir + self.model_to_load) @@ -758,12 +463,7 @@ class RandomForest(prepare_data): + self.model_to_load)).grid(row=self.row, column=1) self.row = self.row + 1 self.master.update() - - Label(self.master, text="Model successfully loaded").grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - + def save_prediction(self): """ @@ -792,15 +492,15 @@ class RandomForest(prepare_data): row=self.row, column=1) self.row = self.row + 1 self.master.update() - + def reshape_prediction(self): """ Reshape the individual predictions into a map. """ - dropped = list(set(self.dropped + self.instances_to_drop)) + arr_xy = np.array(self.xy) - arr_xy[dropped, :] = [self.properties_settings['no_value']] + arr_xy[self.dropped, :] = [self.properties_settings['no_value']] result = np.reshape(list(arr_xy[:, 2]), (len(list(set(self.xy['ycoord']))), diff --git a/src/gui_version/compatibility_of_input_datasets.py b/src/gui_version/compatibility_of_input_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..61e0405611c069575d3abe3552c62f0d51758c2c --- /dev/null +++ b/src/gui_version/compatibility_of_input_datasets.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 29 13:20:59 2025 + +@author: aedrich +""" + +import numpy as np +import pandas as pd +import netCDF4 as nc +import pickle as pkl +import os +import logging +import re + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score +from joblib import delayed, Parallel +from tkinter import Label + +from utilities.ncfile_generation import generate_basic_ncfile +from utilities.strings_for_ncfile import char_to_string, features_to_char + + +class comparison_training_prediction_dataset: + + def __init__(self, logger): + + self.logger = logger + self.error = False + + self.import_parameters() + self.import_prediction_dataset() + self.import_training_dataset() + self.compare_features() + if not self.error: + self.additional_instances_to_drop() + self.save_prediction_dataset() + self.save_training_dataset() + + def import_parameters(self): + + with open('tmp_map.pkl', 'rb') as handle: + self.properties_map = pkl.load(handle) + + with open('tmp_settings.pkl', 'rb') as handle: + self.properties_settings = pkl.load(handle) + + if self.properties_map['drop_pred'] == '': + self.not_included_pred_data = [] + else: + self.not_included_pred_data = self.properties_map[ + 'drop_pred'].split(',') + + if self.properties_map['drop_train'] == '': + self.not_included_train_data = [] + else: + self.not_included_train_data = self.properties_map[ + 'drop_train'].split(',') + + def import_prediction_dataset(self): + + ds = nc.Dataset(self.properties_map['pred_path']) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + + if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: + self.pred = pd.DataFrame(pred, columns=self.feature_list) + else: + self.pred = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) + + self.xy = pd.DataFrame() + self.xy['ycoord'] = self.pred['ycoord'] + self.xy['xcoord'] = self.pred['xcoord'] + + self.idx = ds['Dropped'][:].data + self.idx = [int(x) for x in self.idx] + + if len(self.not_included_pred_data) > 0: + for dataset in self.not_included_pred_data: + if dataset in self.pred.columns.tolist(): + self.pred = self.pred.drop(dataset, axis=1) + + self.logger.info('Prediction dataset imported') + self.logger.info('The following ' + str(len(self.pred.columns.tolist())) + + ' features are included in the prediction dataset: ' + + str(self.pred.columns.tolist())) + + def import_training_dataset(self): + + # Import training dataset as csv file + self.train = pd.read_csv(self.properties_map['train_path']) + # Extract and remove labels from training dataset + self.labels = np.array( + self.train[self.properties_map['name_label']]).reshape( + [np.shape(self.train[self.properties_map['name_label']])[0], 1]) + + self.xy_train = pd.DataFrame() + self.xy_train['ID'] = self.train['ID'] + self.xy_train[self.properties_map['name_label']] = self.train[self.properties_map['name_label']] + self.xy_train['ycoord'] = self.train['ycoord'] + self.xy_train['xcoord'] = self.train['xcoord'] + + self.train = self.train.drop(['xcoord', 'ycoord', 'ID', self.properties_map['name_label']], axis=1) + + if len(self.not_included_train_data) > 0: + for dataset in self.not_included_train_data: + if dataset in self.train.columns.tolist(): + self.train = self.train.drop(dataset, axis=1) + + self.logger.info('Training dataset imported') + self.logger.info('The following ' + str(len(self.train.columns.tolist())) + + ' features are included in the training dataset: ' + + str(self.train.columns.tolist())) + + def compare_features(self): + + """ + It is assessed if all features in the training dataset also appear + in the prediction dataset. If that is not the case, the training + process will be relaunched with an adapted training dataset where the + feature(s) that is/are not contrained in the training dataset are + removed. The second trained model will be stored in a seperate + folder which is named <old_folder_name>_retrain. + + If more features appear in the prediction dataset, the additional + features are removed. + + """ + + self.logger.info('Features are compared between training and prediction dataset') + + if set(self.train.columns) == set(self.pred.columns): + self.logger.info('Features are identical in both training and prediction dataset') + self.pred = self.pred[self.train.columns] + + self.logger.info('Potentially varying order of features has been fixed') + self.error = False + + else: + self.logger.warning('Features are not identical in the training and prediction dataset') + + extra_in_pred = set(self.pred.columns) - set(self.train.columns) + extra_in_train = set(self.train.columns) - set(self.pred.columns) + + if len(extra_in_pred) > 0 and len(extra_in_train) == 0: + self.logger.warning('More features in prediction dataset, additional features are removed') + + self.pred = self.pred[self.train.columns] + self.error = False + + elif len(extra_in_train) > 0 and len(extra_in_pred) == 0 : + self.logger.warning('More features in training dataset, additional features are removed') + + self.train = self.train[self.pred.columns] + self.error = False + + elif len(extra_in_train) > 0 and len(extra_in_pred) > 0: + self.logger.warning('There are mismatching features in both datasets') + + self.common_columns = self.train.columns.intersection(self.pred.columns) + + if len(self.common_columns.tolist()) == 0: + self.logger.error('Error: No common columns in training and prediction dataset') + self.error = True + + elif len(self.common_columns.tolist()) < 6: + self.logger.warning('Warning: only ' + str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset') + self.error = False + + self.train = self.train[self.common_columns] + self.pred = self.pred[self.common_columns] + + else: + self.logger.info(str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset') + self.error = False + + self.train = self.train[self.common_columns] + self.pred = self.pred[self.common_columns] + else: + self.logger.error('Error: Unknown issue detected. Check features manually!') + self.error = True + + self.logger.info('Feature comparison completed') + + def additional_instances_to_drop(self): + + """ + All instances that have a value of zero in all columns of a categorical + feature are identified and appended to the list of instances for which + a reliable prediction is not possible. + + Input: + master: related to information display in external window + logger: related to generation of a process log + row: related to information display in external window, int + idx: Previously defined instances for which prediction is not + possible, list + pred: prediction dataset, pandas DataFrame + + Output: + idx: Updated list of instances for which prediction is not + possible, list + row: Updated row information related to information display in + external window, int + + """ + + self.logger.info('Start identification of instances that are not represented by at least one categorical feature') + + columns = self.pred.columns + # Regular expression to match "<feature>_<value>_encoded" + pattern = re.compile(r"^(.*?)(_?\d+)?_encoded$") + encoded_features = {pattern.match(col).group(1) for col in columns if pattern.match(col)} + + self.logger.info('Identified encoded features: ' + str(encoded_features)) + count = 0 + for feature in encoded_features: + + feature_cols = [col for col in self.pred.columns if col.startswith(feature) and col.endswith("_encoded")] + all_zero_rows = (self.pred[feature_cols] == 0).all(axis=1) + all_zero_rows = self.pred.index[all_zero_rows].tolist() + self.idx = list(set(self.idx + all_zero_rows)) + count = count + len(all_zero_rows) + + self.logger.info(str(count) + ' instances have been identified that are not represented by at least one categorical feature') + + def save_prediction_dataset(self): + + """ + Save prediction dataset and information on dropped rows as nc-file + """ + + self.pred = pd.concat([self.xy, self.pred], axis=1) + pred = self.pred.to_numpy() + char_features = features_to_char(self.pred.columns) + + outfile = self.properties_map['pred_path'] + self.logger.info('Prediction dataset is saved to ' + outfile) + + if os.path.exists(outfile): + os.remove(outfile) + + ds = generate_basic_ncfile(outfile, crs=None) + ds.createDimension('lat', (np.shape(pred)[0])) + ds.createDimension('lon', (np.shape(pred)[1])) + ds.createDimension('ix', (len(self.idx))) + ds.createDimension('feat', len(char_features)) + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + dropped = ds.createVariable('Dropped', 'u8', 'ix') + Features = ds.createVariable('features', 'S1', 'feat') + result[:, :] = pred + dropped[:] = np.array(self.idx) + Features[:] = char_features + ds.close() + + def save_training_dataset(self): + + """ + Save dataframe as csv. If necessary folder is created. + """ + + self.logger.info('Saving of training data in progress') + + outfile = self.properties_map['train_path'] + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + self.train = pd.concat([self.xy_train, self.train], axis=1) + + # Save dataframe as csv + self.train.to_csv(outfile, sep=',', index=False) + self.logger.info('Training dataset saved') + diff --git a/src/gui_version/shire.py b/src/gui_version/shire.py index cd1c9a6e927e4ff79264414ececc3a1fff4771fe..59f5fdab1a9a500ecd465ad804e99b5c667bfdc1 100644 --- a/src/gui_version/shire.py +++ b/src/gui_version/shire.py @@ -9,6 +9,7 @@ import tkinter as tk from create_training_data_gui import * from create_prediction_data_gui import * from RandomForest_gui import * +from compatibility_of_input_datasets import * from check_user_input import check_general_settings from utilities.initialise_log import save_log @@ -84,34 +85,26 @@ else: logger.info('Map generation started') with open('tmp_map.pkl', 'rb') as handle: properties_map = pickle.load(handle) + + s = comparison_training_prediction_dataset(logger) + if not s.error: - if properties_map['training'] == 1 and properties_map['prediction'] == 1: - for mode in ['train_test', 'prediction']: - if mode == 'train_test': - s = RandomForest(master, mode, log=logger) - else: - if properties_map['parallel'] == 1: - s = RandomForest(master, mode, parallel=True, log=logger) - else: + if properties_map['training'] == 1 and properties_map['prediction'] == 1: + for mode in ['train_test', 'prediction']: + if mode == 'train_test': s = RandomForest(master, mode, log=logger) - elif properties_map['training'] == 1 and properties_map['prediction'] == 0: - s = RandomForest(master, 'train_test', log=logger) - elif properties_map['prediction'] == 1 and properties_map['training'] == 0: - if properties_map['parallel'] == 1: - s = RandomForest(master, 'prediction', parallel=True, log=logger) - else: - s = RandomForest(master, 'prediction', log=logger) - - if s.retrain: - print('Retrain necessary') - for mode in ['train_test', 'prediction']: - if mode == 'train_test': - s = RandomForest(master, mode, log=logger, retrain=True) - else: - if properties_map['parallel'] == 1: - s = RandomForest(master, mode, parallel=True, log=logger, retrain=True) else: - s = RandomForest(master, mode, log=logger, retrain=True) + if properties_map['parallel'] == 1: + s = RandomForest(master, mode, parallel=True, log=logger) + else: + s = RandomForest(master, mode, log=logger) + elif properties_map['training'] == 1 and properties_map['prediction'] == 0: + s = RandomForest(master, 'train_test', log=logger) + elif properties_map['prediction'] == 1 and properties_map['training'] == 0: + if properties_map['parallel'] == 1: + s = RandomForest(master, 'prediction', parallel=True, log=logger) + else: + s = RandomForest(master, 'prediction', log=logger) os.remove('tmp_map.pkl') logger = s.logger diff --git a/src/gui_version/utilities/gui.py b/src/gui_version/utilities/gui.py index 6e66dd949504f3fe09452261598896802de11b1d..5fa64eca3a99a6dd7aab8498e0c3e91614779875 100644 --- a/src/gui_version/utilities/gui.py +++ b/src/gui_version/utilities/gui.py @@ -672,40 +672,40 @@ class settings_map: global all_buttons all_buttons = [] - Label(self.master, text="How to treat mismatching categories?", anchor='w', justify='left').grid( - row=self.row, column=0, sticky='w') - - self.button_pressed = tk.StringVar() - self.keep = tk.IntVar() - self.keep.set(0) - i = 'keep' - self.keep.trace_add("write", lambda name, index, mode, - var=self.keep, i=i: self.callback(var, i)) - self.b1 = tk.Radiobutton(self.master, - text="Keep instances of\n matching classes", - variable=self.keep, - value=1, - command=lambda: self.combined_command(self.b1, 'keep'), - anchor='w', justify='left') - self.b1.grid(row=self.row, column=1, columnspan=1, sticky='w') - - - self.remove_instances = tk.IntVar() - self.remove_instances.set(0) - i = 'remove_instances' - self.remove_instances.trace_add("write", lambda name, index, mode, - var=self.remove_instances, i=i: self.callback(var, i)) - self.b2 = tk.Radiobutton(self.master, - text="Remove instances of\n mismatching classes", - variable=self.remove_instances, - value=1, - command=lambda: self.combined_command(self.b2, 'remove'), anchor='w', justify='left') - self.b2.grid(row=self.row, column=2, columnspan=1, sticky='w') - - all_buttons.append(self.b1) - all_buttons.append(self.b2) - - self.row = self.row + 1 + # Label(self.master, text="How to treat mismatching categories?", anchor='w', justify='left').grid( + # row=self.row, column=0, sticky='w') + + # self.button_pressed = tk.StringVar() + # self.keep = tk.IntVar() + # self.keep.set(0) + # i = 'keep' + # self.keep.trace_add("write", lambda name, index, mode, + # var=self.keep, i=i: self.callback(var, i)) + # self.b1 = tk.Radiobutton(self.master, + # text="Keep instances of\n matching classes", + # variable=self.keep, + # value=1, + # command=lambda: self.combined_command(self.b1, 'keep'), + # anchor='w', justify='left') + # self.b1.grid(row=self.row, column=1, columnspan=1, sticky='w') + + + # self.remove_instances = tk.IntVar() + # self.remove_instances.set(0) + # i = 'remove_instances' + # self.remove_instances.trace_add("write", lambda name, index, mode, + # var=self.remove_instances, i=i: self.callback(var, i)) + # self.b2 = tk.Radiobutton(self.master, + # text="Remove instances of\n mismatching classes", + # variable=self.remove_instances, + # value=1, + # command=lambda: self.combined_command(self.b2, 'remove'), anchor='w', justify='left') + # self.b2.grid(row=self.row, column=2, columnspan=1, sticky='w') + + # all_buttons.append(self.b1) + # all_buttons.append(self.b2) + + # self.row = self.row + 1 Label(self.master).grid(row=self.row, column=0) self.row = self.row + 1 @@ -876,8 +876,8 @@ class settings_map: dic['depth_trees'] = self.depth_trees.get() dic['name_label'] = self.name_label.get() dic['criterion'] = self.criterion.get() - dic['keep'] = self.keep.get() - dic['remove_instances'] = self.remove_instances.get() + # dic['keep'] = self.keep.get() + # dic['remove_instances'] = self.remove_instances.get() sourceDir = filedialog.askdirectory( parent=self.master, initialdir="/", title='Choose path') @@ -899,8 +899,8 @@ class settings_map: dic['depth_trees'] = self.properties['depth_trees'] dic['name_label'] = self.properties['name_label'] dic['criterion'] = self.properties['criterion'] - dic['keep'] = self.properties['keep'] - dic['remove_instances'] = self.properties['remove_instances'] + # dic['keep'] = self.properties['keep'] + # dic['remove_instances'] = self.properties['remove_instances'] for key in self.dic_change: if self.dic_change[key] not in placeholders: