Skip to content
Snippets Groups Projects
Commit dc91f7b1 authored by Ann-Kathrin Margarete Edrich's avatar Ann-Kathrin Margarete Edrich
Browse files

Restructure comparison of training and prediction dataset

parent e5145ddb
Branches
No related tags found
No related merge requests found
...@@ -25,12 +25,12 @@ class prepare_data: ...@@ -25,12 +25,12 @@ class prepare_data:
used in the Random Forest classifier. used in the Random Forest classifier.
""" """
def __init__(self, aim, logger, retrain): def __init__(self, aim, logger):
invalid = False invalid = False
self.aim = aim self.aim = aim
self.logger = logger self.logger = logger
self.retrain = retrain
if aim == 'train_test': if aim == 'train_test':
print('Train the model') print('Train the model')
invalid = False invalid = False
...@@ -67,19 +67,11 @@ class prepare_data: ...@@ -67,19 +67,11 @@ class prepare_data:
else: else:
path_pred = settings.path_pred path_pred = settings.path_pred
if path_pred.split('.')[-1] == 'csv':
self.features = pd.read_csv(path_pred)
elif path_pred.split('.')[-1] == 'nc':
ds = nc.Dataset(path_pred) ds = nc.Dataset(path_pred)
pred = ds['Result'][:, :].data pred = ds['Result'][:, :].data
pred_features = ds['features'][:].data pred_features = ds['features'][:].data
self.feature_list = char_to_string(pred_features) self.feature_list = char_to_string(pred_features)
if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list:
self.features = pd.DataFrame(pred, columns=self.feature_list) self.features = pd.DataFrame(pred, columns=self.feature_list)
else:
self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list)
self.dropped = ds['Dropped'][:].data self.dropped = ds['Dropped'][:].data
self.dropped = [int(x) for x in self.dropped] self.dropped = [int(x) for x in self.dropped]
...@@ -88,33 +80,9 @@ class prepare_data: ...@@ -88,33 +80,9 @@ class prepare_data:
self.xy['ycoord'] = self.features['ycoord'] self.xy['ycoord'] = self.features['ycoord']
self.xy['xcoord'] = self.features['xcoord'] self.xy['xcoord'] = self.features['xcoord']
# Remove all features that shall not be included in self.features = self.features.drop(['xcoord', 'ycoord'], axis=1)
# prediction from DataFrame (see settings!)
if len(settings.not_included_pred_data) > 0:
for dataset in settings.not_included_pred_data:
self.features = self.features.drop(dataset, axis=1)
# Determine which classes are contained in the categorical features
# It is distinguished between one-hot and ordinal encoded features
self.categorical_classes = {}
cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat]
df_sub = self.features[cat_subset]
cat_feat = ['_'.join(col.split('_')[:len(col.split('_'))-1]) for col in df_sub.columns.tolist()]
self.distibuish_encoding = {}
for feat in list(set(cat_feat)):
classes = []
if cat_feat.count(feat)>1:
classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
self.distibuish_encoding[feat] = 'ohe'
else:
classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
self.distibuish_encoding[feat] = 'ordinal'
self.categorical_classes[feat] = {}
self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
self.feature_list = list(self.features.columns) self.feature_list = list(self.features.columns)
self.features_org = self.features.copy() self.features = np.array(self.features)
self.logger.info('Features for prediction were imported') self.logger.info('Features for prediction were imported')
self.logger.info('The following ' self.logger.info('The following '
...@@ -133,6 +101,7 @@ class prepare_data: ...@@ -133,6 +101,7 @@ class prepare_data:
self.features = pd.read_csv(settings.path_train + 'training.csv') self.features = pd.read_csv(settings.path_train + 'training.csv')
else: else:
self.features = pd.read_csv(settings.path_train) self.features = pd.read_csv(settings.path_train)
# Extract and remove labels from training dataset # Extract and remove labels from training dataset
self.labels = np.array(self.features[self.label_name]).reshape( self.labels = np.array(self.features[self.label_name]).reshape(
[np.shape(self.features[self.label_name])[0], 1]) [np.shape(self.features[self.label_name])[0], 1])
...@@ -142,47 +111,14 @@ class prepare_data: ...@@ -142,47 +111,14 @@ class prepare_data:
self.xy['ycoord'] = self.features['ycoord'] self.xy['ycoord'] = self.features['ycoord']
self.xy['xcoord'] = self.features['xcoord'] self.xy['xcoord'] = self.features['xcoord']
# Drop ID from training data self.features = self.features.drop(['xcoord', 'ycoord', 'ID'], axis=1)
self.features = self.features.drop('ID', axis=1)
self.features = self.features.drop(['xcoord', 'ycoord'], axis=1)
# Remove all features that shall not be included in
# training from DataFrame (see settings!)
if self.retrain:
features_to_remove = pd.read_csv(settings.path_ml + settings.model_to_save + '/feature_mismatch_training.csv')['to_drop'].to_list()
not_included_train_data = settings.not_included_train_data + features_to_remove
else:
not_included_train_data = settings.not_included_train_data
if len(not_included_train_data) > 0:
for dataset in not_included_train_data:
self.features = self.features.drop(dataset, axis=1)
# Determine which classes are contained in the categorical features
# It is distinguished between one-hot and ordinal encoded features
self.categorical_classes = {}
cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat]
df_sub = self.features[cat_subset]
cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()]
for feat in list(set(cat_feat)):
classes = []
if cat_feat.count(feat)>1:
classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
else:
classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f])
self.categorical_classes[feat] = {}
self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist]
self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat)
self.feature_list = list(self.features.columns) self.feature_list = list(self.features.columns)
self.features = np.array(self.features)
self.logger.info('Features for training were imported') self.logger.info('Features for training were imported')
self.logger.info('The following ' + str(len(self.feature_list)) self.logger.info('The following ' + str(len(self.feature_list))
+ ' features are included in the training dataset: ' + ' features are included in the training dataset: '
+ str(self.feature_list)) + str(self.feature_list))
self.features = np.array(self.features)
def split_training_testing(self): def split_training_testing(self):
...@@ -196,31 +132,29 @@ class prepare_data: ...@@ -196,31 +132,29 @@ class prepare_data:
test_size=self.test_size, test_size=self.test_size,
random_state=settings.random_seed, random_state=settings.random_seed,
stratify=self.labels) stratify=self.labels)
print('Data split') print('Data split')
self.logger.info('Training data split in training and test dataset') self.logger.info('Training data split in training and test dataset')
class RandomForest(prepare_data): class RandomForest(prepare_data):
def __init__(self, aim, parallel=False, log=None, retrain=None): def __init__(self, aim, parallel=False, log=None):
super().__init__(aim, log, retrain) super().__init__(aim, log)
self.aim = aim self.aim = aim
self.parallel = parallel self.parallel = parallel
self.retrain = retrain
self.logger = log self.logger = log
self.num_chunks = 10 self.num_chunks = 10
# Random Forest settings # Random Forest settings
self.criterion = settings.criterion self.criterion = settings.criterion
self.n_estimators = settings.num_trees self.n_estimators = settings.num_trees
self.max_depth = settings.depth self.max_depth = settings.depth
self.model_dir = settings.model_database_dir self.model_dir = settings.model_database_dir
if self.retrain:
self.model_to_load = settings.model_to_load + '_retrain'
self.model_to_save = settings.model_to_save + '_retrain'
else:
self.model_to_load = settings.model_to_load self.model_to_load = settings.model_to_load
self.model_to_save = settings.model_to_save self.model_to_save = settings.model_to_save
self.output_dir = None self.output_dir = None
...@@ -240,7 +174,6 @@ class RandomForest(prepare_data): ...@@ -240,7 +174,6 @@ class RandomForest(prepare_data):
print('Prediction is performed') print('Prediction is performed')
self.create_output_dir() self.create_output_dir()
self.load_model() self.load_model()
if not self.error:
self.predict() self.predict()
self.extract_pos_neg_predictions() self.extract_pos_neg_predictions()
self.reshape_prediction() self.reshape_prediction()
...@@ -375,8 +308,7 @@ class RandomForest(prepare_data): ...@@ -375,8 +308,7 @@ class RandomForest(prepare_data):
'roc_tpr': self.tpr, 'roc_tpr': self.tpr,
'roc_auc': self.roc_auc, 'roc_auc': self.roc_auc,
'accuracy': self.acc, 'accuracy': self.acc,
'fbeta': self.fbeta, 'fbeta': self.fbeta
'categories': self.categorical_classes
} }
with open(settings.model_database_dir with open(settings.model_database_dir
...@@ -386,77 +318,6 @@ class RandomForest(prepare_data): ...@@ -386,77 +318,6 @@ class RandomForest(prepare_data):
self.logger.info('Parameters are saved') self.logger.info('Parameters are saved')
def adapt_categorical_features(self, train_classes, training_features):
"""
The encoded features in the training and prediction dataset are
compared regarding the contained classes. Depending on the user
input, instances in the prediction dataset with classes that are
not included in the training dataset are either set to no_value or
nevertheless considered in the prediction. The surplus additional
features are removed either way to achieve the same set of features
as in the training dataset
"""
self.instances_to_drop = []
self.features_not_in_training = []
for feat in [val for val in training_features if '_encode' in val]:
if feat not in self.feature_list:
print('Error: cannot proceed with mapping')
print('Error: Categorical feature ' + feat + ' not in prediction dataset')
self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset')
self.error = True
self.retrain = True
self.features_not_in_training.append(feat)
if len(self.features_not_in_training) > 0:
pd.DataFrame(self.features_not_in_training, columns=['to_drop']).to_csv(self.model_dir + self.model_to_load + 'feature_mismatch_training.csv', index=False)
if not self.retrain:
if list(set([val for val in training_features if '_encode' in val])) != list(set(self.feature_list)):
for feat in list(set(['_'.join(val.split('_')[:-2]) for val in self.feature_list if '_encode' in val])):
if feat in list(self.distibuish_encoding.keys()):
if self.distibuish_encoding[feat] == 'ohe':
if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])):
print(feat + ': Prediction dataset contains more or other classes than training dataset')
self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset')
self.logger.info('Apply user defined handling approach')
common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes']))
if self.properties_map['keep']:
if len(common_elements) == 0:
print('Error: no common classes for ' + feat + ' in training and prediction dataset')
self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset')
self.error = True
else:
to_drop = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements]
self.features = self.features.drop(to_drop, axis=1)
self.feature_list = self.features.columns.tolist()
elif self.properties_map['remove_instances']:
to_drop_col = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements]
to_drop_row = []
for col in to_drop_col:
to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist()
self.features = self.features.drop(to_drop_col, axis=1)
print('Not matching features have been removed')
self.logger.info('Not matching features have been removed')
self.feature_list = self.features.columns.tolist()
self.instances_to_drop = self.instances_to_drop + to_drop_row
print('Instances to consider during mapping have been adapted')
self.logger.info('Instances to consider during mapping have been adapted')
print('Categorical features have been handled and hamonised')
self.logger.info('Categorical features have been handled and hamonised')
self.logger.info('Remaining features: ' + str(self.feature_list))
def load_model(self): def load_model(self):
""" """
...@@ -478,63 +339,7 @@ class RandomForest(prepare_data): ...@@ -478,63 +339,7 @@ class RandomForest(prepare_data):
+ '/model_params.pkl', 'rb') as f: + '/model_params.pkl', 'rb') as f:
params = pkl.load(f) params = pkl.load(f)
features = params['features'] features = params['features']
self.error = False
self.adapt_categorical_features(params['categories'], features)
if not self.error:
if len(self.feature_list) == len(features):
if set(self.feature_list) != set(features):
print('Error: Not all features of the model are contained in the prediction dataset')
self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
self.error = True
elif self.feature_list != features:
print('The order or features differs. Prediction features are reordered')
self.logger.info('The order or features differs. Prediction features are reordered')
self.features = self.features[features]
if self.features.columns.tolist() != features:
print('There is still something wrong with the order of the features!')
elif self.feature_list == features:
print('Prediction and training dataset have the same order')
self.logger.info('Prediction and training dataset have the same order')
elif len(self.feature_list) < len(features):
print('Error: Not all features of the model are contained in the prediction dataset')
self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
self.error = True
elif len(self.feature_list) > len(features):
if set(features).issubset(self.feature_list):
to_drop = list(set(self.feature_list)-set(features))
self.features = self.features.drop(to_drop, axis=1)
self.feature = self.features[features]
if self.features.columns.tolist() != features:
print('There is still something wrong with the order of the features!')
self.error = True
else:
print('Features in the prediction dataset which were not used for training were removed')
print('Features in the prediction dataset were sorted to match the training features')
self.logger.warning('Features in the prediction dataset which were not used for training were removed')
self.logger.info('Features left: ' + str(self.feature_list))
self.logger.info('Features in the prediction dataset were sorted to match the training features')
else:
Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.logger.error('Error: Not all features of the model are contained in the prediction dataset')
self.error = True
if not self.error:
self.feature_list = self.features.columns.tolist()
self.features = self.features.to_numpy()
self.logger.info('Model loaded from ' self.logger.info('Model loaded from '
+ self.model_dir + self.model_dir
...@@ -572,9 +377,8 @@ class RandomForest(prepare_data): ...@@ -572,9 +377,8 @@ class RandomForest(prepare_data):
Reshape the individual predictions into a map. Reshape the individual predictions into a map.
""" """
dropped = list(set(self.dropped + self.instances_to_drop))
arr_xy = np.array(self.xy) arr_xy = np.array(self.xy)
arr_xy[dropped, :] = settings.no_value#*np.shape(arr_xy)[1] arr_xy[self.dropped, :] = settings.no_value#*np.shape(arr_xy)[1]
result = np.reshape(list(arr_xy[:, 2]), result = np.reshape(list(arr_xy[:, 2]),
(len(list(set(self.xy['ycoord']))), (len(list(set(self.xy['ycoord']))),
......
...@@ -14,16 +14,16 @@ class check_general_settings(): ...@@ -14,16 +14,16 @@ class check_general_settings():
if training_dataset or map_generation: if training_dataset or map_generation:
if os.path.isdir(path_train): if os.path.isdir(path_train):
save_path = path_train + 'check_user_input.log' save_path = path_train + '/check_user_input.log'
else: else:
save_path = os.path.dirname(path_train) + 'check_user_input.log' save_path = os.path.dirname(path_train) + '/check_user_input.log'
elif prediction_dataset: elif prediction_dataset:
if os.path.isdir(path_pred): if os.path.isdir(path_pred):
save_path = path_pred + 'check_user_input.log' save_path = path_pred + '/check_user_input.log'
else: else:
save_path = os.path.dirname(path_pred) + 'check_user_input.log' save_path = os.path.dirname(path_pred) + '/check_user_input.log'
else: else:
save_path = 'check_user_input.log' save_path = '/check_user_input.log'
if os.path.exists(save_path): if os.path.exists(save_path):
os.remove(save_path) os.remove(save_path)
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 29 13:20:59 2025
@author: aedrich
"""
import numpy as np
import pandas as pd
import netCDF4 as nc
import pickle as pkl
import os
import logging
import settings
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score
from joblib import delayed, Parallel
from tkinter import Label
from utilities.ncfile_generation import generate_basic_ncfile
from utilities.strings_for_ncfile import char_to_string, features_to_char
class comparison_training_prediction_dataset:
def __init__(self, logger):
self.logger = logger
self.error = False
self.import_prediction_dataset()
self.import_training_dataset()
self.compare_features()
if not self.error:
self.additional_instances_to_drop()
self.save_prediction_dataset()
self.save_training_dataset()
def import_prediction_dataset(self):
ds = nc.Dataset(settings.path_pred)
pred = ds['Result'][:, :].data
pred_features = ds['features'][:].data
self.feature_list = char_to_string(pred_features)
if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list:
self.pred = pd.DataFrame(pred, columns=self.feature_list)
else:
self.pred = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list)
self.xy = pd.DataFrame()
self.xy['ycoord'] = self.pred['ycoord']
self.xy['xcoord'] = self.pred['xcoord']
self.idx = ds['Dropped'][:].data
self.idx = [int(x) for x in self.idx]
if len(settings.not_included_pred_data) > 0:
for dataset in settings.not_included_pred_data:
if dataset in self.pred.columns.tolist():
self.pred = self.pred.drop(dataset, axis=1)
self.logger.info('Prediction dataset imported')
self.logger.info('The following ' + str(len(self.pred.columns.tolist()))
+ ' features are included in the prediction dataset: '
+ str(self.pred.columns.tolist()))
def import_training_dataset(self):
# Import training dataset as csv file
self.train = pd.read_csv(settings.path_train)
# Extract and remove labels from training dataset
self.labels = np.array(
self.train['label']).reshape(
[np.shape(self.train['label'])[0], 1])
self.xy_train = pd.DataFrame()
self.xy_train['ID'] = self.train['ID']
self.xy_train['label'] = self.train['label']
self.xy_train['ycoord'] = self.train['ycoord']
self.xy_train['xcoord'] = self.train['xcoord']
self.train = self.train.drop(['xcoord', 'ycoord', 'ID', 'label'], axis=1)
if len(settings.not_included_train_data) > 0:
for dataset in settings.not_included_train_data:
if dataset in self.train.columns.tolist():
self.train = self.train.drop(dataset, axis=1)
self.logger.info('Training dataset imported')
self.logger.info('The following ' + str(len(self.train.columns.tolist()))
+ ' features are included in the training dataset: '
+ str(self.train.columns.tolist()))
def compare_features(self):
"""
It is assessed if all features in the training dataset also appear
in the prediction dataset. If that is not the case, the training
process will be relaunched with an adapted training dataset where the
feature(s) that is/are not contrained in the training dataset are
removed. The second trained model will be stored in a seperate
folder which is named <old_folder_name>_retrain.
If more features appear in the prediction dataset, the additional
features are removed.
"""
self.logger.info('Features are compared between training and prediction dataset')
if set(self.train.columns) == set(self.pred.columns):
self.logger.info('Features are identical in both training and prediction dataset')
self.pred = self.pred[self.train.columns]
self.logger.info('Potentially varying order of features has been fixed')
self.error = False
else:
self.logger.warning('Features are not identical in the training and prediction dataset')
extra_in_pred = set(self.pred.columns) - set(self.train.columns)
extra_in_train = set(self.train.columns) - set(self.pred.columns)
if len(extra_in_pred) > 0 and len(extra_in_train) == 0:
self.logger.warning('More features in prediction dataset, additional features are removed')
self.pred = self.pred[self.train.columns]
self.error = False
elif len(extra_in_train) > 0 and len(extra_in_pred) == 0 :
self.logger.warning('More features in training dataset, additional features are removed')
self.train = self.train[self.pred.columns]
self.error = False
elif len(extra_in_train) > 0 and len(extra_in_pred) > 0:
self.logger.warning('There are mismatching features in both datasets')
self.common_columns = self.train.columns.intersection(self.pred.columns)
if len(self.common_columns.tolist()) == 0:
self.logger.error('Error: No common columns in training and prediction dataset')
self.error = True
elif len(self.common_columns.tolist()) < 6:
self.logger.warning('Warning: only ' + str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')
self.error = False
self.train = self.train[self.common_columns]
self.pred = self.pred[self.common_columns]
else:
self.logger.info(str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')
self.error = False
self.train = self.train[self.common_columns]
self.pred = self.pred[self.common_columns]
else:
self.logger.error('Error: Unknown issue detected. Check features manually!')
self.error = True
self.logger.info('Feature comparison completed')
def additional_instances_to_drop(self):
"""
All instances that have a value of zero in all columns of a categorical
feature are identified and appended to the list of instances for which
a reliable prediction is not possible.
Input:
master: related to information display in external window
logger: related to generation of a process log
row: related to information display in external window, int
idx: Previously defined instances for which prediction is not
possible, list
pred: prediction dataset, pandas DataFrame
Output:
idx: Updated list of instances for which prediction is not
possible, list
row: Updated row information related to information display in
external window, int
"""
self.logger.info('Start identification of instances that are not represented by at least one categorical feature')
columns = self.pred.columns
# Regular expression to match "<feature>_<value>_encoded"
pattern = re.compile(r"^(.*?)(_?\d+)?_encoded$")
encoded_features = {pattern.match(col).group(1) for col in columns if pattern.match(col)}
self.logger.info('Identified encoded features: ' + str(encoded_features))
count = 0
for feature in encoded_features:
feature_cols = [col for col in self.pred.columns if col.startswith(feature) and col.endswith("_encoded")]
all_zero_rows = (self.pred[feature_cols] == 0).all(axis=1)
all_zero_rows = self.pred.index[all_zero_rows].tolist()
self.idx = list(set(self.idx + all_zero_rows))
count = count + len(all_zero_rows)
self.logger.info(str(count) + ' instances have been identified that are not represented by at least one categorical feature')
def save_prediction_dataset(self):
"""
Save prediction dataset and information on dropped rows as nc-file
"""
self.pred = pd.concat([self.xy, self.pred], axis=1)
pred = self.pred.to_numpy()
char_features = features_to_char(self.pred.columns)
outfile = settings.path_pred
self.logger.info('Prediction dataset is saved to ' + outfile)
if os.path.exists(outfile):
os.remove(outfile)
ds = generate_basic_ncfile(outfile, crs=None)
ds.createDimension('lat', (np.shape(pred)[0]))
ds.createDimension('lon', (np.shape(pred)[1]))
ds.createDimension('ix', (len(self.idx)))
ds.createDimension('feat', len(char_features))
result = ds.createVariable('Result', 'f4', ('lat', 'lon'))
dropped = ds.createVariable('Dropped', 'u8', 'ix')
Features = ds.createVariable('features', 'S1', 'feat')
result[:, :] = pred
dropped[:] = np.array(self.idx)
Features[:] = char_features
ds.close()
def save_training_dataset(self):
"""
Save dataframe as csv. If necessary folder is created.
"""
self.logger.info('Saving of training data in progress')
outfile = settings.path_train
# If outfile exists already, delete
if os.path.exists(outfile):
os.remove(outfile)
self.train = pd.concat([self.xy_train, self.train], axis=1)
# Save dataframe as csv
self.train.to_csv(outfile, sep=',', index=False)
self.logger.info('Training dataset saved')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This is a template file for settings.py
Either duplicate and rename or fill out and rename.
More information on the individual meaning and what to consider can be
found in the user manual
"""
import logging
import json
import types
def export_variables(logger):
variables = globals()
# Filter out non-serializable objects
defined_vars = {}
for k, v in variables.items():
if not k.startswith('__') and not callable(v) and not isinstance(v, types.ModuleType):
try:
# Test if the value is JSON serializable
json.dumps(v)
defined_vars[k] = v
except (TypeError, OverflowError):
# Skip non-serializable values
pass
# Convert the dictionary to a JSON string
vars_json = json.dumps(defined_vars, indent=4)
logger.info("Exported variables: %s", vars_json)
# Mandatory parameters
days = 2
approach = 'statistical'
# Steps
training_dataset = False # Boolean, if training dataset shall be created
preprocessing = 'no_interpolation' # Defines preprocessing approach: 'cluster', 'interpolation', 'no_interpolation'
train_from_scratch = True
train_delete = None
prediction_dataset = False # Boolean, if prediction dataset shall be created
pred_from_scratch = True
pred_delete = None
map_generation = True # Boolean, if mapping shall be performed
# General
crs = 'wgs84' # Coordinate reference system, string
no_value = -999 # No data value, integer, suggestion -999
random_seed = 42 # Random seed, integer
resolution = 25 # Resolution in m of the final map, integer, all datasets will be interpolated to this resolution
path_ml = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/' # Path to where shire framework related parameters/files will be stored
data_summary_path = None # Path to the data summary file, string, relevant only for training/prediction dataset generation
key_to_include_path = None # Path to kets_to_include file, string, relevant only for training/prediction dataset generation
# Training dataset generation
size = None # Size of the validation dataset, float number between 0 and 1
path_train = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/training_datasets/{days}/training_statistical_{days}d.csv' # Path to directory where the training dataset is/shall be stored
ohe = None # One-hot encoding, bool
path_landslide_database = None # Path to where the landslide database is stored, string
ID = 'ID' # Name of the column containing landslide ID, string
landslide_database_x = 'xcoord' # Name of the column containing longitude values, string
landslide_database_y = 'ycoord' # Name of the column containing latitude values, string
path_nonls_locations = None # Path to where the non-landslide database is stored, string
num_nonls = None # Number of non-landslide locations to include in the training dataset, integer
nonls_database_x = None # Name of the column containing longitude values, string
nonls_database_y = None # Name of the column containing longitude values, string
#cluster = False # Use clustering for training dataset generation, bool
#interpolation = False # Use interpolation for training dataset generation, bool
# Prediction dataset generation
bounding_box = None # Coordinates of the edges of the bounding box of the area of interest, list, [<ymax>, <ymin>, <xmin>, <xmax>]
path_pred = None # Path to directory where the prediction dataset is/shall be stored
# Map generation
RF_training = True # Train the RF, bool
RF_prediction = True # Make a prediction using the RF, bool
not_included_pred_data = ['xcoord', 'ycoord']# List of features in the training dataset not to be considered in prediction
not_included_train_data = [] # List of features in the training dataset not to be considered in model training
num_trees = 100 # Number of trees in the Random Forest, integer
criterion = 'gini' # Criterion for the Random Forest, string
depth = 20 # Number of nodes of the RF, integer
model_to_save = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder name for storage of the RF results, string
model_to_load = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder where RF model is stored, string, identical to model_to_save if training and prediction is done at the same time
model_database_dir = path_ml # Directory where models should be stored
parallel = True # Boolean, true if prediction data shall be split to predict in parallel
keep_cat_features = False #bool, true if categorical features shall be kept even if some instances in prediction dataset have classes not covered by the prediction dataset
remove_instances = True # bool, true of instances in prediction dataset shall be removed if they have different classes than the instances in the training dataset
\ No newline at end of file
...@@ -9,6 +9,8 @@ from create_training_data import create_training_data ...@@ -9,6 +9,8 @@ from create_training_data import create_training_data
from create_prediction_data import create_prediction_data from create_prediction_data import create_prediction_data
from RandomForest import RandomForest from RandomForest import RandomForest
from check_user_input import check_general_settings from check_user_input import check_general_settings
from compatibility_of_input_datasets import comparison_training_prediction_dataset
from utilities.initialise_log import save_log from utilities.initialise_log import save_log
""" """
...@@ -78,6 +80,12 @@ else: ...@@ -78,6 +80,12 @@ else:
print('Map will be generated') print('Map will be generated')
logger.info('Map generation started') logger.info('Map generation started')
print('Training and prediction dataset will be assessed for compatibility')
logger.info('Training and prediction dataset will be assessed for compatibility')
s = comparison_training_prediction_dataset(logger)
if not s.error:
if settings.parallel: if settings.parallel:
print('Prediction will run in parallel') print('Prediction will run in parallel')
logger.info('Prediction will run in parallel') logger.info('Prediction will run in parallel')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment