diff --git a/src/gui_version/create_prediction_data_gui.py b/src/gui_version/create_prediction_data_gui.py index 499fa91f3071b0b3a15b7ca51c347da9e44177df..2b220e0d8a8f639a74b257ac60b41448fe5a98fb 100644 --- a/src/gui_version/create_prediction_data_gui.py +++ b/src/gui_version/create_prediction_data_gui.py @@ -543,6 +543,7 @@ class create_prediction_data: self.datasets_summary, self.properties_pred['ohe'], basic, + self.properties_settings['no_value'], var) to_drop = [] diff --git a/src/gui_version/create_training_data_gui.py b/src/gui_version/create_training_data_gui.py index 2c36e2d678cea10b4dab0ca81d0e611c319a7e55..feccdf30478d486b9e6437ba5fb7d0a048a3f4d2 100644 --- a/src/gui_version/create_training_data_gui.py +++ b/src/gui_version/create_training_data_gui.py @@ -1137,6 +1137,7 @@ class create_training_data: self.datasets_summary, self.properties_train['ohe'], basic, + self.properties_settings['no_value'], var) def save_training_data(self): diff --git a/src/gui_version/utilities/handle_categorical_values.py b/src/gui_version/utilities/handle_categorical_values.py index 9ceacf7f67d566cc11aec181f8c2e615dfd40562..166c077e5f40c039b734881ae497d4eeb9b74c91 100644 --- a/src/gui_version/utilities/handle_categorical_values.py +++ b/src/gui_version/utilities/handle_categorical_values.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): +def handle_categorical_values(df, datasets_summary, ohe, basic, no_data_value, var=None): """ Categorical features in the training dataset are either one hot @@ -46,6 +46,7 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): if ohe: encoder = OneHotEncoder(sparse=False) encoded_data = encoder.fit_transform(df[cat]) + unique_categories = {col: df[col].unique() for col in cat} custom_column_names = [] for col in cat: @@ -57,14 +58,27 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) else: - for feat in cat: - df[feat] = df[feat].apply(str) + cat = [feat for feat in cat if df[feat][df[feat] != no_data_value].apply(lambda x: isinstance(x, str)).all()] - columns_to_encode = cat.copy()#df.select_dtypes(include=['object', 'category']).columns.tolist() - encoder = OrdinalEncoder() - encoded_data = encoder.fit_transform(df[columns_to_encode]) - encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) - df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + if len(cat) > 0: + columns_to_encode = cat.copy() + encoder = OrdinalEncoder() + + # Mask: Identify rows where values are NOT no_data_value + mask = df[columns_to_encode] != no_data_value + + # Apply encoding only to valid values + encoded_data = df[columns_to_encode].copy() + encoded_data[mask] = encoder.fit_transform(df[columns_to_encode][mask]) + + # Explicitly set no_data_value where it was before + encoded_data[~mask] = no_data_value # Keep original no_data_value + + # Convert back to DataFrame + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encode" for col in columns_to_encode]) + + # Merge encoded columns back into df + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) return df diff --git a/src/plain_scripts/create_prediction_data.py b/src/plain_scripts/create_prediction_data.py index 820adaebec8bba17489da985a436ea17ba4b626b..655e5314ccfd6dd7779ea7880292c901024b2db2 100644 --- a/src/plain_scripts/create_prediction_data.py +++ b/src/plain_scripts/create_prediction_data.py @@ -131,6 +131,7 @@ class create_prediction_data: self.data_properties, settings.ohe, basic, + settings.no_value, var) to_drop = [] @@ -342,7 +343,7 @@ class create_prediction_data: ds.createDimension('ix', (len(self.idx))) ds.createDimension('feat', len(self.char_features)) result = ds.createVariable('Result', 'f4', ('lat', 'lon')) - dropped = ds.createVariable('Dropped', 'f4', 'ix') + dropped = ds.createVariable('Dropped', 'i4', 'ix') Features = ds.createVariable('features', 'S1', 'feat') result[:, :] = df_pred dropped[:] = self.idx diff --git a/src/plain_scripts/create_training_data.py b/src/plain_scripts/create_training_data.py index 03f251cb1e996c98ed236e1e60448d4668dec250..b2d7b7354f8122beeefb862ba01bc169307455f9 100644 --- a/src/plain_scripts/create_training_data.py +++ b/src/plain_scripts/create_training_data.py @@ -112,6 +112,7 @@ class create_training_data: self.data_properties, settings.ohe, basic, + settings.no_value, var) def delete_feature(self): diff --git a/src/plain_scripts/utilities/handle_categorical_values.py b/src/plain_scripts/utilities/handle_categorical_values.py index 65cc349478055c58c7b0db39e124ba4098b47edd..166c077e5f40c039b734881ae497d4eeb9b74c91 100644 --- a/src/plain_scripts/utilities/handle_categorical_values.py +++ b/src/plain_scripts/utilities/handle_categorical_values.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): +def handle_categorical_values(df, datasets_summary, ohe, basic, no_data_value, var=None): """ Categorical features in the training dataset are either one hot @@ -58,14 +58,27 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) else: - for feat in cat: - df[feat] = df[feat].apply(str) + cat = [feat for feat in cat if df[feat][df[feat] != no_data_value].apply(lambda x: isinstance(x, str)).all()] - columns_to_encode = cat.copy()#df.select_dtypes(include=['object', 'category']).columns.tolist() - encoder = OrdinalEncoder() - encoded_data = encoder.fit_transform(df[columns_to_encode]) - encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) - df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + if len(cat) > 0: + columns_to_encode = cat.copy() + encoder = OrdinalEncoder() + + # Mask: Identify rows where values are NOT no_data_value + mask = df[columns_to_encode] != no_data_value + + # Apply encoding only to valid values + encoded_data = df[columns_to_encode].copy() + encoded_data[mask] = encoder.fit_transform(df[columns_to_encode][mask]) + + # Explicitly set no_data_value where it was before + encoded_data[~mask] = no_data_value # Keep original no_data_value + + # Convert back to DataFrame + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encode" for col in columns_to_encode]) + + # Merge encoded columns back into df + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) return df