diff --git a/mitm_tooling/data_types/convert.py b/mitm_tooling/data_types/convert.py index 7f9abded500792f7b8551e57e8b73086e80c9f31..24ad2b06cf66ea033935dec6e17c90a181f7103e 100644 --- a/mitm_tooling/data_types/convert.py +++ b/mitm_tooling/data_types/convert.py @@ -22,13 +22,14 @@ def convert_df(df: pd.DataFrame, data_types: dict[str, MITMDataType], inplace=Fa res = pd.DataFrame(data=data, index=df.index) for col, dt in data_types.items(): - try: - # if inplace: - # df[col] = convert_df_col(df, col, dt, inplace=False) - # else: - res[col] = convert_df_col(df, col, dt, inplace=False) - except Exception as e: - raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n{e}') + if col in df.columns: + try: + # if inplace: + # df[col] = convert_df_col(df, col, dt, inplace=False) + # else: + res[col] = convert_df_col(df, col, dt, inplace=False) + except Exception as e: + raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n{e}') return res diff --git a/mitm_tooling/extraction/sql/mapping/export.py b/mitm_tooling/extraction/sql/mapping/export.py index ae9a22176fad4d8096c871a4356664664125d129..0f5a7635d7df8189f759b086a7e2fe58f71e125d 100644 --- a/mitm_tooling/extraction/sql/mapping/export.py +++ b/mitm_tooling/extraction/sql/mapping/export.py @@ -82,15 +82,18 @@ class Exportable(pydantic.BaseModel): chunk_iterators = [] for dp in dps: - def local_iter(dp: DataProvider, columns=tuple(concept_file_columns)) -> Iterator[ + def local_iter(dp: DataProvider=dp, columns=tuple(concept_file_columns)) -> Iterator[ tuple[pd.DataFrame, list[HeaderEntry]]]: for df_chunk in dp.instance_provider.apply_db_chunked(bind, STREAMING_CHUNK_SIZE): if validate: raise NotImplementedError + df_chunk = dp.instance_postprocessor.apply_df(df_chunk) + hes = dp.header_entry_provider.apply_df(df_chunk) + # this does nothing more than adding NaN columns to fill up to the number of attributes in the concept file (k) df_chunk = df_chunk.reindex(columns=list(columns), copy=False) - yield dp.instance_postprocessor.apply_df(df_chunk), dp.header_entry_provider.apply_df(df_chunk) + yield df_chunk, hes - chunk_iterators.append(local_iter(dp)) + chunk_iterators.append(local_iter()) data_sources[main_concept] = StreamingConceptData(structure_df=structure_df, chunk_iterators=chunk_iterators) diff --git a/mitm_tooling/representation/intermediate/deltas.py b/mitm_tooling/representation/intermediate/deltas.py index eb2759fa9611e8edb34a8416f62bc39d9aa9a8d9..a7d9f6969ec5a518394156cda91df8dd0d423137 100644 --- a/mitm_tooling/representation/intermediate/deltas.py +++ b/mitm_tooling/representation/intermediate/deltas.py @@ -83,7 +83,7 @@ def diff_header_entry(a: HeaderEntry, b: HeaderEntry) -> TypeUpdate | None: zip(zip(a.attributes, a.attribute_dtypes), zip(b.attributes, b.attribute_dtypes))): if attr_a != attr_b or dt_a != dt_b: deltas.append(AttributeUpdate(index=i, name=attr_b, dt=dt_b)) - k_a, k_b = a.get_k(), b.get_k() + k_a, k_b = a.attr_k, b.attr_k x = k_b - k_a if x > 0: for j, (attr_b, dt_b) in zip(range(k_a, k_b), zip(b.attributes[k_a:], b.attribute_dtypes[k_a:])): diff --git a/mitm_tooling/representation/intermediate/header.py b/mitm_tooling/representation/intermediate/header.py index 31936ae7c3df544795be97de955ae63faa944e4d..9d9a52d225097c279cca25d47cf0649fb570fcf9 100644 --- a/mitm_tooling/representation/intermediate/header.py +++ b/mitm_tooling/representation/intermediate/header.py @@ -12,7 +12,9 @@ from pydantic import ConfigDict from mitm_tooling.data_types import MITMDataType from mitm_tooling.definition import get_mitm_def from mitm_tooling.definition.definition_representation import ConceptName, MITM, TypeName +from .. import mk_attr_columns from ..common import mk_header_file_columns, ColumnName +from mitm_tooling.definition.definition_tools import map_col_groups class HeaderEntry(pydantic.BaseModel): @@ -50,17 +52,39 @@ class HeaderEntry(pydantic.BaseModel): return HeaderEntry(concept=concept, kind=kind, type_name=type_name, attributes=tuple(attrs), attribute_dtypes=tuple(attr_dts)) - @property + @cached_property def attr_dtype_pairs(self) -> Iterable[tuple[TypeName, MITMDataType]]: return zip(self.attributes, self.attribute_dtypes) - def get_k(self) -> int: + @cached_property + def attr_k(self) -> int: return len(self.attributes) def to_row(self) -> list[str | None]: return [self.kind, self.type_name] + list( itertools.chain(*zip(self.attributes, map(str, self.attribute_dtypes)))) + @cached_property + def attr_name_map(self) -> dict[ColumnName, ColumnName]: + return {a_anon: a for a_anon, a in zip(mk_attr_columns(self.attr_k), self.attributes)} + + +def mk_typed_df_columns(mitm: MITM, he: HeaderEntry) -> tuple[list[str], dict[str, MITMDataType]]: + mitm_def = get_mitm_def(mitm) + concept = he.concept + _, dts = map_col_groups(mitm_def, concept, { + 'kind': lambda: ('kind', MITMDataType.Text), + 'type': lambda: (mitm_def.get_properties(concept).typing_concept, MITMDataType.Text), + 'identity': lambda: mitm_def.resolve_identity_type(concept).items(), + 'inline': lambda: mitm_def.resolve_inlined_types(concept).items(), + 'foreign': lambda: [ + (name, dt) for fk_types in mitm_def.resolve_foreign_types(concept).values() for name, dt in + fk_types.items()], + 'attributes': lambda: list(he.attr_dtype_pairs), + }) + + return list(dts.keys()), dict(dts) + class Header(pydantic.BaseModel): model_config = ConfigDict(frozen=True) @@ -75,7 +99,7 @@ class Header(pydantic.BaseModel): @property def max_k(self) -> int: - return max(map(lambda he: he.get_k(), self.header_entries), default=0) + return max(map(lambda he: he.attr_k, self.header_entries), default=0) def generate_header_df(self) -> pd.DataFrame: k = self.max_k @@ -98,3 +122,8 @@ class Header(pydantic.BaseModel): for he in self.header_entries: res[he.concept][he.type_name] = he return dict(res) + + @cached_property + def typed_df_columns(self) -> dict[ConceptName, dict[TypeName, tuple[list[str], dict[str, MITMDataType]]]]: + return {c: {tp: mk_typed_df_columns(self.mitm, he) for tp, he in tps.items()} for c, tps in + self.as_dict.items()} diff --git a/mitm_tooling/transformation/df/from_exportable.py b/mitm_tooling/transformation/df/from_exportable.py index 46326012e587d9304fd2b6a7e3fd9ff5726844cd..f82a9257261ff0be54a3779e6e922f0d7c67790c 100644 --- a/mitm_tooling/transformation/df/from_exportable.py +++ b/mitm_tooling/transformation/df/from_exportable.py @@ -4,8 +4,10 @@ import pandas as pd from mitm_tooling.definition import get_mitm_def, TypeName from mitm_tooling.extraction.sql.mapping import Exportable +from mitm_tooling.representation import mk_header_file_columns from mitm_tooling.representation.df import TypedMitMDataFrameStream, StreamingMITMDataFrames, MitMDataFrameStream from mitm_tooling.representation.intermediate import HeaderEntry +from mitm_tooling.representation.intermediate.header import mk_typed_df_columns from mitm_tooling.utilities.sql_utils import AnyDBBind, use_db_bind @@ -25,7 +27,11 @@ def exportable_to_mitm_dataframes_stream( for df_chunk in chunks: df_chunk = dp.instance_postprocessor.apply_df(df_chunk) for type_name, type_idx in df_chunk.groupby(mitm_def.get_properties(c).typing_concept).groups.items(): - yield str(type_name), (df_chunk.loc[type_idx],) + hes = dp.header_entry_provider.apply_df(df_chunk.loc[type_idx]) + assert len(hes) == 1, f'expected exactly one header entry per type, got {len(hes)}' + he = hes[0] + typed_df = df_chunk.loc[type_idx].rename(columns=he.attr_name_map) + yield str(type_name), (typed_df,) yield c, df_chunks_iter(c, dps) @@ -50,8 +56,9 @@ def exportable_to_typed_mitm_dataframes_stream( hes = dp.header_entry_provider.apply_df(df_chunk.loc[type_idx]) assert len(hes) == 1, f'expected exactly one header entry per type, got {len(hes)}' he = hes[0] - - yield str(type_name), he, (df_chunk.loc[type_idx],) + # de-anonymize the columns a_i -> actual attribute name + typed_df = df_chunk.loc[type_idx].rename(columns=he.attr_name_map) + yield str(type_name), he, (typed_df,) yield c, typed_df_chunks_iter(c, dps) diff --git a/mitm_tooling/transformation/df/from_intermediate.py b/mitm_tooling/transformation/df/from_intermediate.py index 78672ae0948d6d7b0749df27d41a2c2bbc225948..2cccd36a15bfe3d66bd91b7039c7bb66e7705a5b 100644 --- a/mitm_tooling/transformation/df/from_intermediate.py +++ b/mitm_tooling/transformation/df/from_intermediate.py @@ -1,10 +1,12 @@ +import itertools + import pandas as pd from mitm_tooling.data_types import convert from mitm_tooling.definition import get_mitm_def, ConceptName, TypeName -from mitm_tooling.representation.intermediate import MITMData, Header, HeaderEntry -from mitm_tooling.representation.df import MITMDataFrames from mitm_tooling.representation import mk_concept_file_header +from mitm_tooling.representation.df import MITMDataFrames +from mitm_tooling.representation.intermediate import MITMData, Header, HeaderEntry def unpack_concept_table_as_typed_dfs(header: Header, concept: ConceptName, df: pd.DataFrame) -> dict[ @@ -29,16 +31,15 @@ def unpack_concept_table_as_typed_dfs(header: Header, concept: ConceptName, df: res = {} for (concept, type_name), (he, type_df) in with_header_entry.items(): - k = he.get_k() - base_cols, base_dts = mk_concept_file_header(header.mitm, concept, 0) - normal_form_cols, _ = mk_concept_file_header(header.mitm, concept, k) + k = he.attr_k + normal_form_cols, normal_form_dts = mk_concept_file_header(header.mitm, concept, k) type_df = type_df.reindex(columns=normal_form_cols) - - unpacked_cols = list(base_cols) + list(he.attributes) - unpacked_dts = base_dts | dict(he.attr_dtype_pairs) - type_df.columns = unpacked_cols - - res[he.type_name] = convert.convert_df(type_df, unpacked_dts) + type_df = type_df.rename(columns=he.attr_name_map) + dt_map = dict(itertools.chain( + ((a, dt) for a, dt in normal_form_dts.items() if a in set(type_df.columns)), + he.attr_dtype_pairs + )) + res[he.type_name] = convert.convert_df(type_df, dt_map) return res diff --git a/mitm_tooling/transformation/superset/visualizations/abstract/base.py b/mitm_tooling/transformation/superset/visualizations/abstract/base.py index 4dee588eb49746ad6373e60c2696554e8561d92c..33510b4afa6997718ef6c3be9d5d590868f64436 100644 --- a/mitm_tooling/transformation/superset/visualizations/abstract/base.py +++ b/mitm_tooling/transformation/superset/visualizations/abstract/base.py @@ -49,6 +49,7 @@ class ChartCollectionCreator(ABC): @classmethod def cls_from_dict(cls: type[Self], arg: dict[str, tuple[TableName, ChartCreator]]) -> type[Self]: arg_ = arg + class ConcreteChartCollectionCreator(cls): @property @@ -57,6 +58,23 @@ class ChartCollectionCreator(ABC): return ConcreteChartCollectionCreator + @classmethod + def prefixed(cls, prefix: str, wrapped_obj: Self) -> type[Self]: + prefix_ = prefix + + class PrefixedChartCollectionCreator(cls): + + @property + def chart_creators(self) -> dict[str, tuple[TableName, ChartCreator]]: + return {} + + def mk_chart_collection(self, + ds_id_map: DatasetIdentifierMap, + ch_id_map: NamedChartIdentifierMap) -> ChartDefCollection: + return {f'{prefix_}-{k}': v for k, v in wrapped_obj.mk_chart_collection(ds_id_map, ch_id_map)} + + return PrefixedChartCollectionCreator + class DashboardCreator(ABC): diff --git a/mitm_tooling/transformation/superset/visualizations/maed/dashboards.py b/mitm_tooling/transformation/superset/visualizations/maed/dashboards.py index 4a8271a53545407c50972dfb7c0daf1f349e839c..e1b98bf205caa718e309e7da701d8ac0484f1830 100644 --- a/mitm_tooling/transformation/superset/visualizations/maed/dashboards.py +++ b/mitm_tooling/transformation/superset/visualizations/maed/dashboards.py @@ -15,7 +15,7 @@ def mk_header_markdown(header: Header) -> tuple[str, str]: + '\n'.join(( f'| {he.kind} | {he.type_name} ' + ''.join( (f'| {a} (_{dt}_) ' for a, dt in - he.attr_dtype_pairs)) + ('| ' * max(0, header.max_k - he.get_k())) + '|' for he + he.attr_dtype_pairs)) + ('| ' * max(0, header.max_k - he.attr_k)) + '|' for he in header.header_entries))) + '\n' mitm_def = header.mitm_def @@ -47,7 +47,7 @@ class BaselineMAEDDashboard(MitMDashboardCreator): @property def chart_collection_creator(self) -> ChartCollectionCreator: - return BaselineMAEDCharts(self.header, self.sql_rep_schema) + return ChartCollectionCreator.prefixed('baseline', BaselineMAEDCharts(self.header, self.sql_rep_schema)) @property def dashboard_title(self) -> str: @@ -85,7 +85,7 @@ class ExperimentalMAEDDashboard(MitMDashboardCreator): @property def chart_collection_creator(self) -> ChartCollectionCreator: - return BaselineMAEDCharts(self.header, self.sql_rep_schema) + return ChartCollectionCreator.prefixed('experimental', BaselineMAEDCharts(self.header, self.sql_rep_schema)) @property def dashboard_title(self) -> str: @@ -150,7 +150,7 @@ class CustomChartMAEDDashboard(MitMDashboardCreator): def chart_collection_creator(self) -> ChartCollectionCreator: mitm_dataset_identifier = self.mitm_dataset_identifier return ChartCollectionCreator.cls_from_dict({ - 'custom': ('observations', + 'custom': ('custom-observations', MAEDCustomChart(mitm_dataset_identifier))})() def build_dashboard(self,