From f9284d809b4e494b476cca0a08b49b004e567cf7 Mon Sep 17 00:00:00 2001 From: Leah Tacke genannt Unterberg <leah.tgu@pads.rwth-aachen.de> Date: Tue, 21 Jan 2025 14:01:26 +0100 Subject: [PATCH] more work on sql and superset representation --- mitm_tooling/data_types/data_types.py | 36 +++++++----- mitm_tooling/extraction/__init__.py | 4 +- .../intermediate_representation.py | 24 ++++---- .../representation/sql_representation.py | 58 ++++++++++--------- mitm_tooling/transformation/__init__.py | 4 +- .../df/__init__.py | 0 .../df/intermediate_transformation.py | 7 +-- .../superset/dataset_definition.py | 2 +- .../superset/superset_representation.py | 4 +- test/something.py | 6 +- test/test_to_df.py | 22 +++---- 11 files changed, 93 insertions(+), 74 deletions(-) rename mitm_tooling/{extraction => transformation}/df/__init__.py (100%) rename mitm_tooling/{extraction => transformation}/df/intermediate_transformation.py (96%) diff --git a/mitm_tooling/data_types/data_types.py b/mitm_tooling/data_types/data_types.py index 056114f..9936664 100644 --- a/mitm_tooling/data_types/data_types.py +++ b/mitm_tooling/data_types/data_types.py @@ -7,7 +7,9 @@ import pydantic import sqlalchemy as sa from sqlalchemy.sql import sqltypes -SA_SQLTypeClass = type[sa.types.TypeEngine] +SA_SQLType = sa.types.TypeEngine +SA_SQLTypeInstanceBuilder = Callable[[], SA_SQLType] +SA_SQLTypeClass = type[SA_SQLType] SA_SQLTypeName = str PandasCast = Callable[[pd.Series], pd.Series] @@ -26,8 +28,14 @@ class MITMDataType(enum.StrEnum): Infer = 'infer' @property - def sa_sql_type(self) -> SA_SQLTypeClass | None: - return mitm_sql_type_map.get(self) + def sa_sql_type(self) -> SA_SQLType | None: + if pair := mitm_sql_type_map.get(self): + return pair[1]() + + @property + def sa_sql_type_cls(self) -> SA_SQLTypeClass | None: + if pair := mitm_sql_type_map.get(self): + return pair[0] @property def pandas_cast(self) -> PandasCast | None: @@ -35,7 +43,7 @@ class MITMDataType(enum.StrEnum): @property def sql_type_str(self) -> str: - return self.sa_sql_type.__name__ + return self.sa_sql_type_cls.__name__ def wrap(self) -> 'WrappedMITMDataType': return WrappedMITMDataType(mitm=self) @@ -57,7 +65,7 @@ def sa_sql_to_mitm_type(sa_type: SA_SQLTypeClass) -> MITMDataType: def mitm_to_sql_type(mitm_type: MITMDataType) -> SA_SQLTypeClass | None: - return mitm_type.sa_sql_type + return mitm_type.sa_sql_type_cls def mitm_to_pandas(mitm_type: MITMDataType) -> PandasCast | None: @@ -66,9 +74,9 @@ def mitm_to_pandas(mitm_type: MITMDataType) -> PandasCast | None: def get_sa_sql_type(type_name: EitherDataType | WrappedMITMDataType) -> SA_SQLTypeClass | None: if isinstance(type_name, MITMDataType): - return type_name.sa_sql_type + return type_name.sa_sql_type_cls elif isinstance(type_name, WrappedMITMDataType): - return type_name.mitm.sa_sql_type + return type_name.mitm.sa_sql_type_cls else: if type_name and (t := getattr(sqltypes, type_name, None)): if isinstance(t, type): @@ -108,13 +116,13 @@ sql_mitm_type_map: dict[SA_SQLTypeClass, MITMDataType] = { # sqltypes.BINARY: MITMDataType.Binary, } -mitm_sql_type_map: dict[MITMDataType, SA_SQLTypeClass] = { - MITMDataType.Text: sqltypes.String, - MITMDataType.Datetime: sqltypes.DATETIME_TIMEZONE, - MITMDataType.Json: sqltypes.JSON, - MITMDataType.Boolean: sqltypes.Boolean, - MITMDataType.Integer: sqltypes.Integer, - MITMDataType.Numeric: sqltypes.Float, +mitm_sql_type_map: dict[MITMDataType, None | tuple[SA_SQLTypeClass, SA_SQLTypeInstanceBuilder]] = { + MITMDataType.Text: (sqltypes.String, sqltypes.String), + MITMDataType.Datetime: (sqltypes.DATETIME, lambda: sqltypes.DATETIME_TIMEZONE), + MITMDataType.Json: (sqltypes.JSON, sqltypes.JSON), + MITMDataType.Boolean: (sqltypes.Boolean, sqltypes.Boolean), + MITMDataType.Integer: (sqltypes.Integer, sqltypes.Integer), + MITMDataType.Numeric: (sqltypes.Float, sqltypes.Float), MITMDataType.Unknown: None, MITMDataType.Infer: None, # MITMDataType.Binary: sqltypes.LargeBinary, diff --git a/mitm_tooling/extraction/__init__.py b/mitm_tooling/extraction/__init__.py index 10e03c2..932d0a6 100644 --- a/mitm_tooling/extraction/__init__.py +++ b/mitm_tooling/extraction/__init__.py @@ -1,2 +1,4 @@ -from . import sql, df +from . import sql +from transformation import df + __all__ = ['sql', 'df'] \ No newline at end of file diff --git a/mitm_tooling/representation/intermediate_representation.py b/mitm_tooling/representation/intermediate_representation.py index b39f046..54c58d4 100644 --- a/mitm_tooling/representation/intermediate_representation.py +++ b/mitm_tooling/representation/intermediate_representation.py @@ -4,7 +4,7 @@ import itertools import logging from collections import defaultdict from collections.abc import Iterator, Iterable, Sequence, Mapping -from typing import TYPE_CHECKING, Self, Any +from typing import TYPE_CHECKING, Self, Any, Annotated import pandas as pd import pydantic @@ -63,9 +63,10 @@ class HeaderEntry(pydantic.BaseModel): itertools.chain(*zip(self.attributes, map(str, self.attribute_dtypes)))) + class Header(pydantic.BaseModel): mitm: MITM - header_entries: list[HeaderEntry] = pydantic.Field(default_factory=list) + header_entries: Annotated[list[HeaderEntry], pydantic.Field(default_factory=list)] @classmethod def from_df(cls, df: pd.DataFrame, mitm: MITM) -> Self: @@ -95,7 +96,7 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) header: Header - concept_dfs: dict[ConceptName, pd.DataFrame] = pydantic.Field(default_factory=dict) + concept_dfs: Annotated[dict[ConceptName, pd.DataFrame], pydantic.Field(default_factory=dict)] def __iter__(self): return iter(self.concept_dfs.items()) @@ -103,22 +104,25 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel): def as_generalized(self) -> Self: mitm_def = get_mitm_def(self.header.mitm) dfs = defaultdict(list) - for c, df in self: + for c, df in self.concept_dfs.items(): c = mitm_def.get_parent(c) dfs[c].append(df) - return MITMData(header=self.header, dfs=dict(dfs)) + dfs = {c : pd.concat(dfs_, axis='rows', ignore_index=True) for c, dfs_ in dfs.items()} + return MITMData(header=self.header, concept_dfs=dfs) def as_specialized(self) -> Self: mitm_def = get_mitm_def(self.header.mitm) - dfs = defaultdict(list) + dfs = {} for c, df in self: if mitm_def.get_properties(c).is_abstract: leaf_concepts = mitm_def.get_leafs(c) - for sub_c, idx in df.groupby('kind').groups.items(): - dfs[sub_c].append(df.loc[idx]) + + for sub_c_key, idx in df.groupby('kind').groups.items(): + sub_c = mitm_def.inverse_concept_key_map[str(sub_c_key)] + dfs[sub_c] = df.loc[idx] else: - dfs[c].append(df) - return MITMData(header=self.header, dfs=dict(dfs)) + dfs[c] = df + return MITMData(header=self.header, concept_dfs=dfs) class StreamingConceptData(pydantic.BaseModel): diff --git a/mitm_tooling/representation/sql_representation.py b/mitm_tooling/representation/sql_representation.py index 120998e..3a111d4 100644 --- a/mitm_tooling/representation/sql_representation.py +++ b/mitm_tooling/representation/sql_representation.py @@ -1,19 +1,18 @@ -from collections import defaultdict -from collections.abc import Callable, Iterator, Generator, Mapping +from collections.abc import Callable, Generator, Mapping import pydantic import sqlalchemy as sa import sqlalchemy.sql.schema -from pydantic import AnyUrl +from pydantic import AnyUrl, ConfigDict from mitm_tooling.data_types import MITMDataType from mitm_tooling.definition import MITMDefinition, ConceptProperties, OwnedRelations, ConceptName, MITM, get_mitm_def, \ - ConceptKind, ConceptLevel, RelationName + RelationName from mitm_tooling.definition.definition_tools import map_col_groups, ColGroupMaps -from mitm_tooling.extraction.sql.data_models import Queryable, TableName, ColumnName -from .df_representation import MITMDataset +from mitm_tooling.extraction.sql.data_models import Queryable, TableName from .intermediate_representation import Header, MITMData from mitm_tooling.utilities.sql_utils import create_sa_engine, qualify from mitm_tooling.utilities import python_utils +from mitm_tooling.utilities.io_utils import FilePath from sqlalchemy_utils.view import create_view @@ -74,7 +73,6 @@ def mk_table(meta: sa.MetaData, mitm: MITM, concept: ConceptName, table_name: Ta created_columns, ref_columns) constraints.extend(schema_items) - print(constraints) return sa.Table(table_name, meta, schema=SQL_REPRESENTATION_DEFAULT_SCHEMA, *columns, *constraints), created_columns, ref_columns @@ -104,6 +102,8 @@ ConceptTypeTablesDict = dict[ConceptName, dict[TableName, sa.Table]] class SQLRepresentationSchema(pydantic.BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + meta: sa.MetaData concept_tables: ConceptTablesDict type_tables: ConceptTypeTablesDict @@ -141,6 +141,7 @@ def mk_db_schema(header: Header, gen_views: Callable[ mitm_def.resolve_foreign_types(concept).items() for name, dt in resolved_fk.items()] }) + concept_tables[concept] = t for he in header.header_entries: he_concept = he.concept @@ -185,35 +186,40 @@ def mk_db_schema(header: Header, gen_views: Callable[ return SQLRepresentationSchema(meta=meta, concept_tables=concept_tables, type_tables=type_tables, views=views) -def insert_db_instances(engine: sa.Engine, meta: sa.MetaData, mitm_data: MITMData): +def insert_db_instances(engine: sa.Engine, sql_rep_schema: SQLRepresentationSchema, mitm_data: MITMData): + from mitm_tooling.transformation.df import pack_mitm_dataset, unpack_mitm_data + h = mitm_data.header + mitm = mitm_data.header.mitm + mitm_def = get_mitm_def(mitm) + mitm_dataset = unpack_mitm_data(mitm_data) with engine.connect() as conn: - h = mitm_data.header - mitm = mitm_data.header.mitm - - for concept, df in mitm_data.as_specialized(): - concept_table = mk_concept_table_name(mitm, concept) - t_concept = meta.tables[concept_table] - ref_cols = pick_table_pk(mitm, concept, t_concept.columns) - conn.execute(t_concept.insert(), df[[c.name for c in t_concept.columns]].to_dict('records')) - - if has_type_tables(mitm, concept): - concept_properties, concept_relations = get_mitm_def(mitm).get(concept) - for typ, idx in df.groupby(concept_properties.typing_concept).groups.items(): - type_df = df.loc[idx] - t_type = meta.tables[mk_type_table_name(mitm, concept, str(typ))] - conn.execute(t_type.insert(), type_df[[c.name for c in t_type.columns]].to_dict('records')) + for concept, typed_dfs in mitm_dataset: + concept_properties, concept_relations = mitm_def.get(concept) + for type_name, type_df in typed_dfs.items(): + + t_concept = sql_rep_schema.concept_tables[mitm_def.get_parent(concept)] + ref_cols = pick_table_pk(mitm, concept, t_concept.columns) + conn.execute(t_concept.insert(), type_df[[c.name for c in t_concept.columns]].to_dict('records')) + + if has_type_tables(mitm, concept): + #for typ, idx in df.groupby(concept_properties.typing_concept).groups.items(): + # type_df = df.loc[idx] + t_type = sql_rep_schema.type_tables[concept][type_name] + to_dict = type_df[[c.name for c in t_type.columns]].to_dict('records') + conn.execute(t_type.insert(), to_dict) + conn.commit() def insert_mitm_data(engine: sa.Engine, mitm_data: MITMData) -> SQLRepresentationSchema: sql_rep_schema = mk_db_schema(mitm_data.header) sql_rep_schema.meta.create_all(engine) - insert_db_instances(engine, sql_rep_schema.meta, mitm_data) + insert_db_instances(engine, sql_rep_schema, mitm_data) return sql_rep_schema -def mk_sqlite(mitm_data: MITMData, file_path: str | None = ':memory:') -> tuple[sa.Engine, SQLRepresentationSchema]: - engine = create_sa_engine(AnyUrl(f'sqlite:///{file_path}')) +def mk_sqlite(mitm_data: MITMData, file_path: FilePath | None = ':memory:') -> tuple[sa.Engine, SQLRepresentationSchema]: + engine = create_sa_engine(AnyUrl(f'sqlite:///{str(file_path)}')) sql_rep_schema = insert_mitm_data(engine, mitm_data) # print([f'{t.name}: {t.columns} {t.constraints}' for ts in sql_rep_schema.type_tables.values() for t in ts.values()]) return engine, sql_rep_schema diff --git a/mitm_tooling/transformation/__init__.py b/mitm_tooling/transformation/__init__.py index 81683a1..0bf5d16 100644 --- a/mitm_tooling/transformation/__init__.py +++ b/mitm_tooling/transformation/__init__.py @@ -1,2 +1,2 @@ -from . import superset -__all__ = ['superset'] \ No newline at end of file +from . import df, superset +__all__ = ['df','superset'] \ No newline at end of file diff --git a/mitm_tooling/extraction/df/__init__.py b/mitm_tooling/transformation/df/__init__.py similarity index 100% rename from mitm_tooling/extraction/df/__init__.py rename to mitm_tooling/transformation/df/__init__.py diff --git a/mitm_tooling/extraction/df/intermediate_transformation.py b/mitm_tooling/transformation/df/intermediate_transformation.py similarity index 96% rename from mitm_tooling/extraction/df/intermediate_transformation.py rename to mitm_tooling/transformation/df/intermediate_transformation.py index 8d67f04..7562a4b 100644 --- a/mitm_tooling/extraction/df/intermediate_transformation.py +++ b/mitm_tooling/transformation/df/intermediate_transformation.py @@ -1,6 +1,6 @@ import itertools from collections import defaultdict -from collections.abc import Sequence +from collections.abc import Sequence, Iterable import pandas as pd @@ -12,9 +12,7 @@ from mitm_tooling.representation import mk_concept_file_header from mitm_tooling.representation.common import guess_k_of_header_df, mk_header_file_columns -def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Sequence[pd.DataFrame]) -> pd.DataFrame: - assert len(dfs) > 0 - +def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Iterable[pd.DataFrame]) -> pd.DataFrame: normalized_dfs = [] for df in dfs: base_cols, col_dts = mk_concept_file_header(mitm, concept, 0) @@ -27,6 +25,7 @@ def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Seque df.columns = squashed_form_cols normalized_dfs.append((df, k)) + assert len(normalized_dfs) > 0 max_k = max(normalized_dfs, key=lambda x: x[1])[1] squashed_form_cols = mk_concept_file_header(mitm, concept, max_k)[0] diff --git a/mitm_tooling/transformation/superset/dataset_definition.py b/mitm_tooling/transformation/superset/dataset_definition.py index ce83a6f..92190d4 100644 --- a/mitm_tooling/transformation/superset/dataset_definition.py +++ b/mitm_tooling/transformation/superset/dataset_definition.py @@ -71,7 +71,7 @@ class SupersetColumnDef(pydantic.BaseModel): expression: str | None = None description: str | None = None python_date_format: str = None - extra: dict[str, Any] = pydantic.Field(default_factory=dict) + extra: Annotated[dict[str, Any], pydantic.Field(default_factory=dict)] class SupersetTableDef(SupersetDefFile): diff --git a/mitm_tooling/transformation/superset/superset_representation.py b/mitm_tooling/transformation/superset/superset_representation.py index 4806cde..7fe29aa 100644 --- a/mitm_tooling/transformation/superset/superset_representation.py +++ b/mitm_tooling/transformation/superset/superset_representation.py @@ -70,13 +70,15 @@ def infer_superset_dataset_def(sqlite_file_path: FilePath) -> SupersetDef: cols = [] for c in table.columns: dt = table.column_properties[c].mitm_data_type + cols.append( SupersetColumnDef(column_name=c, is_dttm=dt is MITMDataType.Datetime, groupby=dt not in {MITMDataType.Json, MITMDataType.Numeric, MITMDataType.Datetime}, - type=str(dt.sa_sql_type) # .as_generic()) #.dialect_impl(sa.Dialect.get_dialect_cls(sa.URL.create(drivername='sqlite', database=':memory:'))() + type=(dt.sa_sql_type or MITMDataType.Text.sa_sql_type).compile( + dialect=engine.dialect) )) datasets.append( SupersetTableDef(table_name=table_name, schema_name=schema_name, uuid=uuid.uuid4(), columns=cols)) diff --git a/test/something.py b/test/something.py index 5b6b60d..069b720 100644 --- a/test/something.py +++ b/test/something.py @@ -1,9 +1,6 @@ import os import unittest -from Tools.scripts.generate_opcode_h import header - -from representation.sql_representation import mk_sqlite class MyTestCase(unittest.TestCase): @@ -30,7 +27,7 @@ class MyTestCase(unittest.TestCase): print() def test_writing_sqlite(self): - from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData + from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData, mk_sqlite from mitm_tooling.definition import MITM from mitm_tooling.data_types import MITMDataType h = Header(mitm=MITM.MAED, header_entries=[ @@ -44,6 +41,7 @@ class MyTestCase(unittest.TestCase): mk_sqlite(MITMData(header=h), file_path='gendb.sqlite') def test_with_synthetic(self): + from mitm_tooling.representation import mk_sqlite from mitm_tooling.io import importing from mitm_tooling.definition import MITM syn = importing.read_zip('synthetic.maed', MITM.MAED) diff --git a/test/test_to_df.py b/test/test_to_df.py index 7e39ba4..9e1b9ce 100644 --- a/test/test_to_df.py +++ b/test/test_to_df.py @@ -1,14 +1,14 @@ -import pandas as pd +import unittest -from mitm_tooling.extraction.df import unpack_mitm_data +from transformation.df import unpack_mitm_data +class MyTestCase(unittest.TestCase): + def test_to_df(self): + from mitm_tooling.io import importing + from mitm_tooling.definition import MITM + syn = importing.read_zip('synthetic.maed', MITM.MAED) + mitm_dataset = unpack_mitm_data(syn) -def test_to_df(): - from mitm_tooling.io import importing - from mitm_tooling.definition import MITM - syn = importing.read_zip('synthetic.maed', MITM.MAED) - mitm_dataset = unpack_mitm_data(syn) - - for c, typed_dfs in mitm_dataset: - for type_name, df in typed_dfs.items(): - print(df.head()) + for c, typed_dfs in mitm_dataset: + for type_name, df in typed_dfs.items(): + print(df.head()) -- GitLab