diff --git a/mitm_tooling/transformation/superset/__init__.py b/mitm_tooling/transformation/superset/__init__.py index ca799a78438129c6cacb0fe970503798f1b07c24..92ec2129694f11853f453b2205a984425de7e1e9 100644 --- a/mitm_tooling/transformation/superset/__init__.py +++ b/mitm_tooling/transformation/superset/__init__.py @@ -1,2 +1,2 @@ -from .superset_representation import mk_inferred_superset_dataset_def, mk_superset_dataset_def +from .superset_representation import mk_inferred_superset_dataset_def, mk_superset_dataset_def, mk_inferred_superset_defs from . import dataset_definition, superset_representation \ No newline at end of file diff --git a/mitm_tooling/transformation/superset/dataset_definition.py b/mitm_tooling/transformation/superset/dataset_definition.py index 7f0a6dc4b7cf48c6df7a7fee573320e4325af11b..5eb9c405187cba11d3ed0e330f9ff85562720e80 100644 --- a/mitm_tooling/transformation/superset/dataset_definition.py +++ b/mitm_tooling/transformation/superset/dataset_definition.py @@ -1,9 +1,12 @@ from abc import ABC, abstractmethod from datetime import datetime, tzinfo -from typing import Any, Annotated +from typing import Any, Annotated, Literal import pydantic from uuid import UUID + +from pydantic import Field + from mitm_tooling.data_types import MITMDataType BetterUUID = Annotated[ @@ -41,6 +44,7 @@ class SupersetDatabaseDef(SupersetDefFile): }) impersonate_user: bool = False version: str = '1.0.0' + ssh_tunnel: None = None @property def filename(self): @@ -55,7 +59,7 @@ class SupersetMetricDef(pydantic.BaseModel): description: str | None = None d3format: str | None = None currency: str | None = None - extra: dict[str, Any] = pydantic.Field(default_factory=dict) + extra: dict[str, Any] = Field(default_factory=dict) warning_text: str | None = None @@ -71,10 +75,10 @@ class SupersetColumnDef(pydantic.BaseModel): expression: str | None = None description: str | None = None python_date_format: str = None - extra: Annotated[dict[str, Any], pydantic.Field(default_factory=dict)] + extra: dict[str, Any] = pydantic.Field(default_factory=dict) -class SupersetTableDef(SupersetDefFile): +class SupersetDatasetDef(SupersetDefFile): model_config = pydantic.ConfigDict(populate_by_name=True) table_name: str @@ -87,7 +91,7 @@ class SupersetTableDef(SupersetDefFile): offset: int = 0 cache_timeout: str | None = None catalog: str | None = None - sql: str = '' + sql: str | None = None params: Any = None template_params: Any = None filter_select_enabled: bool = True @@ -97,24 +101,27 @@ class SupersetTableDef(SupersetDefFile): always_filter_main_dttm: bool = False metrics: list[SupersetMetricDef] = pydantic.Field(default_factory=list) columns: list[SupersetColumnDef] = pydantic.Field(default_factory=list) + version: str = '1.0.0' @property def filename(self): return self.table_name -BetterDatetime = Annotated[datetime, +StrDatetime = Annotated[datetime, pydantic.BeforeValidator(lambda x: datetime.fromisoformat(x) if isinstance(x, str) else x), pydantic.PlainSerializer(lambda x: str(x)), pydantic.Field( description="Better annotation for datetime, parses from string format. Serializes to string format." )] +MetadataType = Literal['Database', 'SqlaTable', 'Slice'] + class SupersetMetadataDef(SupersetDefFile): version: str = '1.0.0' - type: str = 'Database' - timestamp: BetterDatetime = pydantic.Field(default_factory=datetime.utcnow) + type: MetadataType = 'SqlaTable' + timestamp: StrDatetime = pydantic.Field(default_factory=datetime.utcnow) @property def filename(self) -> str: @@ -123,12 +130,11 @@ class SupersetMetadataDef(SupersetDefFile): class SupersetDef(pydantic.BaseModel): database: SupersetDatabaseDef - datasets: list[SupersetTableDef] + datasets: list[SupersetDatasetDef] metadata: SupersetMetadataDef = pydantic.Field(default_factory=SupersetMetadataDef) def to_folder_structure(self) -> dict[str, Any]: db_name = self.database.database_name - db_folder = {db_name: self.database} - datasets = list(self.datasets) - dataset_folder = {db_name: datasets} - return {'databases': [self.database], 'datasets': dataset_folder, '.': self.metadata} + folder = {'.': self.metadata, 'databases': [{db_name: self.database}], + 'datasets': {db_name: list(self.datasets)}} + return {'my_import': folder} diff --git a/mitm_tooling/transformation/superset/superset_representation.py b/mitm_tooling/transformation/superset/superset_representation.py index b40cda47e78b627684ae5f3545c18ff611933881..44d2368512f5c665fda9e22284f19651d2d21755 100644 --- a/mitm_tooling/transformation/superset/superset_representation.py +++ b/mitm_tooling/transformation/superset/superset_representation.py @@ -14,7 +14,8 @@ from mitm_tooling.representation.sql_representation import MITMData, mk_sqlite, from mitm_tooling.data_types import MITMDataType -from .dataset_definition import SupersetTableDef, SupersetColumnDef, SupersetDatabaseDef, SupersetDef, SupersetDefFile +from .dataset_definition import SupersetDatasetDef, SupersetColumnDef, SupersetDatabaseDef, SupersetDef, \ + SupersetDefFile, SupersetMetadataDef, SupersetMetricDef def tentative_superset_mount_url(db_name: str) -> AnyUrl: @@ -69,20 +70,26 @@ def infer_superset_dataset_def(sqlite_file_path: FilePath) -> SupersetDef: for schema_name, schema_tables in db_meta.db_structure.items(): for table_name, table in schema_tables.items(): cols = [] + metrics = [SupersetMetricDef(metric_name='COUNT(*)', verbose_name='Count', expression='COUNT(*)')] for c in table.columns: dt = table.column_properties[c].mitm_data_type - cols.append( SupersetColumnDef(column_name=c, is_dttm=dt is MITMDataType.Datetime, groupby=dt not in {MITMDataType.Json, - MITMDataType.Numeric, - MITMDataType.Datetime}, + MITMDataType.Numeric}, type=(dt.sa_sql_type or MITMDataType.Text.sa_sql_type).compile( dialect=engine.dialect) )) + if dt in {MITMDataType.Numeric, MITMDataType.Integer}: + metrics.extend(( + SupersetMetricDef(metric_name=f'AVG({c})', verbose_name=f'AVG({c})', expression=f'AVG({c})'), + SupersetMetricDef(metric_name=f'SUM({c})', verbose_name=f'SUM({c})', expression=f'SUM({c})') + )) + datasets.append( - SupersetTableDef(table_name=table_name, schema_name=schema_name, uuid=uuid.uuid4(), database_uuid=database_uuid, columns=cols)) + SupersetDatasetDef(table_name=table_name, schema_name=schema_name, uuid=uuid.uuid4(), + database_uuid=database_uuid, columns=cols, metrics=metrics)) db_name = os.path.splitext(os.path.basename(sqlite_file_path))[0] return SupersetDef( @@ -93,7 +100,16 @@ def infer_superset_dataset_def(sqlite_file_path: FilePath) -> SupersetDef: def mk_inferred_superset_dataset_def(output_path: FilePath, sqlite_file_path: FilePath): - write_superset_def(output_path, infer_superset_dataset_def(sqlite_file_path)) + dataset_def = infer_superset_dataset_def(sqlite_file_path) + write_superset_def(output_path, dataset_def) + + +def mk_inferred_superset_defs(output_path_base: FilePath, sqlite_file_path: FilePath): + dataset_def = infer_superset_dataset_def(sqlite_file_path) + a = dataset_def.model_copy(update={'metadata': SupersetMetadataDef(type='Database')}) + b = dataset_def.model_copy(update={'metadata': SupersetMetadataDef(type='SqlaTable')}) + write_superset_def(output_path_base + '_db.zip', a) + write_superset_def(output_path_base + '_ds.zip', b) def mk_superset_dataset_def(mitm_data: MITMData, sqlite_file_path: str | None = ':memory:', diff --git a/pyproject.toml b/pyproject.toml index c331ee1dc0d20319406378c926c32639bff80a71..5fc080f98ccba4e4cdd4e37b23d7aa17037941e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "mitm-tooling" -version = "0.3.1" +version = "0.3.2" description = "" authors = ["Leah Tacke genannt Unterberg <leah.tgu@pads.rwth-aachen.de>"] readme = "README.md" @@ -12,9 +12,10 @@ pydantic = "^2.9.2" pyyaml = "6.0.2" genson = "^1.3.0" -sqlalchemy = { version = "^2.0.34", extras = ["postgresql-psycopg", "postgresql"] } +sqlalchemy = { version = ">=2", extras = ["postgresql-psycopg", "postgresql"] } sqlalchemy-utils = ">=0.38.0" -pandas = { version = "^2.0", extras = ["performance", "excel", "hdf5", "output_formatting", "computation", "postgresql", "mysql", "sql-other", "plot", "compression"] } +numpy = ">=2" +pandas = { version = ">=2.0", extras = ["performance", "excel", "hdf5", "output_formatting", "computation", "postgresql", "mysql", "sql-other", "plot", "compression"] } # visualization matplotlib = "*" diff --git a/test/something.py b/test/something.py index 069b7203927948e29c3cf4c2eb4bd6ffb4301b4f..41ca5d18a652bdeb9aafb1c54c0fe2e85be73521 100644 --- a/test/something.py +++ b/test/something.py @@ -20,10 +20,11 @@ class MyTestCase(unittest.TestCase): HeaderEntry(concept='segment_data', kind='SD', type_name='annotation_info', attributes=['y'], attribute_dtypes=[MITMDataType.Json]), ]) - meta, tables = mk_db_schema(h) - print(meta) + sql_rep = mk_db_schema(h) + print(sql_rep.meta) print() - print(tables) + print(sql_rep.concept_tables) + print(sql_rep.type_tables) print() def test_writing_sqlite(self): @@ -49,8 +50,9 @@ class MyTestCase(unittest.TestCase): mk_sqlite(syn, 'synthetic.sqlite') def test_superset(self): - from mitm_tooling.transformation.superset import mk_inferred_superset_dataset_def - mk_inferred_superset_dataset_def('superset_import.zip', 'synthetic.sqlite') + from mitm_tooling.transformation.superset import mk_inferred_superset_dataset_def, mk_inferred_superset_defs + mk_inferred_superset_dataset_def('superset_import', 'synthetic.sqlite') + mk_inferred_superset_defs('superset_import', 'synthetic.sqlite') if __name__ == '__main__':