Skip to content
Snippets Groups Projects
Commit f9284d80 authored by Leah Tacke genannt Unterberg's avatar Leah Tacke genannt Unterberg
Browse files

more work on sql and superset representation

parent b2754f3a
Branches
No related tags found
No related merge requests found
......@@ -7,7 +7,9 @@ import pydantic
import sqlalchemy as sa
from sqlalchemy.sql import sqltypes
SA_SQLTypeClass = type[sa.types.TypeEngine]
SA_SQLType = sa.types.TypeEngine
SA_SQLTypeInstanceBuilder = Callable[[], SA_SQLType]
SA_SQLTypeClass = type[SA_SQLType]
SA_SQLTypeName = str
PandasCast = Callable[[pd.Series], pd.Series]
......@@ -26,8 +28,14 @@ class MITMDataType(enum.StrEnum):
Infer = 'infer'
@property
def sa_sql_type(self) -> SA_SQLTypeClass | None:
return mitm_sql_type_map.get(self)
def sa_sql_type(self) -> SA_SQLType | None:
if pair := mitm_sql_type_map.get(self):
return pair[1]()
@property
def sa_sql_type_cls(self) -> SA_SQLTypeClass | None:
if pair := mitm_sql_type_map.get(self):
return pair[0]
@property
def pandas_cast(self) -> PandasCast | None:
......@@ -35,7 +43,7 @@ class MITMDataType(enum.StrEnum):
@property
def sql_type_str(self) -> str:
return self.sa_sql_type.__name__
return self.sa_sql_type_cls.__name__
def wrap(self) -> 'WrappedMITMDataType':
return WrappedMITMDataType(mitm=self)
......@@ -57,7 +65,7 @@ def sa_sql_to_mitm_type(sa_type: SA_SQLTypeClass) -> MITMDataType:
def mitm_to_sql_type(mitm_type: MITMDataType) -> SA_SQLTypeClass | None:
return mitm_type.sa_sql_type
return mitm_type.sa_sql_type_cls
def mitm_to_pandas(mitm_type: MITMDataType) -> PandasCast | None:
......@@ -66,9 +74,9 @@ def mitm_to_pandas(mitm_type: MITMDataType) -> PandasCast | None:
def get_sa_sql_type(type_name: EitherDataType | WrappedMITMDataType) -> SA_SQLTypeClass | None:
if isinstance(type_name, MITMDataType):
return type_name.sa_sql_type
return type_name.sa_sql_type_cls
elif isinstance(type_name, WrappedMITMDataType):
return type_name.mitm.sa_sql_type
return type_name.mitm.sa_sql_type_cls
else:
if type_name and (t := getattr(sqltypes, type_name, None)):
if isinstance(t, type):
......@@ -108,13 +116,13 @@ sql_mitm_type_map: dict[SA_SQLTypeClass, MITMDataType] = {
# sqltypes.BINARY: MITMDataType.Binary,
}
mitm_sql_type_map: dict[MITMDataType, SA_SQLTypeClass] = {
MITMDataType.Text: sqltypes.String,
MITMDataType.Datetime: sqltypes.DATETIME_TIMEZONE,
MITMDataType.Json: sqltypes.JSON,
MITMDataType.Boolean: sqltypes.Boolean,
MITMDataType.Integer: sqltypes.Integer,
MITMDataType.Numeric: sqltypes.Float,
mitm_sql_type_map: dict[MITMDataType, None | tuple[SA_SQLTypeClass, SA_SQLTypeInstanceBuilder]] = {
MITMDataType.Text: (sqltypes.String, sqltypes.String),
MITMDataType.Datetime: (sqltypes.DATETIME, lambda: sqltypes.DATETIME_TIMEZONE),
MITMDataType.Json: (sqltypes.JSON, sqltypes.JSON),
MITMDataType.Boolean: (sqltypes.Boolean, sqltypes.Boolean),
MITMDataType.Integer: (sqltypes.Integer, sqltypes.Integer),
MITMDataType.Numeric: (sqltypes.Float, sqltypes.Float),
MITMDataType.Unknown: None,
MITMDataType.Infer: None,
# MITMDataType.Binary: sqltypes.LargeBinary,
......
from . import sql, df
from . import sql
from transformation import df
__all__ = ['sql', 'df']
\ No newline at end of file
......@@ -4,7 +4,7 @@ import itertools
import logging
from collections import defaultdict
from collections.abc import Iterator, Iterable, Sequence, Mapping
from typing import TYPE_CHECKING, Self, Any
from typing import TYPE_CHECKING, Self, Any, Annotated
import pandas as pd
import pydantic
......@@ -63,9 +63,10 @@ class HeaderEntry(pydantic.BaseModel):
itertools.chain(*zip(self.attributes, map(str, self.attribute_dtypes))))
class Header(pydantic.BaseModel):
mitm: MITM
header_entries: list[HeaderEntry] = pydantic.Field(default_factory=list)
header_entries: Annotated[list[HeaderEntry], pydantic.Field(default_factory=list)]
@classmethod
def from_df(cls, df: pd.DataFrame, mitm: MITM) -> Self:
......@@ -95,7 +96,7 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
header: Header
concept_dfs: dict[ConceptName, pd.DataFrame] = pydantic.Field(default_factory=dict)
concept_dfs: Annotated[dict[ConceptName, pd.DataFrame], pydantic.Field(default_factory=dict)]
def __iter__(self):
return iter(self.concept_dfs.items())
......@@ -103,22 +104,25 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel):
def as_generalized(self) -> Self:
mitm_def = get_mitm_def(self.header.mitm)
dfs = defaultdict(list)
for c, df in self:
for c, df in self.concept_dfs.items():
c = mitm_def.get_parent(c)
dfs[c].append(df)
return MITMData(header=self.header, dfs=dict(dfs))
dfs = {c : pd.concat(dfs_, axis='rows', ignore_index=True) for c, dfs_ in dfs.items()}
return MITMData(header=self.header, concept_dfs=dfs)
def as_specialized(self) -> Self:
mitm_def = get_mitm_def(self.header.mitm)
dfs = defaultdict(list)
dfs = {}
for c, df in self:
if mitm_def.get_properties(c).is_abstract:
leaf_concepts = mitm_def.get_leafs(c)
for sub_c, idx in df.groupby('kind').groups.items():
dfs[sub_c].append(df.loc[idx])
for sub_c_key, idx in df.groupby('kind').groups.items():
sub_c = mitm_def.inverse_concept_key_map[str(sub_c_key)]
dfs[sub_c] = df.loc[idx]
else:
dfs[c].append(df)
return MITMData(header=self.header, dfs=dict(dfs))
dfs[c] = df
return MITMData(header=self.header, concept_dfs=dfs)
class StreamingConceptData(pydantic.BaseModel):
......
from collections import defaultdict
from collections.abc import Callable, Iterator, Generator, Mapping
from collections.abc import Callable, Generator, Mapping
import pydantic
import sqlalchemy as sa
import sqlalchemy.sql.schema
from pydantic import AnyUrl
from pydantic import AnyUrl, ConfigDict
from mitm_tooling.data_types import MITMDataType
from mitm_tooling.definition import MITMDefinition, ConceptProperties, OwnedRelations, ConceptName, MITM, get_mitm_def, \
ConceptKind, ConceptLevel, RelationName
RelationName
from mitm_tooling.definition.definition_tools import map_col_groups, ColGroupMaps
from mitm_tooling.extraction.sql.data_models import Queryable, TableName, ColumnName
from .df_representation import MITMDataset
from mitm_tooling.extraction.sql.data_models import Queryable, TableName
from .intermediate_representation import Header, MITMData
from mitm_tooling.utilities.sql_utils import create_sa_engine, qualify
from mitm_tooling.utilities import python_utils
from mitm_tooling.utilities.io_utils import FilePath
from sqlalchemy_utils.view import create_view
......@@ -74,7 +73,6 @@ def mk_table(meta: sa.MetaData, mitm: MITM, concept: ConceptName, table_name: Ta
created_columns,
ref_columns)
constraints.extend(schema_items)
print(constraints)
return sa.Table(table_name, meta, schema=SQL_REPRESENTATION_DEFAULT_SCHEMA, *columns,
*constraints), created_columns, ref_columns
......@@ -104,6 +102,8 @@ ConceptTypeTablesDict = dict[ConceptName, dict[TableName, sa.Table]]
class SQLRepresentationSchema(pydantic.BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
meta: sa.MetaData
concept_tables: ConceptTablesDict
type_tables: ConceptTypeTablesDict
......@@ -141,6 +141,7 @@ def mk_db_schema(header: Header, gen_views: Callable[
mitm_def.resolve_foreign_types(concept).items() for name, dt in
resolved_fk.items()]
})
concept_tables[concept] = t
for he in header.header_entries:
he_concept = he.concept
......@@ -185,35 +186,40 @@ def mk_db_schema(header: Header, gen_views: Callable[
return SQLRepresentationSchema(meta=meta, concept_tables=concept_tables, type_tables=type_tables, views=views)
def insert_db_instances(engine: sa.Engine, meta: sa.MetaData, mitm_data: MITMData):
with engine.connect() as conn:
def insert_db_instances(engine: sa.Engine, sql_rep_schema: SQLRepresentationSchema, mitm_data: MITMData):
from mitm_tooling.transformation.df import pack_mitm_dataset, unpack_mitm_data
h = mitm_data.header
mitm = mitm_data.header.mitm
mitm_def = get_mitm_def(mitm)
mitm_dataset = unpack_mitm_data(mitm_data)
with engine.connect() as conn:
for concept, typed_dfs in mitm_dataset:
concept_properties, concept_relations = mitm_def.get(concept)
for type_name, type_df in typed_dfs.items():
for concept, df in mitm_data.as_specialized():
concept_table = mk_concept_table_name(mitm, concept)
t_concept = meta.tables[concept_table]
t_concept = sql_rep_schema.concept_tables[mitm_def.get_parent(concept)]
ref_cols = pick_table_pk(mitm, concept, t_concept.columns)
conn.execute(t_concept.insert(), df[[c.name for c in t_concept.columns]].to_dict('records'))
conn.execute(t_concept.insert(), type_df[[c.name for c in t_concept.columns]].to_dict('records'))
if has_type_tables(mitm, concept):
concept_properties, concept_relations = get_mitm_def(mitm).get(concept)
for typ, idx in df.groupby(concept_properties.typing_concept).groups.items():
type_df = df.loc[idx]
t_type = meta.tables[mk_type_table_name(mitm, concept, str(typ))]
conn.execute(t_type.insert(), type_df[[c.name for c in t_type.columns]].to_dict('records'))
#for typ, idx in df.groupby(concept_properties.typing_concept).groups.items():
# type_df = df.loc[idx]
t_type = sql_rep_schema.type_tables[concept][type_name]
to_dict = type_df[[c.name for c in t_type.columns]].to_dict('records')
conn.execute(t_type.insert(), to_dict)
conn.commit()
def insert_mitm_data(engine: sa.Engine, mitm_data: MITMData) -> SQLRepresentationSchema:
sql_rep_schema = mk_db_schema(mitm_data.header)
sql_rep_schema.meta.create_all(engine)
insert_db_instances(engine, sql_rep_schema.meta, mitm_data)
insert_db_instances(engine, sql_rep_schema, mitm_data)
return sql_rep_schema
def mk_sqlite(mitm_data: MITMData, file_path: str | None = ':memory:') -> tuple[sa.Engine, SQLRepresentationSchema]:
engine = create_sa_engine(AnyUrl(f'sqlite:///{file_path}'))
def mk_sqlite(mitm_data: MITMData, file_path: FilePath | None = ':memory:') -> tuple[sa.Engine, SQLRepresentationSchema]:
engine = create_sa_engine(AnyUrl(f'sqlite:///{str(file_path)}'))
sql_rep_schema = insert_mitm_data(engine, mitm_data)
# print([f'{t.name}: {t.columns} {t.constraints}' for ts in sql_rep_schema.type_tables.values() for t in ts.values()])
return engine, sql_rep_schema
from . import superset
__all__ = ['superset']
\ No newline at end of file
from . import df, superset
__all__ = ['df','superset']
\ No newline at end of file
import itertools
from collections import defaultdict
from collections.abc import Sequence
from collections.abc import Sequence, Iterable
import pandas as pd
......@@ -12,9 +12,7 @@ from mitm_tooling.representation import mk_concept_file_header
from mitm_tooling.representation.common import guess_k_of_header_df, mk_header_file_columns
def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Sequence[pd.DataFrame]) -> pd.DataFrame:
assert len(dfs) > 0
def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Iterable[pd.DataFrame]) -> pd.DataFrame:
normalized_dfs = []
for df in dfs:
base_cols, col_dts = mk_concept_file_header(mitm, concept, 0)
......@@ -27,6 +25,7 @@ def pack_typed_dfs_as_concept_table(mitm: MITM, concept: ConceptName, dfs: Seque
df.columns = squashed_form_cols
normalized_dfs.append((df, k))
assert len(normalized_dfs) > 0
max_k = max(normalized_dfs, key=lambda x: x[1])[1]
squashed_form_cols = mk_concept_file_header(mitm, concept, max_k)[0]
......
......@@ -71,7 +71,7 @@ class SupersetColumnDef(pydantic.BaseModel):
expression: str | None = None
description: str | None = None
python_date_format: str = None
extra: dict[str, Any] = pydantic.Field(default_factory=dict)
extra: Annotated[dict[str, Any], pydantic.Field(default_factory=dict)]
class SupersetTableDef(SupersetDefFile):
......
......@@ -70,13 +70,15 @@ def infer_superset_dataset_def(sqlite_file_path: FilePath) -> SupersetDef:
cols = []
for c in table.columns:
dt = table.column_properties[c].mitm_data_type
cols.append(
SupersetColumnDef(column_name=c,
is_dttm=dt is MITMDataType.Datetime,
groupby=dt not in {MITMDataType.Json,
MITMDataType.Numeric,
MITMDataType.Datetime},
type=str(dt.sa_sql_type) # .as_generic()) #.dialect_impl(sa.Dialect.get_dialect_cls(sa.URL.create(drivername='sqlite', database=':memory:'))()
type=(dt.sa_sql_type or MITMDataType.Text.sa_sql_type).compile(
dialect=engine.dialect)
))
datasets.append(
SupersetTableDef(table_name=table_name, schema_name=schema_name, uuid=uuid.uuid4(), columns=cols))
......
import os
import unittest
from Tools.scripts.generate_opcode_h import header
from representation.sql_representation import mk_sqlite
class MyTestCase(unittest.TestCase):
......@@ -30,7 +27,7 @@ class MyTestCase(unittest.TestCase):
print()
def test_writing_sqlite(self):
from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData
from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData, mk_sqlite
from mitm_tooling.definition import MITM
from mitm_tooling.data_types import MITMDataType
h = Header(mitm=MITM.MAED, header_entries=[
......@@ -44,6 +41,7 @@ class MyTestCase(unittest.TestCase):
mk_sqlite(MITMData(header=h), file_path='gendb.sqlite')
def test_with_synthetic(self):
from mitm_tooling.representation import mk_sqlite
from mitm_tooling.io import importing
from mitm_tooling.definition import MITM
syn = importing.read_zip('synthetic.maed', MITM.MAED)
......
import pandas as pd
import unittest
from mitm_tooling.extraction.df import unpack_mitm_data
from transformation.df import unpack_mitm_data
def test_to_df():
class MyTestCase(unittest.TestCase):
def test_to_df(self):
from mitm_tooling.io import importing
from mitm_tooling.definition import MITM
syn = importing.read_zip('synthetic.maed', MITM.MAED)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment