diff --git a/justfile b/justfile new file mode 100644 index 0000000000000000000000000000000000000000..b39ef285316abbf6240f7b0b2d0e53aae589e91b --- /dev/null +++ b/justfile @@ -0,0 +1,18 @@ +set shell := ["pwsh", "-c"] + +lock: + @poetry lock --no-update + +update: + @poetry update + +build: + @poetry build + +release: + @poetry publish --build + +requirements: + @poetry export --without-hashes -f requirements.txt > requirements.txt + +preflight: lock requirements build diff --git a/mitm_tooling/data_types/convert.py b/mitm_tooling/data_types/convert.py index 922481ab60f08cf77ba771e5b6141b5105eb3b19..b4b50d06fb3dd76fa9765cdbb0df1324e4496f59 100644 --- a/mitm_tooling/data_types/convert.py +++ b/mitm_tooling/data_types/convert.py @@ -1,6 +1,6 @@ import pandas as pd -from data_types import MITMDataType +from .data_types import MITMDataType class ColumnDataTypeConversionException(Exception): @@ -12,18 +12,18 @@ def convert_df(df: pd.DataFrame, data_types: dict[str, MITMDataType], inplace=Fa for col, dt in data_types.items(): try: - if inplace: - convert_df_col(df, col, dt, inplace=True) - else: - res[col] = convert_df_col(df, col, dt, inplace=False) + #if inplace: + # df[col] = convert_df_col(df, col, dt, inplace=False) + #else: + res[col] = convert_df_col(df, col, dt, inplace=False) except Exception as e: - raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n' + str(e)) + raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n{e}') return res def convert_df_col(df: pd.DataFrame, col: str, data_type: MITMDataType, inplace=False): - cast = data_type.pandas_cast(df.loc[:, col]) + cast = data_type.pandas_cast(df[col]) if inplace: cast[col] = cast return cast diff --git a/mitm_tooling/data_types/data_types.py b/mitm_tooling/data_types/data_types.py index 28f6b3f2b246dc105152430014ef390184363816..056114f536f5c364b665985bddf029ab0bf8ad07 100644 --- a/mitm_tooling/data_types/data_types.py +++ b/mitm_tooling/data_types/data_types.py @@ -122,9 +122,9 @@ mitm_sql_type_map: dict[MITMDataType, SA_SQLTypeClass] = { } mitm_pandas_type_mappers: dict[MITMDataType, PandasCast] = { - MITMDataType.Text: lambda s: s.astype('str'), + MITMDataType.Text: lambda s: s.astype(pd.StringDtype()), MITMDataType.Datetime: lambda s: pd.to_datetime(s, utc=True, errors='coerce', format='mixed'), - MITMDataType.Json: lambda s: s.astype('str').apply(json.loads), + MITMDataType.Json: lambda s: s.astype(pd.StringDtype()).apply(json.loads), MITMDataType.Boolean: lambda s: s.astype('Boolean'), MITMDataType.Integer: lambda s: s.astype('Int64'), MITMDataType.Numeric: lambda s: s.astype('Float64'), diff --git a/mitm_tooling/definition/definition_representation.py b/mitm_tooling/definition/definition_representation.py index 2b9eda8b6561a8b45d12bcbdbf4b6def859d50ef..54e177b2b81e9dabe0cf63e488295e1bb3868d89 100644 --- a/mitm_tooling/definition/definition_representation.py +++ b/mitm_tooling/definition/definition_representation.py @@ -100,6 +100,10 @@ class MITMDefinition(pydantic.BaseModel): elif concept in (pm := self.parent_concepts_map): return pm[concept] + def get(self, concept: ConceptName) -> tuple[ConceptProperties, OwnedRelations]: + a, b = self.get_properties(concept), self.get_relations(concept) + return a, b + def get_properties(self, concept: ConceptName) -> ConceptProperties | None: return self.concept_properties.get(concept, None) diff --git a/mitm_tooling/definition/definition_tools.py b/mitm_tooling/definition/definition_tools.py index 269c3da2df3f2c7435ca3439d33f5b9bc5742cca..59178ed8f2ae5f350dffc8be292982a9b5716bb9 100644 --- a/mitm_tooling/definition/definition_tools.py +++ b/mitm_tooling/definition/definition_tools.py @@ -9,10 +9,10 @@ from mitm_tooling.utilities.python_utils import elem_wise_eq T = TypeVar('T') -def dummy(_): return None +def dummy(): return None -def dummy_list(_): return [] +def dummy_list(): return [] Mapper = Callable[[], T | tuple[str, T]] diff --git a/mitm_tooling/extraction/sql/data_models/__init__.py b/mitm_tooling/extraction/sql/data_models/__init__.py index 03aeee544523d5adaa5f3a493105ffafed9384bf..dfb226d78013b6c733300f94f53033691d08aefc 100644 --- a/mitm_tooling/extraction/sql/data_models/__init__.py +++ b/mitm_tooling/extraction/sql/data_models/__init__.py @@ -1,10 +1,10 @@ # noinspection PyUnresolvedReferences from .db_meta import Queryable, TableMetaInfo, DBMetaInfo, ForeignKeyConstraint, ExplicitTableSelection, \ - ExplicitColumnSelection, ExplicitSelectionUtils + ExplicitColumnSelection, ExplicitSelectionUtils, ColumnName # noinspection PyUnresolvedReferences from .db_probe import TableProbe, DBProbe, SampleSummary # noinspection PyUnresolvedReferences -from .table_identifiers import SourceDBType, SchemaName, TableName, ColumnName, TableIdentifier, AnyTableIdentifier, \ +from .table_identifiers import SourceDBType, SchemaName, TableName, TableIdentifier, AnyTableIdentifier, \ LocalTableIdentifier, AnyLocalTableIdentifier, ShortTableIdentifier, LongTableIdentifier # noinspection PyUnresolvedReferences from .virtual_view import TypedRawQuery, VirtualView, VirtualDB, CompiledVirtualView diff --git a/mitm_tooling/extraction/sql/data_models/db_meta.py b/mitm_tooling/extraction/sql/data_models/db_meta.py index e63f8f4390077684ec576f65090bc680619778f9..44903b5b9b923311d1f798fa6b5c071f650ea721 100644 --- a/mitm_tooling/extraction/sql/data_models/db_meta.py +++ b/mitm_tooling/extraction/sql/data_models/db_meta.py @@ -7,8 +7,9 @@ from pydantic import Field from sqlalchemy import Table, MetaData from mitm_tooling.data_types import MITMDataType, SA_SQLTypeName, sa_sql_to_mitm_type -from .table_identifiers import TableName, SchemaName, ColumnName, ShortTableIdentifier, LocalTableIdentifier, \ +from .table_identifiers import TableName, SchemaName, ShortTableIdentifier, LocalTableIdentifier, \ AnyLocalTableIdentifier +from mitm_tooling.representation.intermediate_representation import ColumnName from mitm_tooling.utilities.sql_utils import unqualify ExplicitTableSelection = dict[SchemaName, set[TableName]] diff --git a/mitm_tooling/extraction/sql/data_models/db_probe.py b/mitm_tooling/extraction/sql/data_models/db_probe.py index 6295754dd9434d6fde6ff20da001aef97750d295..09f49613cb5125ca8e025755f8dffc5ad4c625b3 100644 --- a/mitm_tooling/extraction/sql/data_models/db_probe.py +++ b/mitm_tooling/extraction/sql/data_models/db_probe.py @@ -5,9 +5,9 @@ import pydantic from pydantic import NonNegativeInt, Field from mitm_tooling.data_types.data_types import MITMDataType -from .db_meta import TableMetaInfoBase, DBMetaInfoBase, DBMetaInfo +from .db_meta import TableMetaInfoBase, DBMetaInfoBase, DBMetaInfo, ColumnName from .probe_models import SampleSummary -from .table_identifiers import ColumnName, ShortTableIdentifier +from .table_identifiers import ShortTableIdentifier logger = logging.getLogger('api') diff --git a/mitm_tooling/extraction/sql/data_models/table_identifiers.py b/mitm_tooling/extraction/sql/data_models/table_identifiers.py index 5892281214f55e32bb4d12c8ecd35be481249619..1f9f4d155274131c074dc16edc8e1b1c57a1b86e 100644 --- a/mitm_tooling/extraction/sql/data_models/table_identifiers.py +++ b/mitm_tooling/extraction/sql/data_models/table_identifiers.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: TableName = str SchemaName = str -ColumnName = str ShortTableIdentifier = tuple[SchemaName, TableName] QualifiedTableName = str diff --git a/mitm_tooling/extraction/sql/db/__init__.py b/mitm_tooling/extraction/sql/db/__init__.py index 5c5c12179599c801e0f161964be4d67fbe5dfcde..ed6feed6c9c413c46cf47a3105abbd91b1bff0b7 100644 --- a/mitm_tooling/extraction/sql/db/__init__.py +++ b/mitm_tooling/extraction/sql/db/__init__.py @@ -1,5 +1,5 @@ # noinspection PyUnresolvedReferences -from .db_connection import create_sa_engine +from mitm_tooling.utilities.sql_utils import create_sa_engine # noinspection PyUnresolvedReferences from .db_reflection import connect_and_reflect, derive_table_meta_info # noinspection PyUnresolvedReferences diff --git a/mitm_tooling/extraction/sql/db/db_connection.py b/mitm_tooling/extraction/sql/db/db_connection.py index 3739d530afafdbab705f77057e94142732348c68..b28b04f643122b019e912540f228c8ed20be9eeb 100644 --- a/mitm_tooling/extraction/sql/db/db_connection.py +++ b/mitm_tooling/extraction/sql/db/db_connection.py @@ -1,14 +1,3 @@ -import sqlite3 -import typing -from pathlib import Path -import sqlalchemy as sa -from pydantic import AnyUrl -from sqlalchemy import Engine -def create_sa_engine(db_url: AnyUrl, sqlite_extensions: list[str] | None = None, test_engine: bool = False, - **engine_kwargs) -> Engine: - engine = sa.create_engine(str(db_url), **engine_kwargs) - - return engine diff --git a/mitm_tooling/extraction/sql/db/db_probing.py b/mitm_tooling/extraction/sql/db/db_probing.py index f9dadf6eb0db60de3bed1ea9e272b299b3c69e0b..008921f9acc4ac4c511757a5ea5ef0e3ac52704d 100644 --- a/mitm_tooling/extraction/sql/db/db_probing.py +++ b/mitm_tooling/extraction/sql/db/db_probing.py @@ -13,8 +13,7 @@ from sqlalchemy.orm import Session from sqlalchemy.sql import sqltypes from mitm_tooling.data_types import MITMDataType -from ..data_models import ColumnName -from ..data_models import TableMetaInfo, DBMetaInfo +from ..data_models import TableMetaInfo, DBMetaInfo, ColumnName from ..data_models import TableProbe, DBProbe from ..data_models.probe_models import NumericSummaryStatistics, DatetimeSummaryStatistics, \ CategoricalSummaryStatistics, SampleSummary diff --git a/mitm_tooling/extraction/sql/transformation/db_transformation.py b/mitm_tooling/extraction/sql/transformation/db_transformation.py index b36b1e85109037c966a02e15bffc69355814e09f..d8417858ad26e1c8c08b5a364b2d49904ed9ed54 100644 --- a/mitm_tooling/extraction/sql/transformation/db_transformation.py +++ b/mitm_tooling/extraction/sql/transformation/db_transformation.py @@ -15,7 +15,7 @@ from sqlalchemy.sql import sqltypes from mitm_tooling.data_types import get_sa_sql_type, SQL_DataType, WrappedMITMDataType, MITMDataType, get_pandas_cast from mitm_tooling.utilities.python_utils import ExtraInfoExc from ..data_models import DBMetaInfo, TypedRawQuery -from ..data_models import TableName, ColumnName, SourceDBType, TableIdentifier, AnyTableIdentifier +from ..data_models import ColumnName, TableName, SourceDBType, TableIdentifier, AnyTableIdentifier from .df_transformation import extract_json_path, PandasSeriesTransform, PandasCreation, PandasDataframeTransform logger = logging.getLogger('api') diff --git a/mitm_tooling/io/exporting.py b/mitm_tooling/io/exporting.py index f2cc7a237ff185a1796b3a02b9e36dd3369437c0..79329c514754fb28927deb41b85573458af2fae8 100644 --- a/mitm_tooling/io/exporting.py +++ b/mitm_tooling/io/exporting.py @@ -14,8 +14,8 @@ import pandas as pd from mitm_tooling.definition import MITM, ConceptName, get_mitm_def from mitm_tooling.representation.intermediate_representation import HeaderEntry, Header, StreamingConceptData, MITMData, \ StreamingMITMData -from representation.file_representation import write_header_file, write_data_file -from utilities.io_utils import DataSink, ByteSink, use_bytes_io, ensure_ext +from mitm_tooling.representation.file_representation import write_header_file, write_data_file +from mitm_tooling.utilities.io_utils import DataSink, ByteSink, use_bytes_io, ensure_ext logger = logging.getLogger('api') diff --git a/mitm_tooling/io/importing.py b/mitm_tooling/io/importing.py index 8be8486359f4954867e1d5a7bc4061bf4a46d4a8..d1cdb7c9a3fee3b59d2e1d145eba44650bb1d4b1 100644 --- a/mitm_tooling/io/importing.py +++ b/mitm_tooling/io/importing.py @@ -6,14 +6,12 @@ from abc import ABC, abstractmethod import pandas as pd import pydantic -from numba.cuda.cudaimpl import lower import io -from definition import MITM, get_mitm_def +from mitm_tooling.definition import MITM, get_mitm_def from mitm_tooling.representation.file_representation import read_header_file, read_data_file from mitm_tooling.representation.intermediate_representation import MITMData, Header -from mitm_tooling.utilities.io_utils import DataSource, use_for_pandas_io, use_bytes_io, ensure_ext -from utilities.io_utils import FilePath +from mitm_tooling.utilities.io_utils import DataSource, use_for_pandas_io, use_bytes_io, ensure_ext, FilePath logger = logging.getLogger('api') @@ -64,8 +62,7 @@ class FolderImport(FileImport): for concept in mitm_def.main_concepts: fn = ensure_ext(mitm_def.get_properties(concept).plural, '.csv') if fn in file_names: - with use_for_pandas_io(fn) as cf: - parts[concept] = read_data_file(cf, target_mitm=self.mitm, target_concept=concept, normalize=True) + parts[concept] = read_data_file(fn, target_mitm=self.mitm, target_concept=concept, normalize=True) return MITMData(header=Header.from_df(parts.pop('header'), self.mitm), concept_dfs=parts) diff --git a/mitm_tooling/representation/__init__.py b/mitm_tooling/representation/__init__.py index cdc6e474e7c94253a7f5d88d33a1cce9c2e98ad3..966933e7a4a5158c0f13ec2865b4cca29b4063eb 100644 --- a/mitm_tooling/representation/__init__.py +++ b/mitm_tooling/representation/__init__.py @@ -2,5 +2,5 @@ from . import intermediate_representation from . import file_representation from . import sql_representation from .file_representation import mk_concept_file_header, write_header_file, write_data_file, read_data_file, read_header_file -from .intermediate_representation import HeaderEntry, Header, MITMData, StreamingMITMData, StreamingConceptData -from .sql_representation import mk_db_schema +from .intermediate_representation import HeaderEntry, Header, MITMData, StreamingMITMData, StreamingConceptData, ColumnName +from .sql_representation import mk_db_schema, insert_mitm_data, mk_sqlite diff --git a/mitm_tooling/representation/common.py b/mitm_tooling/representation/common.py index 07c64da5dcfa1648b4c78acb3de2088de0818d0d..96e84dbc93bc0e90efa0bc2b9517afd1b6db9c93 100644 --- a/mitm_tooling/representation/common.py +++ b/mitm_tooling/representation/common.py @@ -1,2 +1,2 @@ def guess_k(df): - return sum((1 for c in df.columns if c.startswith('a_'))) + return sum((1 for c in df.columns if c.startswith('a_') and not c.startswith('a_dt'))) diff --git a/mitm_tooling/representation/file_representation.py b/mitm_tooling/representation/file_representation.py index 7b16e089c22046a7f5da4a530886fa5e8f839aba..2bddd8b24bf2c471e46b293c26f601f56a7e3bcc 100644 --- a/mitm_tooling/representation/file_representation.py +++ b/mitm_tooling/representation/file_representation.py @@ -5,15 +5,14 @@ from typing import BinaryIO, TextIO import pandas as pd -import _io -from data_types import MITMDataType -from data_types.convert import convert_df +from mitm_tooling.data_types import MITMDataType +from mitm_tooling.data_types.convert import convert_df from mitm_tooling.definition import get_mitm_def, MITM, ConceptName from mitm_tooling.definition.definition_tools import map_col_groups -from representation.common import guess_k -from utilities.io_utils import DataSink, DataSource, use_for_pandas_io, FilePath, ensure_directory_exists, ensure_ext -from utilities.python_utils import i_th +from mitm_tooling.representation.common import guess_k +from mitm_tooling.utilities.io_utils import DataSink, DataSource, use_for_pandas_io, FilePath, ensure_directory_exists, ensure_ext +from mitm_tooling.utilities.python_utils import i_th def mk_header_file_columns(k: int) -> list[str]: @@ -23,7 +22,7 @@ def mk_header_file_columns(k: int) -> list[str]: def mk_concept_file_header(mitm: MITM, concept: ConceptName, k: int) -> tuple[list[str], dict[str, MITMDataType]]: mitm_def = get_mitm_def(mitm) - dts, _ = map_col_groups(mitm_def, concept, { + _, dts = map_col_groups(mitm_def, concept, { 'kind': lambda: ('kind', MITMDataType.Text), 'type': lambda: (mitm_def.get_properties(concept).typing_concept, MITMDataType.Text), 'identity': lambda: mitm_def.resolve_identity_type(concept).items(), @@ -34,7 +33,7 @@ def mk_concept_file_header(mitm: MITM, concept: ConceptName, k: int) -> tuple[li 'attributes': lambda: [(f'a_{i}', MITMDataType.Unknown) for i in range(1, k + 1)], }) - return i_th(0, list)(dts), dict(dts) + return list(dts.keys()), dict(dts) def write_header_file(df: pd.DataFrame, sink: DataSink) -> None: @@ -54,7 +53,7 @@ def read_header_file(source: DataSource, normalize: bool = False) -> pd.DataFram df = pd.read_csv(f, sep=';') if normalize: k = guess_k(df) - df = df.reindex(columns=mk_header_file_columns(k)).astype('str') + df = df.astype(pd.StringDtype()).reindex(columns=mk_header_file_columns(k)) return df diff --git a/mitm_tooling/representation/intermediate_representation.py b/mitm_tooling/representation/intermediate_representation.py index 374b6b3e1b2ac24af5ddba8e64460ba4ea801873..96d4989b82eec92f2218c68953bf35994c2b7123 100644 --- a/mitm_tooling/representation/intermediate_representation.py +++ b/mitm_tooling/representation/intermediate_representation.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import itertools import logging from collections.abc import Iterator, Iterable, Sequence @@ -7,14 +9,14 @@ import pandas as pd import pydantic from pydantic import ConfigDict -from definition import get_mitm_def +from mitm_tooling.definition import get_mitm_def from mitm_tooling.data_types.data_types import MITMDataType from mitm_tooling.definition.definition_representation import ConceptName, MITM -from mitm_tooling.extraction.sql.data_models.table_identifiers import ColumnName -from representation.common import guess_k -from representation.file_representation import mk_header_file_columns +from .common import guess_k +from .file_representation import mk_header_file_columns logger = logging.getLogger('api') +ColumnName = str class HeaderEntry(pydantic.BaseModel): @@ -24,6 +26,12 @@ class HeaderEntry(pydantic.BaseModel): attributes: list[ColumnName] attribute_dtypes: list[MITMDataType] + @pydantic.model_validator(mode='after') + def attr_check(self): + if not len(self.attributes) == len(self.attribute_dtypes): + raise ValueError('Length of specified attributes and their data types differs.') + return self + @classmethod def from_row(cls, row: Sequence[str], mitm: MITM) -> Self | None: kind, type_name = row[0], row[1] @@ -33,14 +41,15 @@ class HeaderEntry(pydantic.BaseModel): return None attrs, attr_dts = [], [] - for a, a_dt in zip(row[2:], row[3:]): - attrs.append(a) - try: - mitm_dt = MITMDataType(a_dt.lower()) if a_dt else MITMDataType.Unknown - attr_dts.append(mitm_dt) - except ValueError: - logger.error(f'Encountered unrecognized data type during header import: {a_dt}.') - return None + for a, a_dt in zip(row[slice(2, None, 2)], row[slice(3, None, 2)]): + if pd.notna(a) and pd.notna(a_dt): + attrs.append(a) + try: + mitm_dt = MITMDataType(a_dt.lower()) if a_dt else MITMDataType.Unknown + attr_dts.append(mitm_dt) + except ValueError: + logger.error(f'Encountered unrecognized data type during header import: {a_dt}.') + return None return HeaderEntry(concept=concept, kind=kind, type_name=type_name, attributes=attrs, attribute_dtypes=attr_dts) def get_k(self) -> int: @@ -53,7 +62,7 @@ class HeaderEntry(pydantic.BaseModel): class Header(pydantic.BaseModel): mitm: MITM - header_entries: list[HeaderEntry] + header_entries: list[HeaderEntry] = pydantic.Field(default_factory=list) @classmethod def from_df(cls, df: pd.DataFrame, mitm: MITM) -> Self: @@ -73,23 +82,25 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) header: Header - concept_dfs: dict[ConceptName, pd.DataFrame] + concept_dfs: dict[ConceptName, pd.DataFrame] = pydantic.Field(default_factory=dict) def __iter__(self): - return iter(self.concept_dfs) + return iter(self.concept_dfs.items()) class StreamingConceptData(pydantic.BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) structure_df: pd.DataFrame - chunk_iterators: list[Iterator[tuple[pd.DataFrame, list[HeaderEntry]]]] + chunk_iterators: list[Iterator[tuple[pd.DataFrame, list[HeaderEntry]]]] = pydantic.Field(default_factory=list) class StreamingMITMData(Iterable[tuple[ConceptName, StreamingConceptData]], pydantic.BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - data_sources: dict[ConceptName, StreamingConceptData] + data_sources: dict[ConceptName, StreamingConceptData] = pydantic.Field(default_factory=dict) def __iter__(self): return iter(self.data_sources.items()) + + diff --git a/mitm_tooling/representation/sql_representation.py b/mitm_tooling/representation/sql_representation.py index 8491d9ce6e75cd27b1e220133e7271ab9bb2fda3..6e3a18bc434fad4a18c8c79c68a5db9d88817f3f 100644 --- a/mitm_tooling/representation/sql_representation.py +++ b/mitm_tooling/representation/sql_representation.py @@ -1,15 +1,17 @@ from collections import defaultdict +from collections.abc import Callable, Iterator, Generator, Mapping import sqlalchemy as sa +import sqlalchemy.sql.schema from pydantic_core import Url -from sqlalchemy_utils import view - +from mitm_tooling.definition import MITMDefinition, ConceptProperties, OwnedRelations from mitm_tooling.data_types import MITMDataType from mitm_tooling.definition import ConceptName, MITM, get_mitm_def, ConceptKind, ConceptLevel, RelationName from mitm_tooling.definition.definition_tools import map_col_groups, ColGroupMaps -from mitm_tooling.extraction.sql.db import create_sa_engine -from .intermediate_representation import Header +from .intermediate_representation import Header, MITMData, ColumnName +from mitm_tooling.utilities.sql_utils import create_sa_engine +from mitm_tooling.utilities import python_utils from mitm_tooling.utilities.sql_utils import qualify @@ -25,75 +27,167 @@ def mk_link_table_name(mitm: MITM, concept: ConceptName, type_name: RelationName return mk_type_table_name(mitm, concept, type_name) + '_' + fk_name.lower() -def mk_db_schema(header: Header): +def has_type_tables(mitm: MITM, concept: ConceptName) -> bool: + return get_mitm_def(mitm).get_properties(concept).permit_attributes + + +def pick_table_pk(mitm: MITM, concept: ConceptName, created_columns: Mapping[RelationName, sa.Column]) -> list[ + tuple[RelationName, sa.Column]]: + mitm_def = get_mitm_def(mitm) + concept_properties, concept_relations = mitm_def.get(concept) + + names, mapped_names = map_col_groups(mitm_def, concept, { + 'kind': lambda: 'kind', + 'type': lambda: concept_properties.typing_concept, + 'identity': lambda: list(concept_relations.identity) + }) + + return python_utils.pick_from_mapping(created_columns, names) + + +def mk_table(meta: sa.MetaData, mitm: MITM, concept: ConceptName, table_name: str, col_group_maps: ColGroupMaps, + additional_schema_item_maker: Callable[ + [MITM, ConceptName, ConceptProperties, OwnedRelations, + dict[RelationName, sa.Column], list[tuple[RelationName, sa.Column]]], + Generator[ + sqlalchemy.sql.schema.SchemaItem, None, None]] | None = None) -> \ + tuple[ + sa.Table, dict[RelationName, sa.Column], list[tuple[RelationName, sa.Column]]]: + mitm_def = get_mitm_def(mitm) + concept_properties, concept_relations = mitm_def.get(concept) + + columns, created_columns = map_col_groups(mitm_def, concept, col_group_maps, ensure_unique=True) + + ref_columns = pick_table_pk(mitm, concept, created_columns) + + constraints: list[sa.sql.schema.SchemaItem] = [] + if concept_relations.identity: + constraints.append(sa.PrimaryKeyConstraint(*python_utils.i_th(1)(ref_columns))) + + if additional_schema_item_maker: + constraints.extend(iter( + additional_schema_item_maker(mitm, concept, concept_properties, concept_relations, created_columns, + ref_columns))) + print(constraints) + + return sa.Table(table_name, meta, *columns, *constraints), created_columns, ref_columns + + +def mk_db_schema(header: Header) -> tuple[sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]: mitm_def = get_mitm_def(header.mitm) meta = sa.MetaData() - concept_level_view_members: dict[ConceptName, list[list[sa.Column]]] = defaultdict(list) - tables: dict[ConceptName, list[sa.Table]] = {} + tables: dict[ConceptName, dict[str, sa.Table]] = {} views: dict[ConceptName, sa.Table] = {} - for he in header.header_entries: - he_concept = he.concept - concept_properties = mitm_def.get_properties(he_concept) - concept_relations = mitm_def.get_relations(he_concept) - assert concept_properties is not None and concept_relations is not None + for concept in mitm_def.main_concepts: + concept_properties, concept_relations = mitm_def.get(concept) - table_name = mk_type_table_name(header.mitm, he_concept, he.type_name) + table_name = mk_concept_table_name(header.mitm, concept) - columns, created_columns = map_col_groups(mitm_def, he_concept, { + t, t_columns, t_ref_columns = mk_table(meta, header.mitm, concept, table_name, { 'kind': lambda: ('kind', sa.Column('kind', MITMDataType.Text.sa_sql_type, nullable=False)), 'type': lambda: (concept_properties.typing_concept, sa.Column(concept_properties.typing_concept, MITMDataType.Text.sa_sql_type, nullable=False)), 'identity': lambda: [(name, sa.Column(name, dt.sa_sql_type, nullable=False)) for name, dt in - mitm_def.resolve_identity_type(he_concept).items()], + mitm_def.resolve_identity_type(concept).items()], 'inline': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in - mitm_def.resolve_inlined_types(he_concept).items()], + mitm_def.resolve_inlined_types(concept).items()], 'foreign': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for _, resolved_fk in - mitm_def.resolve_foreign_types(he_concept).items() for name, dt in - resolved_fk.items()], - 'attributes': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in - zip(he.attributes, he.attribute_dtypes)], - }, ensure_unique=True) - - constraints = [] - if concept_relations.identity: - constraints.append(sa.PrimaryKeyConstraint(*(created_columns[c] for c in concept_relations.identity))) - - for fk_name, fk_info in concept_relations.foreign.items(): - cols, refcols = zip(*fk_info.fk_relations.items()) - fkc = sa.ForeignKeyConstraint(name=fk_name, columns=[created_columns[c] for c in cols], refcolumns=[ - sa.literal_column(qualify(table=mk_concept_table_name(header.mitm, fk_info.target_concept), column=c)) - for c in refcols]) - # constraints.append(fkc) - - t = sa.Table(table_name, meta, *columns, *constraints) + mitm_def.resolve_foreign_types(concept).items() for name, dt in + resolved_fk.items()] + }) - if he_concept not in tables: - tables[he_concept] = [] - tables[he_concept].append(t) - - if concept_relations.identity: - outer_pk = [] - if not concept_properties.typing_concept in concept_relations.identity: - outer_pk.append(created_columns[concept_properties.typing_concept]) - outer_pk.extend((created_columns[identity_col] for identity_col in concept_relations.identity)) - concept_level_view_members[he_concept].append(outer_pk) - - for concept, members in concept_level_view_members.items(): - view_selection = sa.union_all(*(sa.select(*pk_cols) for pk_cols in members)) - views[concept] = view.create_materialized_view(mk_concept_table_name(header.mitm, concept), view_selection, - meta) - - return meta, tables, views + for he in header.header_entries: + he_concept = he.concept + if has_type_tables(header.mitm, he_concept): + concept_properties, concept_relations = mitm_def.get(he_concept) + + def foreign_key_constraints(mitm, concept, concept_properties, concept_relations, created_columns, + ref_columns): + # self_fk + parent_table = mk_concept_table_name(mitm, concept) + cols, refcols = zip( + *((c, qualify(table=parent_table, column=s)) for s, c in ref_columns)) + yield sa.ForeignKeyConstraint(name='parent', columns=cols, refcolumns=refcols) + for fk_name, fk_info in concept_relations.foreign.items(): + cols, refcols = zip(*fk_info.fk_relations.items()) + fkc = sa.ForeignKeyConstraint(name=fk_name, columns=[created_columns[c] for c in cols], refcolumns=[ + # sa.literal_column(qualify(table=mk_concept_table_name(mitm, fk_info.target_concept), column=c)) + qualify(table=mk_concept_table_name(mitm, fk_info.target_concept), column=c) + for c in refcols]) + yield fkc + + table_name = mk_type_table_name(header.mitm, he_concept, he.type_name) + + t, t_columns, t_ref_columns = mk_table(meta, header.mitm, he_concept, table_name, { + 'kind': lambda: ('kind', sa.Column('kind', MITMDataType.Text.sa_sql_type, nullable=False)), + 'type': lambda: (concept_properties.typing_concept, sa.Column(concept_properties.typing_concept, + MITMDataType.Text.sa_sql_type, + nullable=False)), + 'identity': lambda: [(name, sa.Column(name, dt.sa_sql_type, nullable=False)) for + name, dt in + mitm_def.resolve_identity_type(he_concept).items()], + 'inline': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in + mitm_def.resolve_inlined_types(he_concept).items()], + 'foreign': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for _, resolved_fk in + mitm_def.resolve_foreign_types(he_concept).items() for name, dt in + resolved_fk.items()], + 'attributes': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in + zip(he.attributes, he.attribute_dtypes)], + }, additional_schema_item_maker=foreign_key_constraints) + + if he_concept not in tables: + tables[he_concept] = {} + tables[he_concept][he.type_name] = t + + # for concept, members in concept_level_view_members.items(): + + # view_selection = sa.union_all(*(sa.select(*pk_cols) for pk_cols in members)) + + # views[concept] = view.create_materialized_view(mk_concept_table_name(header.mitm, concept), view_selection, + + # meta) + + return meta, tables # , views + + +def insert_db_instances(engine: sa.Engine, meta: sa.MetaData, mitm_data: MITMData): + with engine.connect() as conn: + mitm = mitm_data.header.mitm + + for concept, df in mitm_data: + concept_table = mk_concept_table_name(mitm, concept) + t_concept = meta.tables[concept_table] + ref_cols = pick_table_pk(mitm, concept, t_concept.columns) + parent_insert = t_concept.insert().values(df[[c.name for c in t_concept.columns]].to_dict('records')) + conn.execute(parent_insert) + + if has_type_tables(mitm, concept): + concept_properties, concept_relations = get_mitm_def(mitm).get(concept) + for typ, idx in df.groupby(concept_properties.typing_concept).groups.items(): + type_df = df.loc[idx] + t_type = meta.tables[mk_type_table_name(mitm, concept, str(typ))] + sub_insert = t_type.insert().values(type_df[[c.name for c in t_type.columns]].to_dict('records')) + conn.execute(sub_insert) + conn.commit() + + +def insert_mitm_data(engine: sa.Engine, mitm_data: MITMData) -> tuple[ + sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]: + meta, tables = mk_db_schema(mitm_data.header) + meta.create_all(engine) + insert_db_instances(engine, meta, mitm_data) + return meta, tables -def mk_sqlite(header: Header, file_path: str | None = ':memory:'): +def mk_sqlite(mitm_data: MITMData, file_path: str | None = ':memory:') -> tuple[ + sa.Engine, sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]: engine = create_sa_engine(Url(f'sqlite:///{file_path}')) - meta, tables, views = mk_db_schema(header) + meta, tables = insert_mitm_data(engine, mitm_data) print(meta.tables) - print([f'{t.name}: {t.columns} {t.constraints}' for ts in tables.values() for t in ts]) - print([f'{t.name}: {t.columns} {t.constraints}' for t in views.values()]) + print([f'{t.name}: {t.columns} {t.constraints}' for ts in tables.values() for t in ts.values()]) meta.create_all(engine) + return engine, meta, tables diff --git a/mitm_tooling/utilities/python_utils.py b/mitm_tooling/utilities/python_utils.py index dd3385b4dc43f7d16642e707762fd727d8f9be37..16f9be878379030f99db9f47f72f78f5b562cd74 100644 --- a/mitm_tooling/utilities/python_utils.py +++ b/mitm_tooling/utilities/python_utils.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence, Mapping from typing import TypeVar, Hashable, Iterable, Callable, Any @@ -66,7 +67,7 @@ def elem_wise_eq(it1: Iterable, it2: Iterable) -> Iterable[bool]: return map(lambda elems: elems[0] == elems[1], zip(it1, it2)) -def grouped(it: Iterable[tuple[K, Any]]) -> dict[K, Any]: +def grouped(it: Iterable[tuple[K, T]]) -> dict[K, list[T]]: res = {} for k, v in it: if k not in res: @@ -84,6 +85,10 @@ def inner_list_concat(d1: dict[K, list[Any]], d2: dict[K, list[Any]]) -> dict[K, return res +def pick_from_mapping(d: Mapping[K, T], keys: Sequence[K]) -> list[tuple[K, T]]: + return [(k, d[k]) for k in keys] + + class ExtraInfoExc(Exception): def __init__(self, msg=None): super().__init__() diff --git a/mitm_tooling/utilities/sql_utils.py b/mitm_tooling/utilities/sql_utils.py index 3dff0c2602f130411aef7458ea0ab297e2caaf4e..9e6fbf859af979d79d2318c2397125732898713f 100644 --- a/mitm_tooling/utilities/sql_utils.py +++ b/mitm_tooling/utilities/sql_utils.py @@ -1,3 +1,8 @@ +import sqlalchemy as sa +from pydantic import AnyUrl +from sqlalchemy import Engine + + def qualify(*, table: str, schema: str | None = None, column: str | None = None): res = table if schema is not None: @@ -11,3 +16,8 @@ def unqualify(n: str) -> list[str]: return n.split('.') +def create_sa_engine(db_url: AnyUrl, sqlite_extensions: list[str] | None = None, test_engine: bool = False, + **engine_kwargs) -> Engine: + engine = sa.create_engine(str(db_url), **engine_kwargs) + + return engine diff --git a/pyproject.toml b/pyproject.toml index cacd92a12339a775d9da44abc3df7bc0baaa62d2..50ea5be887e024bd0c0c9e17fc61b7b8d5546484 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] name = "mitm-tooling" -version = "0.2.4" +version = "0.2.6" description = "" authors = ["Leah Tacke genannt Unterberg <leah.tgu@pads.rwth-aachen.de>"] readme = "README.md" packages = [{ include = "mitm_tooling" }] [tool.poetry.dependencies] -python = ">=3.11,<3.13" +python = ">=3.11,<3.14" pydantic = "^2.9.2" pyyaml = "6.0.2" genson = "^1.3.0" diff --git a/test/something.py b/test/something.py index 670b27b076d14489dcbf0cc6de06f2f1aa3afb4c..e2adb11ccd43e01ec1f509cbf364bd351843c7c8 100644 --- a/test/something.py +++ b/test/something.py @@ -16,32 +16,37 @@ class MyTestCase(unittest.TestCase): from mitm_tooling.data_types import MITMDataType h = Header(mitm=MITM.MAED, header_entries=[ HeaderEntry(concept='measurement', kind='M', type_name='A', attributes=['x'], - attribute_dtypes=[MITMDataType.Datetime]), + attribute_dtypes=[MITMDataType.Numeric]), HeaderEntry(concept='segment', kind='S', type_name='annotation', attributes=[], attribute_dtypes=[]), HeaderEntry(concept='segment_data', kind='SD', type_name='annotation_info', attributes=['y'], attribute_dtypes=[MITMDataType.Json]), ]) - meta, tables, views = mk_db_schema(h) + meta, tables = mk_db_schema(h) print(meta) print() print(tables) print() - print(views) def test_writing_sqlite(self): - from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema + from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData from mitm_tooling.definition import MITM from mitm_tooling.data_types import MITMDataType h = Header(mitm=MITM.MAED, header_entries=[ HeaderEntry(concept='measurement', kind='M', type_name='A', attributes=['x'], - attribute_dtypes=[MITMDataType.Datetime]), + attribute_dtypes=[MITMDataType.Numeric]), HeaderEntry(concept='segment', kind='S', type_name='annotation', attributes=[], attribute_dtypes=[]), HeaderEntry(concept='segment_data', kind='SD', type_name='annotation_info', attributes=['y'], attribute_dtypes=[MITMDataType.Json]), ]) - mk_sqlite(h, file_path='gendb.sqlite') + mk_sqlite(MITMData(header=h), file_path='gendb.sqlite') + + def test_with_synthetic(self): + from mitm_tooling.io import importing + from mitm_tooling.definition import MITM + syn = importing.read_zip('synthetic.maed', MITM.MAED) + mk_sqlite(syn, 'synthetic.sqlite') if __name__ == '__main__': unittest.main()