diff --git a/justfile b/justfile
new file mode 100644
index 0000000000000000000000000000000000000000..b39ef285316abbf6240f7b0b2d0e53aae589e91b
--- /dev/null
+++ b/justfile
@@ -0,0 +1,18 @@
+set shell := ["pwsh", "-c"]
+
+lock:
+ @poetry lock --no-update
+
+update:
+ @poetry update
+
+build:
+ @poetry build
+
+release:
+ @poetry publish --build
+
+requirements:
+ @poetry export --without-hashes -f requirements.txt > requirements.txt
+
+preflight: lock requirements build
diff --git a/mitm_tooling/data_types/convert.py b/mitm_tooling/data_types/convert.py
index 922481ab60f08cf77ba771e5b6141b5105eb3b19..b4b50d06fb3dd76fa9765cdbb0df1324e4496f59 100644
--- a/mitm_tooling/data_types/convert.py
+++ b/mitm_tooling/data_types/convert.py
@@ -1,6 +1,6 @@
import pandas as pd
-from data_types import MITMDataType
+from .data_types import MITMDataType
class ColumnDataTypeConversionException(Exception):
@@ -12,18 +12,18 @@ def convert_df(df: pd.DataFrame, data_types: dict[str, MITMDataType], inplace=Fa
for col, dt in data_types.items():
try:
- if inplace:
- convert_df_col(df, col, dt, inplace=True)
- else:
- res[col] = convert_df_col(df, col, dt, inplace=False)
+ #if inplace:
+ # df[col] = convert_df_col(df, col, dt, inplace=False)
+ #else:
+ res[col] = convert_df_col(df, col, dt, inplace=False)
except Exception as e:
- raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n' + str(e))
+ raise ColumnDataTypeConversionException(f'Conversion of feature \'{col}\' to {dt} failed:\n{e}')
return res
def convert_df_col(df: pd.DataFrame, col: str, data_type: MITMDataType, inplace=False):
- cast = data_type.pandas_cast(df.loc[:, col])
+ cast = data_type.pandas_cast(df[col])
if inplace:
cast[col] = cast
return cast
diff --git a/mitm_tooling/data_types/data_types.py b/mitm_tooling/data_types/data_types.py
index 28f6b3f2b246dc105152430014ef390184363816..056114f536f5c364b665985bddf029ab0bf8ad07 100644
--- a/mitm_tooling/data_types/data_types.py
+++ b/mitm_tooling/data_types/data_types.py
@@ -122,9 +122,9 @@ mitm_sql_type_map: dict[MITMDataType, SA_SQLTypeClass] = {
}
mitm_pandas_type_mappers: dict[MITMDataType, PandasCast] = {
- MITMDataType.Text: lambda s: s.astype('str'),
+ MITMDataType.Text: lambda s: s.astype(pd.StringDtype()),
MITMDataType.Datetime: lambda s: pd.to_datetime(s, utc=True, errors='coerce', format='mixed'),
- MITMDataType.Json: lambda s: s.astype('str').apply(json.loads),
+ MITMDataType.Json: lambda s: s.astype(pd.StringDtype()).apply(json.loads),
MITMDataType.Boolean: lambda s: s.astype('Boolean'),
MITMDataType.Integer: lambda s: s.astype('Int64'),
MITMDataType.Numeric: lambda s: s.astype('Float64'),
diff --git a/mitm_tooling/definition/definition_representation.py b/mitm_tooling/definition/definition_representation.py
index 2b9eda8b6561a8b45d12bcbdbf4b6def859d50ef..54e177b2b81e9dabe0cf63e488295e1bb3868d89 100644
--- a/mitm_tooling/definition/definition_representation.py
+++ b/mitm_tooling/definition/definition_representation.py
@@ -100,6 +100,10 @@ class MITMDefinition(pydantic.BaseModel):
elif concept in (pm := self.parent_concepts_map):
return pm[concept]
+ def get(self, concept: ConceptName) -> tuple[ConceptProperties, OwnedRelations]:
+ a, b = self.get_properties(concept), self.get_relations(concept)
+ return a, b
+
def get_properties(self, concept: ConceptName) -> ConceptProperties | None:
return self.concept_properties.get(concept, None)
diff --git a/mitm_tooling/definition/definition_tools.py b/mitm_tooling/definition/definition_tools.py
index 269c3da2df3f2c7435ca3439d33f5b9bc5742cca..59178ed8f2ae5f350dffc8be292982a9b5716bb9 100644
--- a/mitm_tooling/definition/definition_tools.py
+++ b/mitm_tooling/definition/definition_tools.py
@@ -9,10 +9,10 @@ from mitm_tooling.utilities.python_utils import elem_wise_eq
T = TypeVar('T')
-def dummy(_): return None
+def dummy(): return None
-def dummy_list(_): return []
+def dummy_list(): return []
Mapper = Callable[[], T | tuple[str, T]]
diff --git a/mitm_tooling/extraction/sql/data_models/__init__.py b/mitm_tooling/extraction/sql/data_models/__init__.py
index 03aeee544523d5adaa5f3a493105ffafed9384bf..dfb226d78013b6c733300f94f53033691d08aefc 100644
--- a/mitm_tooling/extraction/sql/data_models/__init__.py
+++ b/mitm_tooling/extraction/sql/data_models/__init__.py
@@ -1,10 +1,10 @@
# noinspection PyUnresolvedReferences
from .db_meta import Queryable, TableMetaInfo, DBMetaInfo, ForeignKeyConstraint, ExplicitTableSelection, \
- ExplicitColumnSelection, ExplicitSelectionUtils
+ ExplicitColumnSelection, ExplicitSelectionUtils, ColumnName
# noinspection PyUnresolvedReferences
from .db_probe import TableProbe, DBProbe, SampleSummary
# noinspection PyUnresolvedReferences
-from .table_identifiers import SourceDBType, SchemaName, TableName, ColumnName, TableIdentifier, AnyTableIdentifier, \
+from .table_identifiers import SourceDBType, SchemaName, TableName, TableIdentifier, AnyTableIdentifier, \
LocalTableIdentifier, AnyLocalTableIdentifier, ShortTableIdentifier, LongTableIdentifier
# noinspection PyUnresolvedReferences
from .virtual_view import TypedRawQuery, VirtualView, VirtualDB, CompiledVirtualView
diff --git a/mitm_tooling/extraction/sql/data_models/db_meta.py b/mitm_tooling/extraction/sql/data_models/db_meta.py
index e63f8f4390077684ec576f65090bc680619778f9..44903b5b9b923311d1f798fa6b5c071f650ea721 100644
--- a/mitm_tooling/extraction/sql/data_models/db_meta.py
+++ b/mitm_tooling/extraction/sql/data_models/db_meta.py
@@ -7,8 +7,9 @@ from pydantic import Field
from sqlalchemy import Table, MetaData
from mitm_tooling.data_types import MITMDataType, SA_SQLTypeName, sa_sql_to_mitm_type
-from .table_identifiers import TableName, SchemaName, ColumnName, ShortTableIdentifier, LocalTableIdentifier, \
+from .table_identifiers import TableName, SchemaName, ShortTableIdentifier, LocalTableIdentifier, \
AnyLocalTableIdentifier
+from mitm_tooling.representation.intermediate_representation import ColumnName
from mitm_tooling.utilities.sql_utils import unqualify
ExplicitTableSelection = dict[SchemaName, set[TableName]]
diff --git a/mitm_tooling/extraction/sql/data_models/db_probe.py b/mitm_tooling/extraction/sql/data_models/db_probe.py
index 6295754dd9434d6fde6ff20da001aef97750d295..09f49613cb5125ca8e025755f8dffc5ad4c625b3 100644
--- a/mitm_tooling/extraction/sql/data_models/db_probe.py
+++ b/mitm_tooling/extraction/sql/data_models/db_probe.py
@@ -5,9 +5,9 @@ import pydantic
from pydantic import NonNegativeInt, Field
from mitm_tooling.data_types.data_types import MITMDataType
-from .db_meta import TableMetaInfoBase, DBMetaInfoBase, DBMetaInfo
+from .db_meta import TableMetaInfoBase, DBMetaInfoBase, DBMetaInfo, ColumnName
from .probe_models import SampleSummary
-from .table_identifiers import ColumnName, ShortTableIdentifier
+from .table_identifiers import ShortTableIdentifier
logger = logging.getLogger('api')
diff --git a/mitm_tooling/extraction/sql/data_models/table_identifiers.py b/mitm_tooling/extraction/sql/data_models/table_identifiers.py
index 5892281214f55e32bb4d12c8ecd35be481249619..1f9f4d155274131c074dc16edc8e1b1c57a1b86e 100644
--- a/mitm_tooling/extraction/sql/data_models/table_identifiers.py
+++ b/mitm_tooling/extraction/sql/data_models/table_identifiers.py
@@ -11,7 +11,6 @@ if TYPE_CHECKING:
TableName = str
SchemaName = str
-ColumnName = str
ShortTableIdentifier = tuple[SchemaName, TableName]
QualifiedTableName = str
diff --git a/mitm_tooling/extraction/sql/db/__init__.py b/mitm_tooling/extraction/sql/db/__init__.py
index 5c5c12179599c801e0f161964be4d67fbe5dfcde..ed6feed6c9c413c46cf47a3105abbd91b1bff0b7 100644
--- a/mitm_tooling/extraction/sql/db/__init__.py
+++ b/mitm_tooling/extraction/sql/db/__init__.py
@@ -1,5 +1,5 @@
# noinspection PyUnresolvedReferences
-from .db_connection import create_sa_engine
+from mitm_tooling.utilities.sql_utils import create_sa_engine
# noinspection PyUnresolvedReferences
from .db_reflection import connect_and_reflect, derive_table_meta_info
# noinspection PyUnresolvedReferences
diff --git a/mitm_tooling/extraction/sql/db/db_connection.py b/mitm_tooling/extraction/sql/db/db_connection.py
index 3739d530afafdbab705f77057e94142732348c68..b28b04f643122b019e912540f228c8ed20be9eeb 100644
--- a/mitm_tooling/extraction/sql/db/db_connection.py
+++ b/mitm_tooling/extraction/sql/db/db_connection.py
@@ -1,14 +1,3 @@
-import sqlite3
-import typing
-from pathlib import Path
-import sqlalchemy as sa
-from pydantic import AnyUrl
-from sqlalchemy import Engine
-def create_sa_engine(db_url: AnyUrl, sqlite_extensions: list[str] | None = None, test_engine: bool = False,
- **engine_kwargs) -> Engine:
- engine = sa.create_engine(str(db_url), **engine_kwargs)
-
- return engine
diff --git a/mitm_tooling/extraction/sql/db/db_probing.py b/mitm_tooling/extraction/sql/db/db_probing.py
index f9dadf6eb0db60de3bed1ea9e272b299b3c69e0b..008921f9acc4ac4c511757a5ea5ef0e3ac52704d 100644
--- a/mitm_tooling/extraction/sql/db/db_probing.py
+++ b/mitm_tooling/extraction/sql/db/db_probing.py
@@ -13,8 +13,7 @@ from sqlalchemy.orm import Session
from sqlalchemy.sql import sqltypes
from mitm_tooling.data_types import MITMDataType
-from ..data_models import ColumnName
-from ..data_models import TableMetaInfo, DBMetaInfo
+from ..data_models import TableMetaInfo, DBMetaInfo, ColumnName
from ..data_models import TableProbe, DBProbe
from ..data_models.probe_models import NumericSummaryStatistics, DatetimeSummaryStatistics, \
CategoricalSummaryStatistics, SampleSummary
diff --git a/mitm_tooling/extraction/sql/transformation/db_transformation.py b/mitm_tooling/extraction/sql/transformation/db_transformation.py
index b36b1e85109037c966a02e15bffc69355814e09f..d8417858ad26e1c8c08b5a364b2d49904ed9ed54 100644
--- a/mitm_tooling/extraction/sql/transformation/db_transformation.py
+++ b/mitm_tooling/extraction/sql/transformation/db_transformation.py
@@ -15,7 +15,7 @@ from sqlalchemy.sql import sqltypes
from mitm_tooling.data_types import get_sa_sql_type, SQL_DataType, WrappedMITMDataType, MITMDataType, get_pandas_cast
from mitm_tooling.utilities.python_utils import ExtraInfoExc
from ..data_models import DBMetaInfo, TypedRawQuery
-from ..data_models import TableName, ColumnName, SourceDBType, TableIdentifier, AnyTableIdentifier
+from ..data_models import ColumnName, TableName, SourceDBType, TableIdentifier, AnyTableIdentifier
from .df_transformation import extract_json_path, PandasSeriesTransform, PandasCreation, PandasDataframeTransform
logger = logging.getLogger('api')
diff --git a/mitm_tooling/io/exporting.py b/mitm_tooling/io/exporting.py
index f2cc7a237ff185a1796b3a02b9e36dd3369437c0..79329c514754fb28927deb41b85573458af2fae8 100644
--- a/mitm_tooling/io/exporting.py
+++ b/mitm_tooling/io/exporting.py
@@ -14,8 +14,8 @@ import pandas as pd
from mitm_tooling.definition import MITM, ConceptName, get_mitm_def
from mitm_tooling.representation.intermediate_representation import HeaderEntry, Header, StreamingConceptData, MITMData, \
StreamingMITMData
-from representation.file_representation import write_header_file, write_data_file
-from utilities.io_utils import DataSink, ByteSink, use_bytes_io, ensure_ext
+from mitm_tooling.representation.file_representation import write_header_file, write_data_file
+from mitm_tooling.utilities.io_utils import DataSink, ByteSink, use_bytes_io, ensure_ext
logger = logging.getLogger('api')
diff --git a/mitm_tooling/io/importing.py b/mitm_tooling/io/importing.py
index 8be8486359f4954867e1d5a7bc4061bf4a46d4a8..d1cdb7c9a3fee3b59d2e1d145eba44650bb1d4b1 100644
--- a/mitm_tooling/io/importing.py
+++ b/mitm_tooling/io/importing.py
@@ -6,14 +6,12 @@ from abc import ABC, abstractmethod
import pandas as pd
import pydantic
-from numba.cuda.cudaimpl import lower
import io
-from definition import MITM, get_mitm_def
+from mitm_tooling.definition import MITM, get_mitm_def
from mitm_tooling.representation.file_representation import read_header_file, read_data_file
from mitm_tooling.representation.intermediate_representation import MITMData, Header
-from mitm_tooling.utilities.io_utils import DataSource, use_for_pandas_io, use_bytes_io, ensure_ext
-from utilities.io_utils import FilePath
+from mitm_tooling.utilities.io_utils import DataSource, use_for_pandas_io, use_bytes_io, ensure_ext, FilePath
logger = logging.getLogger('api')
@@ -64,8 +62,7 @@ class FolderImport(FileImport):
for concept in mitm_def.main_concepts:
fn = ensure_ext(mitm_def.get_properties(concept).plural, '.csv')
if fn in file_names:
- with use_for_pandas_io(fn) as cf:
- parts[concept] = read_data_file(cf, target_mitm=self.mitm, target_concept=concept, normalize=True)
+ parts[concept] = read_data_file(fn, target_mitm=self.mitm, target_concept=concept, normalize=True)
return MITMData(header=Header.from_df(parts.pop('header'), self.mitm), concept_dfs=parts)
diff --git a/mitm_tooling/representation/__init__.py b/mitm_tooling/representation/__init__.py
index cdc6e474e7c94253a7f5d88d33a1cce9c2e98ad3..966933e7a4a5158c0f13ec2865b4cca29b4063eb 100644
--- a/mitm_tooling/representation/__init__.py
+++ b/mitm_tooling/representation/__init__.py
@@ -2,5 +2,5 @@ from . import intermediate_representation
from . import file_representation
from . import sql_representation
from .file_representation import mk_concept_file_header, write_header_file, write_data_file, read_data_file, read_header_file
-from .intermediate_representation import HeaderEntry, Header, MITMData, StreamingMITMData, StreamingConceptData
-from .sql_representation import mk_db_schema
+from .intermediate_representation import HeaderEntry, Header, MITMData, StreamingMITMData, StreamingConceptData, ColumnName
+from .sql_representation import mk_db_schema, insert_mitm_data, mk_sqlite
diff --git a/mitm_tooling/representation/common.py b/mitm_tooling/representation/common.py
index 07c64da5dcfa1648b4c78acb3de2088de0818d0d..96e84dbc93bc0e90efa0bc2b9517afd1b6db9c93 100644
--- a/mitm_tooling/representation/common.py
+++ b/mitm_tooling/representation/common.py
@@ -1,2 +1,2 @@
def guess_k(df):
- return sum((1 for c in df.columns if c.startswith('a_')))
+ return sum((1 for c in df.columns if c.startswith('a_') and not c.startswith('a_dt')))
diff --git a/mitm_tooling/representation/file_representation.py b/mitm_tooling/representation/file_representation.py
index 7b16e089c22046a7f5da4a530886fa5e8f839aba..2bddd8b24bf2c471e46b293c26f601f56a7e3bcc 100644
--- a/mitm_tooling/representation/file_representation.py
+++ b/mitm_tooling/representation/file_representation.py
@@ -5,15 +5,14 @@ from typing import BinaryIO, TextIO
import pandas as pd
-import _io
-from data_types import MITMDataType
-from data_types.convert import convert_df
+from mitm_tooling.data_types import MITMDataType
+from mitm_tooling.data_types.convert import convert_df
from mitm_tooling.definition import get_mitm_def, MITM, ConceptName
from mitm_tooling.definition.definition_tools import map_col_groups
-from representation.common import guess_k
-from utilities.io_utils import DataSink, DataSource, use_for_pandas_io, FilePath, ensure_directory_exists, ensure_ext
-from utilities.python_utils import i_th
+from mitm_tooling.representation.common import guess_k
+from mitm_tooling.utilities.io_utils import DataSink, DataSource, use_for_pandas_io, FilePath, ensure_directory_exists, ensure_ext
+from mitm_tooling.utilities.python_utils import i_th
def mk_header_file_columns(k: int) -> list[str]:
@@ -23,7 +22,7 @@ def mk_header_file_columns(k: int) -> list[str]:
def mk_concept_file_header(mitm: MITM, concept: ConceptName, k: int) -> tuple[list[str], dict[str, MITMDataType]]:
mitm_def = get_mitm_def(mitm)
- dts, _ = map_col_groups(mitm_def, concept, {
+ _, dts = map_col_groups(mitm_def, concept, {
'kind': lambda: ('kind', MITMDataType.Text),
'type': lambda: (mitm_def.get_properties(concept).typing_concept, MITMDataType.Text),
'identity': lambda: mitm_def.resolve_identity_type(concept).items(),
@@ -34,7 +33,7 @@ def mk_concept_file_header(mitm: MITM, concept: ConceptName, k: int) -> tuple[li
'attributes': lambda: [(f'a_{i}', MITMDataType.Unknown) for i in range(1, k + 1)],
})
- return i_th(0, list)(dts), dict(dts)
+ return list(dts.keys()), dict(dts)
def write_header_file(df: pd.DataFrame, sink: DataSink) -> None:
@@ -54,7 +53,7 @@ def read_header_file(source: DataSource, normalize: bool = False) -> pd.DataFram
df = pd.read_csv(f, sep=';')
if normalize:
k = guess_k(df)
- df = df.reindex(columns=mk_header_file_columns(k)).astype('str')
+ df = df.astype(pd.StringDtype()).reindex(columns=mk_header_file_columns(k))
return df
diff --git a/mitm_tooling/representation/intermediate_representation.py b/mitm_tooling/representation/intermediate_representation.py
index 374b6b3e1b2ac24af5ddba8e64460ba4ea801873..96d4989b82eec92f2218c68953bf35994c2b7123 100644
--- a/mitm_tooling/representation/intermediate_representation.py
+++ b/mitm_tooling/representation/intermediate_representation.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import itertools
import logging
from collections.abc import Iterator, Iterable, Sequence
@@ -7,14 +9,14 @@ import pandas as pd
import pydantic
from pydantic import ConfigDict
-from definition import get_mitm_def
+from mitm_tooling.definition import get_mitm_def
from mitm_tooling.data_types.data_types import MITMDataType
from mitm_tooling.definition.definition_representation import ConceptName, MITM
-from mitm_tooling.extraction.sql.data_models.table_identifiers import ColumnName
-from representation.common import guess_k
-from representation.file_representation import mk_header_file_columns
+from .common import guess_k
+from .file_representation import mk_header_file_columns
logger = logging.getLogger('api')
+ColumnName = str
class HeaderEntry(pydantic.BaseModel):
@@ -24,6 +26,12 @@ class HeaderEntry(pydantic.BaseModel):
attributes: list[ColumnName]
attribute_dtypes: list[MITMDataType]
+ @pydantic.model_validator(mode='after')
+ def attr_check(self):
+ if not len(self.attributes) == len(self.attribute_dtypes):
+ raise ValueError('Length of specified attributes and their data types differs.')
+ return self
+
@classmethod
def from_row(cls, row: Sequence[str], mitm: MITM) -> Self | None:
kind, type_name = row[0], row[1]
@@ -33,14 +41,15 @@ class HeaderEntry(pydantic.BaseModel):
return None
attrs, attr_dts = [], []
- for a, a_dt in zip(row[2:], row[3:]):
- attrs.append(a)
- try:
- mitm_dt = MITMDataType(a_dt.lower()) if a_dt else MITMDataType.Unknown
- attr_dts.append(mitm_dt)
- except ValueError:
- logger.error(f'Encountered unrecognized data type during header import: {a_dt}.')
- return None
+ for a, a_dt in zip(row[slice(2, None, 2)], row[slice(3, None, 2)]):
+ if pd.notna(a) and pd.notna(a_dt):
+ attrs.append(a)
+ try:
+ mitm_dt = MITMDataType(a_dt.lower()) if a_dt else MITMDataType.Unknown
+ attr_dts.append(mitm_dt)
+ except ValueError:
+ logger.error(f'Encountered unrecognized data type during header import: {a_dt}.')
+ return None
return HeaderEntry(concept=concept, kind=kind, type_name=type_name, attributes=attrs, attribute_dtypes=attr_dts)
def get_k(self) -> int:
@@ -53,7 +62,7 @@ class HeaderEntry(pydantic.BaseModel):
class Header(pydantic.BaseModel):
mitm: MITM
- header_entries: list[HeaderEntry]
+ header_entries: list[HeaderEntry] = pydantic.Field(default_factory=list)
@classmethod
def from_df(cls, df: pd.DataFrame, mitm: MITM) -> Self:
@@ -73,23 +82,25 @@ class MITMData(Iterable[tuple[ConceptName, pd.DataFrame]], pydantic.BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
header: Header
- concept_dfs: dict[ConceptName, pd.DataFrame]
+ concept_dfs: dict[ConceptName, pd.DataFrame] = pydantic.Field(default_factory=dict)
def __iter__(self):
- return iter(self.concept_dfs)
+ return iter(self.concept_dfs.items())
class StreamingConceptData(pydantic.BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
structure_df: pd.DataFrame
- chunk_iterators: list[Iterator[tuple[pd.DataFrame, list[HeaderEntry]]]]
+ chunk_iterators: list[Iterator[tuple[pd.DataFrame, list[HeaderEntry]]]] = pydantic.Field(default_factory=list)
class StreamingMITMData(Iterable[tuple[ConceptName, StreamingConceptData]], pydantic.BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
- data_sources: dict[ConceptName, StreamingConceptData]
+ data_sources: dict[ConceptName, StreamingConceptData] = pydantic.Field(default_factory=dict)
def __iter__(self):
return iter(self.data_sources.items())
+
+
diff --git a/mitm_tooling/representation/sql_representation.py b/mitm_tooling/representation/sql_representation.py
index 8491d9ce6e75cd27b1e220133e7271ab9bb2fda3..6e3a18bc434fad4a18c8c79c68a5db9d88817f3f 100644
--- a/mitm_tooling/representation/sql_representation.py
+++ b/mitm_tooling/representation/sql_representation.py
@@ -1,15 +1,17 @@
from collections import defaultdict
+from collections.abc import Callable, Iterator, Generator, Mapping
import sqlalchemy as sa
+import sqlalchemy.sql.schema
from pydantic_core import Url
-from sqlalchemy_utils import view
-
+from mitm_tooling.definition import MITMDefinition, ConceptProperties, OwnedRelations
from mitm_tooling.data_types import MITMDataType
from mitm_tooling.definition import ConceptName, MITM, get_mitm_def, ConceptKind, ConceptLevel, RelationName
from mitm_tooling.definition.definition_tools import map_col_groups, ColGroupMaps
-from mitm_tooling.extraction.sql.db import create_sa_engine
-from .intermediate_representation import Header
+from .intermediate_representation import Header, MITMData, ColumnName
+from mitm_tooling.utilities.sql_utils import create_sa_engine
+from mitm_tooling.utilities import python_utils
from mitm_tooling.utilities.sql_utils import qualify
@@ -25,75 +27,167 @@ def mk_link_table_name(mitm: MITM, concept: ConceptName, type_name: RelationName
return mk_type_table_name(mitm, concept, type_name) + '_' + fk_name.lower()
-def mk_db_schema(header: Header):
+def has_type_tables(mitm: MITM, concept: ConceptName) -> bool:
+ return get_mitm_def(mitm).get_properties(concept).permit_attributes
+
+
+def pick_table_pk(mitm: MITM, concept: ConceptName, created_columns: Mapping[RelationName, sa.Column]) -> list[
+ tuple[RelationName, sa.Column]]:
+ mitm_def = get_mitm_def(mitm)
+ concept_properties, concept_relations = mitm_def.get(concept)
+
+ names, mapped_names = map_col_groups(mitm_def, concept, {
+ 'kind': lambda: 'kind',
+ 'type': lambda: concept_properties.typing_concept,
+ 'identity': lambda: list(concept_relations.identity)
+ })
+
+ return python_utils.pick_from_mapping(created_columns, names)
+
+
+def mk_table(meta: sa.MetaData, mitm: MITM, concept: ConceptName, table_name: str, col_group_maps: ColGroupMaps,
+ additional_schema_item_maker: Callable[
+ [MITM, ConceptName, ConceptProperties, OwnedRelations,
+ dict[RelationName, sa.Column], list[tuple[RelationName, sa.Column]]],
+ Generator[
+ sqlalchemy.sql.schema.SchemaItem, None, None]] | None = None) -> \
+ tuple[
+ sa.Table, dict[RelationName, sa.Column], list[tuple[RelationName, sa.Column]]]:
+ mitm_def = get_mitm_def(mitm)
+ concept_properties, concept_relations = mitm_def.get(concept)
+
+ columns, created_columns = map_col_groups(mitm_def, concept, col_group_maps, ensure_unique=True)
+
+ ref_columns = pick_table_pk(mitm, concept, created_columns)
+
+ constraints: list[sa.sql.schema.SchemaItem] = []
+ if concept_relations.identity:
+ constraints.append(sa.PrimaryKeyConstraint(*python_utils.i_th(1)(ref_columns)))
+
+ if additional_schema_item_maker:
+ constraints.extend(iter(
+ additional_schema_item_maker(mitm, concept, concept_properties, concept_relations, created_columns,
+ ref_columns)))
+ print(constraints)
+
+ return sa.Table(table_name, meta, *columns, *constraints), created_columns, ref_columns
+
+
+def mk_db_schema(header: Header) -> tuple[sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]:
mitm_def = get_mitm_def(header.mitm)
meta = sa.MetaData()
- concept_level_view_members: dict[ConceptName, list[list[sa.Column]]] = defaultdict(list)
- tables: dict[ConceptName, list[sa.Table]] = {}
+ tables: dict[ConceptName, dict[str, sa.Table]] = {}
views: dict[ConceptName, sa.Table] = {}
- for he in header.header_entries:
- he_concept = he.concept
- concept_properties = mitm_def.get_properties(he_concept)
- concept_relations = mitm_def.get_relations(he_concept)
- assert concept_properties is not None and concept_relations is not None
+ for concept in mitm_def.main_concepts:
+ concept_properties, concept_relations = mitm_def.get(concept)
- table_name = mk_type_table_name(header.mitm, he_concept, he.type_name)
+ table_name = mk_concept_table_name(header.mitm, concept)
- columns, created_columns = map_col_groups(mitm_def, he_concept, {
+ t, t_columns, t_ref_columns = mk_table(meta, header.mitm, concept, table_name, {
'kind': lambda: ('kind', sa.Column('kind', MITMDataType.Text.sa_sql_type, nullable=False)),
'type': lambda: (concept_properties.typing_concept, sa.Column(concept_properties.typing_concept,
MITMDataType.Text.sa_sql_type,
nullable=False)),
'identity': lambda: [(name, sa.Column(name, dt.sa_sql_type, nullable=False)) for
name, dt in
- mitm_def.resolve_identity_type(he_concept).items()],
+ mitm_def.resolve_identity_type(concept).items()],
'inline': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in
- mitm_def.resolve_inlined_types(he_concept).items()],
+ mitm_def.resolve_inlined_types(concept).items()],
'foreign': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for _, resolved_fk in
- mitm_def.resolve_foreign_types(he_concept).items() for name, dt in
- resolved_fk.items()],
- 'attributes': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in
- zip(he.attributes, he.attribute_dtypes)],
- }, ensure_unique=True)
-
- constraints = []
- if concept_relations.identity:
- constraints.append(sa.PrimaryKeyConstraint(*(created_columns[c] for c in concept_relations.identity)))
-
- for fk_name, fk_info in concept_relations.foreign.items():
- cols, refcols = zip(*fk_info.fk_relations.items())
- fkc = sa.ForeignKeyConstraint(name=fk_name, columns=[created_columns[c] for c in cols], refcolumns=[
- sa.literal_column(qualify(table=mk_concept_table_name(header.mitm, fk_info.target_concept), column=c))
- for c in refcols])
- # constraints.append(fkc)
-
- t = sa.Table(table_name, meta, *columns, *constraints)
+ mitm_def.resolve_foreign_types(concept).items() for name, dt in
+ resolved_fk.items()]
+ })
- if he_concept not in tables:
- tables[he_concept] = []
- tables[he_concept].append(t)
-
- if concept_relations.identity:
- outer_pk = []
- if not concept_properties.typing_concept in concept_relations.identity:
- outer_pk.append(created_columns[concept_properties.typing_concept])
- outer_pk.extend((created_columns[identity_col] for identity_col in concept_relations.identity))
- concept_level_view_members[he_concept].append(outer_pk)
-
- for concept, members in concept_level_view_members.items():
- view_selection = sa.union_all(*(sa.select(*pk_cols) for pk_cols in members))
- views[concept] = view.create_materialized_view(mk_concept_table_name(header.mitm, concept), view_selection,
- meta)
-
- return meta, tables, views
+ for he in header.header_entries:
+ he_concept = he.concept
+ if has_type_tables(header.mitm, he_concept):
+ concept_properties, concept_relations = mitm_def.get(he_concept)
+
+ def foreign_key_constraints(mitm, concept, concept_properties, concept_relations, created_columns,
+ ref_columns):
+ # self_fk
+ parent_table = mk_concept_table_name(mitm, concept)
+ cols, refcols = zip(
+ *((c, qualify(table=parent_table, column=s)) for s, c in ref_columns))
+ yield sa.ForeignKeyConstraint(name='parent', columns=cols, refcolumns=refcols)
+ for fk_name, fk_info in concept_relations.foreign.items():
+ cols, refcols = zip(*fk_info.fk_relations.items())
+ fkc = sa.ForeignKeyConstraint(name=fk_name, columns=[created_columns[c] for c in cols], refcolumns=[
+ # sa.literal_column(qualify(table=mk_concept_table_name(mitm, fk_info.target_concept), column=c))
+ qualify(table=mk_concept_table_name(mitm, fk_info.target_concept), column=c)
+ for c in refcols])
+ yield fkc
+
+ table_name = mk_type_table_name(header.mitm, he_concept, he.type_name)
+
+ t, t_columns, t_ref_columns = mk_table(meta, header.mitm, he_concept, table_name, {
+ 'kind': lambda: ('kind', sa.Column('kind', MITMDataType.Text.sa_sql_type, nullable=False)),
+ 'type': lambda: (concept_properties.typing_concept, sa.Column(concept_properties.typing_concept,
+ MITMDataType.Text.sa_sql_type,
+ nullable=False)),
+ 'identity': lambda: [(name, sa.Column(name, dt.sa_sql_type, nullable=False)) for
+ name, dt in
+ mitm_def.resolve_identity_type(he_concept).items()],
+ 'inline': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in
+ mitm_def.resolve_inlined_types(he_concept).items()],
+ 'foreign': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for _, resolved_fk in
+ mitm_def.resolve_foreign_types(he_concept).items() for name, dt in
+ resolved_fk.items()],
+ 'attributes': lambda: [(name, sa.Column(name, dt.sa_sql_type)) for name, dt in
+ zip(he.attributes, he.attribute_dtypes)],
+ }, additional_schema_item_maker=foreign_key_constraints)
+
+ if he_concept not in tables:
+ tables[he_concept] = {}
+ tables[he_concept][he.type_name] = t
+
+ # for concept, members in concept_level_view_members.items():
+
+ # view_selection = sa.union_all(*(sa.select(*pk_cols) for pk_cols in members))
+
+ # views[concept] = view.create_materialized_view(mk_concept_table_name(header.mitm, concept), view_selection,
+
+ # meta)
+
+ return meta, tables # , views
+
+
+def insert_db_instances(engine: sa.Engine, meta: sa.MetaData, mitm_data: MITMData):
+ with engine.connect() as conn:
+ mitm = mitm_data.header.mitm
+
+ for concept, df in mitm_data:
+ concept_table = mk_concept_table_name(mitm, concept)
+ t_concept = meta.tables[concept_table]
+ ref_cols = pick_table_pk(mitm, concept, t_concept.columns)
+ parent_insert = t_concept.insert().values(df[[c.name for c in t_concept.columns]].to_dict('records'))
+ conn.execute(parent_insert)
+
+ if has_type_tables(mitm, concept):
+ concept_properties, concept_relations = get_mitm_def(mitm).get(concept)
+ for typ, idx in df.groupby(concept_properties.typing_concept).groups.items():
+ type_df = df.loc[idx]
+ t_type = meta.tables[mk_type_table_name(mitm, concept, str(typ))]
+ sub_insert = t_type.insert().values(type_df[[c.name for c in t_type.columns]].to_dict('records'))
+ conn.execute(sub_insert)
+ conn.commit()
+
+
+def insert_mitm_data(engine: sa.Engine, mitm_data: MITMData) -> tuple[
+ sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]:
+ meta, tables = mk_db_schema(mitm_data.header)
+ meta.create_all(engine)
+ insert_db_instances(engine, meta, mitm_data)
+ return meta, tables
-def mk_sqlite(header: Header, file_path: str | None = ':memory:'):
+def mk_sqlite(mitm_data: MITMData, file_path: str | None = ':memory:') -> tuple[
+ sa.Engine, sa.MetaData, dict[ConceptName, dict[str, sa.Table]]]:
engine = create_sa_engine(Url(f'sqlite:///{file_path}'))
- meta, tables, views = mk_db_schema(header)
+ meta, tables = insert_mitm_data(engine, mitm_data)
print(meta.tables)
- print([f'{t.name}: {t.columns} {t.constraints}' for ts in tables.values() for t in ts])
- print([f'{t.name}: {t.columns} {t.constraints}' for t in views.values()])
+ print([f'{t.name}: {t.columns} {t.constraints}' for ts in tables.values() for t in ts.values()])
meta.create_all(engine)
+ return engine, meta, tables
diff --git a/mitm_tooling/utilities/python_utils.py b/mitm_tooling/utilities/python_utils.py
index dd3385b4dc43f7d16642e707762fd727d8f9be37..16f9be878379030f99db9f47f72f78f5b562cd74 100644
--- a/mitm_tooling/utilities/python_utils.py
+++ b/mitm_tooling/utilities/python_utils.py
@@ -1,3 +1,4 @@
+from collections.abc import Sequence, Mapping
from typing import TypeVar, Hashable, Iterable, Callable, Any
@@ -66,7 +67,7 @@ def elem_wise_eq(it1: Iterable, it2: Iterable) -> Iterable[bool]:
return map(lambda elems: elems[0] == elems[1], zip(it1, it2))
-def grouped(it: Iterable[tuple[K, Any]]) -> dict[K, Any]:
+def grouped(it: Iterable[tuple[K, T]]) -> dict[K, list[T]]:
res = {}
for k, v in it:
if k not in res:
@@ -84,6 +85,10 @@ def inner_list_concat(d1: dict[K, list[Any]], d2: dict[K, list[Any]]) -> dict[K,
return res
+def pick_from_mapping(d: Mapping[K, T], keys: Sequence[K]) -> list[tuple[K, T]]:
+ return [(k, d[k]) for k in keys]
+
+
class ExtraInfoExc(Exception):
def __init__(self, msg=None):
super().__init__()
diff --git a/mitm_tooling/utilities/sql_utils.py b/mitm_tooling/utilities/sql_utils.py
index 3dff0c2602f130411aef7458ea0ab297e2caaf4e..9e6fbf859af979d79d2318c2397125732898713f 100644
--- a/mitm_tooling/utilities/sql_utils.py
+++ b/mitm_tooling/utilities/sql_utils.py
@@ -1,3 +1,8 @@
+import sqlalchemy as sa
+from pydantic import AnyUrl
+from sqlalchemy import Engine
+
+
def qualify(*, table: str, schema: str | None = None, column: str | None = None):
res = table
if schema is not None:
@@ -11,3 +16,8 @@ def unqualify(n: str) -> list[str]:
return n.split('.')
+def create_sa_engine(db_url: AnyUrl, sqlite_extensions: list[str] | None = None, test_engine: bool = False,
+ **engine_kwargs) -> Engine:
+ engine = sa.create_engine(str(db_url), **engine_kwargs)
+
+ return engine
diff --git a/pyproject.toml b/pyproject.toml
index cacd92a12339a775d9da44abc3df7bc0baaa62d2..50ea5be887e024bd0c0c9e17fc61b7b8d5546484 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,13 @@
[tool.poetry]
name = "mitm-tooling"
-version = "0.2.4"
+version = "0.2.6"
description = ""
authors = ["Leah Tacke genannt Unterberg <leah.tgu@pads.rwth-aachen.de>"]
readme = "README.md"
packages = [{ include = "mitm_tooling" }]
[tool.poetry.dependencies]
-python = ">=3.11,<3.13"
+python = ">=3.11,<3.14"
pydantic = "^2.9.2"
pyyaml = "6.0.2"
genson = "^1.3.0"
diff --git a/test/something.py b/test/something.py
index 670b27b076d14489dcbf0cc6de06f2f1aa3afb4c..e2adb11ccd43e01ec1f509cbf364bd351843c7c8 100644
--- a/test/something.py
+++ b/test/something.py
@@ -16,32 +16,37 @@ class MyTestCase(unittest.TestCase):
from mitm_tooling.data_types import MITMDataType
h = Header(mitm=MITM.MAED, header_entries=[
HeaderEntry(concept='measurement', kind='M', type_name='A', attributes=['x'],
- attribute_dtypes=[MITMDataType.Datetime]),
+ attribute_dtypes=[MITMDataType.Numeric]),
HeaderEntry(concept='segment', kind='S', type_name='annotation', attributes=[],
attribute_dtypes=[]),
HeaderEntry(concept='segment_data', kind='SD', type_name='annotation_info', attributes=['y'],
attribute_dtypes=[MITMDataType.Json]),
])
- meta, tables, views = mk_db_schema(h)
+ meta, tables = mk_db_schema(h)
print(meta)
print()
print(tables)
print()
- print(views)
def test_writing_sqlite(self):
- from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema
+ from mitm_tooling.representation import Header, HeaderEntry, mk_db_schema, MITMData
from mitm_tooling.definition import MITM
from mitm_tooling.data_types import MITMDataType
h = Header(mitm=MITM.MAED, header_entries=[
HeaderEntry(concept='measurement', kind='M', type_name='A', attributes=['x'],
- attribute_dtypes=[MITMDataType.Datetime]),
+ attribute_dtypes=[MITMDataType.Numeric]),
HeaderEntry(concept='segment', kind='S', type_name='annotation', attributes=[],
attribute_dtypes=[]),
HeaderEntry(concept='segment_data', kind='SD', type_name='annotation_info', attributes=['y'],
attribute_dtypes=[MITMDataType.Json]),
])
- mk_sqlite(h, file_path='gendb.sqlite')
+ mk_sqlite(MITMData(header=h), file_path='gendb.sqlite')
+
+ def test_with_synthetic(self):
+ from mitm_tooling.io import importing
+ from mitm_tooling.definition import MITM
+ syn = importing.read_zip('synthetic.maed', MITM.MAED)
+ mk_sqlite(syn, 'synthetic.sqlite')
if __name__ == '__main__':
unittest.main()