diff --git a/Water quality calibration/optimization_prep.ipynb b/Water quality calibration/optimization_prep.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..22e9339b8670fbfa63461c9f9492b952a7d9a3f9 --- /dev/null +++ b/Water quality calibration/optimization_prep.ipynb @@ -0,0 +1,840 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prep work for Optimization with mikepluspy.\n", + "Adjusts time shift and generates a configuration table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import typing\n", + "import datetime\n", + "from warnings import warn\n", + "from pathlib import Path\n", + "from dataclasses import dataclass\n", + "import typing\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import hydroeval as hde\n", + "import sqlalchemy as sa\n", + "from scipy.ndimage import gaussian_filter1d\n", + "from scipy import signal\n", + "from tqdm.notebook import tqdm\n", + "from mikeio1d.res1d import Res1D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# time string for output folder\n", + "time_path_part = datetime.datetime.now().strftime(\"%Y-%m-%dT%H.%M.%S\")\n", + "# outpit folder\n", + "out_folder = Path(\"out_\" + time_path_part)\n", + "model_folder = Path(\"model_folder_path_relative_to_script/\")\n", + "# path of .mupp file, relative to model_folder\n", + "db_path_rel = \"example.mupp\"\n", + "# define timezone of data\n", + "timezone = \"UTC+01:00\"\n", + "# define name of column in sensor data for flow\n", + "hd_col = \"q_lps\"\n", + "# define name of column in sensor data for concentration\n", + "conc_col = \"conc_mgpl\"\n", + "# this timestep is assumed for all reference/result timeseries\n", + "timestep_str = \"1min\"\n", + "timestep = pd.to_timedelta(timestep_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create output folder\n", + "out_folder.mkdir(exist_ok=True,parents=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# in this project data from a sql databank was used, can also be changed to table data further below\n", + "# if you dont use a sql you need to change \"def calc_plot\"\n", + "# also adjust the loop: \"for label in optim_parts.index:\"\n", + "# connection to datebase used to fetch data\n", + "db_engine = sa.create_engine(\n", + " \"insert connenction to SQL database here\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This excel table tells us which times are to be used for calibration for each sensor and event.\n", + "# The table only contains times, if the (event,sensor) pair should be used, otherwise the values are empty\n", + "# The original excel file contains sensor in the first header line, the kind of column in the second header line\n", + "# and the event as the first column. Stack puts the sensor in a column and also removes values without times for us.\n", + "# This means that event_times only contains the needed (event,sensor) pairs after this line.\n", + "event_times = pd.read_excel(model_folder/\"calibration_event_times.xlsx\",header=[0,1],index_col=[0]).stack(level=0)\n", + "event_times.index.names = [\"event\",\"sensor\"]\n", + "display(event_times.reset_index().dtypes)\n", + "for dtcol in [\"start\",\"end\"]:\n", + " event_times[dtcol]=event_times[dtcol].dt.tz_localize(timezone)\n", + "for id in event_times.index:\n", + " start=event_times.loc[id,\"start\"]\n", + " end=event_times.loc[id,\"end\"]\n", + " start_round = start.round(timestep)\n", + " end_round = end.round(timestep)\n", + " if start!=start_round:\n", + " warn(f\"for {id} start({start}) does not fit timestep({timestep}). rounding to {start_round})\")\n", + " event_times.loc[id,\"start\"]=start_round\n", + " if end!=end_round:\n", + " warn(f\"for {id} end({end}) does not fit timestep({timestep}). rounding to {end_round})\")\n", + " event_times.loc[id,\"end\"]=end_round\n", + "event_times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This table defines for each concentration sensor where it is positioned whithin the model.\n", + "# make sure that the sensor name in the defined excel files corresponds to the names in the sql\n", + "# It also contains a the position and name of a discharge sensor, which is used to adjust the time shift later.\n", + "# Note that chainages are only approximate\n", + "sensor_positions = pd.read_excel(model_folder/\"sensor_data.xlsx\",sheet_name=\"sensor_position\",index_col=\"sensor\")\n", + "sensor_positions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#read table containing information which event is included in which result file\n", + "event_to_file = pd.read_excel(model_folder/\"event_dict.xlsx\",index_col=\"event\")[\"result file\"]\n", + "event_to_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_result(file, reach, node, column, timestep, start, end) -> pd.Series:\n", + " \"\"\"Read the result with specified parameters.\n", + " returns:\n", + " a pandas Series with Datetime as index.\n", + " \"\"\"\n", + " # file_path is read from the event table\n", + " # reach/node is used to read only a part of the res1d file\n", + " df = Res1D(file_path=str(model_folder / file), reaches=[reach], nodes=[node]).read_all()\n", + " # reduce result to desired column\n", + " series = df[column]\n", + " series.index = series.index.tz_localize(timezone)\n", + " # # resample to be comparable to results\n", + " # TODO: sensible resampling or raise error\n", + " # Note: Linear interpolation with pandas first throws away all values where the index does not match to the frequency (?!).\n", + " full_idx = pd.date_range(start, end, freq=timestep)\n", + " series = series.reindex(full_idx, method=\"nearest\")\n", + " # set index name\n", + " series.index.name = \"Datetime\"\n", + " return series\n", + "\n", + "\n", + "def make_conc_result_spec(ev, sensor):\n", + " \"\"\"Defines parameters which define how to read the simulation result for an event from the specified concentration sensor.\"\"\"\n", + " file = f\"Results/{event_to_file[ev]}BaseDefault_Network_AD.res1d\"\n", + " reach = sensor_positions.loc[sensor, \"reach\"]\n", + " # column names feature very exact chainages, which we do not know beforehand.\n", + " # This tries to find the column wiht most similar chainage\n", + " approx_chainage = sensor_positions.loc[sensor, \"chainage\"]\n", + " col_start = f\"AFS:{reach}:\"\n", + " possible_columns = [\n", + " c\n", + " for c in Res1D(str(model_folder / file), reaches=[reach]).read_all().columns\n", + " if str(c).startswith(col_start)\n", + " ]\n", + "\n", + " def chainage_diff(col: str):\n", + " return abs(approx_chainage - float(col.removeprefix(col_start)))\n", + "\n", + " best_col = min(possible_columns, key=chainage_diff)\n", + " print(\n", + " f\"\"\"\n", + " approx_chainage: {approx_chainage}\n", + " possible_columns: {possible_columns}\n", + " best_col: {best_col}\n", + " \"\"\"\n", + " )\n", + " return dict(\n", + " file=file,\n", + " reach=reach,\n", + " node=\"\",\n", + " column=best_col,\n", + " timestep=timestep,\n", + " start=event_times.loc[(ev, sensor), \"start\"],\n", + " end=event_times.loc[(ev, sensor), \"end\"],\n", + " )\n", + "\n", + "\n", + "def make_hd_result_spec(ev, sensor):\n", + " \"\"\"Defines parameters which define how to read the simulation result for a event from the specified discharge sensor.\"\"\"\n", + " file = f\"Results/{event_to_file[ev]}BaseDefault_Network_HD.res1d\"\n", + " reach = sensor_positions.loc[sensor, \"hd_reach\"]\n", + " # column names feature very exact chainages, which we do not know beforehand.\n", + " # This tries to find the column wiht most similar chainage\n", + " approx_chainage = sensor_positions.loc[sensor, \"hd_chainage\"]\n", + " col_start = f\"Discharge:{reach}:\"\n", + " possible_columns = [\n", + " c\n", + " for c in Res1D(str(model_folder / file), reaches=[reach]).read_all().columns\n", + " if str(c).startswith(col_start)\n", + " ]\n", + "\n", + " def chainage_diff(col: str):\n", + " return abs(approx_chainage - float(col.removeprefix(col_start)))\n", + "\n", + " best_col = min(possible_columns, key=chainage_diff)\n", + " print(\n", + " f\"\"\"\n", + " approx_chainage: {approx_chainage}\n", + " possible_columns: {possible_columns}\n", + " best_col: {best_col}\n", + " \"\"\"\n", + " )\n", + " return dict(\n", + " file=file,\n", + " reach=reach,\n", + " node=\"\",\n", + " column=best_col,\n", + " timestep=timestep,\n", + " start=event_times.loc[(ev, sensor), \"start\"],\n", + " end=event_times.loc[(ev, sensor), \"end\"],\n", + " )\n", + "\n", + "\n", + "def make_conc_reference_spec(ev, sensor):\n", + " \"\"\"Defines parameters which define how to read the reference values for a event and specified concentration sensor from the database.\"\"\"\n", + " return {\n", + " \"sensor\": sensor,\n", + " \"start\": event_times.loc[(ev, sensor), \"start\"],\n", + " \"end\": event_times.loc[(ev, sensor), \"end\"],\n", + " }\n", + "\n", + "\n", + "def make_hd_reference_spec(ev, sensor):\n", + " \"\"\"Defines parameters which define how to read the reference values for a event and specified discharge sensor from the database.\"\"\"\n", + " return {\n", + " \"sensor\": sensor_positions.loc[sensor, \"hd_sensor\"],\n", + " \"start\": event_times.loc[(ev, sensor), \"start\"],\n", + " \"end\": event_times.loc[(ev, sensor), \"end\"],\n", + " }\n", + "\n", + "\n", + "def normalize_max(series):\n", + " \"\"\"Normalize a time series so that the maximum value is 1.\"\"\"\n", + " max = np.max(series)\n", + " if max == 0:\n", + " max = 1\n", + " return series / max" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create result configuration table for optimization\n", + "# This table defines which simulations to run and where to find the result for each label.\n", + "optim_part_records = []\n", + "#event_times contains only the event-sensor pairs which have usable data\n", + "for event,sensor in tqdm(event_times.index):\n", + " print(event,sensor)\n", + " try:\n", + " result_spec = make_conc_result_spec(event,sensor)\n", + " optim_part_records.append({\n", + " \"Label\":f\"{event}-{sensor}\",\n", + " \"Event\":event,\n", + " \"Sensor\":sensor,\n", + " \"EventWeight\":1,\n", + " \"Start\":event_times.loc[(event,sensor),\"start\"],\n", + " \"End\":event_times.loc[(event,sensor),\"end\"],\n", + " \"Timestep\":timestep_str,\n", + " \"SimulationName\":f\"{event_to_file[event]}\",\n", + " \"ResultFile\":result_spec[\"file\"],\n", + " \"ResultReach\":result_spec[\"reach\"],\n", + " \"ResultNode\":result_spec[\"node\"],\n", + " \"ResultColumn\":result_spec[\"column\"],\n", + " })\n", + " except Exception as ex:\n", + " print(ex)\n", + "\n", + "optim_parts = pd.DataFrame(optim_part_records)\n", + "optim_parts = optim_parts.set_index(\"Label\")\n", + "optim_parts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# The hydrodynamics simulation may be shifted in time. The water quality simulation depends on the hydrodynamics simulation and has the same lag.\n", + "# Since adjusting quality parameters cannot fix this time lag, we calculate the lag for the hydrodynamics\n", + "# and then shift the quality reference data with the same lag.\n", + "# The lag is calculated with cross correlation. Since the lag may vary over time, \n", + "# we use cross correlation over a window around each value to calculate a lag for each time point.\n", + "# The lag is then smoothed because it does not make sense for the lag to change suddenly.\n", + "\n", + "\n", + "def adjust_lag(series, lag):\n", + " \"\"\"Shifts time in a series to account for lag\n", + " series: values we want to adjust for lag\n", + " lag: time the series lags behind the result.\n", + " start,end: start and end of event, more values\n", + " returns: series with time shifted acccording to lag. \n", + " Note that the nearest value is used, if it does not exist in the original data.\n", + " \"\"\"\n", + " assert isinstance(series.index, pd.DatetimeIndex), \"Need DatetimeIndex in series\"\n", + " assert isinstance(lag.index, pd.DatetimeIndex), \"Need DatetimeIndex in lag\"\n", + " assert (\n", + " len(series.index.difference(lag.index)) == 0\n", + " ), \"need lag value for each element of series\"\n", + " assert lag.notna().all(), \"lag cannot be nan\"\n", + " # use .array to get data without index, since we want to adjust time.\n", + " # Otherwise we would shift the order of the data, but it would still be associated with the old timestamp.\n", + " series_shifted = pd.Series(\n", + " series.reindex(series.index - lag[series.index], method=\"nearest\").array,\n", + " index=series.index,\n", + " )\n", + " # modify the name of the series, this is useful when using plot() for both the original and the shifted data.\n", + " series_shifted.name = series.name + \"_shift\"\n", + " return series_shifted[series.index]\n", + "\n", + "\n", + "def adjust_start_end(start, end, lag):\n", + " \"\"\"Adjust start and end according to lag. Also rounds to timestep, so that all needed time points are included.\n", + " This is use because the reference values contain some kind of notable features. After shifting the reference values in time, \n", + " the reference values should still contain the same features.\n", + " returns: a tuple (start,end) of adjusted times.\n", + " \"\"\"\n", + " start_lag, end_lag = lag.reindex([start, end], method=\"nearest\")\n", + " start_shifted, end_shifted = start + start_lag, end + end_lag\n", + " # round start down and end up\n", + " start_rounded = start_shifted.round(timestep)\n", + " if start_rounded > start_shifted:\n", + " start_rounded -= timestep\n", + " end_rounded = end_shifted.round(timestep)\n", + " if end_rounded < end_shifted:\n", + " end_rounded += timestep\n", + " return start_rounded, end_rounded\n", + "\n", + "\n", + "# cross correlation for a window arround each point to search where the part of te simulation result is located in the reference.\n", + "\n", + "# put lags for each event into a dictionary\n", + "reference_lags = {}\n", + "\n", + "# This procedure was written as a class to make it simpler to experiment with various parameters. \n", + "# dataclasses are a python feature which help as avoid some boilerplate code, like writing an __init__ function.\n", + "@dataclass\n", + "class CcCalculator:\n", + " \"\"\"Class to compute and plot cross correlation for an event. Label is the only required parameter, everything else has default values.\"\"\"\n", + "\n", + " # event for which cross correlation is computed\n", + " label: str\n", + " # size of the window for which cross correlation is computed. Is passed to pd.to_timedelta() for computation.\n", + " # Used in plot annotation as is.\n", + " window: str = \"2h\"\n", + " # tuple of times (as timestep indices) where a window and cross correlation may be plotted.\n", + " # invalid times cause a warning and are otherwise ignored\n", + " win_times: typing.Union[tuple[int, ...], typing.Literal[\"peak\"]] = ()\n", + " # whether to plot window at win_times\n", + " plt_win: bool = True\n", + " # whether to plot correlation at win_times\n", + " plt_corr: bool = True\n", + " # whether to plot the lag\n", + " plt_lag: bool = True\n", + " # whether to plot the warped (lag corrected) reference with result\n", + " plt_warp: bool = True\n", + " # Set a lower limit for flow, below the limit no lag is calculated.\n", + " # If we assume that time lag mostly comes from location differences of the rain measurement station, it makes no sense to calculate the timeshift from dry weather.\n", + " # Additionally the influence of noise may be larger for low flow.\n", + " min_q_res: typing.Union[float, str] = 0.0\n", + "\n", + " # mode to use for cross correlation. \"full\" or \"same\" uses zero padding. Zero-Padding makes values at the edges smaller, which is desirable for our application.\n", + " # when win_pad (see below)>= max_lag_i \"same\" is an efficent mode, otherwise \"full\" is needed\n", + " # \"valid\" does not make sense here: cross correlation is a sum of products, it is largest where the largest values of one window match the largest values of the other window.\n", + " # Therefore maximizing cross correlation only finds lag if the compared arrays have a similar shape. If one of our windows is wider than the other and the wider window has a large peak\n", + " # that is as wide as the thinner window, cross correlation will be maximized where the thinner window is inside the peak, even if the shape does not match at all.\n", + " correlation_mode: str = \"same\"\n", + " # width for the plot, ignored if a fig is passed\n", + " width: float = 12\n", + " # a height is effectively 0 if the correspondig part is not plotted.\n", + " # height for a window plot, in inches.\n", + " height_win: float = 1\n", + " # height for a correlation plot, in inches.\n", + " height_corr: float = 1\n", + " # height for the lag plot, in inches.\n", + " height_lag: float = 1.5\n", + " # height for the warp plot, in inches.\n", + " height_warp: float = 5\n", + "\n", + " def estimate_height(self):\n", + " \"\"\"Estimate how high the plot should be, in inches.\"\"\"\n", + " win_count = len(self.win_times)\n", + " return (\n", + " self.height_win * self.plt_win * win_count\n", + " + self.height_corr * self.plt_corr * win_count\n", + " + self.plt_lag * self.height_lag\n", + " + self.plt_warp * self.height_warp\n", + " )\n", + "\n", + " def calc_plot(self, fig=None):\n", + " \"\"\"Calculates the lag and plot a nice figure. If a figure is passed, plots on new axes inside that figure. else create a figure.\n", + " Returns: The lag, a figure and some statistics.\"\"\"\n", + " event = optim_parts.loc[self.label, \"Event\"]\n", + " sensor = optim_parts.loc[self.label, \"Sensor\"]\n", + " hd_result_spec = make_hd_result_spec(event, sensor)\n", + " hd_reference_spec = make_hd_reference_spec(event, sensor)\n", + " start = optim_parts.loc[self.label, \"Start\"]\n", + " end = optim_parts.loc[self.label, \"End\"]\n", + " # Add padding, so that later start and end can be shifted\n", + " pad = pd.to_timedelta(self.window)\n", + " start_og = start\n", + " end_og = end\n", + " start -= pad\n", + " end += pad\n", + " hd_result_spec[\"start\"] = start\n", + " hd_reference_spec[\"start\"] = start\n", + " hd_result_spec[\"end\"] = end\n", + " hd_reference_spec[\"end\"] = end\n", + " # Just in case the database or simulation file misses some results, we use the the index with all times for both.\n", + " full_idx = pd.date_range(start, end, freq=timestep)\n", + " result_hd = read_result(**hd_result_spec)\n", + " result_hd = result_hd.rename(\"simulation result\")\n", + "\n", + " with db_engine.begin() as con:\n", + " reference_hd = pd.read_sql_query(\n", + " sa.text(\n", + " f\"select datetime, {hd_col} from readings_smoothed where sensor_name = :sensor and datetime>=:start and datetime<=:end\"\n", + " ),\n", + " con,\n", + " params=hd_reference_spec,\n", + " index_col=\"datetime\",\n", + " )[hd_col].reindex(full_idx)\n", + " # TODO: check times in database vs event table\n", + " if not reference_hd.notna().all():\n", + " warn(f\"\"\"readings_smoothed should contain no missing values. Check database. \n", + " Database may not contain enough padding. Needed: {self.window}.\n", + " First to last valid needed: {start} to {end}\"\n", + " First to last valid found: {reference_hd.first_valid_index()} to {reference_hd.last_valid_index()}\"\"\")\n", + " reference_hd = reference_hd.interpolate().ffill().bfill()\n", + " # adjust unit\n", + " reference_hd = reference_hd / 1000\n", + " reference_hd = reference_hd.rename(\"reference\")\n", + " assert reference_hd.index.equals(result_hd.index), \"Index needs to match\"\n", + " result_numpy = result_hd.to_numpy().astype(\"double\")\n", + " reference_numpy = reference_hd.to_numpy().astype(\"double\")\n", + " if isinstance(self.min_q_res, str) and self.min_q_res.endswith(\"%\"):\n", + " q = float(self.min_q_res.removesuffix(\"%\")) / 100\n", + " min_q_res = result_hd.quantile(q)\n", + " else:\n", + " min_q_res = float(self.min_q_res)\n", + " # set the maximum amount of lag in number of timesteps\n", + " max_lag_i = int(np.rint(pd.to_timedelta(self.window) / timestep))\n", + " # set size of the window as distance from the middle to the edges.\n", + " win_pad = max_lag_i\n", + " # save lag for each point in a series.\n", + " lag = pd.Series(\n", + " data=pd.to_timedelta(pd.NA),\n", + " index=full_idx,\n", + " name=\"lag\",\n", + " dtype=\"timedelta64[ns]\",\n", + " )\n", + " # keep some examples of correlation for plotting\n", + " if self.win_times == \"peak\":\n", + " win_times = (np.argmax(result_numpy),)\n", + " else:\n", + " win_times = self.win_times\n", + " cor_examples = []\n", + " # check if examples are valid:\n", + " for t_exp in win_times:\n", + " if t_exp < 0 or t_exp >= len(lag):\n", + " warn(\n", + " f\"t={t_exp} needs to be in range [0,{len(lag)-1}]. Label {self.label}\"\n", + " )\n", + " # calculate lag for each position.\n", + "\n", + " for t in range(len(lag)):\n", + " if result_numpy[t] >= min_q_res:\n", + " # Start and end of windwow, make sure indices are inside the valid range.\n", + " win_start = max(0, t - win_pad)\n", + " win_end = min(t + win_pad, len(result_numpy) - 1)\n", + " # extract window data\n", + " res_part = result_numpy[win_start : win_end + 1]\n", + " ref_part = reference_numpy[win_start : win_end + 1]\n", + " # use scyipy.signal to calculate correlation\n", + " correlation = signal.correlate(\n", + " res_part, ref_part, mode=self.correlation_mode\n", + " )\n", + " # signal.correlation_lags tells us, how the index of the correlation result matches a lag in the original data.\n", + " # For our cases the lags shoudl just be and array of the from [-n,...,0,...,n], but this method handles edge cases\n", + " # and different correlation_modes for us.\n", + " all_lags = signal.correlation_lags(\n", + " res_part.size, ref_part.size, mode=self.correlation_mode\n", + " )\n", + " # use is_lag_valid to only search lags smaller than the limit for the best result.\n", + " is_lag_valid = np.abs(all_lags) <= max_lag_i\n", + " idx_lag = all_lags[is_lag_valid][np.argmax(correlation[is_lag_valid])]\n", + " lag.iloc[t] = idx_lag * timestep\n", + " # put data from some steps into cor_examples for ploting them late, examples configured by self.win\n", + " if t in win_times:\n", + " cor_examples.append(\n", + " dict(\n", + " t=t,\n", + " win_start=win_start,\n", + " win_end=win_end,\n", + " res_part=res_part,\n", + " ref_part=ref_part,\n", + " correlation=correlation,\n", + " all_lags=all_lags,\n", + " is_lag_valid=is_lag_valid,\n", + " idx_lag=idx_lag,\n", + " )\n", + " )\n", + " raw_lag = lag.copy().rename(\"raw lag\")\n", + " # interpolate nans, fill start/end with nearest value\n", + " lag = lag.interpolate().bfill().ffill().fillna(pd.to_timedelta(\"0s\"))\n", + " # smooth lag, requires conversion to float, use timestep for that\n", + " # the smooting corrects random errors and sudden shifts in the lag.\n", + " lag.loc[:] = (\n", + " gaussian_filter1d((lag / timestep).to_numpy(), max_lag_i / 6) * timestep\n", + " )\n", + " lag = lag.rename(\"smoothed lag\")\n", + " reference_hd_shifted = adjust_lag(reference_hd, lag)\n", + " # compute nse values for hd data\n", + " # this can be used to estimate wheter shifting improved the model\n", + " # and - after calibration - as reference for nse on concentration\n", + " # remember that start and end are padded, while _og are the oroginal values\n", + " start_shifted, end_shifted = adjust_start_end(start_og, end_og, lag)\n", + " hd_stats = {\n", + " \"nse_og\": hde.nse(\n", + " result_hd[start_og:end_og].to_numpy(), reference_hd[start_og:end_og].to_numpy()\n", + " ),\n", + " \"nse_shifted\": hde.nse(\n", + " result_hd[start_shifted:end_shifted].to_numpy(),\n", + " reference_hd_shifted[start_shifted:end_shifted].to_numpy(),\n", + " ),\n", + " \"neg_bounded_nse_og\": -hde.nse_c2m(\n", + " result_hd[start_og:end_og].to_numpy(), reference_hd[start_og:end_og].to_numpy()\n", + " ),\n", + " \"neg_bounded_nse_shifted\": -hde.nse_c2m(\n", + " result_hd[start_shifted:end_shifted].to_numpy(),\n", + " reference_hd_shifted[start_shifted:end_shifted].to_numpy(),\n", + " ),\n", + " }\n", + "\n", + " # now plot the result\n", + " if fig == None:\n", + " fig = plt.figure(figsize=(self.width, self.estimate_height()))\n", + " # note that parts that are not plotted still get an ax\n", + " # this ax gets a height_ratio of 0 and is later made invisible.\n", + " axs = fig.subplots(\n", + " 2 + 2 * len(cor_examples),\n", + " height_ratios=[\n", + " self.height_win * self.plt_win,\n", + " self.height_corr * self.plt_corr,\n", + " ]\n", + " * len(cor_examples)\n", + " + [self.plt_lag * self.height_lag]\n", + " + [self.plt_warp * self.height_warp],\n", + " )\n", + " # for each correlation example we plot the window with data and the correlation.\n", + " exp_axs = axs[: 2 * len(cor_examples)]\n", + " # go over axes in pairs\n", + " # ::2 mean we use steps of size 2, 1::2 means we start at idx 1 and uses step 2\n", + " for winax, corax, exp in zip(exp_axs[::2], exp_axs[1::2], cor_examples):\n", + " # plot windows\n", + " if self.plt_win:\n", + " win_x = np.arange(exp[\"win_start\"], exp[\"win_end\"] + 1)\n", + " winax.plot(\n", + " win_x, exp[\"res_part\"], label=\"simulation window\", color=\"blue\"\n", + " )\n", + " winax.plot(\n", + " win_x, exp[\"ref_part\"], label=\"reference window\", color=\"black\"\n", + " )\n", + " # mark the time point to which the window refers\n", + " winax.axvline(exp[\"t\"], label=f\"t={exp['t']}\", color=\"darkgray\")\n", + " # set xlims so that the window is positioned relatively to all data for that event.\n", + " winax.set_xlim(0, len(result_numpy))\n", + " winax.legend(ncol=2)\n", + " else:\n", + " winax.set_visible(False)\n", + " # plot correlation\n", + " if self.plt_corr:\n", + " corax.plot(\n", + " exp[\"all_lags\"],\n", + " exp[\"correlation\"],\n", + " label=\"cor(simulation,reference)\",\n", + " color=(0.2, 0.2, 0.6, 1),\n", + " )\n", + " # mark the lag\n", + " corax.axvline(\n", + " exp[\"idx_lag\"],\n", + " label=f\"lag={exp['idx_lag']} timesteps\",\n", + " color=\"orange\",\n", + " )\n", + " corax.legend()\n", + " else:\n", + " corax.set_visible(False)\n", + " # now plot the lag\n", + " ax_lag, ax_shifted = axs[2 * len(cor_examples) :]\n", + " if self.plt_lag:\n", + " raw_lag_ts = raw_lag / timestep\n", + " lag_ts = lag / timestep\n", + " raw_lag_ts.plot(\n", + " ax=ax_lag,\n", + " color=(0.5, 0.1, 0.1, 1),\n", + " )\n", + " # first draw non interpolated part\n", + " s_lag = lag_ts.copy().rename(\"smoothed lag\")\n", + " s_lag[raw_lag.isna()] = np.nan\n", + " s_lag.plot(ax=ax_lag, color=(0.8, 0.2, 0.2, 0.7))\n", + " # draw interpolated part, if any\n", + " if raw_lag.isna().any():\n", + " i_lag = lag_ts.copy().rename(\"interpolated lag\")\n", + " i_lag[raw_lag.notna()] = np.nan\n", + " i_lag.plot(ax=ax_lag, color=(0.8, 0.2, 0.2, 0.7), linestyle=\"--\")\n", + " ax_lag.set_title(f\"lag, timestep={timestep}\")\n", + " ax_lag.legend()\n", + " else:\n", + " ax_lag.set_visible(False)\n", + "\n", + " # plot the original reference data and shifted reference data\n", + " if self.plt_warp:\n", + " reference_hd.rename(\"original reference\").plot(\n", + " ax=ax_shifted, color=\"black\", linestyle=\":\"\n", + " )\n", + " reference_hd_shifted.rename(\"warped reference\").plot(\n", + " ax=ax_shifted, color=\"black\"\n", + " )\n", + " # also the simulation result\n", + " # if our method for lag correction worked, the shifted reference should match the simlution results better than before.\n", + " result_hd.rename(\"simulated\").plot(ax=ax_shifted, color=\"blue\")\n", + " ax_shifted.legend()\n", + " fig.suptitle(\"label \" + str(self.label) + \", window \" + self.window, y=0.95)\n", + " else:\n", + " ax_shifted.set_visible(False)\n", + " return lag, fig, hd_stats\n", + "\n", + "\n", + "hd_stats_dict = {}\n", + "for label in tqdm(optim_parts.index):\n", + " lag, fig, part_stats = CcCalculator(label, \"6h\", min_q_res=0.1).calc_plot()\n", + " hd_stats_dict[label] = part_stats\n", + " fig.suptitle(\"Label \" + str(label) + \" lag from cross correlation over window\")\n", + " fig.savefig(out_folder / f\"lag_hd_cc_{label}_.png\")\n", + " reference_lags[label] = lag" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# adjust start and end times of events according to lag, \n", + "for label in optim_parts.index:\n", + " start = optim_parts.loc[label, \"Start\"]\n", + " end = optim_parts.loc[label, \"End\"]\n", + " start_shifted, end_shifted = adjust_start_end(start,end,reference_lags[label])\n", + " optim_parts.loc[label, \"Start\"] = start_shifted\n", + " optim_parts.loc[label, \"End\"] = end_shifted\n", + " optim_parts.loc[label, \"StartOriginal\"] = start\n", + " optim_parts.loc[label, \"EndOriginal\"] = end" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optim_parts[[\"Event\",\"Start\",\"End\",\"StartOriginal\",\"EndOriginal\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove timezones for excel\n", + "optim_parts_c = optim_parts.copy()\n", + "for col in [\"Start\",\"End\",\"StartOriginal\",\"EndOriginal\"]:\n", + " optim_parts_c[col] = optim_parts_c[col].dt.tz_localize(None)\n", + "optim_parts_c.to_excel(model_folder/\"optim_parts.xlsx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hd_stats=pd.DataFrame(hd_stats_dict)\n", + "hd_stats[\"mean\"]=hd_stats.mean(axis=\"columns\")\n", + "hd_stats = hd_stats.transpose()\n", + "hd_stats.to_excel(model_folder/\"hd_stats.xlsx\")\n", + "hd_stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Adjusts the time lag for the reference values and puts result into a csv.\n", + "# Also plots original and shifted concentration reference values.\n", + "# The plot also contains the anomaly score (higher means more trustworthy data) for reference.\n", + "\n", + "# collect reference for each label in a dict.\n", + "ref_parts = {}\n", + "# the defines wheter the plots are created.\n", + "plot_reference_shift = True\n", + "for label in optim_parts.index:\n", + " # only look at time that is of interest:\n", + " sensor = optim_parts.loc[label,\"Sensor\"]\n", + " start = optim_parts.loc[label, \"Start\"]\n", + " end = optim_parts.loc[label, \"End\"]\n", + " start_read = reference_lags[label].index.min()\n", + " end_read = reference_lags[label].index.max()\n", + " timestep = optim_parts.loc[label, \"Timestep\"]\n", + " # use this before warping, index with original time and padding\n", + " read_idx = reference_lags[label].index\n", + " # use adjusted times without padding for final index\n", + " full_shifted_idx = pd.date_range(start, end, freq=timestep, tz=timezone)\n", + " # read the reference values and convert them so that they fit the results from simulation\n", + "\n", + " with db_engine.begin() as con:\n", + " ref_part = pd.read_sql_query(\n", + " sa.text(\n", + " f\"select datetime, {conc_col} from readings_smoothed where sensor_name = :sn and datetime>=:start and datetime<=:end\"\n", + " ),\n", + " con,\n", + " params={\"start\": start_read, \"end\": end_read, \"sn\": sensor},\n", + " index_col=\"datetime\",\n", + " )[conc_col].tz_convert(timezone)\n", + " ref_part_raw = ref_part.copy()\n", + " ref_part=ref_part.sort_index().reindex(read_idx, method=\"nearest\")\n", + " assert isinstance(\n", + " ref_part.index, pd.DatetimeIndex\n", + " ), \"Need DatetimeIndex (expected in first column)\"\n", + " assert ref_part.isna().any(axis=None) == False\n", + " # adjust lag\n", + " ref_part_shifted = adjust_lag(ref_part,reference_lags[label])\n", + " ref_part_shifted = ref_part_shifted.reindex(full_shifted_idx)\n", + " ref_parts[label] = ref_part_shifted\n", + " if plot_reference_shift:\n", + " fig,ax = plt.subplots(figsize=(12,3))\n", + " ref_part_shifted.plot(ax=ax,color=\"cyan\",label=\"shifted\")\n", + " ref_part_raw.plot(ax=ax,color=\"darkblue\",linewidth=0.3, label = \"not shifted\",marker=\"o\", markersize=0.8)\n", + " # also plot where score is low, may explain odd parts\n", + " with db_engine.begin() as con:\n", + " score = pd.read_sql(\n", + " sa.text(\n", + " \"\"\"\n", + " SELECT datetime,score\n", + " FROM anomaly_scores \n", + " WHERE sensor_name = :sn AND datetime>= :start AND datetime<= :end AND parameter= :param;\n", + " \"\"\"\n", + " ),\n", + " con,\n", + " params={\"start\": start, \"end\": end, \"sn\": sensor, \"param\":conc_col},\n", + " index_col=\"datetime\",\n", + " dtype={\n", + " \"score\": \"float16\",\n", + " },\n", + " )[\"score\"]\n", + " assert score.index.is_unique, \"check query if index is not unique\"\n", + " score_low = score<0.45\n", + " if score_low.any():\n", + " ax.fill_between(score.index, 0,1,where=score_low, facecolor=(0,0,0,0.2), transform=ax.get_xaxis_transform(), label=\"anomaly score<0.45\",\n", + " edgecolor=(0,0,0,0), linewidth=0.0)\n", + " data_missing=ref_part_raw.reindex(read_idx).isna()\n", + " if data_missing.any():\n", + " ax.fill_between(read_idx, 0,1,where=data_missing, facecolor=(1,0,0,0.2), transform=ax.get_xaxis_transform(), label=\"missing data\",\n", + " edgecolor=(0,0,0,0), linewidth=0.0)\n", + " ax.legend()\n", + " ax.set_title(label)\n", + " display(fig)\n", + " if not ( ax.lines or ax.colllections or ax.get_images()):\n", + " warn(\"empty plot for label \"+label)\n", + " plt.close(fig)\n", + "reference = pd.concat(ref_parts)\n", + "reference.index.names = [\"Label\", \"Datetime\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert reference.notna().all(), \"reference should not have nan, check script for mistakes.\"\n", + "reference.to_csv(model_folder/\"reference.csv\")\n", + "reference" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (Spyder)", + "language": "python3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}