diff --git a/hydraulic_calibration/optimization_hydraulic.ipynb b/hydraulic_calibration/optimization_hydraulic.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..9894e00ddffe3d12f8d3f38f4397614ab37fdabd --- /dev/null +++ b/hydraulic_calibration/optimization_hydraulic.ipynb @@ -0,0 +1,1405 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\trojan\\AppData\\Local\\Temp\\ipykernel_14844\\805292381.py:21: DeprecationWarning: \n", + "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n", + "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n", + "but was not found to be installed on your system.\n", + "If this would cause problems for you,\n", + "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n", + " \n", + " import pandas as pd\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "Optimization with mikepluspy\n", + "The nevergrad package is used for optimization.\n", + "Note: mikepluspy requires a full licence for mike+ to run.\n", + "In many cases the seat of the license is not properly released; closing python and waiting some time should fix this.\n", + "\"\"\"\n", + "\n", + "import typing\n", + "import datetime\n", + "import shutil\n", + "import subprocess\n", + "from multiprocessing.connection import Client\n", + "from time import sleep\n", + "from warnings import warn\n", + "from pathlib import Path\n", + "import functools\n", + "\n", + "\n", + "from mikeplus import DataTableAccess\n", + "from mikeplus.engines import Engine1D\n", + "import numpy as np\n", + "import pandas as pd\n", + "import nevergrad as ng\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import hydroeval as hde\n", + "import sqlalchemy as sa\n", + "from scipy.ndimage import gaussian_filter1d\n", + "from IPython.utils import io as ipyio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def file_to_df(path: Path) -> pd.DataFrame:\n", + " \"\"\"Tries to read a file at path as a pandas DataFrame.\"\"\"\n", + " if path.suffix == \".xlsx\":\n", + " return pd.read_excel(path)\n", + " elif path.suffix == \".dfs0\":\n", + " reader_c.send({\"filename\": path})\n", + " return reader_c.recv()\n", + " elif path.suffix == \".csv\":\n", + " return pd.read_csv(path)\n", + " else:\n", + " raise NotImplementedError(f\"no method for {path.suffix} implemented\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set variables for later use\n", + "# delete result files(as specified by event config) before a sim, assures that no old file is reread in the case that a simulation does not run\n", + "DELETE_RESFILE_BEFORE_SIM = True\n", + "# output heaps of additional data, may be useful for debugging\n", + "VERBOSE_OUTPUT = False\n", + "# attempt simulation at most so many times, used when a simulation fails\n", + "MAX_SUMILATION_TRY = 3\n", + "# Consider a simulation as failed and retry if so many values in the result compared to the reference are missing\n", + "MAX_MISSING_RATIO = 0.6\n", + "#delete the temp folder at end\n", + "DELETE_TEMP = True\n", + "# time string for output folder\n", + "time_path_part = datetime.datetime.now().strftime(\"%Y-%m-%dT%H.%M.%S\")\n", + "# set path of temp and output folder\n", + "temp_folder = Path(\"tmp_\" + time_path_part)\n", + "# output folder\n", + "out_folder = Path(\"out_\" + time_path_part)\n", + "# wheter output from mike simulation get shown on error\n", + "show_sim_out_on_error = True" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# variables which need to be set before each optimization\n", + "# Number of simulations allowed for optimization\n", + "optimization_budget = 60\n", + "# a folder where used files are saved\n", + "# note this should be at the same folder depth as the original folder, so that relative paths lead to the same files\n", + "model_folder = Path(\"Modell_Kalibrierung_AD/\")\n", + "# path of .mupp file, relative to model_folder\n", + "db_path_rel = \"240304_Godorf_AD.mupp\"\n", + "# set the id of the sensors and the timezone\n", + "timezone = \"UTC+01:00\"\n", + "sensor_conc = \"2209NP80036_RUEB0901\"\n", + "sensor_q = \"2216NF73722_RUEB0901\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# create output folder\n", + "out_folder.mkdir(exist_ok=True,parents=True)\n", + "# make a copy of the model folder into the temp folder, since the programm might mess with the model\n", + "assert not temp_folder.exists(), \"Make sure not to mess with existing folders\"\n", + "shutil.copytree(model_folder,temp_folder)\n", + "db_path = temp_folder / db_path_rel" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "WindowsPath('d:/entfrachten/tmp_2024-05-13T10.41.48')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#print the path of the temp folder\n", + "temp_folder.absolute()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# connection to database used to fetch the sensordata\n", + "db_engine = sa.create_engine(\n", + " \"postgresql+psycopg2://postgres:P0stgr3SQL!@137.226.85.103:5432/entfrachten\"\n", + ")\n", + "'''\n", + "Alternativly read the data from excel files:\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "starting mike reader\n" + ] + } + ], + "source": [ + "# MIKE+Py import it's own version of mikeio1d, which does not work for our files and also has odd problems with mikeio.\n", + "# Therefore use a multiprocessing.Listener, which run in its own process with its own imports, and reads the file.\n", + "# the script \"mikeio_listener.py\" needs to be in the same folder, or has to be startd beforehand.\n", + "reader_address = R\"\\\\.\\pipe\\mikeio_pipe\"\n", + "auth_key = b\"res1dreader\"\n", + "try:\n", + " # try to connect to existing server\n", + " reader_c = Client(reader_address, authkey=auth_key)\n", + "except (ConnectionRefusedError, FileNotFoundError) as e:\n", + " print(\"starting mike reader\")\n", + " # start a server in a new console window\n", + " subprocess.Popen(\n", + " [\"python\", \"mikeio_listener.py\"], creationflags=subprocess.CREATE_NEW_CONSOLE\n", + " )\n", + " # give server some time to start\n", + " sleep(10)\n", + " reader_c = Client(reader_address, authkey=auth_key)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# open the MIKE+ database using MIKE+Py.\n", + "dta = DataTableAccess(db_path)\n", + "# open the database from MIKE+\n", + "dta.open_database()\n", + "# check if the database is open.\n", + "dta.is_database_open()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Engine1D object that will be used to run MIKE 1D simulations.\n", + "engine = Engine1D(dta.datatables)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def activate_sim(sim_muid):\n", + " '''\n", + " Activates a simulation defined by sim_muid from MIKE+.\n", + " sim_muid is the name of a simulation in the simulation setup in MIKE+.\n", + " '''\n", + " # first deactivate all\n", + " for id in dta.get_muid_where(\"msm_Project\"):\n", + " dta.set_value(\"msm_Project\", id, \"ActiveProject\", False)\n", + " # activate correct simulation\n", + " dta.set_value(\"msm_Project\", sim_muid, \"ActiveProject\", True)\n", + " # check that the sim is active\n", + " if VERBOSE_OUTPUT:\n", + " display(\"active sim\")\n", + " display(dta.get_muid_where(\"msm_Project\", \"ActiveProject == 1\"))\n", + " assert dta.get_muid_where(\"msm_Project\", \"ActiveProject == 1\") == [\n", + " sim_muid\n", + " ], \"only selected sim should be active\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# specify which parameters are optimized and in which range they should be\n", + "# could make the table in python like this\n", + "# param_specs = pd.DataFrame([\n", + "# {\"Table\":\"msm_HParA\",\"Muid\":DEF_STRING,\"Column\":\"RedFactor\",\"Min\":.4,\"Max\":1,\"Steps\":4},\n", + "# {\"Table\":\"msm_HParA\",\"Muid\":DEF_STRING,\"Column\":\"InitLoss\",\"Min\":0,\"Max\":5*M_TO_MM,\"Steps\":4}\n", + "# ])\n", + "# or load from a file\n", + "param_specs = file_to_df(temp_folder / \"parameter_config.xlsx\")\n", + "# give params_specs a sensible string index, this is later used to pass optimization parameters as a dictionary\n", + "param_specs.index = pd.Index(\n", + " param_specs[[\"Table\", \"Column\", \"Muid\"]].agg(\";\".join, axis=\"columns\"),\n", + " name=\"ParamKey\",\n", + ")\n", + "assert (\n", + " param_specs.index.is_unique\n", + "), \"Index needs to be unique. Otherwise indicates dupplicates in the parameters.\"\n", + "param_specs" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Event</th>\n", + " <th>EventWeight</th>\n", + " <th>Start</th>\n", + " <th>End</th>\n", + " <th>Timestep</th>\n", + " <th>SimulationName</th>\n", + " <th>ReferenceFile</th>\n", + " <th>ReferenceColumn</th>\n", + " <th>ReferenceUnitFactor</th>\n", + " <th>ResultFile</th>\n", + " <th>ResultReach</th>\n", + " <th>ResultNode</th>\n", + " <th>ResultColumn</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Label</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>397_AFS</th>\n", + " <td>397</td>\n", + " <td>1</td>\n", + " <td>2022-09-13 21:36:00+01:00</td>\n", + " <td>2022-09-15 00:52:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_397</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_397BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>405_AFS</th>\n", + " <td>405</td>\n", + " <td>1</td>\n", + " <td>2022-09-27 22:22:00+01:00</td>\n", + " <td>2022-09-28 03:05:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_405</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_405BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>406_AFS</th>\n", + " <td>406</td>\n", + " <td>1</td>\n", + " <td>2022-10-01 02:36:00+01:00</td>\n", + " <td>2022-10-01 20:11:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_406</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_406BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>407_AFS</th>\n", + " <td>407</td>\n", + " <td>1</td>\n", + " <td>2022-10-02 04:43:00+01:00</td>\n", + " <td>2022-10-02 07:31:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_407</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_407BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>411_AFS</th>\n", + " <td>411</td>\n", + " <td>1</td>\n", + " <td>2022-10-17 21:54:00+01:00</td>\n", + " <td>2022-10-18 12:38:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_411</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_411BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>431_AFS</th>\n", + " <td>431</td>\n", + " <td>1</td>\n", + " <td>2022-11-23 17:18:00+01:00</td>\n", + " <td>2022-11-24 00:29:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_431</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_431BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>439_AFS</th>\n", + " <td>439</td>\n", + " <td>1</td>\n", + " <td>2022-12-05 03:28:00+01:00</td>\n", + " <td>2022-12-05 09:12:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_439</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_439BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>490_AFS</th>\n", + " <td>490</td>\n", + " <td>1</td>\n", + " <td>2023-03-07 15:11:00+01:00</td>\n", + " <td>2023-03-10 19:48:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_490</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_490BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>494_AFS</th>\n", + " <td>494</td>\n", + " <td>1</td>\n", + " <td>2023-03-15 03:15:00+01:00</td>\n", + " <td>2023-03-15 11:05:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_494</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_494BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>501_AFS</th>\n", + " <td>501</td>\n", + " <td>1</td>\n", + " <td>2023-03-24 06:48:00+01:00</td>\n", + " <td>2023-03-25 01:55:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_501</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_501BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>528_AFS</th>\n", + " <td>528</td>\n", + " <td>1</td>\n", + " <td>2023-05-11 23:21:00+01:00</td>\n", + " <td>2023-05-12 09:56:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_528</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_528BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>589_AFS</th>\n", + " <td>589</td>\n", + " <td>1</td>\n", + " <td>2023-10-19 14:52:00+01:00</td>\n", + " <td>2023-10-19 19:00:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_589</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_589BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>597_AFS</th>\n", + " <td>597</td>\n", + " <td>1</td>\n", + " <td>2023-10-28 01:11:00+01:00</td>\n", + " <td>2023-10-28 05:58:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_597</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_597BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>614_AFS</th>\n", + " <td>614</td>\n", + " <td>1</td>\n", + " <td>2023-11-24 00:57:00+01:00</td>\n", + " <td>2023-11-24 23:59:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_614</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_614BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>615_AFS</th>\n", + " <td>615</td>\n", + " <td>1</td>\n", + " <td>2023-11-25 14:13:00+01:00</td>\n", + " <td>2023-11-25 21:38:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_615</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_615BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " <tr>\n", + " <th>623_AFS</th>\n", + " <td>623</td>\n", + " <td>1</td>\n", + " <td>2023-12-08 05:14:00+01:00</td>\n", + " <td>2023-12-08 08:35:00+01:00</td>\n", + " <td>1min</td>\n", + " <td>RW_Kal_623</td>\n", + " <td>***readings_smoothed***</td>\n", + " <td>conc_mgpl</td>\n", + " <td>1</td>\n", + " <td>Results Kalibrierung/RW_Kal_623BaseDefault_Net...</td>\n", + " <td>H0069212</td>\n", + " <td></td>\n", + " <td>AFS:H0069212:17.0052</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Event EventWeight Start \\\n", + "Label \n", + "397_AFS 397 1 2022-09-13 21:36:00+01:00 \n", + "405_AFS 405 1 2022-09-27 22:22:00+01:00 \n", + "406_AFS 406 1 2022-10-01 02:36:00+01:00 \n", + "407_AFS 407 1 2022-10-02 04:43:00+01:00 \n", + "411_AFS 411 1 2022-10-17 21:54:00+01:00 \n", + "431_AFS 431 1 2022-11-23 17:18:00+01:00 \n", + "439_AFS 439 1 2022-12-05 03:28:00+01:00 \n", + "490_AFS 490 1 2023-03-07 15:11:00+01:00 \n", + "494_AFS 494 1 2023-03-15 03:15:00+01:00 \n", + "501_AFS 501 1 2023-03-24 06:48:00+01:00 \n", + "528_AFS 528 1 2023-05-11 23:21:00+01:00 \n", + "589_AFS 589 1 2023-10-19 14:52:00+01:00 \n", + "597_AFS 597 1 2023-10-28 01:11:00+01:00 \n", + "614_AFS 614 1 2023-11-24 00:57:00+01:00 \n", + "615_AFS 615 1 2023-11-25 14:13:00+01:00 \n", + "623_AFS 623 1 2023-12-08 05:14:00+01:00 \n", + "\n", + " End Timestep SimulationName \\\n", + "Label \n", + "397_AFS 2022-09-15 00:52:00+01:00 1min RW_Kal_397 \n", + "405_AFS 2022-09-28 03:05:00+01:00 1min RW_Kal_405 \n", + "406_AFS 2022-10-01 20:11:00+01:00 1min RW_Kal_406 \n", + "407_AFS 2022-10-02 07:31:00+01:00 1min RW_Kal_407 \n", + "411_AFS 2022-10-18 12:38:00+01:00 1min RW_Kal_411 \n", + "431_AFS 2022-11-24 00:29:00+01:00 1min RW_Kal_431 \n", + "439_AFS 2022-12-05 09:12:00+01:00 1min RW_Kal_439 \n", + "490_AFS 2023-03-10 19:48:00+01:00 1min RW_Kal_490 \n", + "494_AFS 2023-03-15 11:05:00+01:00 1min RW_Kal_494 \n", + "501_AFS 2023-03-25 01:55:00+01:00 1min RW_Kal_501 \n", + "528_AFS 2023-05-12 09:56:00+01:00 1min RW_Kal_528 \n", + "589_AFS 2023-10-19 19:00:00+01:00 1min RW_Kal_589 \n", + "597_AFS 2023-10-28 05:58:00+01:00 1min RW_Kal_597 \n", + "614_AFS 2023-11-24 23:59:00+01:00 1min RW_Kal_614 \n", + "615_AFS 2023-11-25 21:38:00+01:00 1min RW_Kal_615 \n", + "623_AFS 2023-12-08 08:35:00+01:00 1min RW_Kal_623 \n", + "\n", + " ReferenceFile ReferenceColumn ReferenceUnitFactor \\\n", + "Label \n", + "397_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "405_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "406_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "407_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "411_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "431_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "439_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "490_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "494_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "501_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "528_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "589_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "597_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "614_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "615_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "623_AFS ***readings_smoothed*** conc_mgpl 1 \n", + "\n", + " ResultFile ResultReach \\\n", + "Label \n", + "397_AFS Results Kalibrierung/RW_Kal_397BaseDefault_Net... H0069212 \n", + "405_AFS Results Kalibrierung/RW_Kal_405BaseDefault_Net... H0069212 \n", + "406_AFS Results Kalibrierung/RW_Kal_406BaseDefault_Net... H0069212 \n", + "407_AFS Results Kalibrierung/RW_Kal_407BaseDefault_Net... H0069212 \n", + "411_AFS Results Kalibrierung/RW_Kal_411BaseDefault_Net... H0069212 \n", + "431_AFS Results Kalibrierung/RW_Kal_431BaseDefault_Net... H0069212 \n", + "439_AFS Results Kalibrierung/RW_Kal_439BaseDefault_Net... H0069212 \n", + "490_AFS Results Kalibrierung/RW_Kal_490BaseDefault_Net... H0069212 \n", + "494_AFS Results Kalibrierung/RW_Kal_494BaseDefault_Net... H0069212 \n", + "501_AFS Results Kalibrierung/RW_Kal_501BaseDefault_Net... H0069212 \n", + "528_AFS Results Kalibrierung/RW_Kal_528BaseDefault_Net... H0069212 \n", + "589_AFS Results Kalibrierung/RW_Kal_589BaseDefault_Net... H0069212 \n", + "597_AFS Results Kalibrierung/RW_Kal_597BaseDefault_Net... H0069212 \n", + "614_AFS Results Kalibrierung/RW_Kal_614BaseDefault_Net... H0069212 \n", + "615_AFS Results Kalibrierung/RW_Kal_615BaseDefault_Net... H0069212 \n", + "623_AFS Results Kalibrierung/RW_Kal_623BaseDefault_Net... H0069212 \n", + "\n", + " ResultNode ResultColumn \n", + "Label \n", + "397_AFS AFS:H0069212:17.0052 \n", + "405_AFS AFS:H0069212:17.0052 \n", + "406_AFS AFS:H0069212:17.0052 \n", + "407_AFS AFS:H0069212:17.0052 \n", + "411_AFS AFS:H0069212:17.0052 \n", + "431_AFS AFS:H0069212:17.0052 \n", + "439_AFS AFS:H0069212:17.0052 \n", + "490_AFS AFS:H0069212:17.0052 \n", + "494_AFS AFS:H0069212:17.0052 \n", + "501_AFS AFS:H0069212:17.0052 \n", + "528_AFS AFS:H0069212:17.0052 \n", + "589_AFS AFS:H0069212:17.0052 \n", + "597_AFS AFS:H0069212:17.0052 \n", + "614_AFS AFS:H0069212:17.0052 \n", + "615_AFS AFS:H0069212:17.0052 \n", + "623_AFS AFS:H0069212:17.0052 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# read the event_config.xlsx in which for each sim_muid=events variables are set.\n", + "events = file_to_df(temp_folder / \"event_config.xlsx\").set_index(\"Label\")\n", + "# make sure that empty strings are translated.\n", + "events[\"ResultReach\"] = events[\"ResultReach\"].fillna(\"\").astype(str)\n", + "events[\"ResultNode\"] = events[\"ResultNode\"].fillna(\"\").astype(str)\n", + "# check if start time of events is before the end time.\n", + "assert (events[\"Start\"] <= events[\"End\"]).all(\n", + " axis=None\n", + "), \"Event end needs to be after start.\"\n", + "# add timezone. Use same as sensor_date.\n", + "events[\"Start\"]=events[\"Start\"].dt.tz_localize(timezone)\n", + "events[\"End\"]=events[\"End\"].dt.tz_localize(timezone)\n", + "# check if threre are duplicated events.\n", + "assert events.index.is_unique, \"Need unique labels\"\n", + "assert events.drop_duplicates(subset=\"Event\")[\"SimulationName\"].is_unique, \"Need exactly one simulation name per event\"\n", + "events" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def _read_result_helper(file, reach, node, column, timestep, start, end) -> pd.Series:\n", + " \"\"\"Read the result with specified parameters.\n", + " returns:\n", + " a pandas Series with Datetime as index.\n", + " \"\"\"\n", + " # file_path is read from the event table\n", + " # the reader should be executed in the same directory as this file, but it is safer to send absolute paths.\n", + " # reaches/nodes is used to only read some parts of the result.\n", + " # a reach can be any connection between two nodes.\n", + " # loc may return a series or dataframe.\n", + " file_path: Path = temp_folder / file\n", + " params = {\n", + " \"file_path\": str(file_path.absolute()),\n", + " \"reaches\": [reach],\n", + " \"nodes\": [node],\n", + " }\n", + " # uses the MIKEio listener script first to send the parameters\n", + " reader_c.send(params)\n", + " # second to receive the data from the result file \n", + " df = reader_c.recv()\n", + " # reduce result to desired column.\n", + " series = df[column]\n", + " # add timezone to the index.\n", + " series.index = series.index.tz_localize(timezone)\n", + " # print the data before processing.\n", + " if VERBOSE_OUTPUT:\n", + " display(f\"{file_path}[{column}] before resample:\")\n", + " display(series)\n", + " # resample to be comparable to results.\n", + " # Linear interpolation with pandas first throws away all values where the index does not match to the frequency,\n", + " # therefore use nearest instead of interpolate.\n", + " series = series.resample(timestep).nearest()\n", + " # reduce result to desired times\n", + " series = series.loc[(series.index >= start) & (series.index <= end)]\n", + " # set index name\n", + " series.index.name = \"Datetime\"\n", + " return series\n", + "\n", + "\n", + "def read_result(label) -> pd.Series:\n", + " \"\"\"\n", + " Read the result of the part of the collection system with the corresponding label, \n", + " assuming the simulation was run, using values from DataFrame events.\n", + " returns:\n", + " a pandas Series with Datetime as index.\n", + " \"\"\"\n", + " # used above function, gets parameters from the table\n", + " return _read_result_helper(\n", + " events.at[label, \"ResultFile\"],\n", + " events.at[label, \"ResultReach\"],\n", + " events.at[label, \"Timestep\"],\n", + " events.at[label, \"ResultNode\"],\n", + " events.at[label, \"ResultColumn\"],\n", + " events.at[label, \"Start\"],\n", + " events.at[label, \"End\"],\n", + " )\n", + "\n", + "\n", + "def delete_result_file(label):\n", + " '''\n", + " deletes existing result file before a simulation correspoding to label.\n", + " '''\n", + " file_path: Path = temp_folder / events.at[label, \"ResultFile\"]\n", + " file_path.unlink(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a dictionary to save parts of the reference corresponding to label.\n", + "ref_parts = {}\n", + "\n", + "# read the data for each label.\n", + "for label in events.index:\n", + " filepath = temp_folder / events.loc[label, \"ReferenceFile\"]\n", + " col = events.loc[label, \"ReferenceColumn\"]\n", + " # only look at time that is of interest:\n", + " start = events.loc[label, \"Start\"]\n", + " end = events.loc[label, \"End\"]\n", + " # read the reference values\n", + " with db_engine.begin() as con:\n", + " ref_part = (\n", + " pd.read_sql_query(\n", + " sa.text(\n", + " f\"select datetime, {col} from readings_smoothed where sensor_name = :sn and datetime>=:start and datetime<=:end\"\n", + " ),\n", + " con,\n", + " params={\"start\": start, \"end\": end, \"sn\": sensor_conc},\n", + " index_col=\"datetime\",\n", + " )[col]\n", + " )\n", + " assert isinstance(\n", + " ref_part.index, pd.DatetimeIndex\n", + " ), \"Need DatetimeIndex (expected in first column)\"\n", + " # take column as defined in the event table\n", + " if VERBOSE_OUTPUT:\n", + " display(\"Ref part before resample\")\n", + " display(ref_part)\n", + " # time step of simulation output is one minute, also adjust units\n", + " ref_part = ref_part.resample(events.loc[label, \"Timestep\"]).nearest()\n", + " ref_part = ref_part*events.loc[label, \"ReferenceUnitFactor\"]\n", + " assert ref_part.isna().any(axis=None) == False\n", + " # write in dictionary.\n", + " ref_parts[label] = ref_part\n", + "\n", + "# write dictionary into a pandas series with index Label and Datetime \n", + "reference = pd.concat(ref_parts)\n", + "reference.index.names = [\"Label\", \"Datetime\"]\n", + "reference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# read result which are already in folder\n", + "# test run for read_result\n", + "_test_rs = pd.concat({ev:read_result(ev) for ev in events.index})\n", + "_test_rs.index.names = reference.index.names\n", + "# plot results, if this looks very different from plots in mike, unit conversion may need to be added\n", + "for label in events.index:\n", + " txs=_test_rs.xs(label,level=\"Label\")\n", + " rxs=reference.xs(label,level=\"Label\")\n", + " missing_ratio = txs.reindex_like(rxs).isna().mean()\n", + " if missing_ratio>0.2:\n", + " warn(f\"Label {label} has many missing values compared to the reference missing ratio is {missing_ratio} \")\n", + " pd.DataFrame({\"ref\":rxs,\"test\":txs}).plot(title=str(label),figsize=(7,2.5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define possible objectivefuntcions\n", + "# note: if your objective function should be maximized, use the negative objective function as loss function\n", + "\n", + "def rmse(ref: pd.Series, res: pd.Series) -> float:\n", + " \"\"\"Root Mean Square Error.\"\"\"\n", + " if not ref.index.equals(res.index):\n", + " raise ValueError(\"Indices are not equal, cannot compare\")\n", + " return ((ref - res) ** 2).mean(axis=None) ** 0.5\n", + "\n", + "\n", + "\n", + "def vfp(ref: pd.Series, res: pd.Series) -> float:\n", + " '''\n", + " Volume Error\n", + " '''\n", + " if not ref.index.equals(res.index):\n", + " raise ValueError(\"Indices are not equal, cannot compare\")\n", + " ref_volume = ref.sum() # Gesamtvolumen der Referenzdaten\n", + " res_volume = res.sum() # Gesamtvolumen der Ergebnisdaten\n", + " dif = abs((res_volume - ref_volume))\n", + " # if reference volumne is 0 use replacement value. More useful for optimization\n", + " # replacement value for the divisior when ref_volume is zero.\n", + " _vfp_divisor_on_zero = reference.groupby(level=\"Label\").sum().mean()\n", + " if ref_volume == 0:\n", + " ref_volume = _vfp_divisor_on_zero\n", + " # ToDo: Herausfinden was man sonst sinnvolles benutzen kann! * pd.to_timedelta(events[\"Timestep\"]).dt.total_seconds().to_numpy()\n", + " return 100 * dif / ref_volume # Volumenfehler in Prozent\n", + "\n", + "\n", + "def weighted_vfp(ref: pd.Series, res: pd.Series) -> float:\n", + " '''Weighted Volume Error'''\n", + " labels = events.index.get_level_values(\"Label\")\n", + " # read the event weights also put timestep into weights.\n", + " weights = events[\"EventWeight\"].to_numpy()\n", + " # calculate the vfp for each label\n", + " per_label = np.array(\n", + " [vfp(ref.xs(l, level=\"Label\"), res.xs(l, level=\"Label\")) for l in labels]\n", + " )\n", + " # control if there a missing weights\n", + " nan_parts = np.isnan(weights)|np.isnan(per_label)\n", + " if np.any(nan_parts):\n", + " warn(\"weighted_vfp: some parts are nan, skipping those\")\n", + " # return the weightes average vfp over all labels where no value is nan\n", + " return np.average(per_label[~nan_parts], weights=weights[~nan_parts])\n", + "\n", + "\n", + "def mean_negative_bounded_nse(ref: pd.Series, res: pd.Series) -> float:\n", + " \"\"\"\n", + " Mean over Events of negative nse_c2m. range [-1,1]\n", + "\n", + " Args:\n", + " actual (pandas.DataFrame): DataFrame containing actual values.\n", + " forecast (pandas.DataFrame): DataFrame containing forecasted values.\n", + " Returns:\n", + " float: Cost of the solution.\n", + " \"\"\"\n", + " if not ref.index.equals(res.index):\n", + " raise ValueError(\"Indices are not equal, cannot compare\")\n", + " loss_parts = []\n", + " for ev in ref.index.unique(\"Event\"):\n", + " ref_part = ref.xs(ev, level=\"Event\")\n", + " res_part = res.xs(ev, level=\"Event\")\n", + " loss_parts.append(-hde.nse_c2m(res_part.to_numpy(), ref_part.to_numpy()))\n", + " return np.mean(loss_parts)\n", + "\n", + "# define which objectivefunction is used.\n", + "loss_fn = weighted_vfp\n", + "# use this value to penalize errors\n", + "loss_on_error = loss_fn(reference, -reference)\n", + "display(loss_on_error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ParamSetError(Exception):\n", + " \"\"\"Class that marks an Error where Parameters were not correctly set.\"\"\"\n", + " # class only needed to distinguish Error, no special behavour needed.\n", + " pass\n", + "\n", + "# the ** means that simulation run may be called like so:\n", + "# result = simulation_run(key1=p1,key2=p2, ...)\n", + "# but kwparams will contain a dict of form {\"key1\":p1,\"key2\":p2,...}\n", + "# if you called the simulation like simulation_run(key1=p1,key2=p2, ...),\n", + "# key1, key2, ... would need to be valid python names\n", + "# but with simulation_run(**{\"key1\":p1,\"key2\":p2,...}) key1, key2 could be any string.\n", + "# Because the optimizer uses the second form, we can use the index from param_specs as a key,\n", + "# even though the index may not be a valid python name\n", + "def single_sim_run(ev,**kwparams):\n", + " \"\"\"\n", + " Runs a single simulation for event ev.\n", + " \n", + " \"\"\"\n", + " labels = events.index[events[\"Event\"]==ev]\n", + " # delete previous result file\n", + " if DELETE_RESFILE_BEFORE_SIM:\n", + " for l in labels:\n", + " delete_result_file(l)\n", + " # multiple rows per event may exist, use first row, values should be equal\n", + " sim_name = events.loc[events[\"Event\"]==ev, \"SimulationName\"].iloc[0]\n", + " # activate event\n", + " activate_sim(sim_name)\n", + " # set all parameters to the values defined by params.\n", + " # param_specs defines where the parameter is located in the mike database\n", + " for k in kwparams.keys():\n", + " tab = param_specs.loc[k, \"Table\"]\n", + " col = param_specs.loc[k, \"Column\"]\n", + " muid = param_specs.loc[k, \"Muid\"]\n", + " val = kwparams[k]\n", + " dta.set_value(tab, muid, col, val)\n", + " try:\n", + " val_get = dta.get_field_values(tab,muid,[col])[0]\n", + " except Exception as ex:\n", + " val_get = ex\n", + " if not val_get==val:\n", + " raise ParamSetError(f\"\"\"Value not correctly set. \n", + " set {val}, got {val_get}.\n", + " tab: {tab}\n", + " col: {col}\n", + " muid: {muid}\"\"\")\n", + " print(f\"Running simulation {sim_name}.\")\n", + "\n", + " # usually the simulation would print a lot of stuff,\n", + " # supress output; it would be too much text for repeated calls\n", + " with ipyio.capture_output() as captured:\n", + " engine.run(sim_name)\n", + " if (len(captured.stderr) > 0) or (\" ERROR: \" in captured.stdout):\n", + " warn(f\"Error during simulation for Event {ev}. If show_sim_out_on_error is true, output from sululation follows.\")\n", + " if show_sim_out_on_error:\n", + " captured()\n", + " if VERBOSE_OUTPUT:\n", + " captured()\n", + " # concatenate event data for multiple labels.\n", + " results_ev = pd.concat({l:read_result(l) for l in labels},names=[\"Label\"])\n", + " # select part of the reference that corresponds to the events\n", + " references_ev = reference[reference.index.get_level_values('Label').isin(labels)]\n", + " # calculate ratio of missing values in the part of the reference file\n", + " missing_ratio = results_ev.reindex_like(references_ev).isna().mean()\n", + " assert missing_ratio<MAX_MISSING_RATIO, \"Result is missing many values compared to the reference\"\n", + " # returns results as a pandas series.\n", + " return results_ev\n", + "\n", + "\n", + "def simulation_run(**kwparams):\n", + " \"\"\"\n", + " Runs single_sim_run for each event.\n", + " Takes parameters as arguments, runs simulations as defined by events with them\n", + " and returns the results as a series with MultiIndex[event, time].\n", + "\n", + " Note that kwparams needs keys that match the index of DataFrame param_specs.\n", + " \"\"\"\n", + " \n", + " assert set(kwparams.keys()) == set(\n", + " param_specs.index\n", + " ), \"parameters need to match param_specs\"\n", + "\n", + " # Create a list to store result parts for each event\n", + " res_parts = []\n", + " for ev in events[\"Event\"].unique():\n", + " # Tries the MIKE+ simulation a maximum of times\n", + " for i in range(MAX_SUMILATION_TRY):\n", + " try:\n", + " res_parts.append(single_sim_run(ev,**kwparams))\n", + " # exit the loop if we have a result\n", + " # if an error occurs, the try block is immediately exited and the break is not executed\n", + " break\n", + " except ParamSetError as pex:\n", + " raise pex\n", + " except Exception as ex:\n", + " warn(f\"Simulation try {i} failed for event {ev}:{ex}\")\n", + " else:\n", + " # Else is executed when the for loop completed without break,\n", + " # i.e the simulation did not complete even after multiple runs.\n", + " # in that case return a nan Dataframe\n", + " warn(f\"Simulation failed too many times, returning nan-result\")\n", + " labels = events.index[events[\"Event\"]==ev]\n", + " idx = pd.MultiIndex.from_product([labels,[pd.to_datetime(np.NaN)]])\n", + " idx.names = reference.index.names\n", + " res_parts.append(pd.Series(data=pd.NA,index=idx))\n", + " # concatenate the results, note that single_sim_run has already added the label to the index\n", + " result = pd.concat(res_parts)\n", + " # returns results as pandas series\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test run with default parameters\n", + "# the ** in the call unrolls the series into keyword parameters\n", + "_test_res = simulation_run(**param_specs[\"Default\"])\n", + "display(\"_test_res\")\n", + "display(_test_res)\n", + "_test_res = _test_res.reindex_like(reference)\n", + "display(\"missing ratio\")\n", + "display(_test_res.reindex_like(reference).isna().mean())\n", + "display(\"loss_fn\")\n", + "display(loss_fn(reference, _test_res))\n", + "for label in events.index:\n", + " txs=_test_res.xs(label,level=\"Label\")\n", + " rxs=reference.xs(label,level=\"Label\")\n", + " missing_ratio = txs.reindex_like(rxs).isna().mean()\n", + " if missing_ratio>0.2:\n", + " warn(f\"Label {label} has many missing values compared to the reference missing ratio is {missing_ratio} \")\n", + "\n", + " pd.DataFrame({\"ref\":rxs,\"test\":txs}).plot(title=str(label),figsize=(7,2.5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define optimization paramaters as needed by nevergrad\n", + "# first make a dictionary of scalar values with information from param_specs\n", + "ng_params = {}\n", + "for i in param_specs.index:\n", + " vmin = param_specs.at[i, \"Min\"]\n", + " vmax = param_specs.at[i, \"Max\"]\n", + " vdef = param_specs.at[i, \"Default\"]\n", + " ngp = ng.p.Scalar(init=vdef, lower=vmin, upper=vmax)\n", + " ng_params[i] = ngp\n", + "# now turn the list into an instrumentation\n", + "# the instrumentation is used to tell the optimizer how the paremeters are passed to our function\n", + "instrumentation = ng.p.Instrumentation(**ng_params)\n", + "instrumentation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define the parameter calibration as a minimization problem\n", + "def simulation_compare(**kwparams):\n", + " \"\"\"\n", + " Runs simulation with the specified parameters and uses the loss function on the results.\n", + " \"\"\"\n", + " # start simulation for all events with one parameter set defined by kwparams.\n", + " result = simulation_run(**kwparams)\n", + " # reindex the result according to the reference, important if values are missing in the reference.\n", + " # If there are missing values in results they are inserted as nan. Can be avoided by a proper fixed timestep definition in MIKE+.\n", + " result_matched = result.reindex_like(reference)\n", + " # calculate the objextive function for all results\n", + " plain_loss = loss_fn(reference, result_matched)\n", + " # values should not be missing, penalize by increasing the loss\n", + " missing_ratio = result_matched.isna().mean()\n", + " if missing_ratio>0:\n", + " warn(f\"Values are missing from result:{missing_ratio}\")\n", + " return (1-missing_ratio)*plain_loss + missing_ratio*loss_on_error" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define callback funtions for the optimizers\n", + "# we pass these to the optimizer and the optimizer then executes the function at each step of the optimization\n", + "def params_losses_callback():\n", + " \"\"\"returns two lists and a callback function.\n", + " When the callback function is executed it puts the parameters and loss into the lists.\n", + " !!Do not register this function itself as a callback, but the third returned parameter!!\n", + " \"\"\"\n", + " # create two lists to store the params and the corresponding losses\n", + " params = []\n", + " losses = []\n", + "\n", + " def callback(optim, par, loss):\n", + " params.append(par)\n", + " losses.append(loss)\n", + "\n", + " # returns two lists and the function\n", + " return params, losses, callback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def optim_run(optim_name):\n", + " '''\n", + " Function to start the optimizer.\n", + " input = name of the optimizer\n", + " output = dataframe with details from optimization model\n", + " '''\n", + " # set the optimizer based on the optim_name and instrumentation from params_spec\n", + " optimizer = ng.optimizers.registry[optim_name](\n", + " parametrization=instrumentation, budget=optimization_budget, num_workers=1\n", + " )\n", + " # show progressbar. updated when the optimizer gets the result of a step\n", + " optimizer.register_callback(\"tell\", ng.callbacks.ProgressBar())\n", + " # put params and loss in list\n", + " param_list, loss_list, plcb = params_losses_callback()\n", + " optimizer.register_callback(\"tell\", plcb)\n", + " # get recommendation of parameters after run is finished\n", + " reccomendation = optimizer.minimize(simulation_compare, verbosity=2)\n", + " optim_run_df = pd.concat(\n", + " [\n", + " pd.DataFrame(data=[p.value[1] for p in param_list]),\n", + " pd.DataFrame({\"Optimizer\": optimizer.name, \"Loss\": loss_list}),\n", + " ],\n", + " axis=\"columns\",\n", + " )\n", + " # also add a line for the reccomendation\n", + " rec_params = reccomendation.value[1]\n", + " optim_run_df.loc[\"reccomendation\", rec_params.keys()] = rec_params\n", + " optim_run_df.loc[\"reccomendation\", \"Loss\"] = reccomendation.loss\n", + " optim_run_df.loc[\"reccomendation\", \"Optimizer\"] = optimizer.name\n", + " # save the results of the optimization as a csv and excel file\n", + " optim_run_df.to_csv(out_folder / f\"optim_run{optimizer.name}.csv\")\n", + " optim_run_df.to_excel(out_folder / f\"optim_run{optimizer.name}.xlsx\")\n", + "\n", + " return optim_run_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# choose optimizers from nevergrad\n", + "# https://en.wikipedia.org/wiki/CMA-ES\n", + "# create a list with optimizers\n", + "# it is also possible to use multiple optimizers and compare the results\n", + "optimizers = [\"BayesOptimBO\"] #[\"CMA\",\"BayesOptimBO\"]\n", + "# create dictionary to store the rundata for each optimizer\n", + "optim_run_dfs = {}\n", + "for o in optimizers:\n", + " # start the optimization\n", + " optim_run_dfs[o] = optim_run(o)\n", + " # display results\n", + " display(optim_run_dfs[o])\n", + " optim_run_dfs[o].plot(subplots=True, figsize=(5, 2 * len(param_specs)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# only interesting if multiple optimizers are used.\n", + "# optim_run_df already has optimizer included, use .values instead of passing the dict, so that the optimizer is not added again.\n", + "optim_run_df = pd.concat(optim_run_dfs.values())\n", + "optim_run_df.to_csv(out_folder / \"optim_run.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set the index if the optim_run_df\n", + "optim_run_df.index.name = \"Step\"\n", + "optim_run_df = optim_run_df.reset_index().set_index([\"Optimizer\", \"Step\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the loss for each step of all optimizers\n", + "optim_run_df[\"Loss\"].unstack(level=\"Optimizer\").plot(\n", + " subplots=True, sharey=True, title=\"loss over steps\"\n", + ")[0].get_figure().savefig(out_folder / \"loss_over_steps.png\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot recommendation as a bar plot\n", + "ax = optim_run_df[\"Loss\"].unstack(level=0).loc[\"reccomendation\"].plot(kind=\"bar\")\n", + "ax.bar_label(ax.containers[0])\n", + "optim_run_df[\"Loss\"].xs(\"reccomendation\", level=\"Step\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the cumulative loss over each step\n", + "optim_run_df[\"Loss\"].unstack(level=0).cummin().plot(\n", + " title=\"cummin loss over steps\"\n", + ").get_figure().savefig(out_folder / \"loss_cummin.png\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# store the simulation results for the recommended results\n", + "res_dict = {\"ref\": reference}\n", + "for key, df in optim_run_df.groupby(level=\"Optimizer\"):\n", + " ps = df.loc[(key, \"reccomendation\"), param_specs.index]\n", + " display(ps)\n", + " # start single simulation with recommended params\n", + " _test_res = simulation_run(**ps)\n", + " res_dict[key] = _test_res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# concatenate best results in one dataframe\n", + "best_res = pd.concat(res_dict, axis=\"columns\")\n", + "# save best results in a csv file\n", + "best_res.to_csv(out_folder / \"best_res.csv\")\n", + "# plot best result for each label\n", + "ev_group = best_res.groupby(level=\"Label\")\n", + "fig, axes = plt.subplots(nrows=len(ev_group), figsize=(6, 2.5 * len(ev_group)))\n", + "for (label, df), ax in zip(ev_group, axes):\n", + " df.droplevel(\"Label\").plot(ax=ax,title=str(label))\n", + "fig.suptitle(\"Optimizer Result and Reference\")\n", + "fig.tight_layout()\n", + "# save plot as png file\n", + "fig.savefig(out_folder / \"best_res.png\", bbox_inches=\"tight\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# measure how much the parameters are adjusted\n", + "# first normalize to range than take norm2 of difference to previous step\n", + "# geometric interpretation: project parameters into hyper unit cube and calculate the velocity\n", + "\n", + "# only look at params and exclude reccomendation\n", + "run_param_df = optim_run_df.loc[\n", + " optim_run_df.index.get_level_values(\"Step\") != \"reccomendation\", param_specs.index\n", + "]\n", + "normalized = (run_param_df - param_specs[\"Min\"]) / (\n", + " param_specs[\"Max\"] - param_specs[\"Min\"]\n", + ")\n", + "velocity = normalized.diff()\n", + "# norm2 (root of sum of squares)\n", + "velocity = (velocity**2).sum(axis=\"columns\") ** 0.5\n", + "# plot the velocity of the parameter adjustments\n", + "velocity.unstack(level=0).plot(title=\"velocity of parameter adjustments\")\n", + "velocity.unstack(level=0).plot(\n", + " subplots=True, sharey=True, title=\"velocity of parameter adjustments\"\n", + ")\n", + "velocity.unstack(level=0).plot(\n", + " subplots=True, sharey=False, title=\"velocity of parameter adjustments\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# close database\n", + "dta.close_database()\n", + "# close connection\n", + "reader_c.send(\"close\")\n", + "reader_c.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the temporary folder\n", + "if DELETE_TEMP:\n", + " shutil.rmtree(temp_folder)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print the recommendations one last time\n", + "for key, df in optim_run_df.groupby(level=\"Optimizer\"):\n", + " display(key,df.loc[(key, \"reccomendation\"), param_specs.index])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}