From 564aa7fa1668f001a01b9801e3027ea141d32868 Mon Sep 17 00:00:00 2001
From: Florian Solbach <florian.solbach@rwth-aachen.de>
Date: Fri, 5 Jul 2024 13:59:08 +0000
Subject: [PATCH] Squashed commits for publication

---
 .gitignore                                    |  11 +
 CITATION.CFF                                  |  20 +
 CONTRIBUTING.md                               |  14 +
 LICENSE.md                                    |   1 +
 README.md                                     | 161 ++--
 examples/features.rmmd                        | 165 ++++
 examples/minimum.rmmd                         |   8 +
 python/LICENSE.md                             |   7 +
 python/pyproject.toml                         |  16 +
 python/src/rmmd/__init__.py                   |   0
 python/src/rmmd/cli.py                        | 131 ++++
 python/src/rmmd/update.py                     | 315 ++++++++
 python/src/rmmd/validate.py                   |  51 ++
 python/test/rmmd/test_update.py               | 161 ++++
 python/test/rmmd/test_update/cff/after.cff    |  24 +
 python/test/rmmd/test_update/cff/after.rmmd   |  31 +
 python/test/rmmd/test_update/cff/before.cff   |  17 +
 python/test/rmmd/test_update/cff/before.rmmd  |  18 +
 .../test/rmmd/test_update/ct_yaml/after.rmmd  |  32 +
 .../test_update/ct_yaml/after_add_part.rmmd   |  20 +
 .../test_update/ct_yaml/after_no_json.rmmd    |  24 +
 .../test/rmmd/test_update/ct_yaml/before.rmmd |  22 +
 .../ct_yaml/before_no_species.rmmd            |   8 +
 .../test/rmmd/test_update/ct_yaml/gri3.0.yaml | 187 +++++
 .../rmmd/test_update/ct_yaml/species.json     |  30 +
 .../rmmd/test_update/ct_yaml/species.yaml     |  18 +
 reaction-model-metadata/0.0.1/schema.json     | 707 ++++++++++++++++++
 27 files changed, 2139 insertions(+), 60 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 CITATION.CFF
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE.md
 create mode 100644 examples/features.rmmd
 create mode 100644 examples/minimum.rmmd
 create mode 100644 python/LICENSE.md
 create mode 100644 python/pyproject.toml
 create mode 100644 python/src/rmmd/__init__.py
 create mode 100644 python/src/rmmd/cli.py
 create mode 100644 python/src/rmmd/update.py
 create mode 100644 python/src/rmmd/validate.py
 create mode 100644 python/test/rmmd/test_update.py
 create mode 100644 python/test/rmmd/test_update/cff/after.cff
 create mode 100644 python/test/rmmd/test_update/cff/after.rmmd
 create mode 100644 python/test/rmmd/test_update/cff/before.cff
 create mode 100644 python/test/rmmd/test_update/cff/before.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/after.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/after_add_part.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/after_no_json.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/before.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/before_no_species.rmmd
 create mode 100644 python/test/rmmd/test_update/ct_yaml/gri3.0.yaml
 create mode 100644 python/test/rmmd/test_update/ct_yaml/species.json
 create mode 100644 python/test/rmmd/test_update/ct_yaml/species.yaml
 create mode 100644 reaction-model-metadata/0.0.1/schema.json

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..889a8a8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# python
+__pycache__/
+*.py[cod]
+
+
+# IDEs
+.vscode/
+.idea/ 
+
+# other
+tmp/
diff --git a/CITATION.CFF b/CITATION.CFF
new file mode 100644
index 0000000..8eaddd5
--- /dev/null
+++ b/CITATION.CFF
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+title: Reaction Model Metadata
+message: >
+  If you use RMMD in your work, please cite it using the metadata 
+  from this file.
+authors:
+  - given-names: Florian
+    family-names: Solbach
+  - given-names: Sanket
+    family-names: Girhe
+  - given-names: Enia
+    family-names: Mudimu
+repository-code: 'https://git-ce.rwth-aachen.de/ltt/reaction-model-metadata'
+keywords:
+  - reaction model
+  - reaction mechanism
+  - RMMD
+  - provenance
+license: CC-BY-4.0
+version: 0.0.1
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..0fff3bd
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+Thank you for considering contributing to RMMD. There are several ways you can help this project. You can
+
+- tell others about RMMD,
+- make suggestions,
+- report bugs, or 
+- contribute directly to our documentation, examples, and codebase.
+
+To keep the overview of what there is to do and who is doing what, we use Gitlab issues. We encourage you to either create new issues for your contributions or check and contribute to existing ones. If you have suggestions or questions, feel free to open an issue or contact us via email.
+
+For contributions related to code, documentation, or examples, please follow these steps:
+
+Identify an issue to work on or create a new one and assign it to yourself.
+Fork the repository and make your contributions in your fork.
+Once you're ready, create a merge request and ask for a review.
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..f7d5ff2
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1 @@
+The Reaction Model Metadata format, docummentation, examples, etc. (anything not in the python/ directory of this repository) is licensed under CC BY 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/
\ No newline at end of file
diff --git a/README.md b/README.md
index 3cfcace..2eb1a7a 100644
--- a/README.md
+++ b/README.md
@@ -1,60 +1,101 @@
-# Let's FAIRify Reaction Mechanisms
-
-goal: design and establish a standard for sharing reaction mechanism data in a FAIR way
-
-## strategies
-
-- low activation energy
-- design for extendability
-- allow distribution but central index?
-    - pro distribution: do not insist on platform or way to publish (esp. if data is already published somewhere else, this could not be included)
-    - pro centrality: findability
-    - pro distribution: allow to hide some data behind authentication/authorization 
-- science is done on the shoulders of giants
-    - use existing infrastructure, standards and software
-
-
-## FAIR principles
-
-| principle      | subgoal | description                                                                                               |
-| -------------- | :------ | --------------------------------------------------------------------------------------------------------- |
-| findable       | F1      | (Meta) data are assigned globally unique and persistent identifiers                                       |
-|                | F2      | Data are described with rich metadata                                                                     |
-|                | F3      | Metadata clearly and explicitly include the identifier of the data they describe                          |
-|                | F4      | (Meta)data are registered or indexed in a searchable resource                                             |
-| accessible     | A1      | (Meta)data are retrievable by their identifier using a standardised communication protocol                |
-|                | A1.1    | The protocol is open, free and universally implementable                                                  |
-|                | A1.2    | The protocol allows for an authentication and authorisation procedure where necessary                     |
-|                | A2      | Metadata should be accessible even when the data is no longer available                                   |
-| interoperatble | I1      | (Meta)data use a formal, accessible, shared, and broadly applicable language for knowledge representation |
-|                | I2      | (Meta)data use vocabularies that follow the FAIR principles                                               |
-|                | I3      | (Meta)data include qualified references to other (meta)data                                               |
-| reusable       | R1      | (Meta)data are richly described with a plurality of accurate and relevant attributes                      |
-|                | R1.1    | (Meta)data are released with a clear and accessible data usage license                                    |
-|                | R1.2    | (Meta)data are associated with detailed provenance                                                        |
-|                | R1.3    | (Meta)data meet domain-relevant community standards                                                       |
-
-
-
-### General Measures
-
-- define standard
-    - store info in efficient way
-    - maybe define "levels of FAIRness" for a mechanism
-        - e.g. minimum: license + provenance data for mechanism as a whole, i.e., who created it
-- build tools
-    - tools to help add metadata to mechanisms
-        - need to be able to deal with fact that DOI become available much later than data
-        - need to be able to allow bulk edits -> assign the same provenance info to a lot of parameters
-        - compatible with existing data (e.g. output of ChemTraYzer; in first step, probably not fully automatically)
-    - tools for searching and indexing
-        - ability to do some inference -> e.g. parameter in mech1 was generated by activity that used paremeter in mech2 => mech1 was derived from mech2
-    - tools for visualization
-        - visualize provenance info?
-- build community
-
-### Our measures to achieve FAIRness
-
-- F1
-    - publish each dataset with a DOI and give each parameter a local id
-        - e.g. for reaction mechanisms: DOI + species/reaction/reaction_class (i.e. concrete model instance) + model (e.g. Arrhenius eqn) +  parameter name (e.g. E_a)
+# The Reaction Model Metadata Format (RMMD)
+
+... is a YAML-based file format that is defined using a JSON schema.
+It is similar to, and in parts based on, the [Citation File Format](https://citation-file-format.github.io/) [^CFF], but it is specific to the domain of reaction modeling.
+It is essentially a YAML file ending with *.rmmd and follows a specific structure (check out the `examples/` directory).
+
+The RMMD format should enable researchers to add meaningful and detailed metadata to their reaction models, rates constant parameters, etc..
+It should make it easier for other researchers to find reaction model data, allow them to determine its usefulness to them and enable automated data collection, e.g., for machine learning.
+
+[^CFF]: published under CC-BY 4.0: Druskat, S., Spaaks, J. H., Chue Hong, N., Haines, R., Baker, J., Bliven, S., Willighagen, E., Pérez-Suárez, D., & Konovalov, O. (2021). Citation File Format (Version 1.2.0) [Computer software]. https://doi.org/10.5281/zenodo.5171937
+
+An RMMD file contains information about species, reactions and parameters.
+For each parameter(set) one can add provenance information by specifying the research acitvity that created the parameter.
+This way of providing provenance information is already used in the [PRVO ontology](https://www.w3.org/TR/prov-o/) and has been applied to the problem of describing research processes, data and materials in the [Metadata4Ing ontology](https://nfdi4ing.pages.rwth-aachen.de/metadata4ing/metadata4ing/1.2.1/index.html).
+
+The concepts of describing the evolution of data/materials during a research activity as inputs and outputs of an activity/processing step as well as the association of an activity with tools and methods comes from the Metadata4Ing ontology.
+It is much more general than this format, but also not as straightforward and easy to apply. In principle, the provenance mechanism in RMMD is compatible with Metada4Ing.
+
+# Content
+
+<!--This is Gitlab specific syntax to generate a table of contents -->
+
+[[_TOC_]]
+
+# Challenges With Common Publication Practices
+
+In the past decades, researchers have produced a lot of reaction models[^rm-def] but often without relevant metadata.
+The emergence of automated tool for reaction model generation (e.g, RMG, Kinbot, ChemTraYzer, ...) lead to the production and publication of even more data.
+The models are usually provided as SI to papers or uploaded to public repositories as Chemkin or Cantera YAML files.
+However, the principal goal of these file formats was not the publication of data in a FAIR[^fairprinciples] way (findable, accessible, interoperable and reusable) but rather to be used as input files for simulation codes.
+In practice this means that there are a bunch of problems other researchers can run into when trying to build upon this data:
+
+- No canonical IDs: Species and reactions are usually identified by names such as "CHFCHCF3", which are not always sufficient to identify the intended chemical species, sometimes not even by reading the associated publication.
+- Missing provenance: As it is not uncommon to use parametrization from other mechanisms, it is not always clear, where a specific parameters originally came from, what method was used or how they were changed over different publications.
+- Findability: Even if someone already determined accurate parameters for a species or reaction you are interested in, it can be hard to find that data and the associated publication, e.g., when a mechanism includes data on a reaction intermediate whose IUPAC name is not mentioned in the publication.
+- Many more: no data usage license, unknown uncertainties, ...
+
+[^rm-def]: Here, a reaction model is considered the set of reactions, species and model parameters for kinetics, material transport and thermochemistry typically distributed as dataset. 
+[^fairprinciples]: [FAIR Principles](https://www.go-fair.org/fair-principles/) 
+
+## A Metadata Schema as Part of the Solution
+
+Some FAIR principles can be implemented fairly <!-- ;) --> easily, e.g., by adding a data usage license to ones model and uploading the model to a platform that provides DOIs.
+
+The above named problems, however, require the model creator to add additional information specific to the domain of reaction-modeling or chemistry (["rich" metadata](https://www.go-fair.org/fair-principles/r1-metadata-richly-described-plurality-accurate-relevant-attributes/)).
+For this rich metadata to be really useful to others, it needs to be machine readable which requires a standardized way of supplying this kind of data.
+This is where metadata schemas come into play.
+Since we could not find a suitable schema, e.g., on [https://fairsharing.org/](https://fairsharing.org/) or in the [Metadata Standards Catalog](https://rdamsc.bath.ac.uk/) we started developing a new one.
+
+# Creating Your Own RMMD File
+
+## Editing RMMD files
+
+Since RMMD files are just YAML and use JSON Schema, you can get syntax highlighting, autocompletion and validation in some editors.
+
+**For VSCode**: you can use the [YAML plugin by Redhat](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml) and the following configuration to your `settings.json`:
+
+```json
+"files.associations": {
+  "*.rmmd": "yaml"
+}
+```
+
+By adding a pointer to the current RMMD schema as a comment at the top of your RMMD file, you can also activate live validation:
+
+```yaml
+# yaml-language-server: $schema=../reaction-model-metadata/0.0.1/schema.json
+```
+
+
+**For PyCharm**: You can also use YAML syntax highlighing. Similarly, you can activate validation with a comment at the top of your RMMD file:
+
+```yaml
+# $schema:../reaction-model-metadata/0.0.1/schema.json
+```
+
+## Python Tools
+
+The easiest way to install the Python tools is currently in development mode using pip. We recommend to create a separate Python environment before doing this:
+
+``` shell
+git clone git@git-ce.rwth-aachen.de:ltt/reaction-model-metadata.git rmmd/
+cd rmmd/python
+pip install -e .
+cd ..
+```
+
+Now you can validate your RMMD files using:
+
+``` shell
+rmmd validate --schema reaction-model-metadata/0.0.1/schem.json examples/features.rmmd
+``` 
+
+You can also update an RMMD file from a CFF file (and vice versa) or from a reaction model (currently only Cantera's YAML input format is supported):
+
+``` shell
+rmmd update CITATION.CFF path/to/your/metadata.rmmd
+```
+
+<!-- footnotes -->
+---------------------------
diff --git a/examples/features.rmmd b/examples/features.rmmd
new file mode 100644
index 0000000..18657aa
--- /dev/null
+++ b/examples/features.rmmd
@@ -0,0 +1,165 @@
+# yaml-language-server: $schema=../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: "My cool reaction model"
+authors:
+  - family-names: Mustermann
+    given-names: Maximilian
+    email: m.mustermann@example.org
+  - family-names: Doe
+    given-names: Jane
+    orcid: https://orcid.org/0000-0000-0000-0000-0000
+date-released: "2024-01-01"
+doi: 10.12345/6789
+license: CC-BY-4.0
+parts: # parts of this dataset (e.g. different files/reaction models)
+  - name: mychemkinfile
+    type: chemkin  # should be easy to extend by additional types
+    relative-path: ./my_model.chemkin
+    sha-1: 9c6e2367dfdae7e4dbb6221a37275c3b42265465
+    # remove format enum and set for each key as additional constraint (value is set implicitly in models)
+  - name: mythermodata  # type of model is used as key
+    type: chemkin-therm
+    relative-path: my_model.therm
+    sha-1: 0123456789abcdef0123456789abcdef01234567
+    format: chemkin-thermff
+  - name: mykineticstable
+    type: csv
+    relative-path: my_kinetics.csv
+species:
+  mythermodata:
+    - name: CH4
+      local-name: methane 
+      inchi: InChI=1S/CH4/h1H4
+      smiles: C
+      multiplicity: 0  # we could even provide charge and multiplicity, if relevant
+    - name: C3H7 # for lumped species, it looks like this
+      entities:
+        - inchi: InChI=1S/molecular_entity_one
+        - inchi: InChI=1S/other
+  # species are just listed by the part of the dataset that contains parameterizations for them
+  mychemkinfile:
+    - name: CH3
+      inchi: InChI=1S/CH3/h1H3
+      composition:    # optional because usually contained in model
+        C: 1
+        H: 3
+reactions:      # here, we can supply additional information about
+                # reactions or give them local names that can be used
+                # to easily reference them later
+  mychemkinfile:
+    - name: CH4 = CH3  # just nonsense example
+      local-name: rxn1 # define a new name so we do not have to use the equation
+      lineno: 121  # instead of giving the name (i.e. equation), we could also provide the line number
+      products: [CH3]  
+      reactants: [CH4] # as long as the names in this metadata file are unique, we can reference species contained in a different part of this dataset (here, CH4 is in mythermodata). In case of a name collision, we have to provide the local name
+  mykineticstable:
+    - name: "10"  # in a csv table, using the equation may not be the best apporach to identify a reaction -> use a different identifier
+      identifier: reaction_id  # name of the column in the csv file
+      local-name: rxn2  # name we use here for the reaction
+      products:  {H2O: 2} # instead of a list of names, we can also suply the stoichiometry using a list of mappings
+      reactants: {H2: 2, O2: 1}
+activities:
+  my-theoretical-study:
+    references: # in the laziest case, I just supply the DOI to the study describing the parameterization
+      - type: article
+        doi: 10.12345/6789
+        journal: Journal of Interesting Reasearch
+        title: A new study on some cool system.
+        year: 2024
+        authors:   # If we 
+          - family-names: Mustermann
+            given-names: Maximilian
+            email: m.mustermann@example.org
+          - family-names: Doe
+            given-names: Jane
+            orcid: https://orcid.org/0000-0000-0000-0000-0000
+    description: computational study on system X. ...
+    # If we want to describe the study in more detail, we can also define
+    # subactivities, which can be nested arbitrarily deep and list methods,
+    # tools, and datasets used for each subactivity. Alternatively, we could
+    # simply list the methods and tools used for the entire study here.
+    subactivities:
+      - opt+freq
+      - single-point-calculations   # we can describe several calculations/
+                                    # experiments as one activity, or...
+      - ts-optimization     # ... we can define an activity for a single
+                            # calculation, if this degree of detail is preferred
+      - thermochemistry-calc
+  single-point-calculations:
+    part-of: my-theoretical-study # not really necessary, but helps when reading
+    # activities can realize/apply methods and employ tools, here, for a
+    # computational study
+    applied-methods:
+      - type: computational chemistry
+        method: DLPNO-CCSD(T)
+        basis-set: cc-pVTZ
+    employed-tools:
+      - software: Orca
+        version: 5.0.2
+    output: # here we can reference the dataset produced by this activity
+      - &qm_data  # use YAML anchor to reuse this later
+        type: doi
+        value: 10.19061/iochem-bd-123456
+        description: Dataset containing all QM results for this study
+  opt+freq:
+    output: [*qm_data]  # reuse the reference to the dataset defined above
+    applied-methods:
+      - &B3LYP_def2-TZVP
+        type: computational chemistry
+        method: DFT
+        basis-set: def2-TZVP  # TODO: how detailed do we want to describe lot?
+        functional: B3LYP  # "functional" is currently not hardoced into the schema, but we could add different field for different QM methods
+        disp-corr: D3-BJ    # optional, specify dispersion correction
+    employed-tools:
+      - &G16    # again, for easy reuse
+        software: Gaussian
+        version: "16"
+  ts-optimization:
+    applied-methods:  [*B3LYP_def2-TZVP]
+    employed-tools: [*G16]
+    output:
+      - <<: *qm_data
+        # here, we reuse the reference to the dataset above, but override the
+        # description
+        description: The TS computation results are titled "TS_data" within the dataset.
+  thermochemistry-calc:
+    description: Calculation of thermochemical properties using the RRHO model for multiple conformers
+    employed-tools:
+      - &CTY
+        software: ChemTraYzer
+        version: 3.0.1
+    input:
+      - *qm_data
+  TST-applictation:
+    description: Transition state theory calculations
+    input:
+      - output-of: thermochemistry-calc  # if we did not publish the full dataset for some reason, we can at least describe it via the activity that generated it
+      - *qm_data
+    employed-tools: [*CTY]
+    applied-methods:  # for methods/models that are not specified by the schema, we can use the general type and add a description. -> really useful? this is not really machine readable anyway
+      - type: custom
+        description: Eyring equation for rate constant calculation
+parameters:
+  mythermfile:  # use name of dataset part as key
+    - species: CH4
+      param-id: "..."  # in case the species name, model, symbols, T range, etc. is not enough or as additional info if available
+      # params: NASA7:a0,a1,a3-a7
+      params: {NASA7: "a1, a2, a3-a6"}
+      T-range: [200, 1000]
+      generated-by: my-theoretical-study
+    - species: [CH3,C5H6]
+      params: 
+        NASA7: a0
+      taken-from: # "taken-from" is basically a shortcut for describing the activity of taking a parameter directly from a literature source
+            # TODO: how to deal with analogies, etc. here? -> eg
+        # here, we could reference the ATcT
+        - type: doi 
+          value: 10.1021/jacs.12345678
+          description: "Paper in which they computed standard enthalpies of formation for several molecules"
+    - reaction: rxn1  # reaction name can be empty, if not available
+      params: {Arrhenius: "*"}  # * simply means all parameters of the model
+      generated-by: TST-applictation  # again, if we want, we can reference the
+                                      # specific (sub)activity
+    - reaction: [rxn2,rxn1]
+      params: {Arrhenius: "b"}
+      generated-by: TST-applictation
\ No newline at end of file
diff --git a/examples/minimum.rmmd b/examples/minimum.rmmd
new file mode 100644
index 0000000..537f9c0
--- /dev/null
+++ b/examples/minimum.rmmd
@@ -0,0 +1,8 @@
+# yaml-language-server: $schema=../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: "My cool reaction model"
+authors:
+  - family-names: Mustermann
+    given-names: Maximilian
+  - family-names: Doe
+    given-names: Jane
\ No newline at end of file
diff --git a/python/LICENSE.md b/python/LICENSE.md
new file mode 100644
index 0000000..02ac34b
--- /dev/null
+++ b/python/LICENSE.md
@@ -0,0 +1,7 @@
+Copyright 2024 The RMMD python package authors. 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..a82422b
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,16 @@
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+[project]
+name = "rmmd"
+version = "0.0.1"
+dependencies = [
+    "ruamel.yaml",
+    "jsonschema",   
+]
+
+requires-python = ">=3.10"
+
+[project.scripts]
+rmmd = "rmmd.cli:main"
diff --git a/python/src/rmmd/__init__.py b/python/src/rmmd/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/src/rmmd/cli.py b/python/src/rmmd/cli.py
new file mode 100644
index 0000000..82e9226
--- /dev/null
+++ b/python/src/rmmd/cli.py
@@ -0,0 +1,131 @@
+import argparse
+import logging
+from pathlib import Path
+import sys
+
+from rmmd.update import AbortedByUser, UpdateOptions, update
+from rmmd.validate import RmmdValidationError, validate_rmmd
+
+def _add_validate_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        '--schema',
+        help='Path to schema file',
+        metavar='SCHEMA',
+        required=True,
+        type=Path)
+    parser.add_argument(
+        'rmmd',
+        help='Path to RMMD file to validate',
+        metavar='RMMD',
+        type=Path)
+    
+
+def _add_update_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        'source',
+        help='File to take data from',
+        metavar='SOURCE',
+        type=Path)
+    parser.add_argument(
+        'target',
+        help='File to update (will be overwritten!)',
+        metavar='TARGET',
+        type=Path)
+    parser.add_argument(
+        '--root',
+        help='Root directory of the project. Default: current working directory.',
+        metavar='ROOT',
+        type=Path,
+        default=Path.cwd()
+    )
+    parser.add_argument(
+        '--species',
+        help='Path to JSON/YAML file containing species information',
+        metavar='SPECIES_FILE',
+        type=Path,
+        default=None
+    )
+    ## mutually exclusive group:
+    backup_grp = parser.add_mutually_exclusive_group()
+    backup_grp.add_argument(
+        '--backup',
+        help='Create a backup of the target file',
+        action='store_true',
+        dest='backup',
+        default=True
+    )
+    backup_grp.add_argument(
+        '--no-backup',
+        help='Do not create a backup of the target file',
+        action='store_false',
+        dest='backup',
+        default=True
+    )
+    
+def _parent_parser() -> argparse.ArgumentParser:
+    parent_parser = argparse.ArgumentParser(add_help=False)
+    parent_parser.add_argument(
+        '-v', '--verbose',
+        help='Enable verbose logging',
+        action='store_true'
+    )
+
+    return parent_parser
+
+def main():
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(message)s')
+    # use sdtout as logging.INFO is used as standard output for user
+    handler = logging.StreamHandler(stream=sys.stdout)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    parent = _parent_parser()   # contains arguments that should always be there
+    parser = argparse.ArgumentParser(parents=[parent])
+    subparsers = parser.add_subparsers(dest='_cmd_')
+
+    validate_parser = subparsers.add_parser('validate', parents= [parent])
+    _add_validate_args(validate_parser)
+
+    update_parser = subparsers.add_parser('update', parents= [parent])
+    _add_update_args(update_parser)
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+
+    if args._cmd_ == 'validate':
+        try:
+            validate_rmmd(args.rmmd, args.schema)
+        except RmmdValidationError as err:
+            logging.error(err)
+            exit(1)
+        except Exception as err:
+            logging.error(err)
+            logging.debug('An error occurred.', exc_info=err)
+            exit(1)
+        else:
+            logging.info('Validation successful.')
+    elif args._cmd_ == 'update':
+        try:
+            opts = UpdateOptions(
+                root=args.root,
+                species_json=args.species,
+                backup=args.backup
+            )
+            update(args.source, args.target, opts)
+        except AbortedByUser as err:
+            logging.info(err)
+            exit(0)
+        except Exception as err:
+            logging.error(err)
+            logging.debug('An error occurred.', exc_info=err)
+            exit(1)
+    else:
+        parser.print_help()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/python/src/rmmd/update.py b/python/src/rmmd/update.py
new file mode 100644
index 0000000..fc2fce3
--- /dev/null
+++ b/python/src/rmmd/update.py
@@ -0,0 +1,315 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+import json
+import logging
+import os
+from pathlib import Path
+import re
+import shutil
+from typing import Any, Iterable, Mapping
+from jsonschema import validate
+from ruamel.yaml import YAML
+
+@dataclass
+class UpdateOptions:
+    """stores options for how to update files"""
+    root: Path
+    """root directory of the dataset"""
+    species_json: Path|None
+    """path to the species names JSON file (or YAML)"""
+    backup: bool
+    """create backup of the target file before updating"""
+
+    def __post_init__(self):
+        self.root = Path(self.root).resolve()
+        if self.species_json is not None:
+            self.species_json = Path(self.species_json).resolve()
+
+class AbortedByUser(Exception):
+    """indicated that user aborted the operation"""
+
+
+Object =  Mapping[str, Any]
+'''General type representing a JSON/YAML-serialized object'''
+
+_RMMD_CFF_COMMON = ['title', 'authors', 'references', 'license', 'license_url',
+                   'doi', 'date_released', 'version']
+'''fields which the citation file format and the reaction model metadata format have in common'''
+
+def _log_changes(added: Iterable[str], updated: Iterable[str],
+                 removed: Iterable[str], target_path: Path):
+    def format_items(items: Iterable[str|tuple[str]]) -> list[str]:
+        new = []
+        for item in items:
+            if isinstance(item, str):
+                new.append(f'"{item}"')
+            else:
+                new.append('"' + '/'.join(item) +  '"')
+        
+        return new
+    
+    added = format_items(added)
+    updated = format_items(updated)
+    removed = format_items(removed)
+
+    msg = ''
+    if added:
+        msg += f' Added {", ".join(added)}.'
+    if updated:
+        msg += f' Updated {", ".join(updated)}.'
+    if removed:
+        msg += f' Removed {", ".join(removed)}.'
+    if msg == '':
+        msg = 'No changes made.'
+    else:
+        msg = f'Successfully updated {target_path.name}:' + msg
+
+    logging.info(msg)
+
+class Format(Enum):
+    CFF = 'citation file format'
+    RMMD = 'RMMD'
+    CT_YAML = 'Cantera YAML'
+
+class UnknownFormatError(Exception):
+    """raised when the format of a file cannot be determined"""
+
+def _determine_format(path: Path) -> Format:
+    '''crude heuristics to determine the format of a file'''
+    if path.name.lower() == 'citation.cff':  # allow some slack with capitalization
+        return Format.CFF
+    elif path.suffix == '.rmmd':
+        return Format.RMMD
+    elif path.suffix == '.yaml':
+        # open file and check for rmmd-version or phases key
+        with open(path, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith('rmmd-version:'):
+                    return Format.RMMD
+                elif line.startswith('phases:'):
+                    return Format.CT_YAML
+    else:
+        raise UnknownFormatError(f'Could not determine format of {path}')
+
+def _determine_part(rmmd_content: Object, ctyaml: Path, root: Path) -> str:
+    """determine name of the dataset part for a Cantera YAML file
+    
+    adds the file if it could not be found"""    
+    if 'parts' not in rmmd_content:
+        rmmd_content['parts'] = []
+        return 'cantera-yaml-1'
+
+    existing_names: list[int] = []
+    for part in rmmd_content['parts']:
+        path = Path(part['relative-path'])
+
+        if path == ctyaml.relative_to(root):
+            for checksum in ['sha-1', 'md5']:
+                if checksum in part:
+                    logging.warning('Checksum %s found for %s. Will not be '
+                                    'updated.', checksum, ctyaml)
+
+            return part['name']
+        else:
+            # collect names matching cantera-yaml-X to avoid duplicates
+            match = re.search(r'cantera-yaml-(\d+)', part['name'])
+            if match:
+                existing_names.append(int(match.group(1)))
+
+    part_name = f'cantera-yaml-{max(existing_names, default=1) + 1}'
+    logging.info('Could not find %s in the RMMD file. It will be added as %s',
+                 str(ctyaml.relative_to(root)), part_name)
+    return part_name
+
+def _load_species_def(species_json: Path) -> dict:
+    if species_json.suffix == '.json':
+        with open(species_json, 'r') as f:
+            # TODO validate against schema
+            return json.load(f)
+    elif species_json.suffix == '.yaml':
+        return YAML(typ='safe').load(species_json)
+    else:
+        raise ValueError(f'Unknown species JSON format {species_json.suffix}')
+    
+def _update_ctyaml2rmmd(ctyaml: Path, rmmd: Path, opts: UpdateOptions) -> tuple:
+    added = []
+    updated = []
+    removed = []
+
+    ctyaml = ctyaml.resolve()
+    if not ctyaml.is_relative_to(opts.root):
+        raise ValueError(f'{ctyaml} is not in the dataset root {opts.root}')
+
+    ct_content = YAML(typ='safe').load(ctyaml)
+    rmmd_content = YAML(typ='rt').load(rmmd)
+
+    part_name = _determine_part(rmmd_content, ctyaml, opts.root)
+
+    ct_yaml_part = {            # data to plug into the parts section
+            'name': part_name,
+            'relative-path': str(ctyaml.relative_to(opts.root)),
+            'type': 'cantera-yaml'
+        }
+
+    existing_parts = {p['name']: i for i, p in enumerate(rmmd_content['parts'])} 
+    
+    if part_name not in existing_parts:
+        added.append(('parts', part_name))
+
+        rmmd_content['parts'].append(ct_yaml_part)
+    elif rmmd_content['parts'][existing_parts[part_name]] != ct_yaml_part:
+            updated.append(('parts', part_name))
+            rmmd_content['parts'][existing_parts[part_name]].update(ct_yaml_part)
+
+    if opts.species_json is not None:
+        species_info = _load_species_def(opts.species_json)
+    else:
+        species_info = {}
+
+    def get_species_def(name: str) -> dict:
+        if name in species_info:
+            return {'name': name, **species_info[name]}
+        else:
+            return {'name': name}
+
+    if 'species' not in rmmd_content:
+        rmmd_content['species'] = {}
+    if part_name not in rmmd_content['species']:
+        rmmd_content['species'][part_name] = []
+    rmmd_species = rmmd_content['species'][part_name]
+
+    # species id by name  (both are lists of dicts with the field "name")
+    rmmd_sid_by_n = {s['name']: i for i, s in enumerate(rmmd_species)}
+    ct_sid_by_n = {s['name']: i for i, s in enumerate(ct_content['species'])}
+
+    new_s = set(ct_sid_by_n.keys()) - set(rmmd_sid_by_n.keys())
+    # sort to ensure predictable output > testability
+    new_s = sorted(new_s, key=lambda n: ct_sid_by_n[n])
+    removed_s = set(rmmd_sid_by_n.keys()) - set(ct_sid_by_n.keys())
+    common_s = set(rmmd_sid_by_n.keys()) & set(ct_sid_by_n.keys())
+
+    for s_name in common_s:
+        rmmd_i = rmmd_sid_by_n[s_name]
+        ct_i = ct_sid_by_n[s_name]
+        s_def = get_species_def(s_name)
+        if rmmd_species[rmmd_i] != s_def:
+            updated.append(('species', part_name, s_name))
+            # if we switch from a single entity to a list of molecular entities,
+            # we have to remove the fields directly under the species
+            allowd_species_fields = {'name', 'composition', 'identifier', 'local-name'} # TODO: get from JSON schema
+            if 'entities' in s_def:
+                for key in rmmd_species[rmmd_i].keys():
+                    if key not in allowd_species_fields:
+                        del rmmd_species[rmmd_i][key]
+            elif 'entities' in rmmd_species[rmmd_i]:  # but not in s_def!
+                del rmmd_species[rmmd_i]['entities']
+            rmmd_species[rmmd_i].update(s_def)
+
+    remove_i = [rmmd_sid_by_n[n] for n in removed_s]
+    # iterate in reverse order to always remove from end of list
+    for i in sorted(remove_i, reverse=True):
+        del rmmd_species[i]
+
+    del rmmd_sid_by_n       # not valid anymore -> avoid bugs
+    removed = [('species', part_name, n) for n in removed_s]
+    
+    for s_name in new_s:
+        added.append(('species', part_name, s_name))
+        rmmd_species.append(get_species_def(s_name))
+
+    
+    yaml = YAML(typ='rt')
+    yaml.indent(mapping=2, sequence=4, offset=2)
+    yaml.dump(rmmd_content, rmmd)
+    
+    return added, updated, removed
+
+
+def _update_cff2rmmd(cff: Path, rmmd: Path, _) -> tuple[list[str], list[str], list[str]]:
+    yaml = YAML(typ='rt')
+    yaml.indent(mapping=2, sequence=4, offset=2)
+
+    cff_content = yaml.load(cff)
+    rmmd_content = yaml.load(rmmd)
+
+    added, updated, removed = _update_cff_rmmd_common(cff_content, rmmd_content)
+
+    yaml.dump(rmmd_content, rmmd)
+
+    return added, updated, removed
+
+def _update_rmmd2cff(rmmd: Path, cff: Path, _):
+    yaml = YAML(typ='rt')
+    yaml.indent(mapping=2, sequence=4, offset=2)
+
+    rmmd_content = yaml.load(rmmd)
+    cff_content = yaml.load(cff)
+
+    added, updated, removed = _update_cff_rmmd_common(rmmd_content, cff_content)
+
+    yaml.dump(cff_content, cff)
+
+    return added, updated, removed
+
+def _update_cff_rmmd_common(source: Object, target: Object)\
+        -> tuple[list[str], list[str], list[str]]:
+    """copies fields in _RMMD_CFF_COMMON from source to target
+    
+    :return: (added, updated, removed) - fields that were added or updated in
+             the target"""
+    updated = []
+    added = []
+    removed = []
+
+    for field in _RMMD_CFF_COMMON:
+        if field in source:
+            if field in target and target[field] != source[field]:
+                updated.append(field)
+                target[field] = source[field]
+            elif field not in target:
+                added.append(field)
+                target[field] = source[field]
+
+        elif field in target:
+            removed.append(field)
+
+            del target[field]
+
+    return added, updated, removed
+
+_UPDATER_FNCS = {
+    (Format.CFF, Format.RMMD): _update_cff2rmmd,
+    (Format.RMMD, Format.CFF): _update_rmmd2cff,
+    (Format.CT_YAML, Format.RMMD): _update_ctyaml2rmmd
+}
+
+def _backup_target(target: Path):
+    backup = target.with_suffix('.backup')
+
+    if backup.exists():
+        logging.warning(f'Backup file {backup} already exists.')
+        answer = input('overwrite? [y/n]')
+        if answer.lower() != 'y':
+            raise AbortedByUser('Exiting...')
+        backup.unlink()
+    
+    shutil.copyfile(target, backup)
+
+def update(source: Path, target: Path, opts: UpdateOptions = None):
+    '''updates a file based on another file'''
+    source_format = _determine_format(source)
+    target_format = _determine_format(target)
+
+    format_def = (source_format, target_format)
+    if format_def in _UPDATER_FNCS:
+        if opts.backup:
+            _backup_target(target)
+        added, updated, removed = _UPDATER_FNCS[format_def](source, target, opts)
+
+        
+        _log_changes(added, updated, removed, target)
+    else:
+        raise ValueError(f'Cannot update {target_format.value} with {source_format.value}')
diff --git a/python/src/rmmd/validate.py b/python/src/rmmd/validate.py
new file mode 100644
index 0000000..c5b3bdf
--- /dev/null
+++ b/python/src/rmmd/validate.py
@@ -0,0 +1,51 @@
+from datetime import date
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Mapping, TypeVar
+
+from jsonschema import ValidationError, validate
+from ruamel.yaml import YAML
+
+PathLike = TypeVar('PathLike', str, Path, os.PathLike)
+
+class RmmdValidationError(Exception):
+    """raised when an RMMD file fails validation"""
+
+def validate_rmmd(rmmd: PathLike, schema: PathLike):
+    """Validates an RMMD file
+    
+    :raises: RmmdValidationError
+    """
+    rmmd = Path(rmmd)
+    schema = Path(schema)
+
+    logging.debug('Loading schema...')
+    with open(schema, 'rb') as f:
+        schema = json.load(f)
+
+    logging.debug('Loading RMMD file...')
+    yaml = YAML(typ='rt')
+    content = yaml.load(rmmd)
+
+    logging.info("Validating...")
+    try:
+        validate(instance=content, schema=schema)
+    except ValidationError as err:
+        raise RmmdValidationError(_prettyfy_validation_err(rmmd, content, err))
+
+    
+def _prettyfy_validation_err(fpath: Path, content: Mapping,
+                            err: ValidationError) -> str:
+    abs_path = err.absolute_path
+    parts = [f'[{repr(el)}]' for el in abs_path]
+
+    # check for datetime objects (YAML feature)
+    instance = content
+    for key in err.absolute_path:
+        instance = instance[key]
+    if isinstance(instance, date):
+        return 'YAML date discovered. Surround by " to explicitly use strings.\nIn ' + fpath.name + ''.join(parts)
+    else:
+        return "Validation error in " + fpath.name + ''.join(parts) +  ":\n" + err.message
diff --git a/python/test/rmmd/test_update.py b/python/test/rmmd/test_update.py
new file mode 100644
index 0000000..b3e6055
--- /dev/null
+++ b/python/test/rmmd/test_update.py
@@ -0,0 +1,161 @@
+import shutil
+import pytest
+from pathlib import Path
+
+from rmmd.update import UpdateOptions, _load_species_def, update
+
+_HERE = Path(__file__).parent.resolve()
+_TEST_DATA = _HERE/'test_update'
+
+
+
+class TestCtYamlRmmd:
+    
+    _DATA = _TEST_DATA/'ct_yaml'
+
+    @pytest.fixture
+    def rmmd_file(self, tmp_path: Path) -> Path:
+        # copy _TEST_DATA/before.rmmd into tmp_path/model.rmmd
+        path = tmp_path/'model.rmmd'
+        shutil.copy(self._DATA/'before.rmmd', path)
+
+        return path
+    
+    @pytest.fixture
+    def rmmd_file_no_species(self, tmp_path: Path) -> Path:
+        path = tmp_path/'model.rmmd'
+        shutil.copy(self._DATA/'before_no_species.rmmd', path)
+
+        return path
+
+    @pytest.fixture
+    def expected_rmmd_file(self) -> Path:
+        return self._DATA/'after.rmmd'
+    
+    @pytest.fixture
+    def expected_rmmd_file_add_part(self) -> Path:
+        return self._DATA/'after_add_part.rmmd'
+    
+    @pytest.fixture
+    def expected_rmmd_file_no_json(self) -> Path:
+        return self._DATA/'after_no_json.rmmd'
+    
+    @pytest.fixture
+    def ct_yaml_file(self) -> Path:
+        return self._DATA/'gri3.0.yaml'
+       
+    @pytest.fixture
+    def opts(self) -> UpdateOptions:
+        return UpdateOptions(
+                    root=self._DATA,
+                    species_json=self._DATA/'species.json',
+                    backup = False)
+    
+    def test_load_species_def_with_yaml(self):
+        s_def = _load_species_def(self._DATA/'species.yaml')
+        # the JSON file is already used in other tests
+        expected_s_def = _load_species_def(self._DATA/'species.json')
+
+        assert s_def == expected_s_def
+    
+    def test_update_yaml2rmmd_w_backup(self, rmmd_file: Path,
+            expected_rmmd_file: Path,
+            ct_yaml_file: Path, opts: UpdateOptions):
+        opts.backup = True
+        expected_content = expected_rmmd_file.read_text()
+        before_content = rmmd_file.read_text()
+
+        update(ct_yaml_file, rmmd_file, opts)
+
+        after_content = rmmd_file.read_text()
+        backup_content = rmmd_file.with_suffix('.backup').read_text()
+
+        assert after_content == expected_content
+        assert before_content == backup_content
+
+    def test_update_yaml2rmmd_no_json(self, rmmd_file: Path,
+                expected_rmmd_file_no_json: Path, ct_yaml_file: Path,
+                opts: UpdateOptions):
+        opts.species_json = None
+        expected_content = expected_rmmd_file_no_json.read_text()
+
+        update(ct_yaml_file, rmmd_file, opts)
+
+        after_content = rmmd_file.read_text()
+
+        assert after_content == expected_content
+
+    def test_update_yaml2rmmd_no_json_add_part(self, rmmd_file_no_species: Path,
+                expected_rmmd_file_add_part: Path, ct_yaml_file: Path,
+                opts: UpdateOptions):
+        opts.species_json = None
+        expected_content = expected_rmmd_file_add_part.read_text()
+
+        update(ct_yaml_file, rmmd_file_no_species, opts)
+
+        after_content = rmmd_file_no_species.read_text()
+
+        assert after_content == expected_content
+
+class TestCffRmmd:
+
+    _DATA = _TEST_DATA/'cff'
+
+    @pytest.fixture
+    def opts(self) -> UpdateOptions:
+        return UpdateOptions(
+                root=self._DATA,
+                species_json=None,
+                backup=False)
+
+    @pytest.fixture
+    def cff_file(self, tmp_path: Path) -> Path:
+        # copy _TEST_DATA/before.cff into tmp_path/CITATION.cff
+        path = tmp_path/'CITATION.cff'
+        shutil.copy(self._DATA/'before.cff', path)
+
+        return path
+
+    @pytest.fixture
+    def expected_cff_file(self, tmp_path: Path) -> Path:        
+        # also used as input in one test -> needs proper name
+        cff = tmp_path/'CITATION.cff'
+        shutil.copy(self._DATA/'after.cff', cff)
+        return cff
+
+    @pytest.fixture
+    def rmmd_file(self, tmp_path: Path) -> Path:
+        path = tmp_path/'model.rmmd'
+        shutil.copy(self._DATA/'before.rmmd', path)
+
+        return path
+
+    @pytest.fixture
+    def expected_rmmd_file(self) -> Path:
+        return self._DATA/'after.rmmd'
+
+    def test_update_cff2rmmd(self, rmmd_file: Path, expected_cff_file: Path,
+                            expected_rmmd_file: Path, opts):
+        
+        update(expected_cff_file, rmmd_file, opts)
+
+        with open(rmmd_file, 'r') as f:
+            after = f.read()
+
+        with open(expected_rmmd_file, 'r') as f:
+            expected = f.read()
+
+        assert after == expected
+
+    def test_update_rmmd2cff(self, cff_file: Path, expected_cff_file: Path,
+                            expected_rmmd_file: Path, opts):
+        
+        update(expected_rmmd_file, cff_file, opts)
+        
+        with open(cff_file, 'r') as f:
+            after = f.read()
+
+        with open(expected_cff_file, 'r') as f:
+            expected = f.read()
+
+        assert after == expected
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/cff/after.cff b/python/test/rmmd/test_update/cff/after.cff
new file mode 100644
index 0000000..876cdfd
--- /dev/null
+++ b/python/test/rmmd/test_update/cff/after.cff
@@ -0,0 +1,24 @@
+cff-version: 1.2.0
+message: If you use this software, please cite it as below.
+authors:
+  - family-names: John
+    given-names: Doe
+title: My Datset
+keywords: [reaction model, combustion]
+commit: 123456789abcdefghijklmnop   # commits are not part of RMMD
+date-released: 2024-01-01
+# this comment should remain here
+references:
+  - authors:
+      - name: Big Research Institution
+    date-released: '2021-07-26'
+    doi: 10.5281/zenodo.123456
+    title: Some software project
+    type: software
+    version: 0.1.0
+  - authors:
+      - given-names: Maximilian
+        family-names: Mustermann
+    title: Study on Things
+    type: article
+doi: 10.5281/zenodo.1234
diff --git a/python/test/rmmd/test_update/cff/after.rmmd b/python/test/rmmd/test_update/cff/after.rmmd
new file mode 100644
index 0000000..136aac5
--- /dev/null
+++ b/python/test/rmmd/test_update/cff/after.rmmd
@@ -0,0 +1,31 @@
+# yaml-language-server: $schema=../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: My Datset
+authors:
+  - family-names: John
+    given-names: Doe
+date-released: 2024-01-01
+# this comment should remain here
+parts:    # this is not in the CFF file
+  - name: ckfile
+    type: chemkin
+    relative-path: ./mech/my_model.mech
+    sha-1: 9c6e2367dfdae7e4dbb6221a37275c3b42265465
+species:
+  ckfile:
+    - name: C3H6-4
+    - name: C4H8
+references:
+  - authors:
+      - name: Big Research Institution
+    date-released: '2021-07-26'
+    doi: 10.5281/zenodo.123456
+    title: Some software project
+    type: software
+    version: 0.1.0
+  - authors:
+      - given-names: Maximilian
+        family-names: Mustermann
+    title: Study on Things
+    type: article
+doi: 10.5281/zenodo.1234
diff --git a/python/test/rmmd/test_update/cff/before.cff b/python/test/rmmd/test_update/cff/before.cff
new file mode 100644
index 0000000..97b2568
--- /dev/null
+++ b/python/test/rmmd/test_update/cff/before.cff
@@ -0,0 +1,17 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: John
+    given-names: Doe
+title: "My Datset"
+keywords: ["reaction model", combustion]
+version: 1.0
+commit: 123456789abcdefghijklmnop   # commits are not part of RMMD
+date-released: 2024-01-01
+# this comment should remain here
+references:
+  - authors:
+      - given-names: Maximilian
+        family-names: Mustermann
+    title: "Study on Things"
+    type: article
diff --git a/python/test/rmmd/test_update/cff/before.rmmd b/python/test/rmmd/test_update/cff/before.rmmd
new file mode 100644
index 0000000..e0df9bf
--- /dev/null
+++ b/python/test/rmmd/test_update/cff/before.rmmd
@@ -0,0 +1,18 @@
+# yaml-language-server: $schema=../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: "My Datset"
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
+# this comment should remain here
+parts:    # this is not in the CFF file
+  - name: ckfile
+    type: chemkin
+    relative-path: ./mech/my_model.mech
+    sha-1: 9c6e2367dfdae7e4dbb6221a37275c3b42265465
+species:
+  ckfile:
+    - name: C3H6-4
+    - name: C4H8
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/ct_yaml/after.rmmd b/python/test/rmmd/test_update/ct_yaml/after.rmmd
new file mode 100644
index 0000000..b9dbaf4
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/after.rmmd
@@ -0,0 +1,32 @@
+# yaml-language-server: $schema=../../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: My Datset
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
+# this comment should remain here
+parts:    # this is not in the CFF file
+  - name: ckfile
+    type: cantera-yaml
+    relative-path: gri3.0.yaml
+species:
+  ckfile:
+    - name: H # add InChI, keep name
+      iupac: hydrogen
+      inchi: InChI=1S/H
+    - name: O2 # add entity
+      entities:
+        - inchi: InChI=1S/O2/c1-2
+          multiplicity: 3
+        - inchi: InChI=1S/O2/c1-2
+          multiplicity: 1
+    - name: OH # update InChI
+      inchi: InChI=1S/OH/h1H
+    - name: O
+    - name: CH3
+      inchi: InChI=1S/CH3/c1-2-3/h1H3
+    - name: CH4
+      inchi: InChI=1S/CH4/h1H4
+      iupac: methane
diff --git a/python/test/rmmd/test_update/ct_yaml/after_add_part.rmmd b/python/test/rmmd/test_update/ct_yaml/after_add_part.rmmd
new file mode 100644
index 0000000..364ffcd
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/after_add_part.rmmd
@@ -0,0 +1,20 @@
+# yaml-language-server: $schema=../../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: My Datset
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
+parts:
+  - name: cantera-yaml-1
+    relative-path: gri3.0.yaml
+    type: cantera-yaml
+species:
+  cantera-yaml-1:
+    - name: H
+    - name: O
+    - name: O2
+    - name: OH
+    - name: CH3
+    - name: CH4
diff --git a/python/test/rmmd/test_update/ct_yaml/after_no_json.rmmd b/python/test/rmmd/test_update/ct_yaml/after_no_json.rmmd
new file mode 100644
index 0000000..4976785
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/after_no_json.rmmd
@@ -0,0 +1,24 @@
+# yaml-language-server: $schema=../../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: My Datset
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
+# this comment should remain here
+parts:    # this is not in the CFF file
+  - name: ckfile
+    type: cantera-yaml
+    relative-path: gri3.0.yaml
+species:
+  ckfile:
+    - name: H # add InChI, keep name
+      iupac: hydrogen
+    - name: O2 # add entity
+      inchi: InChI=1S/O2/c1-2
+    - name: OH # update InChI
+      inchi: InChI=1S/H2/typo-whoops/c1-2/h1H2
+    - name: O
+    - name: CH3
+    - name: CH4
diff --git a/python/test/rmmd/test_update/ct_yaml/before.rmmd b/python/test/rmmd/test_update/ct_yaml/before.rmmd
new file mode 100644
index 0000000..b8e1daf
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/before.rmmd
@@ -0,0 +1,22 @@
+# yaml-language-server: $schema=../../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: "My Datset"
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
+# this comment should remain here
+parts:    # this is not in the CFF file
+  - name: ckfile
+    type: chemkin
+    relative-path: gri3.0.yaml
+species:
+  ckfile:
+    - name: C4H8  # remove species
+    - name: H # add InChI, keep name
+      iupac: hydrogen
+    - name: O2 # add entity
+      inchi: InChI=1S/O2/c1-2
+    - name: OH # update InChI
+      inchi: InChI=1S/H2/typo-whoops/c1-2/h1H2
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/ct_yaml/before_no_species.rmmd b/python/test/rmmd/test_update/ct_yaml/before_no_species.rmmd
new file mode 100644
index 0000000..b64d465
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/before_no_species.rmmd
@@ -0,0 +1,8 @@
+# yaml-language-server: $schema=../../../../../reaction-model-metadata/0.0.1/schema.json
+rmmd-version: 0.0.1
+title: "My Datset"
+authors:
+  - family-names: John
+    given-names: Doe
+version: 1.0
+date-released: 2024-01-01
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/ct_yaml/gri3.0.yaml b/python/test/rmmd/test_update/ct_yaml/gri3.0.yaml
new file mode 100644
index 0000000..f7122b5
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/gri3.0.yaml
@@ -0,0 +1,187 @@
+# contents of this file were taken from the Cantera repository
+# source: https://github.com/Cantera/cantera/blob/main/data/gri30.yaml
+#
+#
+# Copyright (c) 2001-2009, California Institute of Technology
+# All rights reserved.
+
+# Copyright (c) 2009 Sandia Corporation. Under the terms of
+# Contract AC04-94AL85000 with Sandia Corporation, the U.S. Government
+# retains certain rights in this software.
+
+# Copyright (c) 2011-2024, Cantera Developers.
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+
+# - Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+
+# - Neither the name of the California Institute of Technology, Sandia
+#   Corporation nor the names of other  contributors may be used to
+#   endorse or promote products derived from this software without
+#   specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+description: |-
+  GRI-Mech Version 3.0 7/30/99  CHEMKIN-II format
+  See README30 file at anonymous FTP site unix.sri.com, directory gri;
+  WorldWideWeb home page http://www.me.berkeley.edu/gri_mech/ or
+  through http://www.gri.org , under 'Basic  Research',
+  for additional information, contacts, and disclaimer
+
+  Updated webpage at http://combustion.berkeley.edu/gri-mech/version30/text30.html
+
+generator: ck2yaml
+input-files: [gri30.inp, gri30_thermo.dat, gri30_tran.dat]
+cantera-version: 2.5.0
+date: Wed, 11 Dec 2019 16:59:02 -0500
+
+units: {length: cm, time: s, quantity: mol, activation-energy: cal/mol}
+
+phases:
+- name: gri30
+  thermo: ideal-gas
+  elements: [O, H, C, N, Ar]
+  species: [H, O, O2, OH, CH3, CH4]
+  kinetics: gas
+  transport: mixture-averaged
+  state: {T: 300.0, P: 1 atm}
+
+species:
+- name: H
+  composition: {H: 1}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [2.5, 7.05332819e-13, -1.99591964e-15, 2.30081632e-18, -9.27732332e-22,
+      2.54736599e+04, -0.446682853]
+    - [2.50000001, -2.30842973e-11, 1.61561948e-14, -4.73515235e-18, 4.98197357e-22,
+      2.54736599e+04, -0.446682914]
+    note: L7/88
+  transport:
+    model: gas
+    geometry: atom
+    well-depth: 145.0
+    diameter: 2.05
+- name: O
+  composition: {O: 1}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [3.1682671, -3.27931884e-03, 6.64306396e-06, -6.12806624e-09, 2.11265971e-12,
+      2.91222592e+04, 2.05193346]
+    - [2.56942078, -8.59741137e-05, 4.19484589e-08, -1.00177799e-11, 1.22833691e-15,
+      2.92175791e+04, 4.78433864]
+    note: |-
+      L1/90
+       GRI-Mech Version 3.0 Thermodynamics released 7/30/99
+       NASA Polynomial format for CHEMKIN-II
+       see README file for disclaimer
+  transport:
+    model: gas
+    geometry: atom
+    well-depth: 80.0
+    diameter: 2.75
+- name: O2
+  composition: {O: 2}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [3.78245636, -2.99673416e-03, 9.84730201e-06, -9.68129509e-09, 3.24372837e-12,
+      -1063.94356, 3.65767573]
+    - [3.28253784, 1.48308754e-03, -7.57966669e-07, 2.09470555e-10, -2.16717794e-14,
+      -1088.45772, 5.45323129]
+    note: TPIS89
+  transport:
+    model: gas
+    geometry: linear
+    well-depth: 107.4
+    diameter: 3.458
+    polarizability: 1.6
+    rotational-relaxation: 3.8
+- name: OH
+  composition: {O: 1, H: 1}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [3.99201543, -2.40131752e-03, 4.61793841e-06, -3.88113333e-09, 1.3641147e-12,
+      3615.08056, -0.103925458]
+    - [3.09288767, 5.48429716e-04, 1.26505228e-07, -8.79461556e-11, 1.17412376e-14,
+      3858.657, 4.4766961]
+    note: RUS78
+  transport:
+    model: gas
+    geometry: linear
+    well-depth: 80.0
+    diameter: 2.75
+- name: CH3
+  composition: {C: 1, H: 3}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [3.6735904, 2.01095175e-03, 5.73021856e-06, -6.87117425e-09, 2.54385734e-12,
+      1.64449988e+04, 1.60456433]
+    - [2.28571772, 7.23990037e-03, -2.98714348e-06, 5.95684644e-10, -4.67154394e-14,
+      1.67755843e+04, 8.48007179]
+    note: L11/89
+  transport:
+    model: gas
+    geometry: linear
+    well-depth: 144.0
+    diameter: 3.8
+- name: CH4
+  composition: {C: 1, H: 4}
+  thermo:
+    model: NASA7
+    temperature-ranges: [200.0, 1000.0, 3500.0]
+    data:
+    - [5.14987613, -0.0136709788, 4.91800599e-05, -4.84743026e-08, 1.66693956e-11,
+      -1.02466476e+04, -4.64130376]
+    - [0.074851495, 0.0133909467, -5.73285809e-06, 1.22292535e-09, -1.0181523e-13,
+      -9468.34459, 18.437318]
+    note: L8/88
+  transport:
+    model: gas
+    geometry: nonlinear
+    well-depth: 141.4
+    diameter: 3.746
+    polarizability: 2.6
+    rotational-relaxation: 13.0
+
+reactions:
+- equation: 2 O + M <=> O2 + M  # Reaction 1
+  type: three-body
+  rate-constant: {A: 1.2e+17, b: -1.0, Ea: 0.0}
+  efficiencies: {H2: 2.4, H2O: 15.4, CH4: 2.0, CO: 1.75, CO2: 3.6, C2H6: 3.0,
+    AR: 0.83}
+- equation: O + CH4 <=> OH + CH3  # Reaction 11
+  rate-constant: {A: 1.02e+09, b: 1.5, Ea: 8600.0}
+- equation: H + CH3 (+M) <=> CH4 (+M)  # Reaction 52
+  type: falloff
+  low-P-rate-constant: {A: 2.62e+33, b: -4.76, Ea: 2440.0}
+  high-P-rate-constant: {A: 1.39e+16, b: -0.534, Ea: 536.0}
+  Troe: {A: 0.783, T3: 74.0, T1: 2941.0, T2: 6964.0}
+  efficiencies: {H2: 2.0, H2O: 6.0, CH4: 3.0, CO: 1.5, CO2: 2.0, C2H6: 3.0,
+    AR: 0.7}
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/ct_yaml/species.json b/python/test/rmmd/test_update/ct_yaml/species.json
new file mode 100644
index 0000000..c1ebd5d
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/species.json
@@ -0,0 +1,30 @@
+{
+    "H": {
+      "inchi": "InChI=1S/H"
+    },
+    "O2": {
+      "entities": [
+        {
+          "inchi": "InChI=1S/O2/c1-2",
+          "multiplicity": 3
+        },
+        {
+          "inchi": "InChI=1S/O2/c1-2",
+          "multiplicity": 1
+        }
+      ]
+    },
+    "OH": {
+      "inchi": "InChI=1S/OH/h1H"
+    },
+    "CH3": {
+      "inchi": "InChI=1S/CH3/c1-2-3/h1H3"
+    },
+    "CH4": {
+      "inchi": "InChI=1S/CH4/h1H4",
+      "iupac": "methane"
+    },
+    "CO": {
+      "inchi": "InChI=1S/CO/c1-2"
+    }
+  }
\ No newline at end of file
diff --git a/python/test/rmmd/test_update/ct_yaml/species.yaml b/python/test/rmmd/test_update/ct_yaml/species.yaml
new file mode 100644
index 0000000..19faa2f
--- /dev/null
+++ b/python/test/rmmd/test_update/ct_yaml/species.yaml
@@ -0,0 +1,18 @@
+H:
+    inchi: InChI=1S/H # no name here
+# no info on O -> just name will be added
+O2:
+  entities:
+    - inchi: InChI=1S/O2/c1-2
+      multiplicity: 3
+    - inchi: InChI=1S/O2/c1-2
+      multiplicity: 1
+OH:
+  inchi: InChI=1S/OH/h1H
+CH3:
+  inchi: InChI=1S/CH3/c1-2-3/h1H3
+CH4:
+  inchi: InChI=1S/CH4/h1H4
+  iupac: methane
+CO: # will be irgnored
+  inchi: InChI=1S/CO/c1-2
diff --git a/reaction-model-metadata/0.0.1/schema.json b/reaction-model-metadata/0.0.1/schema.json
new file mode 100644
index 0000000..94bf475
--- /dev/null
+++ b/reaction-model-metadata/0.0.1/schema.json
@@ -0,0 +1,707 @@
+{
+    "$schema": "https://json-schema.org/draft/2019-09/schema",
+    "type": "object",
+    "$defs": {
+        "name-string": {
+            "type": "string",
+            "description": "Nonempty string with few allowed characters",
+            "pattern": "^[A-Za-z0-9_\\-\\.\\+]+$"
+        },
+        "file": {
+            "type": "object",
+            "properties": {
+                "media-type": {
+                    "type": "string",
+                    "description": "In case no more specific format can be supplied: The IANA media type of the distribution (https://www.iana.org/assignments/media-types/media-types.xhtml)"
+                },
+                "format": {
+                    "type": "string",
+                    "description": "The format of the distribution. Allowed values:\n- chemkin: chemistry input file as defined by [1]\n- chemkin-thermodata: file containing separate thermochemistry parameters in format defined by [1]\n- chemkin-transport: defined by [2]\n- cantera-yaml: Cantera input file in YAML format [3]- cantera-cti: *.cti Cantera legacy format\n- cantera-xml: *.ctml Cantera legacy format\n[1] Kee et al. Sanadia National Laboratories Report SAND89-8009 (1989)\n[2] Kee et al. Sanadia National Laboratories Report SAND86-8246B (1998)\n[3] https://cantera.org/documentation/docs-3.0/sphinx/html/yaml/index.html"
+                },
+                "contained-in": {
+                    "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/identifier"
+                },
+                "relative-path": {
+                    "type": "string",
+                    "pattern": "^[^\\\\:?*\"<>|]+$",
+                    "description": "Relative path to the distribution in the repository. Usually a file path relative to the root of the repository. Unix-style paths (e.g. \"path/to/my/file.txt\") should be used, even on Windows systems. The path should not contain any of the following characters: \\, :, ?, *, \", <, >, |. Whitespace characters are discouraged."
+                },
+                "sha-1": {
+                    "type": "string",
+                    "description": "SHA1 checksum of the file",
+                    "pattern": "^[0-9a-fA-F]{40}$"
+                },
+                "md5": {
+                    "type": "string",
+                    "description": "MD5 checksum of the file",
+                    "pattern": "^[0-9a-fA-F]{32}$"
+                }
+            },
+            "required": ["relative-path"]
+        },
+        "element": {
+            "type": "string",
+            "description": "Element symbol",
+            "enum": ["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na",
+                     "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", 
+                     "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
+                     "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb",
+                     "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb",
+                     "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm",
+                     "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+                     "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl",
+                     "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa",
+                     "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md",
+                     "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg",
+                     "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"]
+        },
+        "cas-number": {
+            "type": "string",
+            "description": "CAS Registry Number",
+            "pattern": "\\d{2,}\\-\\d{2}\\-\\d",
+            "$comment": "JSON schema does not allow validating with the checksum"
+        },
+        "inchi": {
+            "type": "string",
+            "description": "standard InChI",
+            "pattern": "^InChI=1S/.*",
+            "$comment": "Since InChI strings can be very complex and it is really hard to impossible to check for chemical feasibility with a regexp, we do not do a detailed validation here."
+        },
+        "smiles": {
+            "type": "string",
+            "description": "SMILES string",
+            "pattern": "^[A-Za-z0-9@\\+\\-\\(\\)\\[\\]\\{\\}\\.#=\\\\%\\$:]+"
+        },
+        "molecular-entity-properties": {
+            "type": "object",
+            "$comment": "This is an extra definition used by molecular-entity and species.",
+            "properties": {
+                "inchi": { "$ref": "#/$defs/inchi" },
+                "smiles": { "$ref": "#/$defs/smiles" },
+                "cas": { "$ref": "#/$defs/cas-number" },
+                "charge": {
+                    "type": "integer"
+                },
+                "multiplicity": {
+                    "description": "Spin multiplicity.",
+                    "type": "integer"
+                }
+            }
+        },
+        "molecular-entity": {
+            "description": "Any constitutionally or isotopically distinct atom, molecule, ion, ion pair, radical, radical ion, complex, conformer etc., identifiable as a separately distinguishable entity. - 'molecular entity' in IUPAC Compendium of Chemical Terminology, 3rd ed. International Union of Pure and Applied Chemistry; 2006. Online version 3.0.1, 2019. https://doi.org/10.1351/goldbook.M03986",
+            "$ref": "#/$defs/molecular-entity-properties",
+            "unevaluatedProperties": false
+        },
+        "molecular-composition": {
+            "type": "object",
+            "patternProperties": {
+                "^[A-Z][a-z]*$": {
+                "type": "integer",
+                "minimum": 1
+                }
+            },
+            "propertyNames": {
+                "$ref": "#/$defs/element"
+            },
+            "minProperties": 1
+        },
+        "alt-locator": {
+            "type": "object",
+            "description": "Sometimes, the sepcies/reaction name is not enough to uniquely identify a species/reaction/parameterization in the dataset. Then, differnt types of alternative locators can be used.",
+            "properties": {
+                "type": {
+                    "type": "string",
+                    "enum": ["column", "line", "other"]
+                }
+            },
+            "oneOf": [
+                {
+                    "properties": {
+                        "column": {
+                            "type": "string",
+                            "$comment": "we would probably also need to supply a second column that contains the value that we want to locate not just the column with the ids"},
+                        "value": {
+                            "type": ["string", "integer"]
+                        },
+                        "type": {
+                            "default": "column"
+                        }
+                    },
+                    "required": ["column", "value"]
+                }, {
+                    "properties": {
+                        "lineno": {"type": "integer"},
+                        "type": {
+                            "default": "line"
+                        }
+                    },
+                    "required": ["lineno"]
+                }, {
+                    "properties": {
+                        "type": {"const": "other"},
+                        "identifier": {"type": "string"},
+                        "value": {"type": "string"}
+                    }
+                }
+            ],
+            "unevaluatedProperties": false
+        },
+        "species-name": {
+            "type": "string",
+            "description": "Name of the species in the dataset/model and in the metadata file.",
+            "pattern": "\\S+",
+            "examples": [
+                "H2O", "CH3OH"
+            ]
+        },
+        "species": {
+            "type": "object",
+            "description": "A chemical species, i.e. an ensemble of molecular entities, with fixed elemental composition. See also: 'chemical species' in IUPAC Compendium of Chemical Terminology, 3rd ed. International Union of Pure and Applied Chemistry; 2006. Online version 3.0.1, 2019. https://doi.org/10.1351/goldbook.CT01038",
+            "$comment": "The restriction of a fixed composition is imposed for reaction models, so you can guarantee that the element balance holds for reactions. While a species is an ensemble of entities, often, it will only consist of a single entity. Hence, we allow users to specify the properties of the molecular entity directly, if only a single entity is in the ensemble.\nWhile we conceptually interpret a species as fixed composition, we do not enforce that users provide a composition as this information is usually contained in the reaction model.",
+            "properties": {
+                "composition": {
+                    "$ref": "#/$defs/molecular-composition"
+                },
+                "name": {
+                    "$ref": "#/$defs/species-name"
+                },
+                "identifier": {
+                    "type": "string",
+                    "description": "Name of the column or field in the dataset/model that contains the name referenced here (can be an id). Not relevant for reaction models distributed as Chemkin files.",
+                    "default": "species name"
+                },
+                "local-name": {
+                    "$ref": "#/$defs/species-name",
+                    "description": "Name of the species in the metadata file which might differ from the name used in the dataset/reaction model. If not provided, name is used as local-name."
+                }
+            },
+            "required": ["name"],
+            "if": {
+                "required": ["entities"]
+            }, "then": {
+                "properties": {
+                        "entities": {
+                            "type": "array",
+                            "items": {
+                                "$ref": "#/$defs/molecular-entity"
+                            },
+                            "minItems": 1,
+                            "uniqueItems": true
+                        }
+                }
+            }, "else": {
+                "$ref": "#/$defs/molecular-entity-properties"
+            },
+            "unevaluatedProperties": false
+        },
+        "reaction": {
+            "type": "object",
+            "properties": {
+                "name": {
+                    "description": "Name or id of the reaction in the dataset/model and in the metadata file.",
+                    "type": "string",
+                    "examples": [
+                        "H2 + O2 => H2O",
+                        "10"
+                    ]
+                },
+                "identifier": {
+                    "type": "string",
+                    "description": "Name of the column or field in the dataset/model that contains the name referenced here (can be an id).",
+                    "default": "equation"
+                },
+                "local-name": {
+                    "type": "string",
+                    "description": "Name of the reaction in the metadata file which might differ from the name used in the metadata file. If not provided, name is used as local-name.",
+                    "examples": [
+                        "reaction4", "H abstraction 1"
+                    ]
+                },
+                "reactants": {
+                    "description": "Species names or ids of the reactants.",
+                    "items": {
+                        "oneOf": [
+                            {"$ref": "#/$defs/species-name"},
+                            {
+                                "type": "object",
+                                "allOf": [
+                                    {
+                                        "$comment": "restrict name to definition above",
+                                        "propertyNames": {
+                                            "$ref": "#/$defs/species-name"
+                                        }
+                                    },
+                                    {
+                                        "$comment": "also restrict values to integers",
+                                        "patternProperties": {
+                                            ".*": {
+                                                "type": "integer",
+                                                "minimum": 1
+                                            }
+                                        }
+                                    }
+                                ],
+                                "minProperties": 1
+                            }
+                        ]
+                    },
+                    "minItems": 1,
+                    "uniqueItems": true
+                }
+            }
+        },
+        "activity": {
+            "type": "object",
+            "properties": {
+                "references": {
+                    "type": "array",
+                    "items": { "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/reference"},
+                    "minItems": 1,
+                    "uniqueItems": true,
+                    "description": "References describing this activity, e.g. a paper."
+                },
+                "description": {
+                    "type": "string",
+                    "description": "Description of the activity.",
+                    "examples": [
+                        "Computational study of dichlorobenzene.",
+                        "ignition delay time measurement."
+                    ]
+                },
+                "part-of": {
+                    "$ref": "#/$defs/name-string",
+                    "description": "Major acitvity that this activity is a part of."
+                },
+                "subactivities": {
+                    "type": "array",
+                    "description": "Smaller activities that are part of this activity.",
+                    "items": {
+                        "$ref": "#/$defs/name-string"
+                    },
+                    "minItems": 1,
+                    "uniqueItems": true
+                },
+                "applied-methods": {
+                    "type": "array",
+                    "description": "list of methods used.",
+                    "items": {
+                        "$ref": "#/$defs/method"
+                    },
+                    "minItems": 1
+                },
+                "employed-tools": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/$defs/tool"
+                    },
+                    "minItems": 1
+                },
+                "input": {
+                    "type": "array",
+                    "items": {
+                        "anyOf": [
+                            {
+                                "properties": {
+                                    "output-of": {
+                                        "$ref": "#/$defs/name-string"
+                                    }
+                                },
+                                "required": ["output-of"],
+                                "description": "output of another acitvity",
+                                "unevaluatedProperties": false
+                            },
+                            {
+                                "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/identifier"
+                            }
+                        ]
+                    },
+                    "minItems": 1
+                },
+                "output": {
+                    "type": "array",
+                    "$comment": "maybe using a CFF identifier is not flexible enough here and we should instead use a CFF reference, but that comes with other disadvantages such as having to supply authors and not having a description field",
+                    "items": {
+                        "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/identifier"
+                    },
+                    "minItems": 1
+                }
+            },
+            "additionalProperties": false
+        },
+        "method": {
+            "type": "object",
+            "properties": {
+                "type": { "type": "string" },
+                "description": {"type": "string"}
+            },
+            "$comment": "This part is not nearly finished. We still have to decide in which details we want to include methods and how restricted the structure/vocabulary should be.",
+            "required": ["type"],
+            "anyOf": [
+                {
+                    "properties": {
+                            "type": {"const": "computational chemistry"},
+                            "method": {
+                                "type": "string",
+                                "examples": ["DFT", "CCSD(T)", "Hartree-Fock", "SQM"]
+                            },
+                            "basis-set": {
+                                "type": "string",
+                                "examples": ["def2-SVP", "6-31++G*"]
+                            }
+                        },
+                    "required": ["method"]
+                },
+                {
+                    "properties": {
+                            "type": {"const": "experimental"}
+                        }
+                },
+                {
+                    "type": "object",
+                    "properties": {
+                        "type": {"const": "custom"}
+                    }            
+                }
+            ]
+        },
+        "tool": {
+            "$comment": "Currently only software type. Experimental tools such as sensors, devices, custom hardware assemblies, etc. may need to be added",
+            "anyOf": [
+                {
+                    "properties": {
+                        "software": {"type": "string"},
+                        "version": {
+                            "type": "string",
+                            "$comment": "Must be a string so that 2.10 is not accedentially converted to 2.1"}
+                    },
+                    "required": ["software"]
+                }
+            ]
+        },
+        "parameterization": {
+            "type": "object",
+            "$comment": "Base definition for sets of parameters for a model.",
+            "properties": {
+                "generated-by": {"$ref": "#/$defs/name-string"},
+                "taken-from": { 
+                    "description": "taken-from is essentially the same as an activity describing that something was copied with the references as input",
+                    "type": "array",
+                    "items": { "oneOf": [
+                        {"$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/identifier"},
+                        {"$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/reference"}
+                        ]},
+                    "minItems": 1,
+                    "uniqueItems": true
+                }
+            }
+        },
+        "species-parameterization": {
+            "allOf": [{"$ref": "#/$defs/parameterization"}],
+            "description": "Set of parameters for an equation for a specific species, e.g., thermochemistry models",
+            "oneOf":[ 
+                {
+                    "properties": {
+                        "species": { 
+                            "$comment": "For simplicity (for the users, not for implementers) we allow single items and lists",
+                            "oneOf": [
+                                {"$ref": "#/$defs/species-name"},
+                                {
+                                    "type": "array",
+                                    "items":  {"$ref": "#/$defs/species-name"},
+                                    "minItems": 1
+                                }
+                            ] }
+                    },
+                    "required": ["species"]
+                }, 
+                {
+                    "properties": {
+                        "param-id": {
+                            "type": "string",
+                            "description": "In case the combination of the species, part of the model (e.g., file), temperature range, parameter name, etc. is not enough to uniquely identify the parameterization, "
+                        }
+                    },
+                    "required": ["param-id"]
+                }
+            ],
+            "properties": {
+                "params": {
+                    "type": "object",
+                    "examples": [
+                        {
+                            "NASA7": "a0-a6"
+                        }
+                    ],
+                    "properties": {
+                        "NASA7": {
+                            "type": "string",
+                            "description": "NASA7 polynomial with parameters a0-a6",
+                            "anyOf": [
+                                {"pattern": "^(a[0-6](-a[0-6])?)(, *a[0-6](-a[0-6])?)*$"},
+                                {"const": "*"}
+                            ]
+                        },
+                        "NASA9": {
+                            "type": "string",
+                            "description": "NASA9 polynomial with parameters a0-a8",
+                            "anyOf": [
+                                {"pattern": "^(a[0-8](-a[0-8])?)(, *a[0-8](-a[0-8])?)*$"},
+                                {"const": "*"}
+                            ]
+                        },
+                        "Shomate": {
+                            "type": "string",
+                            "description": "Shomate polynomial with parameters A-G",
+                            "anyOf": [
+                                {"pattern": "^([A-G](-[A-G])?)(, *[A-G](-[A-G])?)*$"},
+                                {"const": "*"}
+                            ]
+                        },
+                        "constant c_p": {
+                            "type": "string",
+                            "examples": [
+                                "T0, cp0, h0, s0"
+                            ],
+                            "anyOf": [
+                                {"pattern": "^(T0|cp0|h0|s0)(, *(T0|cp0|h0|s0)){0-3}"},
+                                {"const": "*"}
+                            ]
+                        },
+                        "*": {
+                            "type": "string",
+                            "const": "*"
+                        }
+                    },
+                    "additionalProperties": false,
+                    "description": "Model name and parameters the provenance of which is supplied here."
+                },
+                "T-range": {
+                    "type": "array",
+                    "items": {
+                        "type": "number",
+                        "minimum": 0
+                    },
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "description": "Temperature range in K for which the parameterization is valid."
+                },
+                "p-range": {
+                    "type": "array",
+                    "items": {
+                        "type": "number",
+                        "minimum": 0
+                    },
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "description": "Pressure range in Pa for which the parameterization is valid."
+                }
+            }
+        },
+        "reaction-parameterization": {
+            "allOf": [{"$ref": "#/$defs/parameterization"}],
+            "description": "Set of parameters for an equation for a specific reaction.",
+            "$comment": "This still needs to be worked on",
+            "properties": {
+                "reaction": { 
+                    "oneOf": [
+                        {"$ref": "#/$defs/species-name"},
+                        {
+                            "type": "array",
+                            "items": {"$ref": "#/$defs/species-name"},
+                            "minItems": 1
+                        }
+                    ]
+                    
+                },
+                "params": {
+                    "type": "object",
+                    "examples": [
+                        {
+                            "NASA7": "a0-a6"
+                        }
+                    ],
+                    "properties": {
+                        "Arrhenius": {
+                            "type": "string",
+                            "description": "Extended Arrhenius equation with parameters A, b and E_a",
+                            "anyOf": [
+                                {"pattern": "^(A|b|E_a)(, *A|b|E_a?)*$"},
+                                {"const": "*"}
+                            ]
+                        },
+                        "*": {
+                            "type": "string",
+                            "const": "*"
+                        }
+                    },
+                    "additionalProperties": false,
+                    "description": "Model name and parameters the provenance of which is supplied here."
+                },
+                "T-range": {
+                    "type": "array",
+                    "items": {
+                        "type": "number",
+                        "minimum": 0
+                    },
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "description": "Temperature range in K for which the parameterization is valid."
+                },
+                "p-range": {
+                    "type": "array",
+                    "items": {
+                        "type": "number",
+                        "minimum": 0
+                    },
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "description": "Pressure range in Pa for which the parameterization is valid."
+                }
+            },
+            "required": ["reaction"]
+        }
+    },
+    "properties": {
+        "title": {
+            "type": "string",
+            "description": "Title of the dataset or reaction model.",
+            "minimum": 1
+        },
+        "rmmd-version": {
+            "description": "Version of the RMMD schema used in this file",
+            "examples": [
+                "0.0.1"
+            ],
+            "pattern": "^0\\.0\\.1$",
+            "type": "string"
+        },
+        "references": {
+            "type": "array",
+            "items": { "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/reference"},
+            "minItems": 1,
+            "uniqueItems": true
+        },
+        "authors": {
+            "type": "array",
+            "description": "The author(s) of this dataset/reaction model.",
+            "items": {
+                "anyOf": [
+                    {
+                        "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/person"
+                    },
+                    {
+                        "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/entity"
+                    }
+                ]
+            },
+            "minItems": 1,
+            "uniqueItems": true
+        },
+        "version": {
+            "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/version"
+        },
+        "license": {
+            "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/license"
+        },
+        "license-url": {
+            "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/properties/license-url"
+        },
+        "doi": {
+            "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/doi"
+        },
+        "date-released": {
+            "$ref": "https://citation-file-format.github.io/1.2.0/schema.json#/definitions/date"
+        },
+        "parts": {
+            "type": "array",
+            "items": {
+                "properties": {
+                    "name": {
+                        "$ref": "#/$defs/name-string",
+                        "description": "Name of the part of the dataset/model within the metadata file",
+                        "examples": [
+                            "therm-file-1", "model-Li-et-al-2019"
+                        ]
+                    },
+                    "type": {
+                        "$comment": "This is an extension point for different subdatasets. For files, we consider the subdataset to be the same as its distribution, but if we were to add data services, the type for that would look very differnt and probably not correspond to a distribution. Using an enum here and listing all values below in anyOf seems redundant but helps editors with autocompletion.",
+                        "type": "string",
+                        "enum": ["chemkin", "chemkin-therm", "chemkin-transport","cantera-yaml", "cantera-cti", "cantera-xml", "csv"]
+                    }
+                },  
+                "required": ["name", "type"],
+                "$comment": "Currently, we support only file types, but different types could be supported in the future",
+                "allOf": [{"$ref": "#/$defs/file"}],
+                "unevaluatedProperties": false
+            }
+        },
+        "species": {
+            "type": "object",
+            "propertyNames": {
+                "$ref": "#/$defs/name-string"
+            },
+            "description": "Species in the dataset/model. The key is the name of the part of the dataset/model that contains the species.",
+            "patternProperties": {
+                ".*": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/$defs/species"
+                    }
+                }
+            },
+            "unevaluatedProperties": false
+        },
+        "reactions": {
+            "type": "object",
+            "propertyNames": {
+                "$ref": "#/$defs/name-string"
+            },
+            "patternProperties": {
+                ".*": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/$defs/reaction"
+                    }
+                }
+            },
+            "unevaluatedProperties": false
+        },
+        "parameters": {
+            "type": "object",
+            "propertyNames": {
+                "$ref": "#/$defs/name-string"
+            },
+            "patternProperties": {
+                ".*": {
+                    "type": "array",
+                    "items": {
+                        "anyOf": [
+                            { "$ref": "#/$defs/species-parameterization",
+                            "unevaluatedProperties": false },
+                            { "$ref": "#/$defs/reaction-parameterization",
+                            "unevaluatedProperties": false }
+                        ]
+                    }
+                }
+            }
+        },
+        "activities": {
+            "type": "object",
+            "propertyNames": {
+                "$ref": "#/$defs/name-string"
+            },
+            "patternProperties": {
+                ".*": {
+                    "type": "object",
+                    "anyOf": [
+                        { "$ref": "#/$defs/activity" }
+                    ],
+                    "unevaluatedProperties": false
+                }
+            }
+        }
+    },
+    "required": ["rmmd-version", "title", "authors"],
+    "additionalProperties": false
+}
\ No newline at end of file
-- 
GitLab