QuantumPioneer · craabreu · Mar 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ slurm.*
 .old
 *.parquet
 
+data/kinetics/*.csv
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/data/kinetics/README.md b/data/kinetics/README.md
@@ -0,0 +1,12 @@
+# QuantumPioneer Kinetics Dataset
+
+| Column        | Type   | Units       | Description                                      |
+| ------------- | ------ | ----------- | ------------------------------------------------ |
+| **`rxn_smi`** | string | —           | Reaction SMILES (`r1.r2>>p1.p2`)                 |
+| **`k_298`**   | number | m³/(mol·s)  | Bimolecular rate coefficient at 298 K            |
+| **`A_low`**   | number | m³/(mol·s)  | Arrhenius pre-exponential factor, 300–1000 K     |
+| **`Ea_low`**  | number | J/mol       | Activation energy, 300–1000 K                    |
+| **`A_high`**  | number | m³/(mol·s)  | Arrhenius pre-exponential factor, 1000–2000 K    |
+| **`Ea_high`** | number | J/mol       | Activation energy, 1000–2000 K                   |
+| **`barrier`** | number | kcal/mol    | Forward barrier (ZPE-scaled DLPNO/DFT)           |
+| **`Hrxn`**    | number | kcal/mol    | Forward reaction enthalpy (ZPE-scaled DLPNO/DFT) |
diff --git a/scripts/kinetics/collect_kinetic_data.ipynb b/scripts/kinetics/collect_kinetic_data.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1be4c50a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "from functools import lru_cache\n",
+    "\n",
+    "import pandas as pd\n",
+    "import swifter\n",
+    "\n",
+    "from rdkit import Chem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60549dfc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cwd = pathlib.Path.cwd()\n",
+    "QUANTUM_GREEN_DIR = pathlib.Path(\"/home/shared/projects/quantum_green\")\n",
+    "PAPER_DIR = QUANTUM_GREEN_DIR / \"paper\" / \"figure\" / \"section_3_2_3_rate\"\n",
+    "DATABASE_DIR = QUANTUM_GREEN_DIR / \"datasets_for_publication\" / \"data\" / \"kinetics\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fabee62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_columns\", None)\n",
+    "\n",
+    "\n",
+    "def head(df, n=2):\n",
+    "    display(df.head(n))\n",
+    "    print(f\"Contains {len(df)} rows\")\n",
+    "\n",
+    "\n",
+    "@lru_cache(maxsize=None)\n",
+    "def canonical_smiles(smiles):\n",
+    "    mol = Chem.MolFromSmiles(smiles)\n",
+    "    for atom in mol.GetAtoms():\n",
+    "        atom.SetAtomMapNum(0)\n",
+    "    return Chem.MolToSmiles(mol, isomericSmiles=True)\n",
+    "\n",
+    "\n",
+    "def clean_rxn_smi(rxn_smi):\n",
+    "    return \">>\".join(\n",
+    "        [\n",
+    "            \".\".join([canonical_smiles(smi) for smi in category.split(\".\")])\n",
+    "            for category in rxn_smi.split(\">>\")\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def get_rxn_smi(row):\n",
+    "    return row[\"r1smi\"] + \".\" + row[\"r2smi\"] + \">>\" + row[\"p1smi\"] + \".\" + row[\"p2smi\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f1e23d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rate_data = pd.read_csv(\n",
+    "    PAPER_DIR / \"quantum_green_ts_data_24september17_dft_opted_dlpno_sp_rates.csv\"\n",
+    ")\n",
+    "rate_data[\"rxn_smi\"] = rate_data.swifter.apply(get_rxn_smi, axis=1)\n",
+    "head(rate_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77782139",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rate_data[\"clean_rxn_smi\"] = rate_data[\"rxn_smi\"].swifter.apply(clean_rxn_smi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64722744",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zpe_data = pd.read_pickle(PAPER_DIR / \"ts_key_characteristics_july31a.pkl\")\n",
+    "head(zpe_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d15f61d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kinetics_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"rxn_smi\": rate_data[\"clean_rxn_smi\"],\n",
+    "        \"k_298\": rate_data[\"k_298\"],\n",
+    "        \"A_low\": rate_data[\"low_A\"],\n",
+    "        \"Ea_low\": rate_data[\"low_Ea\"],\n",
+    "        \"A_high\": rate_data[\"high_A\"],\n",
+    "        \"Ea_high\": rate_data[\"high_Ea\"],\n",
+    "        \"barrier\": rate_data[\"rxn_smi\"].map(\n",
+    "            zpe_data.set_index(\"rxn_smi\")[\"fwd_barrier_dlpno_sp_dft_zpe_scaled_kcal\"]\n",
+    "        ),\n",
+    "        \"Hrxn\": rate_data[\"rxn_smi\"].map(\n",
+    "            zpe_data.set_index(\"rxn_smi\")[\"fwd_Hrxn_dlpno_sp_dft_zpe_scaled_kcal\"]\n",
+    "        ),\n",
+    "    }\n",
+    ")\n",
+    "head(kinetics_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5d17245",
+   "metadata": {},
+   "source": [
+    "Looking for duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cec2bff0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kinetics_df[kinetics_df[\"rxn_smi\"].duplicated(keep=False)].sort_values(by=\"rxn_smi\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7402d0c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kinetics_df_no_duplicates = kinetics_df.drop_duplicates(subset=\"rxn_smi\", keep=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c02c1c6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kinetics_df_no_duplicates.to_csv(\n",
+    "    DATABASE_DIR / \"quantumpioneer_kinetics_dataset.csv\", index=False\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}