diff --git a/.gitignore b/.gitignore index b41f4b0..d2b20a4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ slurm.* .old *.parquet +data/kinetics/*.csv + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/data/kinetics/README.md b/data/kinetics/README.md new file mode 100644 index 0000000..4c1bf32 --- /dev/null +++ b/data/kinetics/README.md @@ -0,0 +1,12 @@ +# QuantumPioneer Kinetics Dataset + +| Column | Type | Units | Description | +| ------------- | ------ | ----------- | ------------------------------------------------ | +| **`rxn_smi`** | string | — | Reaction SMILES (`r1.r2>>p1.p2`) | +| **`k_298`** | number | m³/(mol·s) | Bimolecular rate coefficient at 298 K | +| **`A_low`** | number | m³/(mol·s) | Arrhenius pre-exponential factor, 300–1000 K | +| **`Ea_low`** | number | J/mol | Activation energy, 300–1000 K | +| **`A_high`** | number | m³/(mol·s) | Arrhenius pre-exponential factor, 1000–2000 K | +| **`Ea_high`** | number | J/mol | Activation energy, 1000–2000 K | +| **`barrier`** | number | kcal/mol | Forward barrier (ZPE-scaled DLPNO/DFT) | +| **`Hrxn`** | number | kcal/mol | Forward reaction enthalpy (ZPE-scaled DLPNO/DFT) | diff --git a/scripts/kinetics/collect_kinetic_data.ipynb b/scripts/kinetics/collect_kinetic_data.ipynb new file mode 100644 index 0000000..13e62b2 --- /dev/null +++ b/scripts/kinetics/collect_kinetic_data.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1be4c50a", + "metadata": {}, + "outputs": [], + "source": [ + "import pathlib\n", + "\n", + "from functools import lru_cache\n", + "\n", + "import pandas as pd\n", + "import swifter\n", + "\n", + "from rdkit import Chem" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60549dfc", + "metadata": {}, + "outputs": [], + "source": [ + "cwd = pathlib.Path.cwd()\n", + "QUANTUM_GREEN_DIR = pathlib.Path(\"/home/shared/projects/quantum_green\")\n", + "PAPER_DIR = QUANTUM_GREEN_DIR / \"paper\" / \"figure\" / \"section_3_2_3_rate\"\n", + "DATABASE_DIR = QUANTUM_GREEN_DIR / \"datasets_for_publication\" / \"data\" / \"kinetics\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fabee62", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option(\"display.max_columns\", None)\n", + "\n", + "\n", + "def head(df, n=2):\n", + " display(df.head(n))\n", + " print(f\"Contains {len(df)} rows\")\n", + "\n", + "\n", + "@lru_cache(maxsize=None)\n", + "def canonical_smiles(smiles):\n", + " mol = Chem.MolFromSmiles(smiles)\n", + " for atom in mol.GetAtoms():\n", + " atom.SetAtomMapNum(0)\n", + " return Chem.MolToSmiles(mol, isomericSmiles=True)\n", + "\n", + "\n", + "def clean_rxn_smi(rxn_smi):\n", + " return \">>\".join(\n", + " [\n", + " \".\".join([canonical_smiles(smi) for smi in category.split(\".\")])\n", + " for category in rxn_smi.split(\">>\")\n", + " ]\n", + " )\n", + "\n", + "\n", + "def get_rxn_smi(row):\n", + " return row[\"r1smi\"] + \".\" + row[\"r2smi\"] + \">>\" + row[\"p1smi\"] + \".\" + row[\"p2smi\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1e23d7", + "metadata": {}, + "outputs": [], + "source": [ + "rate_data = pd.read_csv(\n", + " PAPER_DIR / \"quantum_green_ts_data_24september17_dft_opted_dlpno_sp_rates.csv\"\n", + ")\n", + "rate_data[\"rxn_smi\"] = rate_data.swifter.apply(get_rxn_smi, axis=1)\n", + "head(rate_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77782139", + "metadata": {}, + "outputs": [], + "source": [ + "rate_data[\"clean_rxn_smi\"] = rate_data[\"rxn_smi\"].swifter.apply(clean_rxn_smi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64722744", + "metadata": {}, + "outputs": [], + "source": [ + "zpe_data = pd.read_pickle(PAPER_DIR / \"ts_key_characteristics_july31a.pkl\")\n", + "head(zpe_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d15f61d6", + "metadata": {}, + "outputs": [], + "source": [ + "kinetics_df = pd.DataFrame(\n", + " {\n", + " \"rxn_smi\": rate_data[\"clean_rxn_smi\"],\n", + " \"k_298\": rate_data[\"k_298\"],\n", + " \"A_low\": rate_data[\"low_A\"],\n", + " \"Ea_low\": rate_data[\"low_Ea\"],\n", + " \"A_high\": rate_data[\"high_A\"],\n", + " \"Ea_high\": rate_data[\"high_Ea\"],\n", + " \"barrier\": rate_data[\"rxn_smi\"].map(\n", + " zpe_data.set_index(\"rxn_smi\")[\"fwd_barrier_dlpno_sp_dft_zpe_scaled_kcal\"]\n", + " ),\n", + " \"Hrxn\": rate_data[\"rxn_smi\"].map(\n", + " zpe_data.set_index(\"rxn_smi\")[\"fwd_Hrxn_dlpno_sp_dft_zpe_scaled_kcal\"]\n", + " ),\n", + " }\n", + ")\n", + "head(kinetics_df)" + ] + }, + { + "cell_type": "markdown", + "id": "f5d17245", + "metadata": {}, + "source": [ + "Looking for duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cec2bff0", + "metadata": {}, + "outputs": [], + "source": [ + "kinetics_df[kinetics_df[\"rxn_smi\"].duplicated(keep=False)].sort_values(by=\"rxn_smi\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7402d0c2", + "metadata": {}, + "outputs": [], + "source": [ + "kinetics_df_no_duplicates = kinetics_df.drop_duplicates(subset=\"rxn_smi\", keep=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c02c1c6b", + "metadata": {}, + "outputs": [], + "source": [ + "kinetics_df_no_duplicates.to_csv(\n", + " DATABASE_DIR / \"quantumpioneer_kinetics_dataset.csv\", index=False\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}