Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ slurm.*
.old
*.parquet

data/kinetics/*.csv

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
12 changes: 12 additions & 0 deletions data/kinetics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# QuantumPioneer Kinetics Dataset

| Column | Type | Units | Description |
| ------------- | ------ | ----------- | ------------------------------------------------ |
| **`rxn_smi`** | string | — | Reaction SMILES (`r1.r2>>p1.p2`) |
| **`k_298`** | number | m³/(mol·s) | Bimolecular rate coefficient at 298 K |
| **`A_low`** | number | m³/(mol·s) | Arrhenius pre-exponential factor, 300–1000 K |
| **`Ea_low`** | number | J/mol | Activation energy, 300–1000 K |
| **`A_high`** | number | m³/(mol·s) | Arrhenius pre-exponential factor, 1000–2000 K |
| **`Ea_high`** | number | J/mol | Activation energy, 1000–2000 K |
| **`barrier`** | number | kcal/mol | Forward barrier (ZPE-scaled DLPNO/DFT) |
| **`Hrxn`** | number | kcal/mol | Forward reaction enthalpy (ZPE-scaled DLPNO/DFT) |
192 changes: 192 additions & 0 deletions scripts/kinetics/collect_kinetic_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1be4c50a",
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"\n",
"from functools import lru_cache\n",
"\n",
"import pandas as pd\n",
"import swifter\n",
"\n",
"from rdkit import Chem"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60549dfc",
"metadata": {},
"outputs": [],
"source": [
"cwd = pathlib.Path.cwd()\n",
"QUANTUM_GREEN_DIR = pathlib.Path(\"/home/shared/projects/quantum_green\")\n",
"PAPER_DIR = QUANTUM_GREEN_DIR / \"paper\" / \"figure\" / \"section_3_2_3_rate\"\n",
"DATABASE_DIR = QUANTUM_GREEN_DIR / \"datasets_for_publication\" / \"data\" / \"kinetics\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fabee62",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option(\"display.max_columns\", None)\n",
"\n",
"\n",
"def head(df, n=2):\n",
" display(df.head(n))\n",
" print(f\"Contains {len(df)} rows\")\n",
"\n",
"\n",
"@lru_cache(maxsize=None)\n",
"def canonical_smiles(smiles):\n",
" mol = Chem.MolFromSmiles(smiles)\n",
" for atom in mol.GetAtoms():\n",
" atom.SetAtomMapNum(0)\n",
" return Chem.MolToSmiles(mol, isomericSmiles=True)\n",
"\n",
"\n",
"def clean_rxn_smi(rxn_smi):\n",
" return \">>\".join(\n",
" [\n",
" \".\".join([canonical_smiles(smi) for smi in category.split(\".\")])\n",
" for category in rxn_smi.split(\">>\")\n",
" ]\n",
" )\n",
"\n",
"\n",
"def get_rxn_smi(row):\n",
" return row[\"r1smi\"] + \".\" + row[\"r2smi\"] + \">>\" + row[\"p1smi\"] + \".\" + row[\"p2smi\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f1e23d7",
"metadata": {},
"outputs": [],
"source": [
"rate_data = pd.read_csv(\n",
" PAPER_DIR / \"quantum_green_ts_data_24september17_dft_opted_dlpno_sp_rates.csv\"\n",
")\n",
"rate_data[\"rxn_smi\"] = rate_data.swifter.apply(get_rxn_smi, axis=1)\n",
"head(rate_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77782139",
"metadata": {},
"outputs": [],
"source": [
"rate_data[\"clean_rxn_smi\"] = rate_data[\"rxn_smi\"].swifter.apply(clean_rxn_smi)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64722744",
"metadata": {},
"outputs": [],
"source": [
"zpe_data = pd.read_pickle(PAPER_DIR / \"ts_key_characteristics_july31a.pkl\")\n",
"head(zpe_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d15f61d6",
"metadata": {},
"outputs": [],
"source": [
"kinetics_df = pd.DataFrame(\n",
" {\n",
" \"rxn_smi\": rate_data[\"clean_rxn_smi\"],\n",
" \"k_298\": rate_data[\"k_298\"],\n",
" \"A_low\": rate_data[\"low_A\"],\n",
" \"Ea_low\": rate_data[\"low_Ea\"],\n",
" \"A_high\": rate_data[\"high_A\"],\n",
" \"Ea_high\": rate_data[\"high_Ea\"],\n",
" \"barrier\": rate_data[\"rxn_smi\"].map(\n",
" zpe_data.set_index(\"rxn_smi\")[\"fwd_barrier_dlpno_sp_dft_zpe_scaled_kcal\"]\n",
" ),\n",
" \"Hrxn\": rate_data[\"rxn_smi\"].map(\n",
" zpe_data.set_index(\"rxn_smi\")[\"fwd_Hrxn_dlpno_sp_dft_zpe_scaled_kcal\"]\n",
" ),\n",
" }\n",
")\n",
"head(kinetics_df)"
]
},
{
"cell_type": "markdown",
"id": "f5d17245",
"metadata": {},
"source": [
"Looking for duplicates"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cec2bff0",
"metadata": {},
"outputs": [],
"source": [
"kinetics_df[kinetics_df[\"rxn_smi\"].duplicated(keep=False)].sort_values(by=\"rxn_smi\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7402d0c2",
"metadata": {},
"outputs": [],
"source": [
"kinetics_df_no_duplicates = kinetics_df.drop_duplicates(subset=\"rxn_smi\", keep=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c02c1c6b",
"metadata": {},
"outputs": [],
"source": [
"kinetics_df_no_duplicates.to_csv(\n",
" DATABASE_DIR / \"quantumpioneer_kinetics_dataset.csv\", index=False\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}