Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 335 additions & 0 deletions scripts/semiempirical_split/nonts_analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import re\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from pyarrow import parquet as pq\n",
"\n",
"from MDAnalysis.analysis.rms import rmsd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"quantum_green = pathlib.Path(\"/home/shared/projects/quantum_green\")\n",
"paquets = pathlib.Path(\"./parquets\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def xyz_str_to_array(xyz_str):\n",
" rows = []\n",
" for s in xyz_str.split(\"\\n\"):\n",
" try:\n",
" lst = s.split()\n",
" rows.append([float(lst[i]) for i in [1, 2, 3]])\n",
" except Exception:\n",
" pass\n",
" return np.array(rows)\n",
"\n",
"\n",
"def last_conformer_to_array(xyz):\n",
" if xyz[-1].dtype != np.dtype(\"float32\"):\n",
" xyz = xyz[-1]\n",
" return np.stack(xyz)[:, 3:]\n",
"\n",
"\n",
"SEMIEMPIRICAL_DFT_MAPPING = {\n",
" \".tar\": \".log\",\n",
" \"/data1/\": \"/home/gridsan/\",\n",
" \"semiempirical_opt\": \"DFT_opt_freq\",\n",
"}\n",
"\n",
"\n",
"REGEX = re.compile(r\"/(\\d+)/\\1.log\")\n",
"\n",
"\n",
"def postprocess(source):\n",
" for k, v in SEMIEMPIRICAL_DFT_MAPPING.items():\n",
" source = source.replace(k, v)\n",
" result = REGEX.search(source)\n",
" if result:\n",
" g1 = result.group(1)\n",
" return source.replace(f\"/{g1}/{g1}.\", f\"/{g1}.\")\n",
" else:\n",
" return source"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read and process the consolidated species data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"species_data = pd.read_pickle(\n",
" quantum_green / \"reactants\" / \"quantum_green_species_data_24march12b.pkl\"\n",
")\n",
"species_data[\"source\"] = species_data[\"species_dft_log_path\"].apply(postprocess)\n",
"species_data[\"xyz\"] = species_data[\"xyz_str\"].apply(xyz_str_to_array)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert species_data[\"species_id\"].nunique() == len(species_data)\n",
"species_data[\"species_id\"].min(), species_data[\"species_id\"].max()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read and process the parsed DFT data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"dft = pq.read_table(\n",
" paquets / \"dft_nonts_dft_v9.parquet\", columns=[\"source\", \"initial_xyz\", \"xyz\"]\n",
").to_pandas()\n",
"dft[\"source\"] = dft[\"source\"].apply(postprocess)\n",
"dft[\"initial_xyz\"] = dft[\"initial_xyz\"].apply(last_conformer_to_array)\n",
"dft[\"xyz\"] = dft[\"xyz\"].apply(last_conformer_to_array)\n",
"dft[\"dft_index\"] = dft.index"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"filtered_dft = dft[dft[\"source\"].isin(species_data[\"source\"])].copy()\n",
"filtered_dft[\"species_xyz\"] = filtered_dft[\"source\"].map(\n",
" species_data.set_index(\"source\")[\"xyz\"]\n",
")\n",
"filtered_dft[\"rmsd\"] = np.vectorize(rmsd)(\n",
" filtered_dft[\"xyz\"], filtered_dft[\"species_xyz\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"matched_dft = filtered_dft.loc[\n",
" filtered_dft.groupby(\"source\")[\"rmsd\"].idxmin()\n",
"].reset_index(drop=True)\n",
"matched_dft[\"species_id\"] = matched_dft[\"source\"].map(\n",
" species_data.set_index(\"source\")[\"species_id\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matched_dft[\"rmsd\"].max()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Assign the initial xyz to each species data point"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"species_data[\"initial_xyz\"] = species_data[\"source\"].map(\n",
" matched_dft.set_index(\"source\")[\"initial_xyz\"]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read and process the semiempirical data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"semiempirical = pq.read_table(\n",
" paquets / \"semiempirical_nonts_semiempirical_v9.parquet\",\n",
" columns=[\"source\", \"initial_xyz\", \"std_xyz\"],\n",
").to_pandas().rename(columns={\"std_xyz\": \"final_xyz\"})\n",
"semiempirical[\"source\"] = semiempirical[\"source\"].apply(postprocess)\n",
"semiempirical[\"initial_xyz\"] = semiempirical[\"initial_xyz\"].apply(last_conformer_to_array)\n",
"semiempirical[\"final_xyz\"] = semiempirical[\"final_xyz\"].apply(last_conformer_to_array)\n",
"semiempirical[\"semiempirical_index\"] = semiempirical.index"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"df = filtered_semiempirical = semiempirical[\n",
" semiempirical[\"source\"].isin(species_data[\"source\"])\n",
"].copy()\n",
"df[\"dft_initial_xyz\"] = df[\"source\"].map(matched_dft.set_index(\"source\")[\"initial_xyz\"])\n",
"\n",
"\n",
"def msd_func(x, y):\n",
" msd = np.square(x - y).sum(-1).mean()\n",
" return msd if msd > 1e-8 else 0.0\n",
"\n",
"\n",
"msd = {\n",
" case: np.vectorize(msd_func)(\n",
" df[f\"{case}_xyz\"], df[\"dft_initial_xyz\"]\n",
" )\n",
" for case in [\"initial\", \"final\"]\n",
"}\n",
"\n",
"df[\"rmsd\"] = np.sqrt(np.minimum(msd[\"initial\"], msd[\"final\"]))\n",
"df[\"from_initial\"] = msd[\"initial\"] < msd[\"final\"]"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"df = matched_semiempirical = filtered_semiempirical.loc[\n",
" filtered_semiempirical.groupby(\"source\")[\"rmsd\"].idxmin()\n",
"].reset_index(drop=True)\n",
"df[\"species_id\"] = df[\"source\"].map(species_data.set_index(\"source\")[\"species_id\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matched_semiempirical[\"rmsd\"].describe(percentiles=[0.999])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matched_semiempirical[\"from_initial\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = matched_semiempirical[matched_semiempirical[\"from_initial\"]]\n",
"total = 0\n",
"for case in [\"aug11b\", \"sep1a_filtered\", \"co2_capture\"]:\n",
" num = sum(case in s for s in df[\"source\"])\n",
" total += num\n",
" print(case, num)\n",
"print(\"total\", total, len(df))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = matched_semiempirical[matched_semiempirical[\"rmsd\"] > 0.0].copy()\n",
"total = 0\n",
"for case in [\"aug11b\", \"sep1a_filtered\", \"co2_capture\"]:\n",
" num = sum(case in s for s in df[\"source\"])\n",
" total += num\n",
" print(case, num)\n",
"print(\"total\", total, len(df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save the results"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"df = matched_semiempirical\n",
"df[\"dft_index\"] = df[\"source\"].map(matched_dft.set_index(\"source\")[\"dft_index\"])\n",
"df[[\"species_id\", \"dft_index\", \"semiempirical_index\", \"rmsd\", \"from_initial\"]].sort_values(\n",
" \"species_id\"\n",
").to_csv(\"quantum_green_species_data_match.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading