QuantumPioneer · craabreu · Mar 19, 2026 · Mar 19, 2026
diff --git a/scripts/semiempirical_split/nonts_analysis.ipynb b/scripts/semiempirical_split/nonts_analysis.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "import re\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pyarrow import parquet as pq\n",
+    "\n",
+    "from MDAnalysis.analysis.rms import rmsd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quantum_green = pathlib.Path(\"/home/shared/projects/quantum_green\")\n",
+    "paquets = pathlib.Path(\"./parquets\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def xyz_str_to_array(xyz_str):\n",
+    "    rows = []\n",
+    "    for s in xyz_str.split(\"\\n\"):\n",
+    "        try:\n",
+    "            lst = s.split()\n",
+    "            rows.append([float(lst[i]) for i in [1, 2, 3]])\n",
+    "        except Exception:\n",
+    "            pass\n",
+    "    return np.array(rows)\n",
+    "\n",
+    "\n",
+    "def last_conformer_to_array(xyz):\n",
+    "    if xyz[-1].dtype != np.dtype(\"float32\"):\n",
+    "        xyz = xyz[-1]\n",
+    "    return np.stack(xyz)[:, 3:]\n",
+    "\n",
+    "\n",
+    "SEMIEMPIRICAL_DFT_MAPPING = {\n",
+    "    \".tar\": \".log\",\n",
+    "    \"/data1/\": \"/home/gridsan/\",\n",
+    "    \"semiempirical_opt\": \"DFT_opt_freq\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "REGEX = re.compile(r\"/(\\d+)/\\1.log\")\n",
+    "\n",
+    "\n",
+    "def postprocess(source):\n",
+    "    for k, v in SEMIEMPIRICAL_DFT_MAPPING.items():\n",
+    "        source = source.replace(k, v)\n",
+    "    result = REGEX.search(source)\n",
+    "    if result:\n",
+    "        g1 = result.group(1)\n",
+    "        return source.replace(f\"/{g1}/{g1}.\", f\"/{g1}.\")\n",
+    "    else:\n",
+    "        return source"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read and process the consolidated species data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "species_data = pd.read_pickle(\n",
+    "    quantum_green / \"reactants\" / \"quantum_green_species_data_24march12b.pkl\"\n",
+    ")\n",
+    "species_data[\"source\"] = species_data[\"species_dft_log_path\"].apply(postprocess)\n",
+    "species_data[\"xyz\"] = species_data[\"xyz_str\"].apply(xyz_str_to_array)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert species_data[\"species_id\"].nunique() == len(species_data)\n",
+    "species_data[\"species_id\"].min(), species_data[\"species_id\"].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read and process the parsed DFT data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dft = pq.read_table(\n",
+    "    paquets / \"dft_nonts_dft_v9.parquet\", columns=[\"source\", \"initial_xyz\", \"xyz\"]\n",
+    ").to_pandas()\n",
+    "dft[\"source\"] = dft[\"source\"].apply(postprocess)\n",
+    "dft[\"initial_xyz\"] = dft[\"initial_xyz\"].apply(last_conformer_to_array)\n",
+    "dft[\"xyz\"] = dft[\"xyz\"].apply(last_conformer_to_array)\n",
+    "dft[\"dft_index\"] = dft.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_dft = dft[dft[\"source\"].isin(species_data[\"source\"])].copy()\n",
+    "filtered_dft[\"species_xyz\"] = filtered_dft[\"source\"].map(\n",
+    "    species_data.set_index(\"source\")[\"xyz\"]\n",
+    ")\n",
+    "filtered_dft[\"rmsd\"] = np.vectorize(rmsd)(\n",
+    "    filtered_dft[\"xyz\"], filtered_dft[\"species_xyz\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matched_dft = filtered_dft.loc[\n",
+    "    filtered_dft.groupby(\"source\")[\"rmsd\"].idxmin()\n",
+    "].reset_index(drop=True)\n",
+    "matched_dft[\"species_id\"] = matched_dft[\"source\"].map(\n",
+    "    species_data.set_index(\"source\")[\"species_id\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matched_dft[\"rmsd\"].max()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Assign the initial xyz to each species data point"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "species_data[\"initial_xyz\"] = species_data[\"source\"].map(\n",
+    "    matched_dft.set_index(\"source\")[\"initial_xyz\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read and process the semiempirical data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "semiempirical = pq.read_table(\n",
+    "    paquets / \"semiempirical_nonts_semiempirical_v9.parquet\",\n",
+    "    columns=[\"source\", \"initial_xyz\", \"std_xyz\"],\n",
+    ").to_pandas().rename(columns={\"std_xyz\": \"final_xyz\"})\n",
+    "semiempirical[\"source\"] = semiempirical[\"source\"].apply(postprocess)\n",
+    "semiempirical[\"initial_xyz\"] = semiempirical[\"initial_xyz\"].apply(last_conformer_to_array)\n",
+    "semiempirical[\"final_xyz\"] = semiempirical[\"final_xyz\"].apply(last_conformer_to_array)\n",
+    "semiempirical[\"semiempirical_index\"] = semiempirical.index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = filtered_semiempirical = semiempirical[\n",
+    "    semiempirical[\"source\"].isin(species_data[\"source\"])\n",
+    "].copy()\n",
+    "df[\"dft_initial_xyz\"] = df[\"source\"].map(matched_dft.set_index(\"source\")[\"initial_xyz\"])\n",
+    "\n",
+    "\n",
+    "def msd_func(x, y):\n",
+    "    msd = np.square(x - y).sum(-1).mean()\n",
+    "    return msd if msd > 1e-8 else 0.0\n",
+    "\n",
+    "\n",
+    "msd = {\n",
+    "    case: np.vectorize(msd_func)(\n",
+    "        df[f\"{case}_xyz\"], df[\"dft_initial_xyz\"]\n",
+    "    )\n",
+    "    for case in [\"initial\", \"final\"]\n",
+    "}\n",
+    "\n",
+    "df[\"rmsd\"] = np.sqrt(np.minimum(msd[\"initial\"], msd[\"final\"]))\n",
+    "df[\"from_initial\"] = msd[\"initial\"] < msd[\"final\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = matched_semiempirical = filtered_semiempirical.loc[\n",
+    "    filtered_semiempirical.groupby(\"source\")[\"rmsd\"].idxmin()\n",
+    "].reset_index(drop=True)\n",
+    "df[\"species_id\"] = df[\"source\"].map(species_data.set_index(\"source\")[\"species_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matched_semiempirical[\"rmsd\"].describe(percentiles=[0.999])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matched_semiempirical[\"from_initial\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = matched_semiempirical[matched_semiempirical[\"from_initial\"]]\n",
+    "total = 0\n",
+    "for case in [\"aug11b\", \"sep1a_filtered\", \"co2_capture\"]:\n",
+    "    num = sum(case in s for s in df[\"source\"])\n",
+    "    total += num\n",
+    "    print(case, num)\n",
+    "print(\"total\", total, len(df))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = matched_semiempirical[matched_semiempirical[\"rmsd\"] > 0.0].copy()\n",
+    "total = 0\n",
+    "for case in [\"aug11b\", \"sep1a_filtered\", \"co2_capture\"]:\n",
+    "    num = sum(case in s for s in df[\"source\"])\n",
+    "    total += num\n",
+    "    print(case, num)\n",
+    "print(\"total\", total, len(df))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Save the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = matched_semiempirical\n",
+    "df[\"dft_index\"] = df[\"source\"].map(matched_dft.set_index(\"source\")[\"dft_index\"])\n",
+    "df[[\"species_id\", \"dft_index\", \"semiempirical_index\", \"rmsd\", \"from_initial\"]].sort_values(\n",
+    "    \"species_id\"\n",
+    ").to_csv(\"quantum_green_species_data_match.csv\", index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}