Finished top-level notebook to analyze aggregated instance data

markcoletti · markcoletti · commit 1a356cfa7870 · 2026-02-13T11:36:51.000-05:00
diff --git a/examples-proposed/024-aggregated-compute-ensemble/input_dir/global_notebook.ipynb b/examples-proposed/024-aggregated-compute-ensemble/input_dir/global_notebook.ipynb
@@ -25,6 +25,7 @@
     "import json\n",
     "import csv\n",
     "from pathlib import Path\n",
+    "import pandas as pd\n",
     "\n",
     "%matplotlib notebook\n",
     "import matplotlib.pyplot as plt"
@@ -42,22 +43,96 @@
    },
    "cell_type": "code",
    "source": [
-    "a = ips_analysis_api.get_data()\n",
-    "b = ips_analysis_api.get_child_data()\n",
-    "c = ips_analysis_api.get_child_data_not_ensembles()\n",
-    "d = ips_analysis_api.get_child_data_by_ensemble_names()"
+    "# Gather ensemble instance data\n",
+    "ensemble_data = ips_analysis_api.get_child_data()"
    ],
    "id": "3d8fa1127a2ab4ef",
    "outputs": [],
    "execution_count": 1
   },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "`ensemble_data` will be a python dict within a dict that has a list.  The outermost dict is keyed by the run number, the next dict level by the timestamp, and the list within that dict will be of all the data that was registered for the instance run.",
+   "id": "a47d457006cb62c0"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "ensemble_data",
+   "id": "baab25e709153aec"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "For this example, we're interested in aggregating the instance data found in all the CSV files.  (The JSON files contain the solutions for each instance for which there are separate jupyter notebooks, and so are outside the scope of this notebook.)  So, let's create a pandas dataframe from all the CSV files.",
+   "id": "be9c1c201521542c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "rows = []\n",
+    "for k in ensemble_data.keys():\n",
+    "    csv_path = Path(ensemble_data[k][0.0][1])\n",
+    "    print(f'Reading {csv_path}')\n",
+    "    with csv_path.open('r') as csv_file:\n",
+    "        csv_reader = csv.DictReader(csv_file)\n",
+    "        for row in csv_reader:\n",
+    "            rows.append(row)"
+   ],
+   "id": "94a8c3dce6c33ae6"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "df = pd.DataFrame(rows)",
+   "id": "3954d70afe1b1794"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "df",
+   "id": "4ddbb45fc4de7"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "# convert start and end times to floats to make new duration column\n",
+    "df['start'] = df['start'].astype(float)\n",
+    "df['end'] = df['end'].astype(float)"
+   ],
+   "id": "1bf3f724abe263db"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "df['duration'] = df['end'] - df['start']",
+   "id": "e1a339aacb56d29f"
+  },
   {
    "metadata": {},
    "cell_type": "code",
    "outputs": [],
    "execution_count": null,
-   "source": "",
-   "id": "c2e3d3e1518509f8"
+   "source": [
+    "# Show the final dataframe with new duration column\n",
+    "df"
+   ],
+   "id": "e2ec1f8dca7ad3e8"
   }
  ],
  "metadata": {