diff --git a/notebooks/modeling.ipynb b/notebooks/modeling.ipynb
index 24d3e2e..377a6e2 100644
--- a/notebooks/modeling.ipynb
+++ b/notebooks/modeling.ipynb
@@ -1,806 +1,773 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "d4f4e2b7",
- "metadata": {},
- "source": [
- "# Modeling\n",
- "\n",
- "In this tutorial, we will show you how to use `zephyr_ml`'s `Zephyr` class to train models. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n",
- "\n",
- "## 1) Load the Feature Matrix\n",
- "\n",
- "Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "4a6724ad",
- "metadata": {},
- "outputs": [
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "d4f4e2b7",
+ "metadata": {},
+ "source": [
+ "# Modeling\n",
+ "\n",
+ "In this tutorial, we will show you how to use `zephyr_ml`'s `Zephyr` class to train models. This tutorial builds on top of the previous one where we create EntitySets, generate label times, and do automated feature engineering. To do any of these previous steps, please refer to `feature_engineering` notebook.\n",
+ "\n",
+ "## 1) Load the Feature Matrix\n",
+ "\n",
+ "Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
+ ]
+ },
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " COUNT(alarms) \n",
- " MAX(alarms.IND_DURATION) \n",
- " MIN(alarms.IND_DURATION) \n",
- " SUM(alarms.IND_DURATION) \n",
- " COUNT(stoppages) \n",
- " MAX(stoppages.COD_WO) \n",
- " MAX(stoppages.IND_DURATION) \n",
- " MAX(stoppages.IND_LOST_GEN) \n",
- " MIN(stoppages.COD_WO) \n",
- " MIN(stoppages.IND_DURATION) \n",
- " ... \n",
- " DES_CORE_ELEMENT_T12 \n",
- " DES_CORE_ELEMENT_T13 \n",
- " DES_CORE_ELEMENT_T14 \n",
- " DES_CORE_ELEMENT_T15 \n",
- " SITE_LOCATION \n",
- " DES_CORE_PLANT_LOC \n",
- " COD_PLANT_SAP_ABC \n",
- " COD_PLANT_SAP_XYZ \n",
- " PI_COLLECTOR_SITE_NAME_LOC0 \n",
- " PI_LOCAL_SITE_NAME_LOC0 \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 12345.0 \n",
- " NaN \n",
- " NaN \n",
- " 12345.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 1 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 37452.0 \n",
- " NaN \n",
- " NaN \n",
- " 37452.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 2 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 23432.0 \n",
- " NaN \n",
- " NaN \n",
- " 23432.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 3 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 12452.0 \n",
- " NaN \n",
- " NaN \n",
- " 12452.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 4 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 32435.0 \n",
- " NaN \n",
- " NaN \n",
- " 32435.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 5 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 23534.0 \n",
- " NaN \n",
- " NaN \n",
- " 23534.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 6 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 65431.0 \n",
- " NaN \n",
- " NaN \n",
- " 65431.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 7 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 35742.0 \n",
- " NaN \n",
- " NaN \n",
- " 35742.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 8 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 21343.0 \n",
- " NaN \n",
- " NaN \n",
- " 21343.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 9 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 43565.0 \n",
- " NaN \n",
- " NaN \n",
- " 43565.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 10 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 24525.0 \n",
- " NaN \n",
- " NaN \n",
- " 24525.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 1 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 11 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 67432.0 \n",
- " NaN \n",
- " NaN \n",
- " 67432.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- " 12 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " 1 \n",
- " 21342.0 \n",
- " NaN \n",
- " NaN \n",
- " 21342.0 \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " 0 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " 0 \n",
- " 1 \n",
- " 1 \n",
- " 1 \n",
- " \n",
- " \n",
- "
\n",
- "
13 rows × 101 columns
\n",
- "
"
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4a6724ad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " COUNT(alarms) \n",
+ " MAX(alarms.IND_DURATION) \n",
+ " MIN(alarms.IND_DURATION) \n",
+ " SUM(alarms.IND_DURATION) \n",
+ " COUNT(stoppages) \n",
+ " MAX(stoppages.COD_WO) \n",
+ " MAX(stoppages.IND_DURATION) \n",
+ " MAX(stoppages.IND_LOST_GEN) \n",
+ " MIN(stoppages.COD_WO) \n",
+ " MIN(stoppages.IND_DURATION) \n",
+ " ... \n",
+ " DES_CORE_ELEMENT_T12 \n",
+ " DES_CORE_ELEMENT_T13 \n",
+ " DES_CORE_ELEMENT_T14 \n",
+ " DES_CORE_ELEMENT_T15 \n",
+ " SITE_LOCATION \n",
+ " DES_CORE_PLANT_LOC \n",
+ " COD_PLANT_SAP_ABC \n",
+ " COD_PLANT_SAP_XYZ \n",
+ " PI_COLLECTOR_SITE_NAME_LOC0 \n",
+ " PI_LOCAL_SITE_NAME_LOC0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 12345.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 12345.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 37452.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 37452.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 23432.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 23432.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 12452.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 12452.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 32435.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 32435.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 23534.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 23534.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 65431.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 65431.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 35742.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 35742.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 21343.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 21343.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 43565.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 43565.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 24525.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 24525.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 67432.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 67432.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " 0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 21342.0 \n",
+ " NaN \n",
+ " NaN \n",
+ " 21342.0 \n",
+ " NaN \n",
+ " ... \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
13 rows × 101 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " COUNT(alarms) MAX(alarms.IND_DURATION) MIN(alarms.IND_DURATION) \\\n",
+ "0 1 NaN NaN \n",
+ "1 0 NaN NaN \n",
+ "2 0 NaN NaN \n",
+ "3 0 NaN NaN \n",
+ "4 0 NaN NaN \n",
+ "5 0 NaN NaN \n",
+ "6 0 NaN NaN \n",
+ "7 0 NaN NaN \n",
+ "8 0 NaN NaN \n",
+ "9 0 NaN NaN \n",
+ "10 0 NaN NaN \n",
+ "11 0 NaN NaN \n",
+ "12 0 NaN NaN \n",
+ "\n",
+ " SUM(alarms.IND_DURATION) COUNT(stoppages) MAX(stoppages.COD_WO) \\\n",
+ "0 0.0 1 12345.0 \n",
+ "1 0.0 1 37452.0 \n",
+ "2 0.0 1 23432.0 \n",
+ "3 0.0 1 12452.0 \n",
+ "4 0.0 1 32435.0 \n",
+ "5 0.0 1 23534.0 \n",
+ "6 0.0 1 65431.0 \n",
+ "7 0.0 1 35742.0 \n",
+ "8 0.0 1 21343.0 \n",
+ "9 0.0 1 43565.0 \n",
+ "10 0.0 1 24525.0 \n",
+ "11 0.0 1 67432.0 \n",
+ "12 0.0 1 21342.0 \n",
+ "\n",
+ " MAX(stoppages.IND_DURATION) MAX(stoppages.IND_LOST_GEN) \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
+ "5 NaN NaN \n",
+ "6 NaN NaN \n",
+ "7 NaN NaN \n",
+ "8 NaN NaN \n",
+ "9 NaN NaN \n",
+ "10 NaN NaN \n",
+ "11 NaN NaN \n",
+ "12 NaN NaN \n",
+ "\n",
+ " MIN(stoppages.COD_WO) MIN(stoppages.IND_DURATION) ... \\\n",
+ "0 12345.0 NaN ... \n",
+ "1 37452.0 NaN ... \n",
+ "2 23432.0 NaN ... \n",
+ "3 12452.0 NaN ... \n",
+ "4 32435.0 NaN ... \n",
+ "5 23534.0 NaN ... \n",
+ "6 65431.0 NaN ... \n",
+ "7 35742.0 NaN ... \n",
+ "8 21343.0 NaN ... \n",
+ "9 43565.0 NaN ... \n",
+ "10 24525.0 NaN ... \n",
+ "11 67432.0 NaN ... \n",
+ "12 21342.0 NaN ... \n",
+ "\n",
+ " DES_CORE_ELEMENT_T12 DES_CORE_ELEMENT_T13 DES_CORE_ELEMENT_T14 \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "5 0 0 0 \n",
+ "6 0 0 0 \n",
+ "7 0 0 0 \n",
+ "8 0 0 0 \n",
+ "9 0 0 0 \n",
+ "10 0 1 0 \n",
+ "11 0 0 1 \n",
+ "12 0 0 0 \n",
+ "\n",
+ " DES_CORE_ELEMENT_T15 SITE_LOCATION DES_CORE_PLANT_LOC \\\n",
+ "0 0 1 1 \n",
+ "1 0 1 1 \n",
+ "2 0 1 1 \n",
+ "3 0 1 1 \n",
+ "4 0 1 1 \n",
+ "5 0 1 1 \n",
+ "6 0 1 1 \n",
+ "7 0 1 1 \n",
+ "8 0 1 1 \n",
+ "9 0 1 1 \n",
+ "10 0 1 1 \n",
+ "11 0 1 1 \n",
+ "12 1 1 1 \n",
+ "\n",
+ " COD_PLANT_SAP_ABC COD_PLANT_SAP_XYZ PI_COLLECTOR_SITE_NAME_LOC0 \\\n",
+ "0 1 0 1 \n",
+ "1 1 0 1 \n",
+ "2 1 0 1 \n",
+ "3 1 0 1 \n",
+ "4 1 0 1 \n",
+ "5 1 0 1 \n",
+ "6 1 0 1 \n",
+ "7 0 1 1 \n",
+ "8 0 1 1 \n",
+ "9 0 1 1 \n",
+ "10 0 1 1 \n",
+ "11 0 1 1 \n",
+ "12 0 1 1 \n",
+ "\n",
+ " PI_LOCAL_SITE_NAME_LOC0 \n",
+ "0 1 \n",
+ "1 1 \n",
+ "2 1 \n",
+ "3 1 \n",
+ "4 1 \n",
+ "5 1 \n",
+ "6 1 \n",
+ "7 1 \n",
+ "8 1 \n",
+ "9 1 \n",
+ "10 1 \n",
+ "11 1 \n",
+ "12 1 \n",
+ "\n",
+ "[13 rows x 101 columns]"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
],
- "text/plain": [
- " COUNT(alarms) MAX(alarms.IND_DURATION) MIN(alarms.IND_DURATION) \\\n",
- "0 1 NaN NaN \n",
- "1 0 NaN NaN \n",
- "2 0 NaN NaN \n",
- "3 0 NaN NaN \n",
- "4 0 NaN NaN \n",
- "5 0 NaN NaN \n",
- "6 0 NaN NaN \n",
- "7 0 NaN NaN \n",
- "8 0 NaN NaN \n",
- "9 0 NaN NaN \n",
- "10 0 NaN NaN \n",
- "11 0 NaN NaN \n",
- "12 0 NaN NaN \n",
- "\n",
- " SUM(alarms.IND_DURATION) COUNT(stoppages) MAX(stoppages.COD_WO) \\\n",
- "0 0.0 1 12345.0 \n",
- "1 0.0 1 37452.0 \n",
- "2 0.0 1 23432.0 \n",
- "3 0.0 1 12452.0 \n",
- "4 0.0 1 32435.0 \n",
- "5 0.0 1 23534.0 \n",
- "6 0.0 1 65431.0 \n",
- "7 0.0 1 35742.0 \n",
- "8 0.0 1 21343.0 \n",
- "9 0.0 1 43565.0 \n",
- "10 0.0 1 24525.0 \n",
- "11 0.0 1 67432.0 \n",
- "12 0.0 1 21342.0 \n",
- "\n",
- " MAX(stoppages.IND_DURATION) MAX(stoppages.IND_LOST_GEN) \\\n",
- "0 NaN NaN \n",
- "1 NaN NaN \n",
- "2 NaN NaN \n",
- "3 NaN NaN \n",
- "4 NaN NaN \n",
- "5 NaN NaN \n",
- "6 NaN NaN \n",
- "7 NaN NaN \n",
- "8 NaN NaN \n",
- "9 NaN NaN \n",
- "10 NaN NaN \n",
- "11 NaN NaN \n",
- "12 NaN NaN \n",
- "\n",
- " MIN(stoppages.COD_WO) MIN(stoppages.IND_DURATION) ... \\\n",
- "0 12345.0 NaN ... \n",
- "1 37452.0 NaN ... \n",
- "2 23432.0 NaN ... \n",
- "3 12452.0 NaN ... \n",
- "4 32435.0 NaN ... \n",
- "5 23534.0 NaN ... \n",
- "6 65431.0 NaN ... \n",
- "7 35742.0 NaN ... \n",
- "8 21343.0 NaN ... \n",
- "9 43565.0 NaN ... \n",
- "10 24525.0 NaN ... \n",
- "11 67432.0 NaN ... \n",
- "12 21342.0 NaN ... \n",
- "\n",
- " DES_CORE_ELEMENT_T12 DES_CORE_ELEMENT_T13 DES_CORE_ELEMENT_T14 \\\n",
- "0 0 0 0 \n",
- "1 0 0 0 \n",
- "2 0 0 0 \n",
- "3 0 0 0 \n",
- "4 0 0 0 \n",
- "5 0 0 0 \n",
- "6 0 0 0 \n",
- "7 0 0 0 \n",
- "8 0 0 0 \n",
- "9 0 0 0 \n",
- "10 0 1 0 \n",
- "11 0 0 1 \n",
- "12 0 0 0 \n",
- "\n",
- " DES_CORE_ELEMENT_T15 SITE_LOCATION DES_CORE_PLANT_LOC \\\n",
- "0 0 1 1 \n",
- "1 0 1 1 \n",
- "2 0 1 1 \n",
- "3 0 1 1 \n",
- "4 0 1 1 \n",
- "5 0 1 1 \n",
- "6 0 1 1 \n",
- "7 0 1 1 \n",
- "8 0 1 1 \n",
- "9 0 1 1 \n",
- "10 0 1 1 \n",
- "11 0 1 1 \n",
- "12 1 1 1 \n",
- "\n",
- " COD_PLANT_SAP_ABC COD_PLANT_SAP_XYZ PI_COLLECTOR_SITE_NAME_LOC0 \\\n",
- "0 1 0 1 \n",
- "1 1 0 1 \n",
- "2 1 0 1 \n",
- "3 1 0 1 \n",
- "4 1 0 1 \n",
- "5 1 0 1 \n",
- "6 1 0 1 \n",
- "7 0 1 1 \n",
- "8 0 1 1 \n",
- "9 0 1 1 \n",
- "10 0 1 1 \n",
- "11 0 1 1 \n",
- "12 0 1 1 \n",
- "\n",
- " PI_LOCAL_SITE_NAME_LOC0 \n",
- "0 1 \n",
- "1 1 \n",
- "2 1 \n",
- "3 1 \n",
- "4 1 \n",
- "5 1 \n",
- "6 1 \n",
- "7 1 \n",
- "8 1 \n",
- "9 1 \n",
- "10 1 \n",
- "11 1 \n",
- "12 1 \n",
- "\n",
- "[13 rows x 101 columns]"
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "feature_matrix = pd.read_csv('data/feature_matrix.csv')\n",
+ "feature_matrix"
]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "feature_matrix = pd.read_csv('data/feature_matrix.csv')\n",
- "feature_matrix"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2be92488",
- "metadata": {},
- "source": [
- "## 2) Preparing Model Inputs\n",
- "\n",
- "Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, etc.\n",
- "\n",
- "In this part of the notebook, we do the following:\n",
- "* create `X` and `y` variables from the feature matrix\n",
- "* impute missing values using a SimpleImpute\n",
- "* pass the data into our `Zephyr` instance and split the data into training and testing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "b3be626a",
- "metadata": {},
- "outputs": [
+ },
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n",
- " warnings.warn(\n",
- "[GUIDE] STALE WARNING: \n",
- "\tPerforming step 2 from step -1 with set_feature_matrix.\n",
- "\tThis is a forward step via a set method.\n",
- "\tThe current step is -1.\n",
- "\tAll previous steps will be considered stale.\n",
- "[GUIDE] Successfully performed set_feature_matrix.\n",
- "\tYou can perform the next step by calling generate_train_test_split.\n",
- "[GUIDE] Successfully performed generate_train_test_split.\n",
- "\tYou can perform the next step by calling fit_pipeline.\n"
- ]
+ "cell_type": "markdown",
+ "id": "2be92488",
+ "metadata": {},
+ "source": [
+ "## 2) Preparing Model Inputs\n",
+ "\n",
+ "Prepare the data for modeling. Depending on the data, you might need to: normalize the data, impute missing values, etc.\n",
+ "\n",
+ "In this part of the notebook, we do the following:\n",
+ "* create `X` and `y` variables from the feature matrix\n",
+ "* impute missing values using a SimpleImpute\n",
+ "* pass the data into our `Zephyr` instance and split the data into training and testing"
+ ]
},
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[True, False, True, False, True, False, False, False, False, True, False, False, False]\n"
- ]
- }
- ],
- "source": [
- "from sklearn.impute import SimpleImputer\n",
- "from zephyr_ml import Zephyr\n",
- "\n",
- "# pop the target labels\n",
- "y = list(feature_matrix.pop('label'))\n",
- "print(y)\n",
- "X = feature_matrix.values\n",
- "\n",
- "\n",
- "# impute missing values\n",
- "imputer = SimpleImputer()\n",
- "X = pd.DataFrame(imputer.fit_transform(X))\n",
- "\n",
- "zephyr = Zephyr()\n",
- "zephyr.set_feature_matrix(X, labels = y)\n",
- "X_train, X_test, y_train, y_test = zephyr.generate_train_test_split(test_size=0.2, random_state=33)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3c8b00e2",
- "metadata": {},
- "source": [
- "## 3) Train a Model\n",
- "\n",
- "We train a model using `Zephyr`'s `fit_pipeline` method.\n",
- "In this notebook, we use an `xgb_classifier` pipeline which consists of two primitives:\n",
- "\n",
- "```\n",
- " \"xgboost.XGBClassifier\"\n",
- " \"zephyr_ml.primitives.postprocessing.FindThreshold\"\n",
- "```\n",
- "\n",
- "An `XGBClassifier` primitive is an XGB model that returns the probability of each class, and `FindThreshold` primitive creates binary labels from the output of the XGB model by choosing a threshold that produces the best metric value (F1 Score by default)\n",
- "\n",
- "To use a pipeline, we simply pass the name of the pipeline to our `Zephyr` instance.\n",
- "Optionally, you can change the default settings of the primitive by passing a hyperparameter dictionary. For example, we can change the number of trees in the classifier to be 50 instead of the default value (100)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "edffee03",
- "metadata": {},
- "outputs": [
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "b3be626a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n",
+ " warnings.warn(\n",
+ "[GUIDE] Successfully performed set_feature_matrix.\n",
+ "\tYou can perform the next step by calling generate_train_test_split.\n",
+ "[GUIDE] Successfully performed generate_train_test_split.\n",
+ "\tYou can perform the next step by calling fit_pipeline.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[True, False, True, False, True, False, False, False, False, True, False, False, False]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.impute import SimpleImputer\n",
+ "from zephyr_ml import Zephyr\n",
+ "\n",
+ "# pop the target labels\n",
+ "y = list(feature_matrix.pop('label'))\n",
+ "print(y)\n",
+ "X = feature_matrix.values\n",
+ "\n",
+ "\n",
+ "# impute missing values\n",
+ "imputer = SimpleImputer()\n",
+ "X = pd.DataFrame(imputer.fit_transform(X))\n",
+ "\n",
+ "zephyr = Zephyr()\n",
+ "zephyr.set_feature_matrix(X, labels = y)\n",
+ "X_train, X_test, y_train, y_test = zephyr.generate_train_test_split(test_size=0.2, random_state=33)"
+ ]
+ },
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[GUIDE] Successfully performed fit_pipeline.\n",
- "\tYou can perform the next step by calling predict or evaluate.\n"
- ]
- }
- ],
- "source": [
- "hyperparameters = {\n",
- " \"xgboost.XGBClassifier#1\": {\n",
- " \"n_estimators\": 50\n",
- " }\n",
- "}\n",
- "\n",
- "zephyr.fit_pipeline(pipeline = \"xgb_classifier\", pipeline_hyperparameters = hyperparameters)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "445afd22",
- "metadata": {},
- "source": [
- "Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "78187756",
- "metadata": {},
- "outputs": [
+ "cell_type": "markdown",
+ "id": "3c8b00e2",
+ "metadata": {},
+ "source": [
+ "## 3) Train a Model\n",
+ "\n",
+ "We train a model using `Zephyr`'s `fit_pipeline` method.\n",
+ "In this notebook, we use an `xgb_classifier` pipeline which consists of two primitives:\n",
+ "\n",
+ "```\n",
+ " \"xgboost.XGBClassifier\"\n",
+ " \"zephyr_ml.primitives.postprocessing.FindThreshold\"\n",
+ "```\n",
+ "\n",
+ "An `XGBClassifier` primitive is an XGB model that returns the probability of each class, and `FindThreshold` primitive creates binary labels from the output of the XGB model by choosing a threshold that produces the best metric value (F1 Score by default)\n",
+ "\n",
+ "To use a pipeline, we simply pass the name of the pipeline to our `Zephyr` instance.\n",
+ "Optionally, you can change the default settings of the primitive by passing a hyperparameter dictionary. For example, we can change the number of trees in the classifier to be 50 instead of the default value (100)."
+ ]
+ },
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[GUIDE] Successfully performed predict.\n",
- "\tYou have reached the end of the predictive engineering workflow.\n",
- "\tYou can call predict or evaluate again or re-perform previous steps based on results.\n"
- ]
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "edffee03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparameters = {\n",
+ " \"xgboost.XGBClassifier#1\": {\n",
+ " \"n_estimators\": 50\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "zephyr.fit_pipeline(pipeline = \"xgb_classifier\", pipeline_hyperparameters = hyperparameters)"
+ ]
},
{
- "data": {
- "text/plain": [
- "[1, 0, 1]"
+ "cell_type": "markdown",
+ "id": "445afd22",
+ "metadata": {},
+ "source": [
+ "Now that the pipeline is trained, we can use it to predict the values of the test data using `predict` function\n"
]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "zephyr.predict()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "24cda971",
- "metadata": {},
- "source": [
- "Lastly, we can evaluate the performance of the pipeline using `evaluate` function\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "cd097853",
- "metadata": {},
- "outputs": [
+ },
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[GUIDE] Successfully performed evaluate.\n",
- "\tYou have reached the end of the predictive engineering workflow.\n",
- "\tYou can call predict or evaluate again or re-perform previous steps based on results.\n"
- ]
- }
- ],
- "source": [
- "res = zephyr.evaluate()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "8df0f26c",
- "metadata": {},
- "outputs": [
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "78187756",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[1, 0, 1]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "zephyr.predict()"
+ ]
+ },
{
- "data": {
- "text/plain": [
- "{'sklearn.metrics.accuracy_score': 0.6666666666666666,\n",
- " 'sklearn.metrics.precision_score': 0.5,\n",
- " 'sklearn.metrics.f1_score': 0.6666666666666666,\n",
- " 'sklearn.metrics.recall_score': 1.0,\n",
- " 'zephyr_ml.primitives.postprocessing.confusion_matrix': (array([[1, 1],\n",
- " [0, 1]]),\n",
- " ),\n",
- " 'zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve': (0.5,\n",
- " )}"
+ "cell_type": "markdown",
+ "id": "24cda971",
+ "metadata": {},
+ "source": [
+ "Lastly, we can evaluate the performance of the pipeline using `evaluate` function\n"
]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "res"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e2657da3",
- "metadata": {},
- "source": [
- "The `confusion_matrix` and `roc_auc_score_and_curve` evaluation metrics return some `matplotlib.figure.Figure` objects, which we can display, as shown below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "b74c3618",
- "metadata": {},
- "outputs": [
+ },
{
- "data": {
- "image/png": "",
- "text/plain": [
- ""
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "cd097853",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "res = zephyr.evaluate()"
]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%matplotlib inline\n",
- "_, conf_matrix_fig = res[\"zephyr_ml.primitives.postprocessing.confusion_matrix\"]\n",
- "conf_matrix_fig"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "d59e86b1",
- "metadata": {},
- "outputs": [
+ },
{
- "data": {
- "image/png": "",
- "text/plain": [
- ""
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8df0f26c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'sklearn.metrics.accuracy_score': 0.6666666666666666,\n",
+ " 'sklearn.metrics.precision_score': 0.5,\n",
+ " 'sklearn.metrics.f1_score': 0.6666666666666666,\n",
+ " 'sklearn.metrics.recall_score': 1.0,\n",
+ " 'zephyr_ml.primitives.postprocessing.confusion_matrix': (array([[1, 1],\n",
+ " [0, 1]]),\n",
+ " ),\n",
+ " 'zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve': (0.5,\n",
+ " )}"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res"
]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e2657da3",
+ "metadata": {},
+ "source": [
+ "The `confusion_matrix` and `roc_auc_score_and_curve` evaluation metrics return some `matplotlib.figure.Figure` objects, which we can display, as shown below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b74c3618",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%matplotlib inline\n",
+ "_, conf_matrix_fig = res[\"zephyr_ml.primitives.postprocessing.confusion_matrix\"]\n",
+ "conf_matrix_fig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d59e86b1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "_, roc_fig = res[\"zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve\"]\n",
+ "\n",
+ "roc_fig\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.0"
}
- ],
- "source": [
- "\n",
- "_, roc_fig = res[\"zephyr_ml.primitives.postprocessing.roc_auc_score_and_curve\"]\n",
- "\n",
- "roc_fig\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "venv",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/notebooks/visualization.ipynb b/notebooks/visualization.ipynb
index 3e5f9fb..86e34b5 100644
--- a/notebooks/visualization.ipynb
+++ b/notebooks/visualization.ipynb
@@ -1,297 +1,283 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "3674a18e",
- "metadata": {},
- "source": [
- "# Visualization\n",
- "\n",
- "In this tutorial, we will show you how to use Zephyr class to obtain intermediate results of the pipeline for visualization purposes during the fitting stage. To know more about pipelines and Zephyr class please refer to the modeling notebook. We also used a demo feature matrix, to know how you can create features, please refer to feature_engineering notebook.\n",
- "\n",
- "## Load the Feature Matrix\n",
- "\n",
- "Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "d6f954db",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "feature_matrix = pd.read_csv('data/feature_matrix.csv')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4ba7879c",
- "metadata": {},
- "source": [
- "## Prepare data\n",
- "\n",
- "Prepare the data for training by creating a `y` variable to hold the labels, imputing missing values, and normlizing the data. We then initialize a `Zephyr` instance, set our data, and split it into training and testing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "23ec49dd",
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n",
- " warnings.warn(\n",
- "[GUIDE] STALE WARNING: \n",
- "\tPerforming step 2 from step -1 with set_feature_matrix.\n",
- "\tThis is a forward step via a set method.\n",
- "\tThe current step is -1.\n",
- "\tAll previous steps will be considered stale.\n",
- "[GUIDE] Successfully performed set_feature_matrix.\n",
- "\tYou can perform the next step by calling generate_train_test_split.\n",
- "[GUIDE] Successfully performed generate_train_test_split.\n",
- "\tYou can perform the next step by calling fit_pipeline.\n"
- ]
+ "cell_type": "markdown",
+ "id": "3674a18e",
+ "metadata": {},
+ "source": [
+ "# Visualization\n",
+ "\n",
+ "In this tutorial, we will show you how to use Zephyr class to obtain intermediate results of the pipeline for visualization purposes during the fitting stage. To know more about pipelines and Zephyr class please refer to the modeling notebook. We also used a demo feature matrix, to know how you can create features, please refer to feature_engineering notebook.\n",
+ "\n",
+ "## Load the Feature Matrix\n",
+ "\n",
+ "Load the feature matrix which is the result of the `feature_engineering` notebook. For the purpose of this tutorial, we use a dummy feature matrix stored in the `data/` folder."
+ ]
},
{
- "data": {
- "text/plain": [
- "( 0 1 2 3 4 5 6 7 8 9 ... \\\n",
- " 10 -0.288675 0.0 0.0 -0.463185 -0.463185 -0.463185 0.0 0.0 0.0 0.0 ... \n",
- " 5 -0.288675 0.0 0.0 -0.521570 -0.521570 -0.521570 0.0 0.0 0.0 0.0 ... \n",
- " 3 -0.288675 0.0 0.0 -1.174466 -1.174466 -1.174466 0.0 0.0 0.0 0.0 ... \n",
- " 11 -0.288675 0.0 0.0 2.064680 2.064680 2.064680 0.0 0.0 0.0 0.0 ... \n",
- " 1 -0.288675 0.0 0.0 0.298409 0.298409 0.298409 0.0 0.0 0.0 0.0 ... \n",
- " 9 -0.288675 0.0 0.0 0.658556 0.658556 0.658556 0.0 0.0 0.0 0.0 ... \n",
- " 2 -0.288675 0.0 0.0 -0.527579 -0.527579 -0.527579 0.0 0.0 0.0 0.0 ... \n",
- " 8 -0.288675 0.0 0.0 -0.650653 -0.650653 -0.650653 0.0 0.0 0.0 0.0 ... \n",
- " 7 -0.288675 0.0 0.0 0.197664 0.197664 0.197664 0.0 0.0 0.0 0.0 ... \n",
- " 4 -0.288675 0.0 0.0 0.002832 0.002832 0.002832 0.0 0.0 0.0 0.0 ... \n",
- " \n",
- " 80 81 82 83 84 85 86 87 88 89 \n",
- " 10 0.0 3.464102 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 5 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " 3 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " 11 0.0 -0.288675 3.464102 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 1 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " 9 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 2 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " 8 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 7 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 4 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " \n",
- " [10 rows x 90 columns],\n",
- " 0 1 2 3 4 5 6 7 8 9 ... \\\n",
- " 6 -0.288675 0.0 0.0 1.946791 1.946791 1.946791 0.0 0.0 0.0 0.0 ... \n",
- " 12 -0.288675 0.0 0.0 -0.650711 -0.650711 -0.650711 0.0 0.0 0.0 0.0 ... \n",
- " 0 3.464102 0.0 0.0 -1.180770 -1.180770 -1.180770 0.0 0.0 0.0 0.0 ... \n",
- " \n",
- " 80 81 82 83 84 85 86 87 88 89 \n",
- " 6 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " 12 0.0 -0.288675 -0.288675 3.464102 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
- " 0 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
- " \n",
- " [3 rows x 90 columns],\n",
- " 10 False\n",
- " 5 False\n",
- " 3 False\n",
- " 11 False\n",
- " 1 False\n",
- " 9 True\n",
- " 2 True\n",
- " 8 False\n",
- " 7 False\n",
- " 4 True\n",
- " Name: label, dtype: bool,\n",
- " 6 False\n",
- " 12 False\n",
- " 0 True\n",
- " Name: label, dtype: bool)"
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "d6f954db",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "feature_matrix = pd.read_csv('data/feature_matrix.csv')"
]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from sklearn.impute import SimpleImputer\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "from zephyr_ml import Zephyr\n",
- "\n",
- "# pop the target labels\n",
- "y = list(feature_matrix.pop('label'))\n",
- "X = feature_matrix.values\n",
- "\n",
- "# impute missing values\n",
- "imputer = SimpleImputer()\n",
- "X = imputer.fit_transform(X)\n",
- "\n",
- "# normalize the data\n",
- "scaler = StandardScaler()\n",
- "X = pd.DataFrame(scaler.fit_transform(X))\n",
- "\n",
- "zephyr = Zephyr()\n",
- "zephyr.set_feature_matrix(feature_matrix=X, labels = y)\n",
- "zephyr.generate_train_test_split(test_size=0.2, random_state=33)\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "733a854b",
- "metadata": {},
- "source": [
- "## Select Model\n",
- "\n",
- "Select a model that has a `visual` block such as the `xgb_classifier` pipeline.\n",
- "\n",
- "The visual block in the pipeline json defines what are the intermediate results you want to capture and return during the fitting process. For example, in the `xgb` pipeline, we are interested to see what is the best threshold it found. In addition, we are interested to see the different scores obtained at each threshold.\n",
- "Then the block would look something like:\n",
- "\n",
- "```\n",
- "\"visual\": [\n",
- " {\n",
- " \"name\": \"threshold\",\n",
- " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.threshold\"\n",
- " },\n",
- " {\n",
- " \"name\": \"scores\",\n",
- " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.scores\"\n",
- " }\n",
- "]\n",
- "```\n",
- "\n",
- "Where we have a _name_ and a _variable_ defining the intermediate outputs. "
- ]
- },
- {
- "cell_type": "markdown",
- "id": "531d157d",
- "metadata": {},
- "source": [
- "## Visualize\n",
- "\n",
- "When training the pipeline using the `fit` function, you can specify `zephyr.fit_pipeline(.., visual=True)` to indicate you are interested in obtaining the intermediate outputs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "683393df",
- "metadata": {},
- "outputs": [
+ },
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "[GUIDE] Successfully performed fit_pipeline.\n",
- "\tYou can perform the next step by calling predict or evaluate.\n"
- ]
- }
- ],
- "source": [
- "output = zephyr.fit_pipeline(pipeline = \"xgb_classifier\", visual=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "13221b40",
- "metadata": {},
- "outputs": [
+ "cell_type": "markdown",
+ "id": "4ba7879c",
+ "metadata": {},
+ "source": [
+ "## Prepare data\n",
+ "\n",
+ "Prepare the data for training by creating a `y` variable to hold the labels, imputing missing values, and normlizing the data. We then initialize a `Zephyr` instance, set our data, and split it into training and testing."
+ ]
+ },
{
- "data": {
- "text/plain": [
- "dict_keys(['threshold', 'scores'])"
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "23ec49dd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/raymondpan/zephyr/Zephyr-repo/venv/lib/python3.8/site-packages/sklearn/impute/_base.py:555: UserWarning: Skipping features without any observed values: [ 1 2 6 7 9 10 15 16 17 18]. At least one non-missing value is needed for imputation with strategy='mean'.\n",
+ " warnings.warn(\n",
+ "[GUIDE] Successfully performed set_feature_matrix.\n",
+ "\tYou can perform the next step by calling generate_train_test_split.\n",
+ "[GUIDE] Successfully performed generate_train_test_split.\n",
+ "\tYou can perform the next step by calling fit_pipeline.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "( 0 1 2 3 4 5 6 7 8 9 ... \\\n",
+ " 10 -0.288675 0.0 0.0 -0.463185 -0.463185 -0.463185 0.0 0.0 0.0 0.0 ... \n",
+ " 5 -0.288675 0.0 0.0 -0.521570 -0.521570 -0.521570 0.0 0.0 0.0 0.0 ... \n",
+ " 3 -0.288675 0.0 0.0 -1.174466 -1.174466 -1.174466 0.0 0.0 0.0 0.0 ... \n",
+ " 11 -0.288675 0.0 0.0 2.064680 2.064680 2.064680 0.0 0.0 0.0 0.0 ... \n",
+ " 1 -0.288675 0.0 0.0 0.298409 0.298409 0.298409 0.0 0.0 0.0 0.0 ... \n",
+ " 9 -0.288675 0.0 0.0 0.658556 0.658556 0.658556 0.0 0.0 0.0 0.0 ... \n",
+ " 2 -0.288675 0.0 0.0 -0.527579 -0.527579 -0.527579 0.0 0.0 0.0 0.0 ... \n",
+ " 8 -0.288675 0.0 0.0 -0.650653 -0.650653 -0.650653 0.0 0.0 0.0 0.0 ... \n",
+ " 7 -0.288675 0.0 0.0 0.197664 0.197664 0.197664 0.0 0.0 0.0 0.0 ... \n",
+ " 4 -0.288675 0.0 0.0 0.002832 0.002832 0.002832 0.0 0.0 0.0 0.0 ... \n",
+ " \n",
+ " 80 81 82 83 84 85 86 87 88 89 \n",
+ " 10 0.0 3.464102 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 5 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " 3 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " 11 0.0 -0.288675 3.464102 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 1 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " 9 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 2 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " 8 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 7 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 4 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " \n",
+ " [10 rows x 90 columns],\n",
+ " 0 1 2 3 4 5 6 7 8 9 ... \\\n",
+ " 6 -0.288675 0.0 0.0 1.946791 1.946791 1.946791 0.0 0.0 0.0 0.0 ... \n",
+ " 12 -0.288675 0.0 0.0 -0.650711 -0.650711 -0.650711 0.0 0.0 0.0 0.0 ... \n",
+ " 0 3.464102 0.0 0.0 -1.180770 -1.180770 -1.180770 0.0 0.0 0.0 0.0 ... \n",
+ " \n",
+ " 80 81 82 83 84 85 86 87 88 89 \n",
+ " 6 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " 12 0.0 -0.288675 -0.288675 3.464102 0.0 0.0 -1.080123 1.080123 0.0 0.0 \n",
+ " 0 0.0 -0.288675 -0.288675 -0.288675 0.0 0.0 0.925820 -0.925820 0.0 0.0 \n",
+ " \n",
+ " [3 rows x 90 columns],\n",
+ " 10 False\n",
+ " 5 False\n",
+ " 3 False\n",
+ " 11 False\n",
+ " 1 False\n",
+ " 9 True\n",
+ " 2 True\n",
+ " 8 False\n",
+ " 7 False\n",
+ " 4 True\n",
+ " Name: label, dtype: bool,\n",
+ " 6 False\n",
+ " 12 False\n",
+ " 0 True\n",
+ " Name: label, dtype: bool)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from zephyr_ml import Zephyr\n",
+ "\n",
+ "# pop the target labels\n",
+ "y = list(feature_matrix.pop('label'))\n",
+ "X = feature_matrix.values\n",
+ "\n",
+ "# impute missing values\n",
+ "imputer = SimpleImputer()\n",
+ "X = imputer.fit_transform(X)\n",
+ "\n",
+ "# normalize the data\n",
+ "scaler = StandardScaler()\n",
+ "X = pd.DataFrame(scaler.fit_transform(X))\n",
+ "\n",
+ "zephyr = Zephyr()\n",
+ "zephyr.set_feature_matrix(feature_matrix=X, labels = y)\n",
+ "zephyr.generate_train_test_split(test_size=0.2, random_state=33)\n",
+ "\n"
]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output.keys()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d1672fc6",
- "metadata": {},
- "source": [
- "The output is a dictionary containing two values: `threshold`, and `scores`. Let's visualize these results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "c7a88d5b",
- "metadata": {},
- "outputs": [
+ },
{
- "data": {
- "image/png": "",
- "text/plain": [
- ""
+ "cell_type": "markdown",
+ "id": "733a854b",
+ "metadata": {},
+ "source": [
+ "## Select Model\n",
+ "\n",
+ "Select a model that has a `visual` block such as the `xgb_classifier` pipeline.\n",
+ "\n",
+ "The visual block in the pipeline json defines what are the intermediate results you want to capture and return during the fitting process. For example, in the `xgb` pipeline, we are interested to see what is the best threshold it found. In addition, we are interested to see the different scores obtained at each threshold.\n",
+ "Then the block would look something like:\n",
+ "\n",
+ "```\n",
+ "\"visual\": [\n",
+ " {\n",
+ " \"name\": \"threshold\",\n",
+ " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.threshold\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"scores\",\n",
+ " \"variable\": \"zephyr_ml.primitives.postprocessing.FindThreshold#1.scores\"\n",
+ " }\n",
+ "]\n",
+ "```\n",
+ "\n",
+ "Where we have a _name_ and a _variable_ defining the intermediate outputs. "
]
- },
- "metadata": {},
- "output_type": "display_data"
+ },
+ {
+ "cell_type": "markdown",
+ "id": "531d157d",
+ "metadata": {},
+ "source": [
+ "## Visualize\n",
+ "\n",
+ "When training the pipeline using the `fit` function, you can specify `zephyr.fit_pipeline(.., visual=True)` to indicate you are interested in obtaining the intermediate outputs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "683393df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output = zephyr.fit_pipeline(pipeline = \"xgb_classifier\", visual=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "13221b40",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dict_keys(['threshold', 'scores'])"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d1672fc6",
+ "metadata": {},
+ "source": [
+ "The output is a dictionary containing two values: `threshold`, and `scores`. Let's visualize these results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "c7a88d5b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "\n",
+ "scores = output['scores']\n",
+ "threshold = output['threshold']\n",
+ "\n",
+ "line = np.arange(0, 1.0, 0.01)\n",
+ "\n",
+ "%matplotlib inline\n",
+ "plt.plot(line, scores)\n",
+ "plt.axvline(threshold, color='r')\n",
+ "plt.ylabel(\"f1 score\")\n",
+ "plt.xlabel(\"threshold\")\n",
+ "plt.title(f\"Best obtained threshold at {threshold}\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa0b487c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.0"
}
- ],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "\n",
- "scores = output['scores']\n",
- "threshold = output['threshold']\n",
- "\n",
- "line = np.arange(0, 1.0, 0.01)\n",
- "\n",
- "%matplotlib inline\n",
- "plt.plot(line, scores)\n",
- "plt.axvline(threshold, color='r')\n",
- "plt.ylabel(\"f1 score\")\n",
- "plt.xlabel(\"threshold\")\n",
- "plt.title(f\"Best obtained threshold at {threshold}\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "aa0b487c",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "venv",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/zephyr_ml/core.py b/zephyr_ml/core.py
index 8b8e0c2..aa0e9ca 100644
--- a/zephyr_ml/core.py
+++ b/zephyr_ml/core.py
@@ -107,17 +107,20 @@ def perform_producer_step(self, zephyr, method,
return res
def try_log_forward_set_method_warning(self, name, next_step):
+ if self.current_step != -1:
+ from_str = (f"Going from step {self.current_step} to "
+ f"step {next_step} by performing {name}.")
+ else:
+ from_str = (f"Performing step {next_step} with {name}.")
LOGGER.warning((f"[GUIDE] STALE WARNING: \n"
- f"\tPerforming step {next_step} from "
- f"step {self.current_step} with {name}.\n"
+ f"\t{from_str}\n"
f"\tThis is a forward step via a set method.\n"
- f"\tThe current step is {self.current_step}.\n"
f"\tAll previous steps will be considered stale."))
def try_log_backwards_set_method_warning(self, name, next_step):
LOGGER.warning((f"[GUIDE] STALE WARNING: \n"
- f"\tPerforming step {next_step} from "
- f"step {self.current_step} with {name}.\n"
+ f"\tGoing from step {self.current_step} to "
+ f"step {next_step} by performing {name}.\n"
f"\tThis is a backwards step via a set method.\n"
f"\tAll other steps will be considered stale."))
@@ -126,8 +129,8 @@ def try_log_backwards_key_method_warning(self, name, next_step):
self.get_steps_in_between(
self.current_step, next_step))
LOGGER.warning((f"[GUIDE] STALE WARNING:\n"
- f"\tPerforming step {next_step} from "
- f"step {self.current_step} via {name}.\n"
+ f"\tGoing from step {self.current_step} to "
+ f"step {next_step} by performing {name}.\n"
f"\tThis is a backwards step via a key method.\n"
f"\tThe following steps will be considered stale:\n"
f"{steps_in_between}"))