cromano8 · rcsm89 · Feb 2, 2024 · Feb 5, 2024
diff --git a/notebooks/2_snowml.ipynb b/notebooks/2_snowml.ipynb
@@ -1,10 +1,10 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 2,
-  "metadata": {},
   "cells": [
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "import warnings\n",
         "\n",
@@ -15,87 +15,87 @@
         "from snowflake.ml.utils.connection_params import SnowflakeLoginOptions\n",
         "from snowflake.snowpark import Session\n",
         "from snowflake.snowpark import types as T\n",
-        "from snowflake.snowpark.functions import col\n",
+        "from snowflake.snowpark.functions import col, sum as snow_sum\n",
         "\n",
         "warnings.simplefilter(action=\"ignore\", category=UserWarning)"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "session = Session.builder.configs(SnowflakeLoginOptions()).getOrCreate()"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "titanic_df = session.table(\"titanic\")"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "titanic_df.show()"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "# Columns with null values and their respective counts\n",
         "null_counts = [\n",
         "    (col_name, titanic_df.where(col(col_name).isNull()).count())\n",
         "    for col_name in titanic_df.columns\n",
         "]\n",
         "null_counts"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "titanic_df = titanic_df.drop([\"AGE\", \"DECK\", \"ALIVE\", \"ADULT_MALE\", \"EMBARKED\"])"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "titanic_df = titanic_df.withColumn(\"FARE\", titanic_df[\"FARE\"].astype(T.FloatType()))\n",
         "\n",
         "titanic_df.show()"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "cat_cols = [\"SEX\", \"CLASS\", \"WHO\", \"EMBARK_TOWN\"]\n",
         "num_cols = [\"PCLASS\", \"SIBSP\", \"PARCH\", \"FARE\"]"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "impute_cat = SimpleImputer(\n",
         "    input_cols=cat_cols,\n",
@@ -106,13 +106,13 @@
         "\n",
         "titanic_df = impute_cat.fit(titanic_df).transform(titanic_df)\n",
         "titanic_df.show()"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "OHE = OneHotEncoder(\n",
         "    input_cols=cat_cols,\n",
@@ -124,22 +124,22 @@
         "\n",
         "titanic_df = OHE.fit(titanic_df).transform(titanic_df)\n",
         "titanic_df.show()"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "train_df, test_df = titanic_df.random_split(weights=[0.8, 0.2], seed=8)"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "xgb = XGBClassifier(\n",
         "    input_cols=train_df.drop(\"SURVIVED\").columns,\n",
@@ -149,22 +149,45 @@
         "\n",
         "# Train\n",
         "xgb.fit(train_df)"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "result = xgb.predict(test_df)"
-      ],
+      ]
+    },
+    {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
       "outputs": [],
-      "execution_count": null
+      "source": [
+        "def specificity_score(df, y_true_col_names, y_pred_col_names):\n",
+        "    tn = snow_sum((col(y_true_col_names) == 0) & (col(y_pred_col_names) == 0)).alias(\"TN\")\n",
+        "    fp = snow_sum((col(y_true_col_names) == 0) & (col(y_pred_col_names) == 1)).alias(\"FP\")\n",
+        "    specificity = tn / (tn + fp)\n",
+        "    return specificity\n",
+        "\n",
+        "def sensitivity_score(df, y_true_col_names, y_pred_col_names):\n",
+        "    tp = snow_sum((col(y_true_col_names) == 1) & (col(y_pred_col_names) == 1)).alias(\"TP\")\n",
+        "    fn = snow_sum((col(y_true_col_names) == 1) & (col(y_pred_col_names) == 0)).alias(\"FN\")\n",
+        "    sensitivity = tp / (tp + fn)\n",
+        "    return sensitivity\n",
+        "\n",
+        "specificity = specificity_score(result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\")\n",
+        "sensitivity = sensitivity_score(result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\")\n",
+        "\n",
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {},
+      "outputs": [],
       "source": [
         "accuracy = accuracy_score(\n",
         "    df=result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\"\n",
@@ -178,11 +201,22 @@
         "    df=result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\"\n",
         ")\n",
         "\n",
-        "print(f\"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}\")"
-      ],
-      "cell_type": "code",
-      "outputs": [],
-      "execution_count": null
+        "specificity = specificity_score(\n",
+        "    df=result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\"\n",
+        ")\n",
+        "\n",
+        "sensitivity = sensitivity_score(\n",
+        "    df=result, y_true_col_names=\"SURVIVED\", y_pred_col_names=\"PRED_SURVIVED\"\n",
+        ")\n",
+        "\n",
+      ]
     }
-  ]
+  ],
+  "metadata": {
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
 }