From 03de56b1ab831ec2cbee462066a7cdb93e12f592 Mon Sep 17 00:00:00 2001 From: Nancy Amer Date: Fri, 11 Apr 2025 23:00:45 +0200 Subject: [PATCH] pipeline w/t scaling --- PM_pipeline.ipynb | 810 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 810 insertions(+) create mode 100644 PM_pipeline.ipynb diff --git a/PM_pipeline.ipynb b/PM_pipeline.ipynb new file mode 100644 index 0000000..458ce19 --- /dev/null +++ b/PM_pipeline.ipynb @@ -0,0 +1,810 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyPzZxEbLOYq1zjEDFBK8FRH", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Libraries" + ], + "metadata": { + "id": "ylUFE2gAWyTB" + } + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "Bcap68Gv94i9" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.pipeline import Pipeline, make_pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "import xgboost as xgb\n", + "from xgboost import XGBRegressor\n", + "from scipy import stats\n", + "import joblib" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#Pipeline" + ], + "metadata": { + "id": "UpWtCyNJW2eu" + } + }, + { + "cell_type": "code", + "source": [ + "categorical_features = ['flight_phase']\n", + "numerical_features = ['flight_cycle',\t'egt_probe_average',\t'fuel_flw',\t'core_spd',\t'zpn12p',\t'vib_n1_#1_bearing',\t'vib_n2_#1_bearing',\t'vib_n2_turbine_frame']\n", + "numerical_features_for_outlier = ['egt_probe_average', 'fuel_flw', 'core_spd',\n", + " 'zpn12p', 'vib_n1_#1_bearing', 'vib_n2_#1_bearing',\n", + " 'vib_n2_turbine_frame'] #'corrected_fan_spd'" + ], + "metadata": { + "id": "MBRHt8W0Jvrf" + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Custom Transformer for outliers removal: OutlierRemoverGroup\n", + "class OutlierRemoverGroup(BaseEstimator, TransformerMixin):\n", + " def __init__(self, groupby_col, columns, threshold=2.5):\n", + " self.groupby_col = groupby_col\n", + " self.columns = columns\n", + " self.threshold = threshold\n", + "\n", + " def fit(self, X, y=None):\n", + " return self\n", + "\n", + " def transform(self, X):\n", + " X_clean = X.copy()\n", + " z_scores = X_clean.groupby(self.groupby_col)[self.columns].transform(\n", + " lambda x: np.abs(stats.zscore(x, nan_policy='omit'))\n", + " )\n", + " mask = (z_scores <= self.threshold).all(axis=1)\n", + " return X_clean[mask].reset_index(drop=True)" + ], + "metadata": { + "id": "R5PX9gsLVQCg" + }, + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#encoding categorical column\n", + "categorical_pipeline = Pipeline(steps=[\n", + " ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))\n", + "])" + ], + "metadata": { + "id": "toyCPtvUI9UV" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#merging the transformers together\n", + "col_transformer = ColumnTransformer(\n", + " transformers=[\n", + " ('num', 'passthrough', numerical_features),\n", + " ('cat', categorical_pipeline, categorical_features)\n", + " ]\n", + ")" + ], + "metadata": { + "id": "Vjzd3EDFRWW2" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "xgb_model = XGBRegressor(random_state=42)\n", + "pipefinal = make_pipeline(col_transformer, xgb_model)" + ], + "metadata": { + "id": "dStezUWMWGIf" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#Data Importing" + ], + "metadata": { + "id": "fWjuGoyrXZhH" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv('engines2_data_cleaned_no_outliers.csv')#Data/" + ], + "metadata": { + "id": "Gzo0jNotXXFu" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#applying the above outlierRemover\n", + "X = df[['flight_cycle', 'flight_phase', 'egt_probe_average', 'fuel_flw', 'core_spd', 'zpn12p', 'vib_n1_#1_bearing', 'vib_n2_#1_bearing', 'vib_n2_turbine_frame']]\n", + "Y= df['RUL']\n", + "df_all = X.copy()\n", + "df_all['RUL'] = Y\n", + "remover = OutlierRemoverGroup(groupby_col='flight_phase', columns=numerical_features_for_outlier)\n", + "df_filtered = remover.fit_transform(df_all)" + ], + "metadata": { + "id": "XeL_fjgRbL5o" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#Data Splitting" + ], + "metadata": { + "id": "22b_NXdU98iX" + } + }, + { + "cell_type": "code", + "source": [ + "X = df_filtered.drop(columns='RUL')\n", + "Y = df_filtered['RUL']\n", + "# Initial 80/20 split\n", + "X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)\n", + "# Further split temp into validation and test sets\n", + "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)" + ], + "metadata": { + "id": "BVEzw6JUW7cI" + }, + "execution_count": 30, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pipefinal.fit(X_train, y_train)" + ], + "metadata": { + "id": "ICjo6swRWkRq", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 245 + }, + "outputId": "fcaecb51-cb33-4e16-902a-a4e3b27192f7" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Pipeline(steps=[('columntransformer',\n", + " ColumnTransformer(transformers=[('num', 'passthrough',\n", + " ['flight_cycle',\n", + " 'egt_probe_average',\n", + " 'fuel_flw', 'core_spd',\n", + " 'zpn12p',\n", + " 'vib_n1_#1_bearing',\n", + " 'vib_n2_#1_bearing',\n", + " 'vib_n2_turbine_frame']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(drop='first',\n", + " handle_unknown='ignore'))]),\n", + " ['flight_phase'])])),\n", + " ('xgbregressor',\n", + " XGB...\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None, learning_rate=None,\n", + " max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None,\n", + " max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None,\n", + " n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...))])" + ], + "text/html": [ + "
Pipeline(steps=[('columntransformer',\n",
+              "                 ColumnTransformer(transformers=[('num', 'passthrough',\n",
+              "                                                  ['flight_cycle',\n",
+              "                                                   'egt_probe_average',\n",
+              "                                                   'fuel_flw', 'core_spd',\n",
+              "                                                   'zpn12p',\n",
+              "                                                   'vib_n1_#1_bearing',\n",
+              "                                                   'vib_n2_#1_bearing',\n",
+              "                                                   'vib_n2_turbine_frame']),\n",
+              "                                                 ('cat',\n",
+              "                                                  Pipeline(steps=[('onehot',\n",
+              "                                                                   OneHotEncoder(drop='first',\n",
+              "                                                                                 handle_unknown='ignore'))]),\n",
+              "                                                  ['flight_phase'])])),\n",
+              "                ('xgbregressor',\n",
+              "                 XGB...\n",
+              "                              feature_types=None, gamma=None, grow_policy=None,\n",
+              "                              importance_type=None,\n",
+              "                              interaction_constraints=None, learning_rate=None,\n",
+              "                              max_bin=None, max_cat_threshold=None,\n",
+              "                              max_cat_to_onehot=None, max_delta_step=None,\n",
+              "                              max_depth=None, max_leaves=None,\n",
+              "                              min_child_weight=None, missing=nan,\n",
+              "                              monotone_constraints=None, multi_strategy=None,\n",
+              "                              n_estimators=None, n_jobs=None,\n",
+              "                              num_parallel_tree=None, random_state=42, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Predict and evaluate on the test set\n", + "y_pred = pipefinal.predict(X_test)\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "mae = mean_absolute_error(y_test, y_pred)\n", + "rmse = mean_squared_error(y_test, y_pred)\n", + "print(f\"MAE: {mae}, RMSE: {rmse}\")" + ], + "metadata": { + "id": "nWFKHcUOXwFh", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2e31953a-2a65-46ea-97ee-2a96bb2298c7" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "MAE: 125.16436767578125, RMSE: 27350.759765625\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Save the Pipeline\n", + "joblib.dump(pipefinal, 'predictive_maintenance_pipeline.pkl')\n" + ], + "metadata": { + "id": "I0xFogOlLct5", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0ceb0312-972c-4a49-d1e4-145859f5f544" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['predictive_maintenance_pipeline.pkl']" + ] + }, + "metadata": {}, + "execution_count": 34 + } + ] + } + ] +} \ No newline at end of file