diff --git a/PM_pipeline.ipynb b/PM_pipeline.ipynb
new file mode 100644
index 0000000..458ce19
--- /dev/null
+++ b/PM_pipeline.ipynb
@@ -0,0 +1,810 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "authorship_tag": "ABX9TyPzZxEbLOYq1zjEDFBK8FRH",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Libraries"
+ ],
+ "metadata": {
+ "id": "ylUFE2gAWyTB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "id": "Bcap68Gv94i9"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "from sklearn.pipeline import Pipeline, make_pipeline\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "import xgboost as xgb\n",
+ "from xgboost import XGBRegressor\n",
+ "from scipy import stats\n",
+ "import joblib"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Pipeline"
+ ],
+ "metadata": {
+ "id": "UpWtCyNJW2eu"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "categorical_features = ['flight_phase']\n",
+ "numerical_features = ['flight_cycle',\t'egt_probe_average',\t'fuel_flw',\t'core_spd',\t'zpn12p',\t'vib_n1_#1_bearing',\t'vib_n2_#1_bearing',\t'vib_n2_turbine_frame']\n",
+ "numerical_features_for_outlier = ['egt_probe_average', 'fuel_flw', 'core_spd',\n",
+ " 'zpn12p', 'vib_n1_#1_bearing', 'vib_n2_#1_bearing',\n",
+ " 'vib_n2_turbine_frame'] #'corrected_fan_spd'"
+ ],
+ "metadata": {
+ "id": "MBRHt8W0Jvrf"
+ },
+ "execution_count": 23,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Custom Transformer for outliers removal: OutlierRemoverGroup\n",
+ "class OutlierRemoverGroup(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self, groupby_col, columns, threshold=2.5):\n",
+ " self.groupby_col = groupby_col\n",
+ " self.columns = columns\n",
+ " self.threshold = threshold\n",
+ "\n",
+ " def fit(self, X, y=None):\n",
+ " return self\n",
+ "\n",
+ " def transform(self, X):\n",
+ " X_clean = X.copy()\n",
+ " z_scores = X_clean.groupby(self.groupby_col)[self.columns].transform(\n",
+ " lambda x: np.abs(stats.zscore(x, nan_policy='omit'))\n",
+ " )\n",
+ " mask = (z_scores <= self.threshold).all(axis=1)\n",
+ " return X_clean[mask].reset_index(drop=True)"
+ ],
+ "metadata": {
+ "id": "R5PX9gsLVQCg"
+ },
+ "execution_count": 24,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#encoding categorical column\n",
+ "categorical_pipeline = Pipeline(steps=[\n",
+ " ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))\n",
+ "])"
+ ],
+ "metadata": {
+ "id": "toyCPtvUI9UV"
+ },
+ "execution_count": 25,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#merging the transformers together\n",
+ "col_transformer = ColumnTransformer(\n",
+ " transformers=[\n",
+ " ('num', 'passthrough', numerical_features),\n",
+ " ('cat', categorical_pipeline, categorical_features)\n",
+ " ]\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "Vjzd3EDFRWW2"
+ },
+ "execution_count": 26,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "xgb_model = XGBRegressor(random_state=42)\n",
+ "pipefinal = make_pipeline(col_transformer, xgb_model)"
+ ],
+ "metadata": {
+ "id": "dStezUWMWGIf"
+ },
+ "execution_count": 27,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Data Importing"
+ ],
+ "metadata": {
+ "id": "fWjuGoyrXZhH"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv('engines2_data_cleaned_no_outliers.csv')#Data/"
+ ],
+ "metadata": {
+ "id": "Gzo0jNotXXFu"
+ },
+ "execution_count": 28,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#applying the above outlierRemover\n",
+ "X = df[['flight_cycle', 'flight_phase', 'egt_probe_average', 'fuel_flw', 'core_spd', 'zpn12p', 'vib_n1_#1_bearing', 'vib_n2_#1_bearing', 'vib_n2_turbine_frame']]\n",
+ "Y= df['RUL']\n",
+ "df_all = X.copy()\n",
+ "df_all['RUL'] = Y\n",
+ "remover = OutlierRemoverGroup(groupby_col='flight_phase', columns=numerical_features_for_outlier)\n",
+ "df_filtered = remover.fit_transform(df_all)"
+ ],
+ "metadata": {
+ "id": "XeL_fjgRbL5o"
+ },
+ "execution_count": 29,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Data Splitting"
+ ],
+ "metadata": {
+ "id": "22b_NXdU98iX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "X = df_filtered.drop(columns='RUL')\n",
+ "Y = df_filtered['RUL']\n",
+ "# Initial 80/20 split\n",
+ "X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
+ "# Further split temp into validation and test sets\n",
+ "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)"
+ ],
+ "metadata": {
+ "id": "BVEzw6JUW7cI"
+ },
+ "execution_count": 30,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pipefinal.fit(X_train, y_train)"
+ ],
+ "metadata": {
+ "id": "ICjo6swRWkRq",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 245
+ },
+ "outputId": "fcaecb51-cb33-4e16-902a-a4e3b27192f7"
+ },
+ "execution_count": 31,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Pipeline(steps=[('columntransformer',\n",
+ " ColumnTransformer(transformers=[('num', 'passthrough',\n",
+ " ['flight_cycle',\n",
+ " 'egt_probe_average',\n",
+ " 'fuel_flw', 'core_spd',\n",
+ " 'zpn12p',\n",
+ " 'vib_n1_#1_bearing',\n",
+ " 'vib_n2_#1_bearing',\n",
+ " 'vib_n2_turbine_frame']),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('onehot',\n",
+ " OneHotEncoder(drop='first',\n",
+ " handle_unknown='ignore'))]),\n",
+ " ['flight_phase'])])),\n",
+ " ('xgbregressor',\n",
+ " XGB...\n",
+ " feature_types=None, gamma=None, grow_policy=None,\n",
+ " importance_type=None,\n",
+ " interaction_constraints=None, learning_rate=None,\n",
+ " max_bin=None, max_cat_threshold=None,\n",
+ " max_cat_to_onehot=None, max_delta_step=None,\n",
+ " max_depth=None, max_leaves=None,\n",
+ " min_child_weight=None, missing=nan,\n",
+ " monotone_constraints=None, multi_strategy=None,\n",
+ " n_estimators=None, n_jobs=None,\n",
+ " num_parallel_tree=None, random_state=42, ...))])"
+ ],
+ "text/html": [
+ "
Pipeline(steps=[('columntransformer',\n",
+ " ColumnTransformer(transformers=[('num', 'passthrough',\n",
+ " ['flight_cycle',\n",
+ " 'egt_probe_average',\n",
+ " 'fuel_flw', 'core_spd',\n",
+ " 'zpn12p',\n",
+ " 'vib_n1_#1_bearing',\n",
+ " 'vib_n2_#1_bearing',\n",
+ " 'vib_n2_turbine_frame']),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('onehot',\n",
+ " OneHotEncoder(drop='first',\n",
+ " handle_unknown='ignore'))]),\n",
+ " ['flight_phase'])])),\n",
+ " ('xgbregressor',\n",
+ " XGB...\n",
+ " feature_types=None, gamma=None, grow_policy=None,\n",
+ " importance_type=None,\n",
+ " interaction_constraints=None, learning_rate=None,\n",
+ " max_bin=None, max_cat_threshold=None,\n",
+ " max_cat_to_onehot=None, max_delta_step=None,\n",
+ " max_depth=None, max_leaves=None,\n",
+ " min_child_weight=None, missing=nan,\n",
+ " monotone_constraints=None, multi_strategy=None,\n",
+ " n_estimators=None, n_jobs=None,\n",
+ " num_parallel_tree=None, random_state=42, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('columntransformer',\n",
+ " ColumnTransformer(transformers=[('num', 'passthrough',\n",
+ " ['flight_cycle',\n",
+ " 'egt_probe_average',\n",
+ " 'fuel_flw', 'core_spd',\n",
+ " 'zpn12p',\n",
+ " 'vib_n1_#1_bearing',\n",
+ " 'vib_n2_#1_bearing',\n",
+ " 'vib_n2_turbine_frame']),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('onehot',\n",
+ " OneHotEncoder(drop='first',\n",
+ " handle_unknown='ignore'))]),\n",
+ " ['flight_phase'])])),\n",
+ " ('xgbregressor',\n",
+ " XGB...\n",
+ " feature_types=None, gamma=None, grow_policy=None,\n",
+ " importance_type=None,\n",
+ " interaction_constraints=None, learning_rate=None,\n",
+ " max_bin=None, max_cat_threshold=None,\n",
+ " max_cat_to_onehot=None, max_delta_step=None,\n",
+ " max_depth=None, max_leaves=None,\n",
+ " min_child_weight=None, missing=nan,\n",
+ " monotone_constraints=None, multi_strategy=None,\n",
+ " n_estimators=None, n_jobs=None,\n",
+ " num_parallel_tree=None, random_state=42, ...))])ColumnTransformer(transformers=[('num', 'passthrough',\n",
+ " ['flight_cycle', 'egt_probe_average',\n",
+ " 'fuel_flw', 'core_spd', 'zpn12p',\n",
+ " 'vib_n1_#1_bearing', 'vib_n2_#1_bearing',\n",
+ " 'vib_n2_turbine_frame']),\n",
+ " ('cat',\n",
+ " Pipeline(steps=[('onehot',\n",
+ " OneHotEncoder(drop='first',\n",
+ " handle_unknown='ignore'))]),\n",
+ " ['flight_phase'])])['flight_cycle', 'egt_probe_average', 'fuel_flw', 'core_spd', 'zpn12p', 'vib_n1_#1_bearing', 'vib_n2_#1_bearing', 'vib_n2_turbine_frame']
passthrough
['flight_phase']
OneHotEncoder(drop='first', handle_unknown='ignore')
XGBRegressor(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=None, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...)