Team-16-Machine-Learning-Project/p5-crossvalidating.py at main · derekogorry/Team-16-Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# STEP 5: CROSS-VALIDATION & HYPERPARAMETER TUNING
# CROSS-VALIDATION:
#   > Splits training data into K equal "folds" (we're using K=5)
#   > Train on 4 folds, test on remaining 1 fold
#   > Rotate test fold, 5 times total
#   & Average 5 results for more reliable performance estimate

import numpy as np
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

# cross_val_score: runs K-fold cross-validation for us automatically
from sklearn.model_selection import cross_val_score, KFold

# make_scorer: converts our custom MAE function into a format sklearn accepts
from sklearn.metrics import make_scorer, mean_absolute_error

# LOAD DATA
data    = joblib.load("train_test_data.pkl")
X_train = data["X_train"]
y_train = data["y_train"]

print(f"Training data loaded: {X_train.shape[0]} houses, {X_train.shape[1]} features")


# DEFINE CUSTOM SCORER (MAE in dollars)
# Convert MAE from log-dollar units (sklearn default) to real dollars
def dollar_mae(y_true_log, y_pred_log):
    """
    Computes MAE after reversing the log transform.
    Returns error in real dollars — easy to interpret.
    """
    # np.expm1 reverses np.log1p: expm1(x) = e^x - 1
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

# make_scorer wraps function so sklearn's cross_val_score can use it
# greater_is_better=False tells sklearn that lower MAE is better
scorer = make_scorer(dollar_mae, greater_is_better=False)

# DEFINE THE K-FOLD SPLITTER
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RUN 5-FOLD CROSS-VALIDATION ON ALL THREE MODELS
# (cross_val_score trains fresh models internally; doesn't use saved .pkl files, so here we redfine them)
models = {
    "Linear Regression": LinearRegression(),

    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        min_samples_leaf=5,
        random_state=42
    ),

    "Neural Network": MLPRegressor(
        hidden_layer_sizes=(128, 64, 32),
        activation="relu",
        solver="adam",
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    ),
}

print("\n" + "=" * 60)
print("5-FOLD CROSS-VALIDATION RESULTS")
print("=" * 60)
print("(Running all 5 folds × 3 models — may take 2–4 minutes total)")

cv_results = {}

for model_name, model in models.items():
    print(f"\nEvaluating: {model_name}...")

    # n_jobs=-1 uses all available CPU cores to run folds in parallel.
    # Scores come back as NEGATIVE, so we negate them to get positive MAE values.
    raw_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=kf,               # our 5-fold splitter
        scoring=scorer,      # our custom dollar MAE scorer
        n_jobs=-1            # use all CPU cores
    )

    # Convert negative scores to positive MAE values
    mae_scores = -raw_scores
    cv_results[model_name] = mae_scores

    # Report results
    print(f"  Mean MAE:  ${mae_scores.mean():>10,.0f}")
    print(f"  Std  MAE:  ${mae_scores.std():>10,.0f}  ← smaller std = more consistent")
    print(f"  Per-fold:  {['${:,.0f}'.format(s) for s in mae_scores]}")

# Summary table
print("\n" + "=" * 60)
print("SUMMARY — 5-Fold CV Mean MAE (lower = better)")
print("=" * 60)
sorted_cv = sorted(cv_results.items(), key=lambda x: x[1].mean())
for rank, (name, scores) in enumerate(sorted_cv, 1):
    print(f"  #{rank}  {name:<25}  ${scores.mean():,.0f}  ±  ${scores.std():,.0f}")

# GRID SEARCH (automated parameter tuning)
# Grid Search tries all combinations to find a combo that gives the best cross-validation score.
RUN_GRID_SEARCH = False   # we change to True to enable

if RUN_GRID_SEARCH:
    from sklearn.model_selection import GridSearchCV

    print("\n" + "=" * 60)
    print("GRID SEARCH — Tuning Gradient Boosting hyperparameters")
    print("=" * 60)
    print("This may take 10–30 minutes...")

    # The grid: 3 × 3 × 3 = 27 combinations, each run 5 times = 135 model fits
    param_grid = {
        "n_estimators":  [200, 300, 500],
        "learning_rate": [0.03, 0.05, 0.1],
        "max_depth":     [3, 4, 5],
    }

    grid_search = GridSearchCV(
        GradientBoostingRegressor(min_samples_leaf=5, random_state=42),
        param_grid,
        cv=kf,
        scoring=scorer,
        n_jobs=-1,
        verbose=1   # prints progress updates
    )

    grid_search.fit(X_train, y_train)

    print("\nBest hyperparameters found:")
    print(f"  {grid_search.best_params_}")
    print(f"Best CV MAE: ${-grid_search.best_score_:,.0f}")
    print("\nUse these parameters in step4_train_models.py for better results!")

    # Saves best model found by grid search
    joblib.dump(grid_search.best_estimator_, "model_gbr_tuned.pkl")
    print("Saved: model_gbr_tuned.pkl")

print("\nStep 5 complete! Proceed to p6_finalevaluation.py")