-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathp5-crossvalidating.py
More file actions
149 lines (120 loc) · 5.06 KB
/
Copy pathp5-crossvalidating.py
File metadata and controls
149 lines (120 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# STEP 5: CROSS-VALIDATION & HYPERPARAMETER TUNING
# CROSS-VALIDATION:
# > Splits training data into K equal "folds" (we're using K=5)
# > Train on 4 folds, test on remaining 1 fold
# > Rotate test fold, 5 times total
# & Average 5 results for more reliable performance estimate
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
# cross_val_score: runs K-fold cross-validation for us automatically
from sklearn.model_selection import cross_val_score, KFold
# make_scorer: converts our custom MAE function into a format sklearn accepts
from sklearn.metrics import make_scorer, mean_absolute_error
# LOAD DATA
data = joblib.load("train_test_data.pkl")
X_train = data["X_train"]
y_train = data["y_train"]
print(f"Training data loaded: {X_train.shape[0]} houses, {X_train.shape[1]} features")
# DEFINE CUSTOM SCORER (MAE in dollars)
# Convert MAE from log-dollar units (sklearn default) to real dollars
def dollar_mae(y_true_log, y_pred_log):
"""
Computes MAE after reversing the log transform.
Returns error in real dollars — easy to interpret.
"""
# np.expm1 reverses np.log1p: expm1(x) = e^x - 1
y_true = np.expm1(y_true_log)
y_pred = np.expm1(y_pred_log)
return mean_absolute_error(y_true, y_pred)
# make_scorer wraps function so sklearn's cross_val_score can use it
# greater_is_better=False tells sklearn that lower MAE is better
scorer = make_scorer(dollar_mae, greater_is_better=False)
# DEFINE THE K-FOLD SPLITTER
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# RUN 5-FOLD CROSS-VALIDATION ON ALL THREE MODELS
# (cross_val_score trains fresh models internally; doesn't use saved .pkl files, so here we redfine them)
models = {
"Linear Regression": LinearRegression(),
"Gradient Boosting": GradientBoostingRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=4,
min_samples_leaf=5,
random_state=42
),
"Neural Network": MLPRegressor(
hidden_layer_sizes=(128, 64, 32),
activation="relu",
solver="adam",
max_iter=500,
early_stopping=True,
validation_fraction=0.1,
random_state=42
),
}
print("\n" + "=" * 60)
print("5-FOLD CROSS-VALIDATION RESULTS")
print("=" * 60)
print("(Running all 5 folds × 3 models — may take 2–4 minutes total)")
cv_results = {}
for model_name, model in models.items():
print(f"\nEvaluating: {model_name}...")
# n_jobs=-1 uses all available CPU cores to run folds in parallel.
# Scores come back as NEGATIVE, so we negate them to get positive MAE values.
raw_scores = cross_val_score(
model,
X_train,
y_train,
cv=kf, # our 5-fold splitter
scoring=scorer, # our custom dollar MAE scorer
n_jobs=-1 # use all CPU cores
)
# Convert negative scores to positive MAE values
mae_scores = -raw_scores
cv_results[model_name] = mae_scores
# Report results
print(f" Mean MAE: ${mae_scores.mean():>10,.0f}")
print(f" Std MAE: ${mae_scores.std():>10,.0f} ← smaller std = more consistent")
print(f" Per-fold: {['${:,.0f}'.format(s) for s in mae_scores]}")
# Summary table
print("\n" + "=" * 60)
print("SUMMARY — 5-Fold CV Mean MAE (lower = better)")
print("=" * 60)
sorted_cv = sorted(cv_results.items(), key=lambda x: x[1].mean())
for rank, (name, scores) in enumerate(sorted_cv, 1):
print(f" #{rank} {name:<25} ${scores.mean():,.0f} ± ${scores.std():,.0f}")
# GRID SEARCH (automated parameter tuning)
# Grid Search tries all combinations to find a combo that gives the best cross-validation score.
RUN_GRID_SEARCH = False # we change to True to enable
if RUN_GRID_SEARCH:
from sklearn.model_selection import GridSearchCV
print("\n" + "=" * 60)
print("GRID SEARCH — Tuning Gradient Boosting hyperparameters")
print("=" * 60)
print("This may take 10–30 minutes...")
# The grid: 3 × 3 × 3 = 27 combinations, each run 5 times = 135 model fits
param_grid = {
"n_estimators": [200, 300, 500],
"learning_rate": [0.03, 0.05, 0.1],
"max_depth": [3, 4, 5],
}
grid_search = GridSearchCV(
GradientBoostingRegressor(min_samples_leaf=5, random_state=42),
param_grid,
cv=kf,
scoring=scorer,
n_jobs=-1,
verbose=1 # prints progress updates
)
grid_search.fit(X_train, y_train)
print("\nBest hyperparameters found:")
print(f" {grid_search.best_params_}")
print(f"Best CV MAE: ${-grid_search.best_score_:,.0f}")
print("\nUse these parameters in step4_train_models.py for better results!")
# Saves best model found by grid search
joblib.dump(grid_search.best_estimator_, "model_gbr_tuned.pkl")
print("Saved: model_gbr_tuned.pkl")
print("\nStep 5 complete! Proceed to p6_finalevaluation.py")