-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpolynomial_regression.py
More file actions
104 lines (79 loc) · 3.64 KB
/
polynomial_regression.py
File metadata and controls
104 lines (79 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
csv_path = r'.\courses_2020_fixed_distance_km_no_top5.csv'
df = pd.read_csv(csv_path)
# compute age if year of birth present, otherwise leave NaN
current_year = 2020
if 'Athlete year of birth' in df.columns:
df['age'] = current_year - df['Athlete year of birth']
else:
df['age'] = np.nan
# encode gender: male=1, female=0, unknown->nan
def encode_gender(g):
if pd.isna(g):
return np.nan
g_s = str(g).strip().upper()
if g_s.startswith('M'):
return 1
if g_s.startswith('F'):
return 0
return np.nan
df['gender_code'] = df['Athlete gender'].apply(encode_gender)
X = df[['age', 'gender_code', 'Distance_km']].copy()
y = df['Athlete average speed'].copy()
mask = X.notnull().all(axis=1) & y.notnull()
X = X[mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)
# Now split for training/validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50)
## define a list of values for the maximum polynomial degree
degrees = list(range(1, 6))
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
# scaler sur X_train uniquement (pour éviter fuite de données)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
# ensuite tu fais ton PolynomialFeatures
linear_tr_errors = []
linear_val_errors = []
linear_test_errors = []
for degree in degrees: # use for-loop to fit polynomial regression models with different degrees
#TRAINING
lin_regr = LinearRegression(fit_intercept=True)
poly = PolynomialFeatures(degree=degree, include_bias=False) # generate polynomial features
X_train_poly = poly.fit_transform(X_train_scaled) # fit the raw features
lin_regr.fit(X_train_poly, y_train) # apply linear regression to these new features and labels
#TRAINING ERROR
y_pred_train = lin_regr.predict(X_train_poly) # predict using the linear model
tr_error = mean_squared_error(y_train, y_pred_train) # calculate the training error
#VALIDATION ERROR
X_val_poly = poly.transform(X_val_scaled) # transform the raw features for the validation data
y_pred_val = lin_regr.predict(X_val_poly) # predict values for the validation data using the linear model
val_error = mean_squared_error(y_val, y_pred_val) # calculate the validation error
#TEST ERROR
X_test_poly = poly.transform(X_test_scaled) # transform the raw features for the test data
y_pred_test = lin_regr.predict(X_test_poly) # predict values for the test data using the linear model
test_error = mean_squared_error(y_test, y_pred_test) # calculate the test error
#PLOTS
linear_tr_errors.append(tr_error)
linear_val_errors.append(val_error)
linear_test_errors.append(test_error)
# plot the training and validation errors
plt.figure(figsize=(8, 5))
plt.plot(degrees, linear_tr_errors, label='Training Error', marker='o')
plt.plot(degrees, linear_val_errors, label='Validation Error', marker='o')
plt.plot(degrees, linear_test_errors, label='Test Error', marker='o')
plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.title('Training, Validation and Test Errors vs Polynomial Degree')
plt.legend()
plt.grid(True)
plt.show()