-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_model_ebano.py
More file actions
129 lines (108 loc) · 4.82 KB
/
_model_ebano.py
File metadata and controls
129 lines (108 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.utils import check_random_state
from sklearn.base import BaseEstimator, ClassifierMixin
from dlordinal.metrics import amae
class EBANO(BaseEstimator, ClassifierMixin):
"""
EBANO receives the paths to the saved results of the models
and computes the optimal weights for each model using cross-validation.
The path to the model must match the format:
`{model_saved_results_path}/{dataset_name}/seed_{random_state}_{split}.csv`,
where `split` can be either "train" or "test". The CSV file should contain a column
named "y_proba" with the predicted probabilities for each class.
"""
def __init__(
self,
models_saved_results_paths,
weights_cv_n_iters=1000,
random_state=None,
):
self.weights_cv_n_iters = weights_cv_n_iters
self.models_saved_results_paths = models_saved_results_paths
self.random_state = random_state
self.estimator_weights = None
self.dataset_name = None
def _crossvalidate_ensemble_weights(self, targets, models_pred_probas):
if self.weights_cv_n_iters <= 0:
raise ValueError("num_iter must be positive")
random_state = check_random_state(self.random_state)
num_estimators = len(models_pred_probas)
best_weights = random_state.uniform(size=num_estimators)
best_amae_score = float("inf")
for _ in range(self.weights_cv_n_iters):
weights = random_state.uniform(size=num_estimators)
weights /= np.sum(weights)
probas = self._compute_probas(models_pred_probas, weights)
preds = np.argmax(probas, axis=1)
amae_score = amae(
y_true=targets,
y_pred=preds,
)
if amae_score < best_amae_score:
best_amae_score = amae_score
best_weights = weights
return best_weights
def _compute_probas(self, models_pred_probas, weights):
weighted_probas = np.zeros((models_pred_probas[0].shape[0], self.n_classes_))
for i, estimator_pred_probas in enumerate(models_pred_probas):
weighted_probas += estimator_pred_probas * weights[i]
weighted_probas = softmax(weighted_probas, axis=1)
return weighted_probas
def _load_model_pred_probas(self, model_saved_results_path, split):
assert split in ["train", "test"], "split must be 'train' or 'test'"
seed_results_path = os.path.join(
model_saved_results_path,
self.dataset_name,
f"seed_{self.random_state}_{split}.csv",
)
if not os.path.exists(seed_results_path):
raise FileNotFoundError(f"Pretrained model not found at {seed_results_path}")
model_pred_probas = pd.read_csv(seed_results_path)["y_proba"].to_numpy()
float_probas = [
np.fromstring(row.strip("[]"), sep=" ") for row in model_pred_probas
]
float_probas = np.vstack(float_probas)
return float_probas
def fit(self, X, y):
self._train_shape = X.shape
self.n_classes_ = len(np.unique(y))
if self.dataset_name is None:
raise ValueError("dataset_name must be set before fitting")
models_pred_probas = []
for model_saved_results_path in self.models_saved_results_paths:
model_pred_probas = self._load_model_pred_probas(
model_saved_results_path, split="train"
)
if model_pred_probas.shape[1] != self.n_classes_:
raise ValueError(
f"Model predictions must have shape (n_samples, {self.n_classes_}), "
f"but got {model_pred_probas.shape}"
)
if model_pred_probas.shape[0] != len(y):
raise ValueError(
f"Model predictions must have the same number of samples as y, "
f"but got {model_pred_probas.shape[0]} and {len(y)}"
)
models_pred_probas.append(model_pred_probas)
self.estimator_weights = self._crossvalidate_ensemble_weights(
y,
models_pred_probas=models_pred_probas,
)
return self
def predict_proba(self, X):
split = "test"
if self._train_shape == X.shape:
split = "train"
models_pred_probas = []
for model_saved_results_path in self.models_saved_results_paths:
model_pred_probas = self._load_model_pred_probas(
model_saved_results_path, split=split
)
models_pred_probas.append(model_pred_probas)
return self._compute_probas(models_pred_probas, self.estimator_weights)
def predict(self, X):
pred_probas = self.predict_proba(X)
return np.argmax(pred_probas, axis=1)