From dda8fbe332b643bdb575eb2fea9f8421b19ac880 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 16 May 2023 12:37:59 -0300 Subject: [PATCH 01/28] Update link to make git clone --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a9b1b8..406cd3e 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ In order to contribute with `librep` you may want to: 1. Clone librep's repository: ``` -git clone https://github.com/otavioon/hiaac-librep.git +git clone https://github.com/discovery-unicamp/hiaac-librep.git ``` 2. Create a python virtual environment and activate it (requires Python >= 3.8): From 019f3f00d655f2731d7049dba94f32603562bb5c Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 16 May 2023 16:33:49 -0300 Subject: [PATCH 02/28] Fix documetation --- README.md | 4 ++-- src/librep/config/type_definitions.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 406cd3e..cc2c42e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ To install `librep`, you may use: ``` -pip install git+https://github.com/otavioon/hiaac-librep.git +pip install git+https://github.com/discovery-unicamp/hiaac-librep.git ``` ### Pip optional dependencies @@ -36,7 +36,7 @@ source .librep-venv/bin/activate 3. Install librep development packages, in editable mode ``` -pip install -e .[dev] +pip install -e .[dev] (we need to fix the flag [dev]) ``` 4. Run tests diff --git a/src/librep/config/type_definitions.py b/src/librep/config/type_definitions.py index 91e93f2..ef937dc 100644 --- a/src/librep/config/type_definitions.py +++ b/src/librep/config/type_definitions.py @@ -4,5 +4,5 @@ # PathLike: The PathLike type is used for defining a file path. PathLike = Union[str, os.PathLike] -ArrayLike = Union[numpy.ndarray] +ArrayLike = Union[numpy.ndarray, list] KeyType = Hashable \ No newline at end of file From 72b342bf088ef5823df8245ad61920f9d23aa752 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Thu, 5 Oct 2023 16:05:53 -0300 Subject: [PATCH 03/28] Add xai tools --- src/librep/xai/xai.py | 525 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 525 insertions(+) create mode 100644 src/librep/xai/xai.py diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py new file mode 100644 index 0000000..465e286 --- /dev/null +++ b/src/librep/xai/xai.py @@ -0,0 +1,525 @@ +from pathlib import Path +from typing import List, Tuple, Dict, Any + + +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import MinMaxScaler, StandardScaler + +from librep.datasets.multimodal.operations import ( + DatasetFitter, + PandasMultiModalDataset, +) + +# import xAI techniques +import shap +from lime import lime_tabular + +import pickle +import numpy as np +import pandas as pd +from tqdm import tqdm + + +############################################################################################################ +# Function to load the dataset +############################################################################################################ +def load_dataset( + dataset_name: str, reduce_on: str, normalization: str = None, path: Path = Path("../results/execution/output_files/reduced_data") +) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: + """This function loads the dataset from the path. In particular, it loads the train and test files from the path: + results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. + This directory contains the preprocessed data for the interested experiment. + + Parameters + ---------- + dataset_name: str + The name of the dataset + reduce_on: str + The name of the modality on which the dataset is reduced + normalization: str + The name of the normalization technique to apply to the dataset + path: Path + The path where the dataset is stored + + Returns + ------- + train: Dataset + The train dataset + test: Dataset + The test dataset + """ + path = path / f"{dataset_name}-{reduce_on}" + + # Let's read files from the directory path + with open(path / "train", "rb") as f: + train = pickle.load(f) + + with open(path / "test", "rb") as f: + test = pickle.load(f) + + # Let's normalize the dataset + train, test = normalize_dataset(train, test, normalization) + + return train, test + + +def normalize_dataset( + train: PandasMultiModalDataset, + test: PandasMultiModalDataset, + normalization: str = None, +) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: + """This function normalizes the dataset. + + Parameters + ---------- + train: Dataset + The train dataset + test: Dataset + The test dataset + normalization: str + The name of the normalization technique to apply to the dataset + If None, no normalization is applied + If "MinMaxScaler", MinMaxScaler is applied + If "StandardScaler", StandardScaler is applied + + Returns + ------- + train: Dataset + The normalized train dataset + test: Dataset + The normalized test dataset + """ + + if normalization == None: + pass + elif normalization == "MinMaxScaler": + scaler = MinMaxScaler() + scaler.fit(train.X) + train.X = scaler.transform(train.X) + test.X = scaler.transform(test.X) + elif normalization == "StandardScaler": + scaler = StandardScaler() + scaler.fit(train.X) + train.X = scaler.transform(train.X) + test.X = scaler.transform(test.X) + + return train, test + + +############################################################################################################ +# Functions to train the models +# Random Forest +# SVM +# KNN +############################################################################################################ +def train_rf(train: PandasMultiModalDataset) -> RandomForestClassifier: + """This function trains a Random Forest classifier on the train dataset. + + Parameters + ---------- + train: Dataset + The train dataset + + Returns + ------- + model: RandomForestClassifier + The trained Random Forest classifier + """ + + model = RandomForestClassifier(n_estimators=100, random_state=42) + DatasetFitter(model, use_y=True)(train) + + return model + + +def train_svm(train: PandasMultiModalDataset) -> SVC: + """This function trains a SVM classifier on the train dataset. + + Parameters + ---------- + train: Dataset + The train dataset + + Returns + ------- + model: SVC + The trained SVM classifier + """ + model = SVC(random_state=42, probability=True) + DatasetFitter(model, use_y=True)(train) + + return model + + +def train_knn(train: PandasMultiModalDataset) -> KNeighborsClassifier: + """This function trains a KNN classifier on the train dataset. + + Parameters + ---------- + train: Dataset + The train dataset + + Returns + ------- + model: KNeighborsClassifier + The trained KNN classifier + """ + model = KNeighborsClassifier(n_neighbors=5) + DatasetFitter(model, use_y=True)(train) + + return model + + +############################################################################################################ +# Functions to calculate the feature importance using SHAP +############################################################################################################ +def calc_shap_values_tree(model, test: PandasMultiModalDataset) -> np.ndarray: + """This function calculates the shap values for each sample in the test dataset + + Parameters + ---------- + model: sklearn model based on trees + The trained model + test: Dataset + The test dataset + + Returns + ------- + shap_values: np.ndarray + The shap values for each sample in the test dataset + """ + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(test.X) + + return shap_values + + +def calc_shap_values(model, test: PandasMultiModalDataset) -> np.ndarray: + """This function calculates the shap values for each sample in the test dataset + + Parameters + ---------- + model: machine learning model + The trained model + test: Dataset + The test dataset + """ + explainer = shap.KernelExplainer(model.predict_proba, test.X) + shap_values = explainer.shap_values(test.X, nsamples=100) + + return shap_values + + +############################################################################################################ +# Functions to filter the SHAP values +############################################################################################################ +def shap_values_per_feature( + shap_values, activities: List[int], num_features: int = 24 +) -> pd.DataFrame: + """This function calculates the shap values for each feature, for each activity. + For each activity, the shap values from a subset are the absolute average of the shap values of all samples in the subset. + After that, the feature importance for each feature are the sum of the shap values of all activities. + + Parameters + ---------- + shap_values: np.ndarray + The shap values for each sample in the test dataset + activities: List[int] + The list of activities + num_features: int + The number of features + + Returns + ------- + df: pd.DataFrame + The dataframe containing the shap values for each feature + """ + fi = { + f"feature {i}": np.sum( + [ + np.mean(np.abs(shap_values[j][:, i])) + for j, activity in enumerate(activities) + ] + ) + for i in range(num_features) + } + + df = pd.DataFrame(fi, index=[0]) + return df + + +def shap_values_per_class(shap_values, activities: List[int]) -> pd.DataFrame: + """This function calculates the shap values for each feature, for each activity. + For each activity, the shap values from a subset are the absolute average of the shap values of all samples in the subset. + + Parameters + ---------- + shap_values: np.ndarray + The shap values for each sample in the test dataset + activities: List[int] + The list of activities + + Returns + ------- + df: pd.DataFrame + The dataframe containing the shap values for each feature + """ + keys = ["activity"] + [f"feature {i}" for i in range(24)] + + fi = {key: None for key in keys} + fis = [] + for j, activity in enumerate(activities): + fi["activity"] = activity + for i in range(24): + fi[f"feature {i}"] = np.mean(np.abs(shap_values[j][:, i])) + fis.append(fi.copy()) + + df = pd.DataFrame(fis) + return df + + +############################################################################################################ +# Functions to calculate the feature importance using LIME +############################################################################################################ +def calc_lime_values( + model, + test: PandasMultiModalDataset, + train: PandasMultiModalDataset, + standartized_codes: Dict[int, str], +) -> List[Dict[str, Any]]: + """This function calculates the lime values for each sample in the test dataset and store in a dictionary + the class of the sample, the predicted class by lime, the feature importance and the explainer object created by lime. + + Parameters + ---------- + model: machine learning model + The trained model + test: Dataset + The test dataset + train: Dataset + The train dataset + standartized_codes: Dict[int, str] + The dictionary containing the mapping between the activity codes and the activity names + + Returns + ------- + lime_values: List[Dict[str, Any]] + The list of dictionaries containing the lime values for each sample in the test dataset + """ + # This function calculates the lime values for each sample in the test dataset + lime_values = [] + num_features = train.X.shape[1] + features_names = [f"feature_{i}" for i in range(num_features)] + explainer = lime_tabular.LimeTabularExplainer( + train.X, + feature_names=features_names, + class_names=standartized_codes.values(), + discretize_continuous=False, + ) + for sample, y in tqdm(zip(test.X, test.y)): + exp = explainer.explain_instance( + sample, + model.predict_proba, + num_features=num_features, + top_labels=1, + ) + + lime_value = { + "True class": y, + "Predicted class by lime:": list(exp.as_map().keys())[0], + "Lime values": exp.as_map(), + "Explainer": exp, + } + + lime_values.append(lime_value) + + return lime_values + + +############################################################################################################ +# Functions to filter the LIME values +############################################################################################################ +def lime_values_per_class( + lime_values: List[Dict[str, Any]], + dataset: str, + reduce: str, + model_name: str, + activities: List[int], + standartized_codes: Dict[int, str], + num_features: int = 24, +) -> pd.DataFrame: + """This function calculates the lime values for each feature, for each activity. For each activity, + the lime values from a subset are the absolute average of the feature importances of all samples in the subset. + After that, the feature importance for each feature are the sum of the lime values of all activities. + + Parameters + ---------- + lime_values: List[Dict[str, Any]] + The list of dictionaries containing the lime values for each sample in the test dataset + dataset: str + The name of the dataset + reduce: str + The name of the modality on which the dataset is reduced + model_name: str + The name of the model + activities: List[int] + The list of activities + standartized_codes: Dict[int, str] + The dictionary containing the mapping between the activity codes and the activity names + num_features: int + The number of features + + Returns + ------- + df: pd.DataFrame + The dataframe containing the feature importance for each feature for each activity by lime + """ + + columns = [f"feature {i}" for i in range(num_features)] + [ + "Classifier", + "Dataset", + "reduce on", + "activity", + ] + dfs = [] + + for activity in activities: + list = [] + df = {column: None for column in columns} + for j, lime_value in enumerate(lime_values): + if lime_value["True class"] == activity: + lime_predict = lime_value["Predicted class by lime:"] + sample = lime_value["Lime values"][lime_predict] + sample = sorted(sample) + sample = np.array(sample) + sample = sample[:, 1] + sample = np.abs(sample) + list.append(sample) + + list = np.array(list) + list = np.mean(list, axis=0) + list = np.array(list) + + df["Classifier"] = model_name + df["Dataset"] = dataset + df["reduce on"] = reduce + df["activity"] = standartized_codes[activity] + for i, value in enumerate(list[:num_features]): + df[f"feature {i}"] = value + dfs.append(df) + return pd.DataFrame(dfs).reset_index(drop=True) + + +def lime_values_per_feature( + lime_values: List[Dict[str, Any]], + dataset: str, + reduce: str, + model_name: str, + activities: List[int], + standartized_codes: Dict[int, str], + num_features: int = 24, +) -> pd.DataFrame: + """This function calculates the lime values for each feature, for each activity. For each activity, + the lime values from a subset are the absolute average of the feature importances of all samples in the subset. + After that, the feature importance for each feature are the sum of the lime values of all activities. + + Parameters + ---------- + lime_values: List[Dict[str, Any]] + The list of dictionaries containing the lime values for each sample in the test dataset + dataset: str + The name of the dataset + reduce: str + The name of the modality on which the dataset is reduced + model_name: str + The name of the model + activities: List[int] + The list of activities + standartized_codes: Dict[int, str] + The dictionary containing the mapping between the activity codes and the activity names + num_features: int + The number of features + + Returns + ------- + df: pd.DataFrame + The dataframe containing the feature importance for each feature for each activity by lime + """ + + df = lime_values_per_class( + lime_values, + dataset, + reduce, + model_name, + activities, + standartized_codes, + num_features, + ) + # Let's remove the columns that are not features + df = df.drop(columns=["Classifier", "Dataset", "reduce on", "activity"]) + + # Let's sum all lines + df = df.sum(axis=0).to_frame().T + + return df.reset_index(drop=True) + + +############################################################################################################ +# Functions to calculate the feature importance using Oracle thecnique +############################################################################################################ +def calc_oracle_values( + classifier: str, + dataset: str, + reduce: str, + latent_dim: int, + columns_to_remove: List[int] = [], + normalization: str = None, +): + """This function calculates the oracle values for each feature. + The oracle values are calculated by removing each feature from the dataset and then training a classifier on the reduced dataset, + so that the accuracy of the classifier is calculated + on the test dataset. The oracle values are calculated as 1 - accuracy. + + Parameters + ---------- + classifier: str + The name of the classifier + dataset: str + The name of the dataset + reduce: str + The name of the modality on which the dataset is reduced + latent_dim: int + The number of latent dimensions, that is, the number of features + columns_to_remove: List[int] + The list of columns to remove from the dataset, if needed + + Returns + ------- + fis: np.ndarray + The oracle values for each feature + """ + + accuracies = [] + for dim in range(latent_dim): + train, test = load_dataset(dataset, reduce, normalization=normalization) + if columns_to_remove != []: + train.X = np.delete(train.X, columns_to_remove, axis=1) + test.X = np.delete(test.X, columns_to_remove, axis=1) + # Let's remove the column dim from the train and test dataset + train.X = np.delete(train.X, dim, axis=1) + test.X = np.delete(test.X, dim, axis=1) + + model = ( + train_rf(train) + if classifier == "Radom Forest" + else train_svm(train) + if classifier == "SVM" + else train_knn(train) + ) + accuracy = model.score(test.X, test.y) + accuracies.append(accuracy) + + fis = 1 - np.array(accuracies) + return fis, accuracies From 94a50baf55d2c7637e144c4c2cb706bf75623623 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Thu, 26 Oct 2023 11:51:04 -0300 Subject: [PATCH 04/28] Add balanced accuracy --- src/librep/metrics/report.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/librep/metrics/report.py b/src/librep/metrics/report.py index ffff22f..9abc4a1 100644 --- a/src/librep/metrics/report.py +++ b/src/librep/metrics/report.py @@ -4,6 +4,7 @@ classification_report, accuracy_score, f1_score, + balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay ) @@ -18,6 +19,7 @@ def __init__( self, use_accuracy: bool = True, use_f1_score: bool = True, + use_balanced_accuracy: bool = False, use_confusion_matrix: bool = True, use_classification_report: bool = False, plot_confusion_matrix: bool = True, @@ -29,6 +31,7 @@ def __init__( ): self.use_accuracy = use_accuracy self.use_f1_score = use_f1_score + self.use_balanced_accuracy = use_balanced_accuracy self.use_confusion_matrix = use_confusion_matrix self.use_classification_report = use_classification_report self.plot_confusion_matrix = plot_confusion_matrix @@ -59,6 +62,10 @@ def evaluate( res = f1_score(y_true, y_pred, average="macro") result["f1 score (macro)"] = float(res) + + if self.use_balanced_accuracy: + res = balanced_accuracy_score(y_true, y_pred) + result["balanced accuracy"] = float(res) if self.use_confusion_matrix: res = confusion_matrix(y_true, y_pred) From 3fdb1952c86d11b9088ce9093e3e33616cc8a32c Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 8 Nov 2023 15:55:05 -0300 Subject: [PATCH 05/28] Update xai file --- src/librep/xai/xai.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 465e286..bf71146 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -3,6 +3,7 @@ from sklearn.ensemble import RandomForestClassifier +from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MinMaxScaler, StandardScaler @@ -26,7 +27,10 @@ # Function to load the dataset ############################################################################################################ def load_dataset( - dataset_name: str, reduce_on: str, normalization: str = None, path: Path = Path("../results/execution/output_files/reduced_data") + dataset_name: str, + reduce_on: str, + normalization: str = None, + path: Path = Path("../results/execution/output_files/reduced_data"), ) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: """This function loads the dataset from the path. In particular, it loads the train and test files from the path: results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. @@ -172,6 +176,25 @@ def train_knn(train: PandasMultiModalDataset) -> KNeighborsClassifier: return model +def train_dt(train: PandasMultiModalDataset) -> DecisionTreeClassifier: + """This function trains a Decision Tree classifier on the train dataset. + + Parameters + ---------- + train: Dataset + The train dataset + + Returns + ------- + model: DecisionTreeClassifier + The trained Decision Tree classifier + """ + model = DecisionTreeClassifier(random_state=42) + DatasetFitter(model, use_y=True)(train) + + return model + + ############################################################################################################ # Functions to calculate the feature importance using SHAP ############################################################################################################ From 0fa63675ea97b1a4f46f368153bd3720ee462b96 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Fri, 10 Nov 2023 17:53:59 -0300 Subject: [PATCH 06/28] Fix xay documentation and functions to calculate global feature importance --- src/librep/xai/xai.py | 72 ++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index bf71146..b99ad1b 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -117,6 +117,7 @@ def normalize_dataset( # Random Forest # SVM # KNN +# Decision Tree ############################################################################################################ def train_rf(train: PandasMultiModalDataset) -> RandomForestClassifier: """This function trains a Random Forest classifier on the train dataset. @@ -351,7 +352,8 @@ def calc_lime_values( lime_value = { "True class": y, - "Predicted class by lime:": list(exp.as_map().keys())[0], + "LIME prediction": list(exp.as_map().keys())[0], + "Model prediction": model.predict([sample])[0], "Lime values": exp.as_map(), "Explainer": exp, } @@ -372,6 +374,7 @@ def lime_values_per_class( activities: List[int], standartized_codes: Dict[int, str], num_features: int = 24, + remove_misclassified: bool = True, ) -> pd.DataFrame: """This function calculates the lime values for each feature, for each activity. For each activity, the lime values from a subset are the absolute average of the feature importances of all samples in the subset. @@ -400,39 +403,42 @@ def lime_values_per_class( The dataframe containing the feature importance for each feature for each activity by lime """ - columns = [f"feature {i}" for i in range(num_features)] + [ - "Classifier", - "Dataset", - "reduce on", - "activity", + dfs = {activity: [] for activity in activities} + + fis = {activity: [] for activity in activities} + for j, lime_value in enumerate(lime_values): + # Calculate the feature importance for the sample + activity = lime_value["True class"] + lime_predict = lime_value["LIME prediction"] + sample = lime_value["Lime values"][lime_predict] + sample = sorted(sample) + sample = np.array(sample) + fi = np.abs(sample[:, 1]) + model_predict = lime_value["Model prediction"] + if remove_misclassified: + fis[activity].append(fi) + else: + # Check if the model predicted correctly the sample + if model_predict == activity: + fis[activity].append(fi) + + # Let's calculate the average of the feature importance for each activity + columns = ["Classifier", "Dataset", "reduce on", "activity"] + [ + f"feature_{i}" for i in range(num_features) ] - dfs = [] - for activity in activities: - list = [] - df = {column: None for column in columns} - for j, lime_value in enumerate(lime_values): - if lime_value["True class"] == activity: - lime_predict = lime_value["Predicted class by lime:"] - sample = lime_value["Lime values"][lime_predict] - sample = sorted(sample) - sample = np.array(sample) - sample = sample[:, 1] - sample = np.abs(sample) - list.append(sample) - - list = np.array(list) - list = np.mean(list, axis=0) - list = np.array(list) - - df["Classifier"] = model_name - df["Dataset"] = dataset - df["reduce on"] = reduce - df["activity"] = standartized_codes[activity] - for i, value in enumerate(list[:num_features]): - df[f"feature {i}"] = value - dfs.append(df) - return pd.DataFrame(dfs).reset_index(drop=True) + fi_class = np.array(fi_class) + fi_class = np.mean(fi_class, axis=0) + fi_class = fi_class.tolist() + data = [model_name, dataset, reduce, standartized_codes[activity]] + fi_class + df = pd.DataFrame([data], columns=columns) + dfs[activity].append(df) + + columns = dfs[activity][0].columns + # The dataframe containing the global feature importance for each class of the dataset + df = pd.concat(dfs.values(), ignore_index=True, columns=columns) + + return df def lime_values_per_feature( @@ -540,6 +546,8 @@ def calc_oracle_values( else train_svm(train) if classifier == "SVM" else train_knn(train) + if classifier == "KNN" + else train_dt(train) ) accuracy = model.score(test.X, test.y) accuracies.append(accuracy) From f59a57b406a12ebc93cde4b8e97e92420e562edf Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Fri, 10 Nov 2023 21:48:41 -0300 Subject: [PATCH 07/28] Update path to load data --- src/librep/xai/xai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index b99ad1b..1987039 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -30,7 +30,7 @@ def load_dataset( dataset_name: str, reduce_on: str, normalization: str = None, - path: Path = Path("../results/execution/output_files/reduced_data"), + path: Path = Path("../results/execution/transformed_data"), ) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: """This function loads the dataset from the path. In particular, it loads the train and test files from the path: results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. @@ -57,10 +57,10 @@ def load_dataset( path = path / f"{dataset_name}-{reduce_on}" # Let's read files from the directory path - with open(path / "train", "rb") as f: + with open(path / "train.pkl", "rb") as f: train = pickle.load(f) - with open(path / "test", "rb") as f: + with open(path / "test.pkl", "rb") as f: test = pickle.load(f) # Let's normalize the dataset From a411efdf284a3f8b7d1163251db9dfa7040fb982 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Fri, 10 Nov 2023 22:20:43 -0300 Subject: [PATCH 08/28] Update data_path --- src/librep/xai/xai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 1987039..57341c6 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -30,7 +30,7 @@ def load_dataset( dataset_name: str, reduce_on: str, normalization: str = None, - path: Path = Path("../results/execution/transformed_data"), + path: Path = Path("../reducer_experiments/results/execution/transformed_data/"), ) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: """This function loads the dataset from the path. In particular, it loads the train and test files from the path: results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. From 62ac18e3eaa51e5e6f47c33f1d8e2845a543f1f7 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Sat, 11 Nov 2023 02:03:25 -0300 Subject: [PATCH 09/28] Update --- src/librep/xai/xai.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 57341c6..b87bd4b 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -30,7 +30,7 @@ def load_dataset( dataset_name: str, reduce_on: str, normalization: str = None, - path: Path = Path("../reducer_experiments/results/execution/transformed_data/"), + path: Path = Path("../reducer_experiments/results/execution/transformed_data"), ) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: """This function loads the dataset from the path. In particular, it loads the train and test files from the path: results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. @@ -502,9 +502,10 @@ def calc_oracle_values( classifier: str, dataset: str, reduce: str, - latent_dim: int, + latent_dim: int = 24, columns_to_remove: List[int] = [], normalization: str = None, + data_path: str = "../reducer_experiments/results/execution/transformed_data", ): """This function calculates the oracle values for each feature. The oracle values are calculated by removing each feature from the dataset and then training a classifier on the reduced dataset, @@ -532,7 +533,9 @@ def calc_oracle_values( accuracies = [] for dim in range(latent_dim): - train, test = load_dataset(dataset, reduce, normalization=normalization) + train, test = load_dataset( + dataset, reduce, normalization=normalization, path=Path(data_path) + ) if columns_to_remove != []: train.X = np.delete(train.X, columns_to_remove, axis=1) test.X = np.delete(test.X, columns_to_remove, axis=1) From 3d7145e0aabdeb998a6c170ad44efff0117327ab Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Sat, 11 Nov 2023 02:36:24 -0300 Subject: [PATCH 10/28] Fix Lime --- src/librep/xai/xai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index b87bd4b..1b51977 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -427,7 +427,7 @@ def lime_values_per_class( f"feature_{i}" for i in range(num_features) ] for activity in activities: - fi_class = np.array(fi_class) + fi_class = np.array(fis[activity]) fi_class = np.mean(fi_class, axis=0) fi_class = fi_class.tolist() data = [model_name, dataset, reduce, standartized_codes[activity]] + fi_class From 24fdd25f0818dbdedd7ed7dedad85a5a15f54cc8 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Sat, 11 Nov 2023 02:41:25 -0300 Subject: [PATCH 11/28] Fix dataframe concatenate --- src/librep/xai/xai.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 1b51977..2e963ad 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -434,9 +434,8 @@ def lime_values_per_class( df = pd.DataFrame([data], columns=columns) dfs[activity].append(df) - columns = dfs[activity][0].columns # The dataframe containing the global feature importance for each class of the dataset - df = pd.concat(dfs.values(), ignore_index=True, columns=columns) + df = pd.concat([sub_df for sub_df in dfs.values()]) return df From 9bcc43b6cd59cb4c2aac4923c406082f7250c855 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Sat, 11 Nov 2023 02:46:41 -0300 Subject: [PATCH 12/28] Fix dataframe concatenate --- src/librep/xai/xai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 2e963ad..4c316bf 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -403,7 +403,7 @@ def lime_values_per_class( The dataframe containing the feature importance for each feature for each activity by lime """ - dfs = {activity: [] for activity in activities} + dfs = {activity: None for activity in activities} fis = {activity: [] for activity in activities} for j, lime_value in enumerate(lime_values): @@ -432,7 +432,7 @@ def lime_values_per_class( fi_class = fi_class.tolist() data = [model_name, dataset, reduce, standartized_codes[activity]] + fi_class df = pd.DataFrame([data], columns=columns) - dfs[activity].append(df) + dfs[activity] = df # The dataframe containing the global feature importance for each class of the dataset df = pd.concat([sub_df for sub_df in dfs.values()]) From 2b1d483ae6563707062a403bc5c316cd23a5e78a Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Sat, 11 Nov 2023 15:29:44 -0300 Subject: [PATCH 13/28] Fix functions to calculate feature importance with lime --- src/librep/xai/xai.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 4c316bf..9f62aba 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -396,6 +396,8 @@ def lime_values_per_class( The dictionary containing the mapping between the activity codes and the activity names num_features: int The number of features + misclassified: bool + If True, the samples misclassified by the model are removed when calculating the feature importance with lime Returns ------- @@ -423,15 +425,15 @@ def lime_values_per_class( fis[activity].append(fi) # Let's calculate the average of the feature importance for each activity - columns = ["Classifier", "Dataset", "reduce on", "activity"] + [ - f"feature_{i}" for i in range(num_features) - ] + columns = [f"feature {i}" for i in range(num_features)] for activity in activities: fi_class = np.array(fis[activity]) fi_class = np.mean(fi_class, axis=0) - fi_class = fi_class.tolist() - data = [model_name, dataset, reduce, standartized_codes[activity]] + fi_class - df = pd.DataFrame([data], columns=columns) + df = pd.DataFrame([fi_class], columns=columns) + df["Classifier"] = model_name + df["Dataset"] = dataset + df["reduce on"] = reduce + df["activity"] = standartized_codes[activity] dfs[activity] = df # The dataframe containing the global feature importance for each class of the dataset @@ -448,6 +450,7 @@ def lime_values_per_feature( activities: List[int], standartized_codes: Dict[int, str], num_features: int = 24, + remove_misclassified: bool = True, ) -> pd.DataFrame: """This function calculates the lime values for each feature, for each activity. For each activity, the lime values from a subset are the absolute average of the feature importances of all samples in the subset. @@ -469,6 +472,8 @@ def lime_values_per_feature( The dictionary containing the mapping between the activity codes and the activity names num_features: int The number of features + remove_misclassified: bool + If True, the samples misclassified by the model are removed when calculating the feature importance with lime Returns ------- @@ -484,14 +489,22 @@ def lime_values_per_feature( activities, standartized_codes, num_features, + remove_misclassified, ) - # Let's remove the columns that are not features - df = df.drop(columns=["Classifier", "Dataset", "reduce on", "activity"]) - # Let's sum all lines - df = df.sum(axis=0).to_frame().T + dfs = [] + df.drop(columns=["activity"], inplace=True) + for (classifier, dataset, reduce), sub_df in df.groupby( + ["Classifier", "Dataset", "reduce on"] + ): + # Let's sum all lines + sub_df = sub_df.sum(axis=0).to_frame().T + sub_df["Classifier"] = classifier + sub_df["Dataset"] = dataset + sub_df["reduce on"] = reduce + dfs.append(sub_df) - return df.reset_index(drop=True) + return pd.concat(dfs).reset_index(drop=True) ############################################################################################################ @@ -523,6 +536,13 @@ def calc_oracle_values( The number of latent dimensions, that is, the number of features columns_to_remove: List[int] The list of columns to remove from the dataset, if needed + normalization: str + The name of the normalization technique to apply to the dataset + If None, no normalization is applied + If "MinMaxScaler", MinMaxScaler is applied + If "StandardScaler", StandardScaler is applied + data_path: str + The path where the dataset is stored Returns ------- From d838797437b30ad4910218747b6fa285cf8780af Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 15 Nov 2023 12:59:27 -0300 Subject: [PATCH 14/28] Fix bug --- src/librep/xai/xai.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 9f62aba..16e37fb 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -7,6 +7,7 @@ from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import MinMaxScaler, StandardScaler +from sklearn.metrics import accuracy_score from librep.datasets.multimodal.operations import ( DatasetFitter, @@ -318,7 +319,7 @@ def calc_lime_values( Parameters ---------- - model: machine learning model + model: machine learning modelra The trained model test: Dataset The test dataset @@ -564,14 +565,15 @@ def calc_oracle_values( model = ( train_rf(train) - if classifier == "Radom Forest" + if classifier == "Random Forest" else train_svm(train) if classifier == "SVM" else train_knn(train) if classifier == "KNN" else train_dt(train) ) - accuracy = model.score(test.X, test.y) + predictions = model.predict(test.X) + accuracy = accuracy_score(test.y, predictions) accuracies.append(accuracy) fis = 1 - np.array(accuracies) From 6ca943bc65de730aa3a3eb81cc46d63a402aea3b Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 20 Dec 2023 10:42:59 -0300 Subject: [PATCH 15/28] Add condition to take only real part if we don't want to calculate the absolute value of fft --- src/librep/transforms/fft.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/librep/transforms/fft.py b/src/librep/transforms/fft.py index 8b79f84..ef7683c 100644 --- a/src/librep/transforms/fft.py +++ b/src/librep/transforms/fft.py @@ -61,6 +61,8 @@ def transform(self, X: ArrayLike) -> ArrayLike: data = fftpack.fft(data) if self.absolute: data = np.abs(data) + else: + data = np.real(data) if self.centered: data = data[:len(data)//2] datas.append(data) From 1f4a26a08740e8938ae46b083a7569f3fe9cc91c Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Mon, 15 Apr 2024 12:00:03 -0300 Subject: [PATCH 16/28] Add geometric mean in report --- src/librep/metrics/report.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/librep/metrics/report.py b/src/librep/metrics/report.py index 9abc4a1..4690a00 100644 --- a/src/librep/metrics/report.py +++ b/src/librep/metrics/report.py @@ -5,6 +5,8 @@ accuracy_score, f1_score, balanced_accuracy_score, + recall_score, + precision_score, confusion_matrix, ConfusionMatrixDisplay ) @@ -20,6 +22,9 @@ def __init__( use_accuracy: bool = True, use_f1_score: bool = True, use_balanced_accuracy: bool = False, + use_precision: bool = True, + use_recall: bool = True, + use_geometric_mean: bool = True, use_confusion_matrix: bool = True, use_classification_report: bool = False, plot_confusion_matrix: bool = True, @@ -32,6 +37,9 @@ def __init__( self.use_accuracy = use_accuracy self.use_f1_score = use_f1_score self.use_balanced_accuracy = use_balanced_accuracy + self.use_precision = use_precision + self.use_recall = use_recall + self.use_geometric_mean = use_geometric_mean self.use_confusion_matrix = use_confusion_matrix self.use_classification_report = use_classification_report self.plot_confusion_matrix = plot_confusion_matrix @@ -66,6 +74,28 @@ def evaluate( if self.use_balanced_accuracy: res = balanced_accuracy_score(y_true, y_pred) result["balanced accuracy"] = float(res) + if self.use_precision: + res = precision_score(y_true, y_pred, average="weighted") + result["precision (weighted)"] = float(res) + + res = precision_score(y_true, y_pred, average="micro") + result["precision (micro)"] = float(res) + + res = precision_score(y_true, y_pred, average="macro") + result["precision (macro)"] = float(res) + if self.use_recall: + res = recall_score(y_true, y_pred, average="weighted") + result["recall (weighted)"] = float(res) + + res = recall_score(y_true, y_pred, average="micro") + result["recall (micro)"] = float(res) + + res = recall_score(y_true, y_pred, average="macro") + result["recall (macro)"] = float(res) + + if self.use_geeometric_mean: + res = np.sqrt(recall_score(y_true, y_pred, average="weighted") * precision_score(y_true, y_pred, average="weighted")) + result["geometric mean"] = float(res) if self.use_confusion_matrix: res = confusion_matrix(y_true, y_pred) From 17f9df4ad449e5633a9535dc4e60db382710a1f3 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 17 Apr 2024 23:21:00 -0300 Subject: [PATCH 17/28] Update load data function --- src/librep/xai/xai.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 16e37fb..da1a546 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -28,7 +28,8 @@ # Function to load the dataset ############################################################################################################ def load_dataset( - dataset_name: str, + dst_train: str, + dst_test: str, reduce_on: str, normalization: str = None, path: Path = Path("../reducer_experiments/results/execution/transformed_data"), @@ -39,8 +40,10 @@ def load_dataset( Parameters ---------- - dataset_name: str - The name of the dataset + dst_train: str + The name of the train dataset + dst_test: str + The name of the test dataset reduce_on: str The name of the modality on which the dataset is reduced normalization: str @@ -55,7 +58,8 @@ def load_dataset( test: Dataset The test dataset """ - path = path / f"{dataset_name}-{reduce_on}" + path = path / f"{dst_train}_{dst_test}_{reduce_on}" + print(path) # Let's read files from the directory path with open(path / "train.pkl", "rb") as f: @@ -513,7 +517,8 @@ def lime_values_per_feature( ############################################################################################################ def calc_oracle_values( classifier: str, - dataset: str, + dst_train: str, + dst_test: str, reduce: str, latent_dim: int = 24, columns_to_remove: List[int] = [], @@ -529,8 +534,10 @@ def calc_oracle_values( ---------- classifier: str The name of the classifier - dataset: str - The name of the dataset + dst_train: str + The name of the train dataset + dst_test: str + The name of the test dataset reduce: str The name of the modality on which the dataset is reduced latent_dim: int @@ -554,7 +561,7 @@ def calc_oracle_values( accuracies = [] for dim in range(latent_dim): train, test = load_dataset( - dataset, reduce, normalization=normalization, path=Path(data_path) + dst_train, dst_test, reduce, normalization=None, path=Path(data_path) ) if columns_to_remove != []: train.X = np.delete(train.X, columns_to_remove, axis=1) From ef3893e3a6887c478a4d984030270e0b4dd8adf0 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 23 Apr 2024 09:06:55 -0300 Subject: [PATCH 18/28] Add more estimators --- src/librep/estimators/sklearn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/librep/estimators/sklearn.py b/src/librep/estimators/sklearn.py index 6ed5565..e5a8cfd 100644 --- a/src/librep/estimators/sklearn.py +++ b/src/librep/estimators/sklearn.py @@ -1,3 +1,5 @@ from sklearn.neighbors import * from sklearn.ensemble import * -from sklearn.svm import * \ No newline at end of file +from sklearn.svm import * +from sklearn.tree import * +from sklearn.linear_model import * \ No newline at end of file From a30cc1624831b02b24a4f4cb0533246f07852b27 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 23 Apr 2024 09:07:20 -0300 Subject: [PATCH 19/28] Fix geometric mean --- src/librep/metrics/report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librep/metrics/report.py b/src/librep/metrics/report.py index 4690a00..fe107ad 100644 --- a/src/librep/metrics/report.py +++ b/src/librep/metrics/report.py @@ -93,7 +93,7 @@ def evaluate( res = recall_score(y_true, y_pred, average="macro") result["recall (macro)"] = float(res) - if self.use_geeometric_mean: + if self.use_geometric_mean: res = np.sqrt(recall_score(y_true, y_pred, average="weighted") * precision_score(y_true, y_pred, average="weighted")) result["geometric mean"] = float(res) From 5d9325785fda1ff17bd766aa354b7bde7bcba00a Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 23 Apr 2024 09:26:34 -0300 Subject: [PATCH 20/28] Add more transforms --- src/librep/transforms/__init__.py | 21 +++++++++++++++++---- src/librep/transforms/ica.py | 3 +++ src/librep/transforms/isomap.py | 3 +++ src/librep/transforms/kernel_pca.py | 3 +++ src/librep/transforms/lle.py | 3 +++ 5 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 src/librep/transforms/ica.py create mode 100644 src/librep/transforms/isomap.py create mode 100644 src/librep/transforms/kernel_pca.py create mode 100644 src/librep/transforms/lle.py diff --git a/src/librep/transforms/__init__.py b/src/librep/transforms/__init__.py index 5e42410..f7ca241 100644 --- a/src/librep/transforms/__init__.py +++ b/src/librep/transforms/__init__.py @@ -1,9 +1,22 @@ +from .autocorrelation import * +from .convaelstm import * +from .dimal import * from .fft import * +from .filter import * +from .ica import * +from .isomap import * +from .kernel_pca import * +from .lle import * from .lstm import * +from .pca import * +from .removeFrequencies import * from .resampler import * -from .umap import * -from .tsne import * -from .autocorrelation import * +from .reshaper import * +from .simclr_full import * +from .simclr_linear import * +from .simclr import * +from .spectrogram import * from .stats import * -from .removeFrequencies import * +from .topo_ae import * from .tsne import * +from .umap import * \ No newline at end of file diff --git a/src/librep/transforms/ica.py b/src/librep/transforms/ica.py new file mode 100644 index 0000000..d961529 --- /dev/null +++ b/src/librep/transforms/ica.py @@ -0,0 +1,3 @@ +from sklearn.decomposition import FastICA as reducer + +FastICA = reducer \ No newline at end of file diff --git a/src/librep/transforms/isomap.py b/src/librep/transforms/isomap.py new file mode 100644 index 0000000..3354e96 --- /dev/null +++ b/src/librep/transforms/isomap.py @@ -0,0 +1,3 @@ +from sklearn.manifold import Isomap as reducer + +Isomap = reducer \ No newline at end of file diff --git a/src/librep/transforms/kernel_pca.py b/src/librep/transforms/kernel_pca.py new file mode 100644 index 0000000..6ed3513 --- /dev/null +++ b/src/librep/transforms/kernel_pca.py @@ -0,0 +1,3 @@ +from sklearn.decomposition import KernelPCA as reducer + +KernelPCA = reducer \ No newline at end of file diff --git a/src/librep/transforms/lle.py b/src/librep/transforms/lle.py new file mode 100644 index 0000000..9327cf9 --- /dev/null +++ b/src/librep/transforms/lle.py @@ -0,0 +1,3 @@ +from sklearn.manifold import LocallyLinearEmbedding as reducer + +LocallyLinearEmbedding = reducer \ No newline at end of file From a20f660b5b5e395a2f6add0e9fb5e2496ed8bae3 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 23 Apr 2024 09:26:59 -0300 Subject: [PATCH 21/28] Import numpy --- src/librep/metrics/report.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/librep/metrics/report.py b/src/librep/metrics/report.py index fe107ad..c4aa025 100644 --- a/src/librep/metrics/report.py +++ b/src/librep/metrics/report.py @@ -11,6 +11,7 @@ ConfusionMatrixDisplay ) import matplotlib.pyplot as plt +import numpy as np from librep.base.evaluators import SupervisedEvaluator from librep.config.type_definitions import ArrayLike, PathLike From 25b4e3f66e3c10fbc5529a7432489539b2391328 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Tue, 23 Apr 2024 09:27:42 -0300 Subject: [PATCH 22/28] Remove print path --- src/librep/xai/xai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index da1a546..5bf5963 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -59,7 +59,6 @@ def load_dataset( The test dataset """ path = path / f"{dst_train}_{dst_test}_{reduce_on}" - print(path) # Let's read files from the directory path with open(path / "train.pkl", "rb") as f: From 62284b0b1531d2ae012b381d213d3351a91966af Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 24 Apr 2024 09:56:49 -0300 Subject: [PATCH 23/28] Modify the shap values computation and generalize the dimensions --- src/librep/xai/xai.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 5bf5963..950342d 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -244,7 +244,7 @@ def calc_shap_values(model, test: PandasMultiModalDataset) -> np.ndarray: # Functions to filter the SHAP values ############################################################################################################ def shap_values_per_feature( - shap_values, activities: List[int], num_features: int = 24 + shap_values, activities: List[int] ) -> pd.DataFrame: """This function calculates the shap values for each feature, for each activity. For each activity, the shap values from a subset are the absolute average of the shap values of all samples in the subset. @@ -256,25 +256,36 @@ def shap_values_per_feature( The shap values for each sample in the test dataset activities: List[int] The list of activities - num_features: int - The number of features + Returns ------- df: pd.DataFrame The dataframe containing the shap values for each feature """ - fi = { - f"feature {i}": np.sum( - [ - np.mean(np.abs(shap_values[j][:, i])) - for j, activity in enumerate(activities) - ] + + # Define the number of features + num_features = shap_values[0].shape[1] + + values = {f'feature {i}': 0 for i in range(num_features)} + for i in range(num_features): + values[f'feature {i}'] = np.sum( + [np.abs(shap_values[j][:, i]).mean() for j in range(len(shap_values))] ) - for i in range(num_features) - } - df = pd.DataFrame(fi, index=[0]) + + # fi = { + # f"feature {i}": np.sum( + # [ + # np.mean(np.abs(shap_values[j][:, i])) + # for j, activity in enumerate(activities) + # ] + # ) + # for i in range(num_features) + # } + + # df = pd.DataFrame(fi, index=[0]) + df = pd.DataFrame(values, index=[0]) return df @@ -294,13 +305,17 @@ def shap_values_per_class(shap_values, activities: List[int]) -> pd.DataFrame: df: pd.DataFrame The dataframe containing the shap values for each feature """ - keys = ["activity"] + [f"feature {i}" for i in range(24)] + + # Define the number of features + num_features = shap_values[0].shape[1] + + keys = ["activity"] + [f"feature {i}" for i in range(num_features)] fi = {key: None for key in keys} fis = [] for j, activity in enumerate(activities): fi["activity"] = activity - for i in range(24): + for i in range(num_features): fi[f"feature {i}"] = np.mean(np.abs(shap_values[j][:, i])) fis.append(fi.copy()) From 6cf3086fcf5e168d9d20ad547ff42a5f7751caeb Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Thu, 25 Apr 2024 15:54:30 -0300 Subject: [PATCH 24/28] Update function --- src/librep/xai/xai.py | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 950342d..08191f6 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -387,13 +387,14 @@ def calc_lime_values( ############################################################################################################ def lime_values_per_class( lime_values: List[Dict[str, Any]], - dataset: str, + dst_train: str, + dst_test: str, reduce: str, model_name: str, activities: List[int], standartized_codes: Dict[int, str], num_features: int = 24, - remove_misclassified: bool = True, + remove_misclassified: bool = False, ) -> pd.DataFrame: """This function calculates the lime values for each feature, for each activity. For each activity, the lime values from a subset are the absolute average of the feature importances of all samples in the subset. @@ -403,8 +404,10 @@ def lime_values_per_class( ---------- lime_values: List[Dict[str, Any]] The list of dictionaries containing the lime values for each sample in the test dataset - dataset: str - The name of the dataset + dst_train: str + The name of the train dataset + dst_test: str + The name of the test dataset reduce: str The name of the modality on which the dataset is reduced model_name: str @@ -427,7 +430,7 @@ def lime_values_per_class( dfs = {activity: None for activity in activities} fis = {activity: [] for activity in activities} - for j, lime_value in enumerate(lime_values): + for _, lime_value in enumerate(lime_values): # Calculate the feature importance for the sample activity = lime_value["True class"] lime_predict = lime_value["LIME prediction"] @@ -436,12 +439,13 @@ def lime_values_per_class( sample = np.array(sample) fi = np.abs(sample[:, 1]) model_predict = lime_value["Model prediction"] + + # If remove_misclassified is True, the samples misclassified by the model are removed if remove_misclassified: - fis[activity].append(fi) - else: - # Check if the model predicted correctly the sample if model_predict == activity: fis[activity].append(fi) + else: + fis[activity].append(fi) # Let's calculate the average of the feature importance for each activity columns = [f"feature {i}" for i in range(num_features)] @@ -450,7 +454,8 @@ def lime_values_per_class( fi_class = np.mean(fi_class, axis=0) df = pd.DataFrame([fi_class], columns=columns) df["Classifier"] = model_name - df["Dataset"] = dataset + df["Train"] = dst_train + df["Test"] = dst_test df["reduce on"] = reduce df["activity"] = standartized_codes[activity] dfs[activity] = df @@ -463,13 +468,14 @@ def lime_values_per_class( def lime_values_per_feature( lime_values: List[Dict[str, Any]], - dataset: str, + dst_train: str, + dst_test: str, reduce: str, model_name: str, activities: List[int], standartized_codes: Dict[int, str], num_features: int = 24, - remove_misclassified: bool = True, + remove_misclassified: bool = False, ) -> pd.DataFrame: """This function calculates the lime values for each feature, for each activity. For each activity, the lime values from a subset are the absolute average of the feature importances of all samples in the subset. @@ -479,8 +485,10 @@ def lime_values_per_feature( ---------- lime_values: List[Dict[str, Any]] The list of dictionaries containing the lime values for each sample in the test dataset - dataset: str - The name of the dataset + dst_train: str + The name of the train dataset + dst_test: str + The name of the test dataset reduce: str The name of the modality on which the dataset is reduced model_name: str @@ -502,7 +510,8 @@ def lime_values_per_feature( df = lime_values_per_class( lime_values, - dataset, + dst_train, + dst_test, reduce, model_name, activities, @@ -513,13 +522,14 @@ def lime_values_per_feature( dfs = [] df.drop(columns=["activity"], inplace=True) - for (classifier, dataset, reduce), sub_df in df.groupby( - ["Classifier", "Dataset", "reduce on"] + for (classifier, dst_train, dst_test, reduce), sub_df in df.groupby( + ["Classifier", "Train", "Test", "reduce on"] ): # Let's sum all lines sub_df = sub_df.sum(axis=0).to_frame().T sub_df["Classifier"] = classifier - sub_df["Dataset"] = dataset + sub_df["Train"] = dst_train + sub_df["Test"] = dst_test sub_df["reduce on"] = reduce dfs.append(sub_df) From fce27e0a8f46c5efc75ddfbc2a95423be5dd82de Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Thu, 25 Apr 2024 16:28:34 -0300 Subject: [PATCH 25/28] Rename feature name --- src/librep/xai/xai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index 08191f6..cf2f63f 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -354,7 +354,7 @@ def calc_lime_values( # This function calculates the lime values for each sample in the test dataset lime_values = [] num_features = train.X.shape[1] - features_names = [f"feature_{i}" for i in range(num_features)] + features_names = [f"f-{i}" for i in range(num_features)] explainer = lime_tabular.LimeTabularExplainer( train.X, feature_names=features_names, @@ -448,7 +448,7 @@ def lime_values_per_class( fis[activity].append(fi) # Let's calculate the average of the feature importance for each activity - columns = [f"feature {i}" for i in range(num_features)] + columns = [f"f-{i}" for i in range(num_features)] for activity in activities: fi_class = np.array(fis[activity]) fi_class = np.mean(fi_class, axis=0) From bca324c7cf3009013f4b5a1c3ec04a31c017f181 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Wed, 12 Jun 2024 17:01:45 -0300 Subject: [PATCH 26/28] Fix file --- src/librep/xai/xai.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index cf2f63f..d5cd837 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -273,18 +273,6 @@ def shap_values_per_feature( [np.abs(shap_values[j][:, i]).mean() for j in range(len(shap_values))] ) - - # fi = { - # f"feature {i}": np.sum( - # [ - # np.mean(np.abs(shap_values[j][:, i])) - # for j, activity in enumerate(activities) - # ] - # ) - # for i in range(num_features) - # } - - # df = pd.DataFrame(fi, index=[0]) df = pd.DataFrame(values, index=[0]) return df @@ -308,18 +296,17 @@ def shap_values_per_class(shap_values, activities: List[int]) -> pd.DataFrame: # Define the number of features num_features = shap_values[0].shape[1] - - keys = ["activity"] + [f"feature {i}" for i in range(num_features)] - - fi = {key: None for key in keys} fis = [] + values = {f'feature {i}': 0 for i in range(num_features)} for j, activity in enumerate(activities): - fi["activity"] = activity + values = {f'feature {i}': 0 for i in range(num_features)} for i in range(num_features): - fi[f"feature {i}"] = np.mean(np.abs(shap_values[j][:, i])) - fis.append(fi.copy()) + values[f'feature {i}'] = np.abs(shap_values[j][:, i]).mean() + values["activity"] = activity + fis.append(values) df = pd.DataFrame(fis) + return df From 6c78a31d6c44c9cb208ebd0de634c888d0a2ea80 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Thu, 27 Jun 2024 15:58:26 -0300 Subject: [PATCH 27/28] Remove unusen variabel --- src/librep/xai/xai.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index d5cd837..ea42783 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -32,7 +32,7 @@ def load_dataset( dst_test: str, reduce_on: str, normalization: str = None, - path: Path = Path("../reducer_experiments/results/execution/transformed_data"), + path: Path = Path(), ) -> Tuple[PandasMultiModalDataset, PandasMultiModalDataset]: """This function loads the dataset from the path. In particular, it loads the train and test files from the path: results/execution/output_files/reduced_data/{dataset_name}-{reduce_on}. @@ -244,9 +244,9 @@ def calc_shap_values(model, test: PandasMultiModalDataset) -> np.ndarray: # Functions to filter the SHAP values ############################################################################################################ def shap_values_per_feature( - shap_values, activities: List[int] + shap_values, ) -> pd.DataFrame: - """This function calculates the shap values for each feature, for each activity. + """This function calculates the shap values for each feature. For each activity, the shap values from a subset are the absolute average of the shap values of all samples in the subset. After that, the feature importance for each feature are the sum of the shap values of all activities. @@ -254,9 +254,6 @@ def shap_values_per_feature( ---------- shap_values: np.ndarray The shap values for each sample in the test dataset - activities: List[int] - The list of activities - Returns ------- From dcd8bd69ec0e0cfce623f0a3252325fe52a51ba1 Mon Sep 17 00:00:00 2001 From: Patrick Alves Date: Fri, 6 Sep 2024 16:33:02 -0300 Subject: [PATCH 28/28] Update xai file --- src/librep/xai/xai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librep/xai/xai.py b/src/librep/xai/xai.py index ea42783..9d5721d 100644 --- a/src/librep/xai/xai.py +++ b/src/librep/xai/xai.py @@ -569,7 +569,7 @@ def calc_oracle_values( accuracies = [] for dim in range(latent_dim): train, test = load_dataset( - dst_train, dst_test, reduce, normalization=None, path=Path(data_path) + dst_train, dst_test, reduce, normalization=normalization, path=Path(data_path) ) if columns_to_remove != []: train.X = np.delete(train.X, columns_to_remove, axis=1)