diff --git a/scallops/features/map_eval.py b/scallops/features/map_eval.py index 28b47b2..91c1e5a 100644 --- a/scallops/features/map_eval.py +++ b/scallops/features/map_eval.py @@ -1,3 +1,4 @@ +from collections.abc import Callable, Sequence from typing import Literal import anndata @@ -6,6 +7,71 @@ from sklearn.metrics.pairwise import cosine_similarity +def recall( + true_positives_df: pd.DataFrame, + similarity_df: pd.DataFrame, + similarity_column_true_positives: str = "value", + similarity_column: str = "value", + quantiles: Sequence[float] = (0.01, 0.05), + two_sided: bool = True, + n_true_positives: Callable[[pd.DataFrame], int] = len, +) -> pd.DataFrame: + """Compute recall at the specified quantiles. + + :param true_positives_df: Dataframe containing true positive pairwise similarities + (e.g. relationships from CORUM) + :param similarity_df: Dataframe containing all pairwise similarities from which + quantiles are computed. + :param similarity_column_true_positives: Column in `true_positives_df` containing + similarity scores. + :param similarity_column: Column in `similarity_df` containing similarity scores. + :param quantiles: List of quantiles to extract relevant gene pairs. + :param two_sided: If two-sided, recall is computed at specific quantiles + and 1-quantiles. + :param n_true_positives: Function that accepts a dataframe and returns the + number of true positives. Default is the length of the dataframe + :return: Dataframe containing recall results. + """ + results = [] + + n_relevant = n_true_positives(true_positives_df) + quantiles_ = quantiles + if two_sided: + quantiles = np.array(quantiles) + one_minus_quantiles = 1 - np.array(quantiles) + quantiles_ = [ + (quantiles[i], one_minus_quantiles[i]) for i in range(len(quantiles)) + ] + quantiles = np.concatenate((quantiles, one_minus_quantiles)) + recall_thresholds = similarity_df[similarity_column].quantile(quantiles) + for quantile in quantiles_: + if two_sided: + threshold_low, threshold_high = ( + recall_thresholds[quantile[0]], + recall_thresholds[quantile[1]], + ) + + df_retrieved = true_positives_df[ + (true_positives_df[similarity_column_true_positives] >= threshold_high) + | (true_positives_df[similarity_column_true_positives] <= threshold_low) + ] + elif quantile <= 0.5: + df_retrieved = true_positives_df[ + true_positives_df[similarity_column_true_positives] + <= recall_thresholds[quantile] + ] + else: + df_retrieved = true_positives_df[ + true_positives_df[similarity_column_true_positives] + >= recall_thresholds[quantile] + ] + n_relevant_retrieved = n_true_positives(df_retrieved) + results.append( + [quantile[0] if two_sided else quantile, n_relevant_retrieved / n_relevant] + ) + return pd.DataFrame(results, columns=["quantile", "recall"]) + + def pairwise_similarities( data: anndata.AnnData, metric: Literal["cosine", "pearson"] = "cosine" ) -> np.ndarray: