From 7ce788617a3fdfcb216d055db02f2438bd627015 Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Thu, 19 Feb 2026 10:30:29 -0500 Subject: [PATCH 1/2] keep track what complexes pairs belong to --- scallops/features/map_eval.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/scallops/features/map_eval.py b/scallops/features/map_eval.py index 28b47b2..806c298 100644 --- a/scallops/features/map_eval.py +++ b/scallops/features/map_eval.py @@ -1,3 +1,4 @@ +from collections import defaultdict from typing import Literal import anndata @@ -30,22 +31,31 @@ def read_corum(path: str) -> pd.DataFrame: :param path: Path to CORUM CSV (e.g. corum_humanComplexes.txt). Available from https://mips.helmholtz-muenchen.de/corum/download - :return: Dataframe containing pairs of genes found in CORUM + :return: Dataframe containing pairs of genes found and complexes they belong to """ - corum_gene_names = pd.read_csv(path, usecols=["subunits_gene_name"], sep="\t")[ - "subunits_gene_name" - ].values + df = pd.read_csv(path, usecols=["complex_name", "subunits_gene_name"], sep="\t") + corum_gene_names = df["subunits_gene_name"].values + complex_names = df["complex_name"].values pairs = set() + pair_to_complex_names = defaultdict(set) + for i in range(len(corum_gene_names)): cluster = corum_gene_names[i].split(";") + complex_name = complex_names[i] for j in range(len(cluster)): for k in range(j): - pairs.add((cluster[j], cluster[k])) - pairs.add((cluster[k], cluster[j])) + p1 = (cluster[j], cluster[k]) + p2 = (cluster[k], cluster[j]) + pairs.add(p1) + pairs.add(p2) + pair_to_complex_names[p1].add(complex_name) + pair_to_complex_names[p2].add(complex_name) a = [] b = [] + c = [] for p in pairs: a.append(p[0]) b.append(p[1]) - return pd.DataFrame(data=dict(a=a, b=b)) + c.append(pair_to_complex_names[p]) + return pd.DataFrame(data=dict(a=a, b=b, complex_name=c)) From cb0cacbeed5c3d2df6a4d53a28469be12a58574f Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Fri, 20 Feb 2026 11:26:31 -0500 Subject: [PATCH 2/2] Fixed conflicts with main --- scallops/features/map_eval.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/scallops/features/map_eval.py b/scallops/features/map_eval.py index 91c1e5a..c341a51 100644 --- a/scallops/features/map_eval.py +++ b/scallops/features/map_eval.py @@ -1,3 +1,4 @@ +from collections import defaultdict from collections.abc import Callable, Sequence from typing import Literal @@ -96,22 +97,31 @@ def read_corum(path: str) -> pd.DataFrame: :param path: Path to CORUM CSV (e.g. corum_humanComplexes.txt). Available from https://mips.helmholtz-muenchen.de/corum/download - :return: Dataframe containing pairs of genes found in CORUM + :return: Dataframe containing pairs of genes found and complexes they belong to """ - corum_gene_names = pd.read_csv(path, usecols=["subunits_gene_name"], sep="\t")[ - "subunits_gene_name" - ].values + df = pd.read_csv(path, usecols=["complex_name", "subunits_gene_name"], sep="\t") + corum_gene_names = df["subunits_gene_name"].values + complex_names = df["complex_name"].values pairs = set() + pair_to_complex_names = defaultdict(set) + for i in range(len(corum_gene_names)): cluster = corum_gene_names[i].split(";") + complex_name = complex_names[i] for j in range(len(cluster)): for k in range(j): - pairs.add((cluster[j], cluster[k])) - pairs.add((cluster[k], cluster[j])) + p1 = (cluster[j], cluster[k]) + p2 = (cluster[k], cluster[j]) + pairs.add(p1) + pairs.add(p2) + pair_to_complex_names[p1].add(complex_name) + pair_to_complex_names[p2].add(complex_name) a = [] b = [] + c = [] for p in pairs: a.append(p[0]) b.append(p[1]) - return pd.DataFrame(data=dict(a=a, b=b)) + c.append(pair_to_complex_names[p]) + return pd.DataFrame(data=dict(a=a, b=b, complex_name=c))