-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconstruct_benchmark.py
More file actions
84 lines (69 loc) · 3.46 KB
/
construct_benchmark.py
File metadata and controls
84 lines (69 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import h5py
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
def main():
print("Starting benchmark construction...")
print("Sampling 1k random DreaMS embeddings from MassSpecGym dataset")
# Sample 1k random DreaMS embeddings from the MassSpecGym dataset
# from dreams.utils.io import sample_hdf
# sample_hdf(
# hdf_pth=Path("data/MassSpecGym_DreaMS.hdf5"),
# out_pth=Path("data/MassSpecGym_DreaMS_rand1k.hdf5"),
# n_samples=1_000,
# )
# Prepare paths to reference and query DreaMS embeddings
ref_pths = [
Path("data/GeMS_A1_DreaMS_rand50k.hdf5"),
Path("data/GeMS_A1_DreaMS_rand500k.hdf5"),
Path("data/GeMS_A1_DreaMS_rand5M.hdf5"),
]
query_pth = Path("data/MassSpecGym_DreaMS_rand1k.hdf5")
print("\nConverting HDF5 files to numpy arrays...")
# Store all embeddings in numpy arrays compatible with matchms EmbeddingBaseSimilarity codebase
for pth in ref_pths + [query_pth]:
if pth.with_suffix(".npy").exists():
print(f"Skipping {pth.with_suffix(".npy")} because it already exists")
continue
print(f"Processing {pth}")
with h5py.File(pth, "r") as f:
# Store embeddings in numpy array
embs = f["DreaMS_embedding"][:]
np.save(pth.with_suffix(".npy"), embs)
print(f"Saved numpy array with shape {embs.shape}")
print("\nPre-computing similarities between query and reference embeddings...")
# Pre-compute similarities between query and reference embeddings
top_k = 50
for ref_pth in ref_pths:
print(f"\nProcessing reference dataset: {ref_pth}")
# Load embeddings
query_embs = np.load(query_pth.with_suffix(".npy"))
ref_embs = np.load(ref_pth.with_suffix(".npy"))
print(f"Loaded query embeddings: {query_embs.shape}")
print(f"Loaded reference embeddings: {ref_embs.shape}")
n = len(query_embs) # Use all query embeddings
# Initialize arrays to store results
top_similarities = np.zeros((n, top_k))
top_indices = np.zeros((n, top_k), dtype=int)
# Process one query vector at a time to minimize memory usage
for i in tqdm(range(n), desc=f"Computing top {top_k} similarities for {n} queries..."):
# Compute similarities between current query and all reference embeddings
query = query_embs[i:i+1] # Keep 2D shape for broadcasting
similarities = cosine_similarity(ref_embs, query).flatten()
# Get top k similarities and indices
idx = np.argpartition(similarities, -top_k)[-top_k:]
idx = idx[np.argsort(-similarities[idx])] # Sort top k
top_indices[i] = idx
top_similarities[i] = similarities[idx]
# Save results as a single matrix
results = np.stack([top_indices, top_similarities], axis=1) # Shape: (n, 2, top_k)
np.save(ref_pth.with_suffix('.benchmark.npy'), results)
print(f"Saved benchmark results to {ref_pth.with_suffix('.benchmark.npy')}")
print(f"\nResults for {ref_pth}:")
print("Shape of query embeddings:", query_embs.shape)
print("Shape of reference embeddings:", ref_embs.shape)
print("Shape of top similarities per vector:", top_similarities.shape)
print("Shape of top indices per vector:", top_indices.shape)
if __name__ == "__main__":
main()