-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
129 lines (107 loc) · 4.1 KB
/
config.py
File metadata and controls
129 lines (107 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pathlib import Path
import sys
from dotenv import load_dotenv
# ------------------------------------------------------------
# Run configuration
# ------------------------------------------------------------
RUN_NAME = "wiki_mpnet_10m"
FILE_PREFIX = "wiki_mpnet_embeddings"
CLEANUP_INTERMEDIATE_FVECS = True # Will delete intermediate files produced by zero removal, etc.
RUN_DIR = Path("runs") / RUN_NAME
# ------------------------------------------------------------
# Working files. Do not update.
# ------------------------------------------------------------
RAW_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_raw_base.fvecs"
NONZERO_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_nonzero_base.fvecs"
NORMALIZED_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_normalized_base.fvecs"
DEDUP_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_base.fvecs"
SPLIT_QUERY_FVECS = RUN_DIR / f"{FILE_PREFIX}_base_query.fvecs"
SPLIT_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_base_base.fvecs"
SPLIT_QPARTS_DIR = Path(f"{DEDUP_BASE_FVECS.with_suffix('')}_qparts")
SPLIT_BPARTS_DIR = Path(f"{DEDUP_BASE_FVECS.with_suffix('')}_bparts")
GT_PROCESSED_BASE_FVECS = RUN_DIR / f"{FILE_PREFIX}_gt_processed_base.fvecs"
GT_PROCESSED_QUERY_FVECS = RUN_DIR / f"{FILE_PREFIX}_gt_processed_query.fvecs"
GROUND_TRUTH_FILE = RUN_DIR / "ground_truth.ivecs"
# ------------------------------------------------------------
DEDUP_REPORT = RUN_DIR / f"{FILE_PREFIX}_dedup_report.txt"
DEDUP_TEMP_DIR = RUN_DIR / f"{FILE_PREFIX}_dedup_temp"
NUM_QUERY = 10000 # Set an integer target number. The final count may be less due to zero removal and dedup.
NUM_BASE = 10000000 # set an integer for truncation (otherwise all listed input files will be processed. The final count may be less due to zero removal and dedup.
GT_K = 100
GT_METRIC = "ip" # "ip" or "l2"
GT_SHUFFLE = True
GT_GPUS = "-1" # "-1" for CPU, e.g. "0" for one GPU, "0,1" for multi-GPU
FINAL_GROUND_TRUTH = RUN_DIR / f"{FILE_PREFIX}_gt_{GT_METRIC}_{GT_K}.ivecs"
LOG_FILE = RUN_DIR / "pipeline.log"
SUMMARY_FILE = RUN_DIR / "summary.json"
OVERWRITE = False
# ------------------------------------------------------------
# Input data
# ------------------------------------------------------------
SOURCE_TYPE = "npy"
from pathlib import Path
import os
# Load the .env file from the current directory
load_dotenv()
dataset_root = os.environ.get("DATASET_ROOT")
if not dataset_root:
raise RuntimeError(
"DATASET_ROOT is not set. "
"Example: export DATASET_ROOT=/path/to/your/datasets"
)
DATASET_ROOT = Path(dataset_root)
DATASET_NAME = "mpnet-43m" # Just an example. Put whatever you'd like
EMBED_SUBDIR = "data/en/embs"
DATASET_DIR = DATASET_ROOT / DATASET_NAME / EMBED_SUBDIR
INPUT_FILES = [DATASET_DIR / f"emb_{i:03d}.npy" for i in range(12)] # Match the file naming conventions from your download
# ------------------------------------------------------------
# External stage commands
# ------------------------------------------------------------
REMOVE_ZEROS_CMD = [
sys.executable,
"-u",
"fvecs_remove_zeros.py",
"--input", str(RAW_BASE_FVECS),
"--output", str(NONZERO_BASE_FVECS),
]
NORMALIZE_CMD = [
sys.executable,
"-u",
"fvecs_normalize.py",
"--input", str(NONZERO_BASE_FVECS),
"--output", str(NORMALIZED_BASE_FVECS),
]
DEDUP_CMD = [
sys.executable,
"-u",
"fvecs_deduplicator.py",
str(NORMALIZED_BASE_FVECS),
"--output", str(DEDUP_BASE_FVECS),
"--report_file", str(DEDUP_REPORT),
"--reporting_threshold", "1",
"--chunk_size", "200000",
"--temp_dir", str(DEDUP_TEMP_DIR),
]
SPLIT_CMD = [
sys.executable,
"-u",
"fvecs_split.py",
str(DEDUP_BASE_FVECS),
"--num_query", str(NUM_QUERY),
"--seed", "47",
]
GROUND_TRUTH_CMD = [
sys.executable,
"-u",
"knn_utils.py",
"--base", str(SPLIT_BASE_FVECS),
"--query", str(SPLIT_QUERY_FVECS),
"--output", str(GROUND_TRUTH_FILE),
"--processed_base_out", str(GT_PROCESSED_BASE_FVECS),
"--processed_query_out", str(GT_PROCESSED_QUERY_FVECS),
"--k", str(GT_K),
"--metric", GT_METRIC,
"--gpus", GT_GPUS,
]
if GT_SHUFFLE:
GROUND_TRUTH_CMD.append("--shuffle")