From dbd89ec97a0ba45825f70b0550d9d06b6c310738 Mon Sep 17 00:00:00 2001 From: Neng Li Date: Tue, 19 May 2026 09:34:29 -0400 Subject: [PATCH 1/5] add ensure_parent_dir --- src/tevatron/utils/io.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 src/tevatron/utils/io.py diff --git a/src/tevatron/utils/io.py b/src/tevatron/utils/io.py new file mode 100644 index 00000000..e4d21011 --- /dev/null +++ b/src/tevatron/utils/io.py @@ -0,0 +1,18 @@ +import os + + +def ensure_parent_dir(path): + """Create parent directories for a filesystem path (``mkdir -p`` semantics). + + Intended for output *file* paths: ensures the containing directory exists + before long-running encode/search/rerank work so runs fail fast on + permission errors instead of after compute. + + No-op when *path* is falsy or resolves to having no parent (e.g. + ``output.pkl`` in the current directory). + """ + if not path: + return + parent = os.path.dirname(os.path.normpath(path)) + if parent: + os.makedirs(parent, exist_ok=True) From ae133140846c54d2a19005f66b9f276207194f11 Mon Sep 17 00:00:00 2001 From: Neng Li Date: Tue, 19 May 2026 09:35:28 -0400 Subject: [PATCH 2/5] call ensure_parent_dir --- scripts/dataset_transform_scripts/MultiVENT2.0.py | 4 ++-- scripts/hn_mining.py | 4 ++++ scripts/reduce_results.py | 3 +++ src/tevatron/reranker/driver/rerank.py | 3 +++ src/tevatron/retriever/driver/encode.py | 3 +++ src/tevatron/retriever/driver/encode_mm.py | 3 +++ src/tevatron/retriever/driver/jax_encode.py | 4 ++++ src/tevatron/retriever/driver/search.py | 5 +++++ src/tevatron/retriever/driver/vllm_encode.py | 3 +++ src/tevatron/retriever/driver/vllm_encode_mm.py | 3 +++ src/tevatron/utils/format/convert_result_to_marco.py | 4 ++++ src/tevatron/utils/format/convert_result_to_trec.py | 4 ++++ src/tevatron/utils/format/prepare_rerank_input.py | 3 +++ 13 files changed, 44 insertions(+), 2 deletions(-) diff --git a/scripts/dataset_transform_scripts/MultiVENT2.0.py b/scripts/dataset_transform_scripts/MultiVENT2.0.py index bc761287..68ad66a6 100644 --- a/scripts/dataset_transform_scripts/MultiVENT2.0.py +++ b/scripts/dataset_transform_scripts/MultiVENT2.0.py @@ -19,6 +19,7 @@ import math import datasets +from tevatron.utils.io import ensure_parent_dir TRAIN_QUERY_FN = "multivent_2_train_queries.csv" TRAIN_JUDGEMENT_FN = "multivent_2_train_judgments.jsonl" @@ -56,8 +57,7 @@ def _form_training_entry(qid, query, positive_doc_ids, negative_doc_ids): "source": "MultiVENT2.0", } - output_dir = os.path.dirname(output_fn) - os.makedirs(output_dir, exist_ok=True) + ensure_parent_dir(output_fn) with open(output_fn, "w") as f: for qid, query in tqdm(qid2query.items(), desc="Forming training data"): positive_doc_ids = query2positive_doc[qid] diff --git a/scripts/hn_mining.py b/scripts/hn_mining.py index b4460585..ae49ccd2 100644 --- a/scripts/hn_mining.py +++ b/scripts/hn_mining.py @@ -1,5 +1,7 @@ import json from argparse import ArgumentParser + +from tevatron.utils.io import ensure_parent_dir from datasets import load_dataset, concatenate_datasets from multiprocessing import Manager from tqdm import tqdm @@ -88,6 +90,8 @@ def __call__(self, example): parser.add_argument('--regex', action='store_true', required=False) args = parser.parse_args() + ensure_parent_dir(args.output) + train_data = load_dataset(args.train_data_name, cache_dir=args.cache_dir)['train'] corpus_data = load_dataset(args.corpus_data_name, cache_dir=args.cache_dir)['train'] if args.em: diff --git a/scripts/reduce_results.py b/scripts/reduce_results.py index d0c1a26c..07c75281 100644 --- a/scripts/reduce_results.py +++ b/scripts/reduce_results.py @@ -1,12 +1,15 @@ import argparse import os +from tevatron.utils.io import ensure_parent_dir + parser = argparse.ArgumentParser(description='Reduce retrieval results from multiple shards.') parser.add_argument('--results_dir', type=str, help='Directory that contains results from all shards', required=True) parser.add_argument('--output', help='Path to final results file', required=True) parser.add_argument('--depth', type=int, help='Number of retrieved doc for each query', required=False, default=100) args = parser.parse_args() +ensure_parent_dir(args.output) all_results = {} print(f'Merging results from {len(os.listdir(args.results_dir))} result files.') diff --git a/src/tevatron/reranker/driver/rerank.py b/src/tevatron/reranker/driver/rerank.py index 2a63b1de..2e98afc7 100644 --- a/src/tevatron/reranker/driver/rerank.py +++ b/src/tevatron/reranker/driver/rerank.py @@ -18,6 +18,7 @@ from tevatron.reranker.dataset import RerankerInferenceDataset from tevatron.reranker.modeling import RerankerModel from tevatron.reranker.collator import RerankerInferenceCollator +from tevatron.utils.io import ensure_parent_dir logger = logging.getLogger(__name__) @@ -35,6 +36,8 @@ def main(): if training_args.local_rank > 0 or training_args.n_gpu > 1: raise NotImplementedError('Multi-GPU encoding is not supported.') + ensure_parent_dir(data_args.rerank_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/retriever/driver/encode.py b/src/tevatron/retriever/driver/encode.py index 8749dfda..0e64d373 100644 --- a/src/tevatron/retriever/driver/encode.py +++ b/src/tevatron/retriever/driver/encode.py @@ -18,6 +18,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments from tevatron.retriever.dataset import EncodeDataset +from tevatron.utils.io import ensure_parent_dir from tevatron.retriever.collator import EncodeCollator from tevatron.retriever.modeling import EncoderOutput, DenseModel @@ -37,6 +38,8 @@ def main(): if training_args.local_rank > 0 or training_args.n_gpu > 1: raise NotImplementedError('Multi-GPU encoding is not supported.') + ensure_parent_dir(data_args.encode_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/retriever/driver/encode_mm.py b/src/tevatron/retriever/driver/encode_mm.py index c3c7746f..ff393ffe 100644 --- a/src/tevatron/retriever/driver/encode_mm.py +++ b/src/tevatron/retriever/driver/encode_mm.py @@ -18,6 +18,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments from tevatron.retriever.dataset import EncodeDataset +from tevatron.utils.io import ensure_parent_dir from tevatron.retriever.collator import MultiModalEncodeCollator from tevatron.retriever.modeling import EncoderOutput, MultiModalDenseModel @@ -37,6 +38,8 @@ def main(): if training_args.local_rank > 0 or training_args.n_gpu > 1: raise NotImplementedError('Multi-GPU encoding is not supported.') + ensure_parent_dir(data_args.encode_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/retriever/driver/jax_encode.py b/src/tevatron/retriever/driver/jax_encode.py index a08a7ccf..2a0bfc1b 100644 --- a/src/tevatron/retriever/driver/jax_encode.py +++ b/src/tevatron/retriever/driver/jax_encode.py @@ -21,6 +21,8 @@ from transformers import (AutoConfig, AutoTokenizer, FlaxAutoModel, HfArgumentParser, TensorType) +from tevatron.utils.io import ensure_parent_dir + logger = logging.getLogger(__name__) @@ -34,6 +36,8 @@ def main(): data_args: DataArguments training_args: TrainingArguments + ensure_parent_dir(data_args.encode_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/retriever/driver/search.py b/src/tevatron/retriever/driver/search.py index 1f374eac..e5ed8255 100644 --- a/src/tevatron/retriever/driver/search.py +++ b/src/tevatron/retriever/driver/search.py @@ -8,6 +8,7 @@ import faiss from tevatron.retriever.searcher import FaissFlatSearcher +from tevatron.utils.io import ensure_parent_dir import logging logger = logging.getLogger(__name__) @@ -30,6 +31,7 @@ def search_queries(retriever, q_reps, p_lookup, args): def write_ranking(corpus_indices, corpus_scores, q_lookup, ranking_save_file): + ensure_parent_dir(ranking_save_file) with open(ranking_save_file, 'w') as f: for qid, q_doc_scores, q_doc_indices in zip(q_lookup, corpus_scores, corpus_indices): score_list = [(s, idx) for s, idx in zip(q_doc_scores, q_doc_indices)] @@ -45,6 +47,7 @@ def pickle_load(path): def pickle_save(obj, path): + ensure_parent_dir(path) with open(path, 'wb') as f: pickle.dump(obj, f) @@ -61,6 +64,8 @@ def main(): args = parser.parse_args() + ensure_parent_dir(args.save_ranking_to) + index_files = glob.glob(args.passage_reps) logger.info(f'Pattern match found {len(index_files)} files; loading them into index.') diff --git a/src/tevatron/retriever/driver/vllm_encode.py b/src/tevatron/retriever/driver/vllm_encode.py index 665fb56c..7c2a25a3 100644 --- a/src/tevatron/retriever/driver/vllm_encode.py +++ b/src/tevatron/retriever/driver/vllm_encode.py @@ -15,6 +15,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments from tevatron.retriever.dataset import EncodeDataset +from tevatron.utils.io import ensure_parent_dir from tevatron.retriever.collator import VllmEncodeCollator from vllm import LLM from vllm.config import PoolerConfig @@ -37,6 +38,8 @@ def main(): if training_args.local_rank > 0 or training_args.n_gpu > 1: raise NotImplementedError('Multi-GPU encoding is not supported.') + ensure_parent_dir(data_args.encode_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/retriever/driver/vllm_encode_mm.py b/src/tevatron/retriever/driver/vllm_encode_mm.py index 5498e55c..84194c67 100644 --- a/src/tevatron/retriever/driver/vllm_encode_mm.py +++ b/src/tevatron/retriever/driver/vllm_encode_mm.py @@ -15,6 +15,7 @@ from tevatron.retriever.arguments import ModelArguments, DataArguments, \ TevatronTrainingArguments as TrainingArguments from tevatron.retriever.dataset import EncodeDataset +from tevatron.utils.io import ensure_parent_dir from tevatron.retriever.collator import VllmMultiModalEncodeCollator from vllm import LLM from vllm.config import PoolerConfig @@ -37,6 +38,8 @@ def main(): if training_args.local_rank > 0 or training_args.n_gpu > 1: raise NotImplementedError('Multi-GPU encoding is not supported.') + ensure_parent_dir(data_args.encode_output_path) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", diff --git a/src/tevatron/utils/format/convert_result_to_marco.py b/src/tevatron/utils/format/convert_result_to_marco.py index 9bf93ce3..528c40d9 100644 --- a/src/tevatron/utils/format/convert_result_to_marco.py +++ b/src/tevatron/utils/format/convert_result_to_marco.py @@ -1,10 +1,14 @@ from argparse import ArgumentParser +from tevatron.utils.io import ensure_parent_dir + parser = ArgumentParser() parser.add_argument('--input', type=str, required=True) parser.add_argument('--output', type=str, required=True) args = parser.parse_args() +ensure_parent_dir(args.output) + with open(args.input) as f_in, open(args.output, 'w') as f_out: cur_qid = None rank = 0 diff --git a/src/tevatron/utils/format/convert_result_to_trec.py b/src/tevatron/utils/format/convert_result_to_trec.py index c1664418..6a2a4316 100644 --- a/src/tevatron/utils/format/convert_result_to_trec.py +++ b/src/tevatron/utils/format/convert_result_to_trec.py @@ -1,11 +1,15 @@ from argparse import ArgumentParser +from tevatron.utils.io import ensure_parent_dir + parser = ArgumentParser() parser.add_argument('--input', type=str, required=True) parser.add_argument('--output', type=str, required=True) parser.add_argument('--remove_query', action='store_true') args = parser.parse_args() +ensure_parent_dir(args.output) + with open(args.input) as f_in, open(args.output, 'w') as f_out: cur_qid = None rank = 0 diff --git a/src/tevatron/utils/format/prepare_rerank_input.py b/src/tevatron/utils/format/prepare_rerank_input.py index 0054cb6b..066da0b5 100644 --- a/src/tevatron/utils/format/prepare_rerank_input.py +++ b/src/tevatron/utils/format/prepare_rerank_input.py @@ -3,6 +3,8 @@ from datasets import load_dataset from tqdm import tqdm +from tevatron.utils.io import ensure_parent_dir + def read_result(path): retrieval_results = {} with open(path) as f: @@ -29,6 +31,7 @@ def read_result(path): parser.add_argument('--cache_dir', type=str, required=False) args = parser.parse_args() +ensure_parent_dir(args.output_path) query_data = load_dataset(args.query_data_name, cache_dir=args.cache_dir)[args.query_data_split] corpus_data = load_dataset(args.corpus_data_name, cache_dir=args.cache_dir)['train'] query_id_map = {} From b4c6889166861cb250aeb80335b42fde303d5199 Mon Sep 17 00:00:00 2001 From: Neng Li Date: Tue, 26 May 2026 14:44:54 -0400 Subject: [PATCH 3/5] model_revision argument --- src/tevatron/retriever/arguments.py | 5 +++++ src/tevatron/retriever/dataset.py | 1 + src/tevatron/retriever/driver/train.py | 20 +++++++++++++++----- src/tevatron/retriever/modeling/encoder.py | 5 +++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/tevatron/retriever/arguments.py b/src/tevatron/retriever/arguments.py index 95724d38..9f54d785 100644 --- a/src/tevatron/retriever/arguments.py +++ b/src/tevatron/retriever/arguments.py @@ -19,6 +19,11 @@ class ModelArguments: default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} ) + model_revision: Optional[str] = field( + default=None, + metadata={"help": "Revision of the model to use (branch name, tag, or commit id)."}, + ) + pooling: str = field( default='cls', metadata={"help": "pooling method for query and passage encoder"} diff --git a/src/tevatron/retriever/dataset.py b/src/tevatron/retriever/dataset.py index ae3fdb57..049d7dac 100644 --- a/src/tevatron/retriever/dataset.py +++ b/src/tevatron/retriever/dataset.py @@ -271,6 +271,7 @@ def __init__(self, data_args: DataArguments): split=self.data_args.dataset_split, cache_dir=self.data_args.dataset_cache_dir, num_proc=self.data_args.num_proc, + verification_mode="no_checks", ) if self.data_args.dataset_number_of_shards > 1: self.encode_data = self.encode_data.shard( diff --git a/src/tevatron/retriever/driver/train.py b/src/tevatron/retriever/driver/train.py index 39abab45..d9a70e68 100644 --- a/src/tevatron/retriever/driver/train.py +++ b/src/tevatron/retriever/driver/train.py @@ -60,9 +60,13 @@ def main(): set_seed(training_args.seed) + tokenizer_kwargs = {'cache_dir': model_args.cache_dir} + if model_args.model_revision is not None: + tokenizer_kwargs['revision'] = model_args.model_revision + tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, + **tokenizer_kwargs, ) if tokenizer.pad_token_id is None: @@ -79,13 +83,19 @@ def main(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 - + + model_load_kwargs = { + 'cache_dir': model_args.cache_dir, + 'torch_dtype': torch_dtype, + 'attn_implementation': model_args.attn_implementation, + } + if model_args.model_revision is not None: + model_load_kwargs['revision'] = model_args.model_revision + model = DenseModel.build( model_args, training_args, - cache_dir=model_args.cache_dir, - torch_dtype=torch_dtype, - attn_implementation=model_args.attn_implementation, + **model_load_kwargs, ) train_dataset = TrainDataset(data_args) diff --git a/src/tevatron/retriever/modeling/encoder.py b/src/tevatron/retriever/modeling/encoder.py index c3eedc35..e5f7a052 100644 --- a/src/tevatron/retriever/modeling/encoder.py +++ b/src/tevatron/retriever/modeling/encoder.py @@ -26,6 +26,11 @@ class EncoderOutput(ModelOutput): class EncoderModel(nn.Module): TRANSFORMER_CLS = AutoModel + # HuggingFace Trainer.checkpoint resume calls _issue_warnings_after_load(), which expects + # PreTrainedModel-style attributes on `self.model`. EncoderModel wraps a PreTrainedModel in + # `.encoder` but is itself an nn.Module, so omitting this triggers AttributeError on resume. + _keys_to_ignore_on_save = None + def __init__(self, encoder: PreTrainedModel, pooling: str = 'cls', From dc3c0ebd1e62f2031c0a82453586faabd073a2ac Mon Sep 17 00:00:00 2001 From: Neng Date: Sun, 21 Jun 2026 19:24:30 +0000 Subject: [PATCH 4/5] Add BEIR JSONL encoder utility --- scripts/encode_beir_jsonl.py | 199 +++++++++++++++++++++++++++++ src/tevatron/retriever/collator.py | 20 ++- 2 files changed, 214 insertions(+), 5 deletions(-) create mode 100755 scripts/encode_beir_jsonl.py diff --git a/scripts/encode_beir_jsonl.py b/scripts/encode_beir_jsonl.py new file mode 100755 index 00000000..a239b483 --- /dev/null +++ b/scripts/encode_beir_jsonl.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +"""Encode raw BEIR JSONL/JSONL.GZ files without materializing HF Arrow caches.""" + +import argparse +import gzip +import json +import logging +import os +import pickle +from contextlib import nullcontext + +import numpy as np +import torch +from tqdm import tqdm +from transformers import AutoTokenizer + +from tevatron.retriever.modeling import DenseModel +from tevatron.utils.io import ensure_parent_dir + + +logger = logging.getLogger(__name__) + + +def iter_jsonl(path): + opener = gzip.open if path.endswith(".gz") else open + with opener(path, "rt", encoding="utf-8") as f: + for line in f: + if line.strip(): + yield json.loads(line) + + +def iter_examples(path, encode_is_query, prefix, num_shards, shard_index): + for row_index, row in enumerate(iter_jsonl(path)): + if row_index % num_shards != shard_index: + continue + + content_id = row.get("query_id") or row.get("docid") or row["_id"] + if encode_is_query: + text = row.get("query_text", row.get("query", row.get("text", ""))) or "" + else: + text = row.get("text", "") or "" + title = row.get("title", "") or "" + if title: + text = f"{title} {text}" + text = text.strip() + yield str(content_id), prefix + text + + +def batched(iterator, batch_size): + ids = [] + texts = [] + for content_id, text in iterator: + ids.append(content_id) + texts.append(text) + if len(ids) == batch_size: + yield ids, texts + ids = [] + texts = [] + if ids: + yield ids, texts + + +def encode_batch(tokenizer, texts, max_length, append_eos_token, pad_to_multiple_of): + tokenized = tokenizer( + texts, + padding=False, + truncation=True, + max_length=max_length - 1 if append_eos_token else max_length, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=True, + ) + if append_eos_token: + tokenized["input_ids"] = [ + input_ids + [tokenizer.eos_token_id] for input_ids in tokenized["input_ids"] + ] + return tokenizer.pad( + tokenized, + padding=True, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=True, + return_tensors="pt", + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument("--model_name_or_path", required=True) + parser.add_argument("--lora_name_or_path", default=None) + parser.add_argument("--cache_dir", default=None) + parser.add_argument("--pooling", default="cls") + parser.add_argument("--normalize", action="store_true") + parser.add_argument("--attn_implementation", default=None) + parser.add_argument("--bf16", action="store_true") + parser.add_argument("--fp16", action="store_true") + parser.add_argument("--append_eos_token", action="store_true") + parser.add_argument("--padding_side", choices=["left", "right"], default="right") + parser.add_argument("--encode_is_query", action="store_true") + parser.add_argument("--query_prefix", default="") + parser.add_argument("--passage_prefix", default="") + parser.add_argument("--query_max_len", type=int, default=32) + parser.add_argument("--passage_max_len", type=int, default=128) + parser.add_argument("--pad_to_multiple_of", type=int, default=16) + parser.add_argument("--per_device_eval_batch_size", type=int, default=64) + parser.add_argument("--dataset_number_of_shards", type=int, default=1) + parser.add_argument("--dataset_shard_index", type=int, default=0) + parser.add_argument("--encode_output_path", required=True) + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + + if args.bf16: + torch_dtype = torch.bfloat16 + elif args.fp16: + torch_dtype = torch.float16 + else: + torch_dtype = torch.float32 + + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + tokenizer.padding_side = args.padding_side + + model_kwargs = { + "cache_dir": args.cache_dir, + "torch_dtype": torch_dtype, + } + if args.attn_implementation: + model_kwargs["attn_implementation"] = args.attn_implementation + + model = DenseModel.load( + args.model_name_or_path, + pooling=args.pooling, + normalize=args.normalize, + lora_name_or_path=args.lora_name_or_path, + **model_kwargs, + ) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = model.to(device) + model.eval() + + prefix = args.query_prefix if args.encode_is_query else args.passage_prefix + max_length = args.query_max_len if args.encode_is_query else args.passage_max_len + examples = iter_examples( + args.input, + args.encode_is_query, + prefix, + args.dataset_number_of_shards, + args.dataset_shard_index, + ) + + encoded = [] + lookup_indices = [] + amp_context = ( + torch.amp.autocast("cuda") + if device.type == "cuda" and (args.fp16 or args.bf16) + else nullcontext() + ) + + for batch_ids, batch_texts in tqdm( + batched(examples, args.per_device_eval_batch_size), + desc=os.path.basename(args.input), + ): + batch = encode_batch( + tokenizer, + batch_texts, + max_length, + args.append_eos_token, + args.pad_to_multiple_of, + ) + lookup_indices.extend(batch_ids) + with amp_context: + with torch.no_grad(): + batch = {k: v.to(device) for k, v in batch.items()} + if args.encode_is_query: + model_output = model(query=batch) + reps = model_output.q_reps + else: + model_output = model(passage=batch) + reps = model_output.p_reps + encoded.append(reps.cpu().detach().numpy()) + + if not encoded: + raise ValueError(f"No rows were encoded from {args.input}") + + ensure_parent_dir(args.encode_output_path) + with open(args.encode_output_path, "wb") as f: + pickle.dump((np.concatenate(encoded), lookup_indices), f) + + +if __name__ == "__main__": + main() diff --git a/src/tevatron/retriever/collator.py b/src/tevatron/retriever/collator.py index 20e02ef5..ffff47d8 100644 --- a/src/tevatron/retriever/collator.py +++ b/src/tevatron/retriever/collator.py @@ -3,7 +3,6 @@ from typing import List, Tuple from dataclasses import dataclass from transformers import PreTrainedTokenizer, ProcessorMixin -from qwen_omni_utils import process_mm_info from PIL import Image from tevatron.retriever.arguments import DataArguments @@ -12,6 +11,17 @@ logger = logging.getLogger(__name__) +def _process_mm_info(*args, **kwargs): + try: + from qwen_omni_utils import process_mm_info + except ImportError as exc: + raise ImportError( + "qwen_omni_utils is required for multimodal collators. " + "Install qwen-omni-utils to use multimodal training or encoding." + ) from exc + return process_mm_info(*args, **kwargs) + + @dataclass class TrainCollator: """ @@ -161,9 +171,9 @@ def __call__(self, features): # audios, images, videos = process_mm_info(conversation, use_audio_in_video=False) - query_audio_inputs, query_image_inputs, query_video_inputs = process_mm_info(query_messages, use_audio_in_video=False) + query_audio_inputs, query_image_inputs, query_video_inputs = _process_mm_info(query_messages, use_audio_in_video=False) - passage_audio_inputs, passage_image_inputs, passage_video_inputs = process_mm_info(passage_messages, use_audio_in_video=False) + passage_audio_inputs, passage_image_inputs, passage_video_inputs = _process_mm_info(passage_messages, use_audio_in_video=False) query_inputs = self.processor( text=query_texts, @@ -277,7 +287,7 @@ def __call__(self, features): if self.data_args.append_eos_token: texts = [x[0] + '<|endoftext|>' for x in texts] - audio_inputs, image_inputs, video_inputs = process_mm_info(messages, use_audio_in_video=False) + audio_inputs, image_inputs, video_inputs = _process_mm_info(messages, use_audio_in_video=False) collated_inputs = self.processor( text=texts, @@ -352,7 +362,7 @@ def __call__(self, features): texts = [x[0] + '<|endoftext|>' for x in texts] - audio_inputs, image_inputs, video_inputs = process_mm_info(messages, use_audio_in_video=False) + audio_inputs, image_inputs, video_inputs = _process_mm_info(messages, use_audio_in_video=False) return content_ids, texts, image_inputs From 78bfa3ff7427c87906e568b594641682a03de985 Mon Sep 17 00:00:00 2001 From: Neng Date: Sun, 21 Jun 2026 19:24:50 +0000 Subject: [PATCH 5/5] Update retriever documentation examples --- docs/encoding.md | 8 +- docs/training.md | 17 ++--- examples/coCondenser-marco/README.md | 110 ++++++++++++++------------- examples/coCondenser-nq/README.md | 51 +++++++------ examples/example_dpr.md | 11 +-- examples/repllama/README.md | 34 +++++---- examples/reranker/README.md | 22 +++--- examples/unicoil/README.md | 28 ++++--- 8 files changed, 143 insertions(+), 138 deletions(-) diff --git a/docs/encoding.md b/docs/encoding.md index 711165c0..b6d48071 100644 --- a/docs/encoding.md +++ b/docs/encoding.md @@ -53,8 +53,8 @@ python -m tevatron.retriever.driver.encode \ ``` > Here we are using our self-contained datasets to train. -> To use custom dataset, replace `--dataset_name Tevatron/wikipedia-nq-corpus` by -> `--encode_in_path `. (see here for details) +> To use custom dataset, set `--dataset_name json` and pass +> `--dataset_path `. (see here for details) ## Encoding on TPU (JAX / Flax) @@ -62,11 +62,11 @@ python -m tevatron.retriever.driver.encode \ I.e. the following command will do same thing as above but with Jax/Flax: ``` -python -m tevatron.driver.jax_encode \ +python -m tevatron.retriever.driver.jax_encode \ --output_dir=temp \ --model_name_or_path model_nq \ --per_device_eval_batch_size 156 \ --passage_max_len 128 \ --dataset_name Tevatron/wikipedia-nq-corpus \ --encode_output_path corpus_emb.pkl -``` \ No newline at end of file +``` diff --git a/docs/training.md b/docs/training.md index 383fa7da..f8756714 100644 --- a/docs/training.md +++ b/docs/training.md @@ -65,12 +65,12 @@ GradCache also works with multi-GPU `torchrun` setups. ## Training with TPU Tevatron implements TPU training via Jax/Flax. -We provide a separate module `tevatron.driver.jax_train` to train on TPU. +We provide a separate module `tevatron.retriever.driver.jax_train` to train on TPU. The arguments managements aligns with above Pytorch training driver. By running the following commands on a V3-8 TPU VM is equivalent to the commands above. ```bash -python -m tevatron.driver.jax_train \ +python -m tevatron.retriever.driver.jax_train \ --output_dir model_nq \ --dataset_name Tevatron/wikipedia-nq \ --model_name_or_path bert-base-uncased \ @@ -78,8 +78,8 @@ python -m tevatron.driver.jax_train \ --per_device_train_batch_size 16 \ --train_group_size 2 \ --learning_rate 1e-5 \ - --q_max_len 32 \ - --p_max_len 156 \ + --query_max_len 32 \ + --passage_max_len 156 \ --num_train_epochs 40 ``` > Note that our Jax training driver also support gradient cache by adding `--grad_cache` option. @@ -98,16 +98,13 @@ Here we describe the details of the arguments additionally defined for Tevatron' | `tokenizer_name` | Tokenizer name or path if not the same as `model_name_or_path` | `str` | same as `model_name_or_path` | pytorch, jax | | `cache_dir` | Path to the directory to save the cache of models and datasets | `str` | `~/.cache/` | pytorch, jax | | `untie_encoder` | Whether query encoder and passage encoder share same parameter | `bool` | `False` | pytorch, jax | -| `add_pooler` | Whether add pooler on top of last layer output | `bool` | `False` | pytorch | -| `projection_in_dim` | The input dim of pooler | `int` | `768` | | -| `projection_out_dim` | The output dim of pooler | `int` | `768` | pytorch | | `dataset_name` | Dataset name that avaliable on HuggingFace | `str` | `json` | pytorch, jax | -| `train_dir` | Directory that stores custom training data | `str` | `None` | pytorch, jax | -| `dataset_proc_num` | Number of threads to use to preprocess/tokenize data | `int` | `12` | pytorch, jax | +| `dataset_path` | Path to local data files or directory | `str` | `None` | pytorch, jax | +| `num_proc` | Number of threads to use to preprocess/tokenize data | `int` | `1` | pytorch, jax | | `train_group_size` | Number of passages for each anchor query during training. It will load 1 positive passage + (`train_group_size`-1) negative passages for each example during training | `int` | `8` | pytorch, jax | | `passage_field_separator` | The token to seperate `title` and `text` field for passages | `str` | `" "` | pytorch | | `query_max_len` | Maximum query length | `int` | `32` | pytorch, jax | | `passage_max_len` | Maximum passage length | `int` | `128` | pytorch, jax | | `grad_cache` | Whether use gradient cache feature. This can be used to support large batch size while GPU/TPU memory are limited. | `bool` | `False` | pytorch, jax | | `gc_q_chunk_size` | Sub-batch size for queries with `grad_cache` | `int` | `4` | pytorch, jax | -| `gc_p_chunk_size` | Sub-batch size for passages with `grad_cache` | `int` | `32` | pytorch, jax | \ No newline at end of file +| `gc_p_chunk_size` | Sub-batch size for passages with `grad_cache` | `int` | `32` | pytorch, jax | diff --git a/examples/coCondenser-marco/README.md b/examples/coCondenser-marco/README.md index 6a3f802e..ddffebc4 100644 --- a/examples/coCondenser-marco/README.md +++ b/examples/coCondenser-marco/README.md @@ -27,31 +27,33 @@ mkdir -p encoding/corpus mkdir -p encoding/query for i in $(seq -f "%02g" 0 9) do -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model \ --model_name_or_path Luyu/co-condenser-marco-retriever \ --fp16 \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/corpus/split${i}.json \ - --encoded_save_path encoding/corpus/split${i}.pt + --dataset_name json \ + --dataset_path marco/bert/corpus/split${i}.json \ + --encode_output_path encoding/corpus/split${i}.pt done -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model \ --model_name_or_path Luyu/co-condenser-marco-retriever \ --fp16 \ - --q_max_len 32 \ - --encode_is_qry \ + --query_max_len 32 \ + --encode_is_query \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/query/dev.query.json \ - --encoded_save_path encoding/query/qry.pt + --dataset_name json \ + --dataset_path marco/bert/query/dev.query.json \ + --encode_output_path encoding/query/qry.pt ``` ### Index Search ``` -python -m tevatron.faiss_retriever \ ---query_reps encoding/query/qry.pt \ ---passage_reps encoding/corpus/'*.pt' \ +python -m tevatron.retriever.driver.search \ +--query_reps encoding/query/qry.pt \ +--passage_reps encoding/corpus/'*.pt' \ --depth 10 \ --batch_size -1 \ --save_text \ @@ -65,15 +67,16 @@ python ../msmarco-passage-ranking/score_to_marco.py rank.txt Pick a pre-trained condenser that is most suitable for the experiment from [Condenser Repo](https://github.com/luyug/Condenser#pre-trained-models). Train ``` -python -m tevatron.driver.train \ - --output_dir ./retriever_model_s1 \ - --model_name_or_path CONDENSER_MODEL_NAME \ - --save_steps 20000 \ - --train_dir ./marco/bert/train \ - --fp16 \ - --per_device_train_batch_size 8 \ - --learning_rate 5e-6 \ - --num_train_epochs 3 \ +python -m tevatron.retriever.driver.train \ + --output_dir ./retriever_model_s1 \ + --model_name_or_path CONDENSER_MODEL_NAME \ + --save_steps 20000 \ + --dataset_name json \ + --dataset_path "./marco/bert/train/*.json" \ + --fp16 \ + --per_device_train_batch_size 8 \ + --learning_rate 5e-6 \ + --num_train_epochs 3 \ --dataloader_num_workers 2 ``` ## Mining Hard Negatives @@ -84,31 +87,33 @@ mkdir -p encoding/corpus mkdir -p encoding/query for i in $(seq -f "%02g" 0 9) do -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model \ --model_name_or_path ./retriever_model_s1 \ --fp16 \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/corpus/split${i}.json \ - --encoded_save_path encoding/corpus/split${i}.pt + --dataset_name json \ + --dataset_path marco/bert/corpus/split${i}.json \ + --encode_output_path encoding/corpus/split${i}.pt done -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model \ --model_name_or_path ./retriever_model_s1 \ --fp16 \ - --q_max_len 32 \ - --encode_is_qry \ + --query_max_len 32 \ + --encode_is_query \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/query/train.query.json \ - --encoded_save_path encoding/query/train.pt + --dataset_name json \ + --dataset_path marco/bert/query/train.query.json \ + --encode_output_path encoding/query/train.pt ``` ### Search ``` -python -m tevatron.faiss_retriever \ ---query_reps encoding/query/train.pt \ ---passage_reps encoding/corpus/'*.pt' \ +python -m tevatron.retriever.driver.search \ +--query_reps encoding/query/train.pt \ +--passage_reps encoding/corpus/'*.pt' \ --batch_size 5000 \ --save_text \ --save_ranking_to train.rank.tsv @@ -121,15 +126,16 @@ bash create_hn.sh ## Fine-tuning Stage 2 ``` -python -m tevatron.driver.train \ - --output_dir ./retriever_model_s2 \ - --model_name_or_path CONDENSER_MODEL_NAME \ - --save_steps 20000 \ - --train_dir ./marco/bert/train-hn \ - --fp16 \ - --per_device_train_batch_size 8 \ - --learning_rate 5e-6 \ - --num_train_epochs 2 \ +python -m tevatron.retriever.driver.train \ + --output_dir ./retriever_model_s2 \ + --model_name_or_path CONDENSER_MODEL_NAME \ + --save_steps 20000 \ + --dataset_name json \ + --dataset_path "./marco/bert/train-hn/*.json" \ + --fp16 \ + --per_device_train_batch_size 8 \ + --learning_rate 5e-6 \ + --num_train_epochs 2 \ --dataloader_num_workers 2 ``` @@ -140,30 +146,32 @@ mkdir -p encoding/corpus-s2 mkdir -p encoding/query-s2 for i in $(seq -f "%02g" 0 9) do -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model_s2 \ --model_name_or_path ./retriever_model_s2 \ --fp16 \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/corpus/split${i}.json \ - --encoded_save_path encoding/corpus-s2/split${i}.pt + --dataset_name json \ + --dataset_path marco/bert/corpus/split${i}.json \ + --encode_output_path encoding/corpus-s2/split${i}.pt done -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir ./retriever_model_s2 \ --model_name_or_path ./retriever_model_s2 \ --fp16 \ - --q_max_len 32 \ - --encode_is_qry \ + --query_max_len 32 \ + --encode_is_query \ --per_device_eval_batch_size 128 \ - --encode_in_path marco/bert/query/dev.query.json \ - --encoded_save_path encoding/query-s2/qry.pt + --dataset_name json \ + --dataset_path marco/bert/query/dev.query.json \ + --encode_output_path encoding/query-s2/qry.pt ``` Run the retriever, ``` -python -m tevatron.faiss_retriever \ ---query_reps encoding/query-s2/qry.pt \ ---passage_reps encoding/corpus-s2/'*.pt' \ +python -m tevatron.retriever.driver.search \ +--query_reps encoding/query-s2/qry.pt \ +--passage_reps encoding/corpus-s2/'*.pt' \ --depth 10 \ --batch_size -1 \ --save_text \ diff --git a/examples/coCondenser-nq/README.md b/examples/coCondenser-nq/README.md index 6d1a6d92..f1dabafc 100644 --- a/examples/coCondenser-nq/README.md +++ b/examples/coCondenser-nq/README.md @@ -45,20 +45,20 @@ python prepare_wiki_train.py --input hn.json --output nq-train/hn.bert.json --to Pick a pre-trained condenser that is most suitable for the experiment from [Condenser Repo](https://github.com/luyug/Condenser#pre-trained-models). Run training, ``` -python -m torch.distributed.launch --nproc_per_node=4 -m tevatron.driver.train \ +python -m torch.distributed.launch --nproc_per_node=4 -m tevatron.retriever.driver.train \ --output_dir model-nq \ --model_name_or_path CONDENSER_MODEL_NAME \ --do_train \ --save_steps 20000 \ - --train_dir nq-train \ + --dataset_name json \ + --dataset_path "nq-train/*.json" \ --fp16 \ --per_device_train_batch_size 32 \ - --train_n_passages 2 \ + --train_group_size 2 \ --learning_rate 5e-6 \ - --q_max_len 32 \ - --p_max_len 256 \ + --query_max_len 32 \ + --passage_max_len 256 \ --num_train_epochs 40 \ - --negatives_x_device \ --untie_encoder \ --positive_passage_no_shuffle ``` @@ -84,20 +84,20 @@ python prepare_wiki_train.py --input hn.json --output nq-train/hn.bert.json --to Pick a pre-trained condenser that is most suitable for the experiment from [Condenser Repo](https://github.com/luyug/Condenser#pre-trained-models). Run training, ``` -python -m torch.distributed.launch --nproc_per_node=4 -m tevatron.driver.train \ +python -m torch.distributed.launch --nproc_per_node=4 -m tevatron.retriever.driver.train \ --output_dir model-nq \ --model_name_or_path CONDENSER_MODEL_NAME \ --do_train \ --save_steps 20000 \ - --train_dir nq-train \ + --dataset_name json \ + --dataset_path "nq-train/*.json" \ --fp16 \ --per_device_train_batch_size 32 \ - --train_n_passages 2 \ + --train_group_size 2 \ --learning_rate 5e-6 \ - --q_max_len 32 \ - --p_max_len 256 \ + --query_max_len 32 \ + --passage_max_len 256 \ --num_train_epochs 20 \ - --negatives_x_device \ --untie_encoder \ --positive_passage_no_shuffle ``` @@ -111,32 +111,33 @@ MODEL_DIR=nq-model for s in $(seq -f "%02g" 0 19) do -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --config_name CONDENSER_MODEL_NAME \ --output_dir=$OUTDIR \ --model_name_or_path $MODEL_DIR \ --fp16 \ --per_device_eval_batch_size 64 \ - --p_max_len 256 \ - --dataset_proc_num 8 \ + --passage_max_len 256 \ + --num_proc 8 \ --dataset_name Tevatron/wikipedia-nq-corpus \ - --encoded_save_path embeddings-nq/$s.pt \ - --encode_num_shard 20 \ + --encode_output_path embeddings-nq/$s.pt \ + --dataset_number_of_shards 20 \ --passage_field_separator sep_token \ - --encode_shard_index $s + --dataset_shard_index $s done -python -m tevatron.driver.encode \ +python -m tevatron.retriever.driver.encode \ --output_dir=$OUTDIR \ --model_name_or_path $MODEL_DIR \ --config_name CONDENSER_MODEL_NAME \ --fp16 \ --per_device_eval_batch_size 64 \ - --q_max_len 32 \ - --dataset_proc_num 2 \ - --dataset_name Tevatron/wikipedia-nq/test \ - --encoded_save_path embeddings-nq-queries/query.pt \ - --encode_is_qry + --query_max_len 32 \ + --num_proc 2 \ + --dataset_name Tevatron/wikipedia-nq \ + --dataset_split test \ + --encode_output_path embeddings-nq-queries/query.pt \ + --encode_is_query ``` ## Search and Evaluation @@ -146,7 +147,7 @@ ENCODE_QRY_DIR=embeddings-nq-queries ENCODE_DIR=embeddings-nq DEPTH=200 RUN=run.nq.test.txt -python -m tevatron.faiss_retriever \ +python -m tevatron.retriever.driver.search \ --query_reps $ENCODE_QRY_DIR/query.pt \ --passage_reps $ENCODE_DIR/'*.pt' \ --depth $DEPTH \ diff --git a/examples/example_dpr.md b/examples/example_dpr.md index 58d2a1ea..0060875b 100644 --- a/examples/example_dpr.md +++ b/examples/example_dpr.md @@ -59,9 +59,9 @@ python -m tevatron.retriever.driver.encode \ --fp16 \ --per_device_eval_batch_size 156 \ --dataset_name Tevatron/wikipedia-nq-corpus \ - --encoded_save_path corpus_emb.$s.pkl \ - --encode_num_shard 20 \ - --encode_shard_index $s + --encode_output_path corpus_emb.$s.pkl \ + --dataset_number_of_shards 20 \ + --dataset_shard_index $s done ``` @@ -72,8 +72,9 @@ python -m tevatron.retriever.driver.encode \ --model_name_or_path model_nq \ --fp16 \ --per_device_eval_batch_size 156 \ - --dataset_name Tevatron/wikipedia-nq/test \ - --encoded_save_path query_emb.pkl \ + --dataset_name Tevatron/wikipedia-nq \ + --dataset_split test \ + --encode_output_path query_emb.pkl \ --encode_is_query ``` diff --git a/examples/repllama/README.md b/examples/repllama/README.md index ce7c74d6..8fac7b4c 100644 --- a/examples/repllama/README.md +++ b/examples/repllama/README.md @@ -19,11 +19,12 @@ CUDA_VISIBLE_DEVICES=1 python encode.py \ --tokenizer_name meta-llama/Llama-2-7b-hf \ --fp16 \ --per_device_eval_batch_size 16 \ - --p_max_len 512 \ - --dataset_name Tevatron/beir-corpus:scifact \ - --encoded_save_path beir_embedding_scifact/corpus_scifact.${s}.pkl \ - --encode_num_shard 4 \ - --encode_shard_index ${s} + --passage_max_len 512 \ + --dataset_name Tevatron/beir-corpus \ + --dataset_config scifact \ + --encode_output_path beir_embedding_scifact/corpus_scifact.${s}.pkl \ + --dataset_number_of_shards 4 \ + --dataset_shard_index ${s} done ``` > We shard the encoding, so that it can be parallelized on multiple GPUs, when its available. @@ -36,15 +37,17 @@ CUDA_VISIBLE_DEVICES=6 python encode.py \ --tokenizer_name meta-llama/Llama-2-7b-hf \ --fp16 \ --per_device_eval_batch_size 16 \ - --q_max_len 512 \ - --dataset_name Tevatron/beir:scifact/test \ - --encoded_save_path beir_embedding_scifact/queries_scifact.pkl \ - --encode_is_qry + --query_max_len 512 \ + --dataset_name Tevatron/beir \ + --dataset_config scifact \ + --dataset_split test \ + --encode_output_path beir_embedding_scifact/queries_scifact.pkl \ + --encode_is_query ``` ### Search -python -m tevatron.faiss_retriever \ +python -m tevatron.retriever.driver.search \ --query_reps beir_embedding_scifact/queries_scifact.pkl \ --passage_reps 'beir_embedding_scifact/corpus_scifact.*.pkl' \ --depth 100 \ @@ -78,15 +81,14 @@ deepspeed --include localhost:0,1,2,3 train.py \ --per_device_train_batch_size 8 \ --gradient_accumulation_steps 4 \ --gradient_checkpointing \ - --train_n_passages 16 \ + --train_group_size 16 \ --learning_rate 1e-4 \ - --q_max_len 32 \ - --p_max_len 196 \ + --query_max_len 32 \ + --passage_max_len 196 \ --num_train_epochs 1 \ --logging_steps 10 \ --overwrite_output_dir \ - --dataset_proc_num 32 \ - --negatives_x_device \ + --num_proc 32 \ --warmup_steps 100 ``` @@ -99,4 +101,4 @@ deepspeed --include localhost:0,1,2,3 train.py \ year={2023}, journal={arXiv:2310.08319}, } -``` \ No newline at end of file +``` diff --git a/examples/reranker/README.md b/examples/reranker/README.md index 0e5b8741..3aec35c8 100644 --- a/examples/reranker/README.md +++ b/examples/reranker/README.md @@ -3,17 +3,16 @@ In this example, we take the retrieval results from the first stage retriever (e ## Train Reranker ``` -CUDA_VISIBLE_DEVICES=0 python train_reranker.py \ +CUDA_VISIBLE_DEVICES=0 python reranker_train.py \ --output_dir reranker_msmarco \ --model_name_or_path bert-base-uncased \ --save_steps 20000 \ --dataset_name Tevatron/msmarco-passage \ --fp16 \ --per_device_train_batch_size 8 \ - --train_n_passages 8 \ + --train_group_size 8 \ --learning_rate 5e-6 \ - --q_max_len 16 \ - --p_max_len 128 \ + --rerank_max_len 144 \ --num_train_epochs 3 \ --logging_steps 500 \ --overwrite_output_dir @@ -40,25 +39,24 @@ The BM25 result has `topk=1000` passages for each query. It gives MRR@10=0.1840 ``` python prepare_rerank_file.py \ --query_data_name Tevatron/msmarco-passage \ - --corpus Tevatron/msmarco-passage-corpus \ - --retrieval run.msmarco-v1-passage.bm25-default.dev.txt \ + --corpus_data_name Tevatron/msmarco-passage-corpus \ + --retrieval_results run.msmarco-v1-passage.bm25-default.dev.txt \ --output_path rerank_input_file.bm25.jsonl ``` ### Reranking ``` -CUDA_VISIBLE_DEVICES=6 python reranker_inference.py \ +CUDA_VISIBLE_DEVICES=6 python -m tevatron.reranker.driver.rerank \ --output_dir=temp \ --model_name_or_path reranker_msmarco \ --tokenizer_name bert-base-uncased \ - --encode_in_path rerank_input_file.bm25.jsonl \ + --dataset_path rerank_input_file.bm25.jsonl \ --fp16 \ --per_device_eval_batch_size 156 \ - --q_max_len 16 \ - --p_max_len 128 \ - --dataset_name data_script.py \ - --encoded_save_path rerank_out_file.bm25.monobert.txt + --rerank_max_len 144 \ + --dataset_name json \ + --rerank_output_path rerank_out_file.bm25.monobert.txt ``` ### Evaluation diff --git a/examples/unicoil/README.md b/examples/unicoil/README.md index 5fe12e4b..b4803c14 100644 --- a/examples/unicoil/README.md +++ b/examples/unicoil/README.md @@ -7,14 +7,11 @@ CUDA_VISIBLE_DEVICES=0 python examples/unicoil/train_unicoil.py \ --dataset_name Tevatron/msmarco-passage \ --fp16 \ --per_device_train_batch_size 8 \ - --train_n_passages 8 \ + --train_group_size 8 \ --learning_rate 5e-6 \ - --q_max_len 16 \ - --p_max_len 128 \ + --query_max_len 16 \ + --passage_max_len 128 \ --num_train_epochs 3 \ - --add_pooler \ - --projection_in_dim 768 \ - --projection_out_dim 1 \ --logging_steps 500 \ --overwrite_output_dir ``` @@ -28,11 +25,11 @@ CUDA_VISIBLE_DEVICES=0 python examples/unicoil/encode_unicoil.py \ --model_name_or_path unicoil_distilbert \ --fp16 \ --per_device_eval_batch_size 156 \ - --p_max_len 128 \ + --passage_max_len 128 \ --dataset_name Tevatron/msmarco-passage-corpus \ - --encoded_save_path corpus_emb.${s}.jsonl \ - --encode_num_shard 20 \ - --encode_shard_index ${s} + --encode_output_path corpus_emb.${s}.jsonl \ + --dataset_number_of_shards 20 \ + --dataset_shard_index ${s} done ``` @@ -43,10 +40,11 @@ CUDA_VISIBLE_DEVICES=0 python examples/unicoil/encode_unicoil.py \ --model_name_or_path unicoil_distilbert \ --fp16 \ --per_device_eval_batch_size 156 \ - --encode_is_qry \ - --q_max_len 16 \ - --dataset_name Tevatron/msmarco-passage/dev \ - --encoded_save_path queries_emb.tsv + --encode_is_query \ + --query_max_len 16 \ + --dataset_name Tevatron/msmarco-passage \ + --dataset_split dev \ + --encode_output_path queries_emb.tsv ``` ## Indexing @@ -79,4 +77,4 @@ python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset run.msma MRR @10: 0.32835067312502864 QueriesRanked: 6980 ##################### -``` \ No newline at end of file +```