@@ -379,6 +379,187 @@ def _build_subset(selected: list[dict]) -> tuple[dict, dict, dict]:
379379 f"qrels={ sum (len (v ) for v in qrels_24 .values ())} )" )
380380
381381
382+ def _load_multilingual_beir_dataset (
383+ hf_path : str ,
384+ name : str ,
385+ out_file : str ,
386+ lang_prefix : str ,
387+ * ,
388+ max_corpus : int = 0 ,
389+ ) -> None :
390+ """다국어 BeIR 형식 데이터셋 로드 (ko-corpus, ko-queries, ko-qrels 등)."""
391+ from datasets import get_dataset_split_names , load_dataset
392+
393+ print (f"Downloading { name } ..." )
394+
395+ corpus_config = f"{ lang_prefix } -corpus"
396+ queries_config = f"{ lang_prefix } -queries"
397+ qrels_config = f"{ lang_prefix } -qrels"
398+
399+ # corpus
400+ try :
401+ corpus_split = get_dataset_split_names (hf_path , corpus_config )[0 ]
402+ except Exception :
403+ print (f" SKIP: { name } — config '{ corpus_config } ' not available" )
404+ return
405+ corpus_ds = load_dataset (hf_path , corpus_config , split = corpus_split )
406+ corpus = {}
407+ id_key = "_id" if "_id" in corpus_ds .column_names else "id"
408+ for row in corpus_ds :
409+ corpus [str (row [id_key ])] = {"title" : row .get ("title" , "" ), "text" : row .get ("text" , "" )}
410+
411+ # queries
412+ queries_split = get_dataset_split_names (hf_path , queries_config )[0 ]
413+ queries_ds = load_dataset (hf_path , queries_config , split = queries_split )
414+ queries = {}
415+ q_id_key = "_id" if "_id" in queries_ds .column_names else "id"
416+ for row in queries_ds :
417+ queries [str (row [q_id_key ])] = row .get ("text" , "" )
418+
419+ # qrels
420+ qrels_split = get_dataset_split_names (hf_path , qrels_config )[0 ]
421+ qrels_ds = load_dataset (hf_path , qrels_config , split = qrels_split )
422+ qrels : dict [str , dict [str , int ]] = {}
423+ for row in qrels_ds :
424+ qid = str (row .get ("query-id" , "" ))
425+ cid = str (row .get ("corpus-id" , "" ))
426+ score = row .get ("score" , 1 )
427+ if qid and cid :
428+ qrels .setdefault (qid , {})[cid ] = score
429+
430+ # 대규모 corpus 샘플링: qrels 관련 문서 + 랜덤 negative
431+ if max_corpus > 0 and len (corpus ) > max_corpus :
432+ import random
433+ random .seed (42 )
434+ relevant_ids = set ()
435+ for rels in qrels .values ():
436+ relevant_ids .update (rels .keys ())
437+
438+ sampled_corpus = {cid : corpus [cid ] for cid in relevant_ids if cid in corpus }
439+
440+ remaining = [cid for cid in corpus if cid not in relevant_ids ]
441+ n_neg = max_corpus - len (sampled_corpus )
442+ if n_neg > 0 and remaining :
443+ neg_sample = random .sample (remaining , min (n_neg , len (remaining )))
444+ for cid in neg_sample :
445+ sampled_corpus [cid ] = corpus [cid ]
446+
447+ print (f" Sampled corpus: { len (corpus )} → { len (sampled_corpus )} "
448+ f"(relevant={ len (relevant_ids & set (corpus .keys ()))} , negative={ len (sampled_corpus ) - len (relevant_ids & set (sampled_corpus .keys ()))} )" )
449+ corpus = sampled_corpus
450+
451+ out = {
452+ "name" : name ,
453+ "source" : hf_path ,
454+ "corpus_size" : len (corpus ),
455+ "query_size" : len (queries ),
456+ "qrels_size" : sum (len (v ) for v in qrels .values ()),
457+ "corpus" : corpus ,
458+ "queries" : queries ,
459+ "qrels" : qrels ,
460+ }
461+ path = DATA_DIR / out_file
462+ with open (path , "w" ) as f :
463+ json .dump (out , f , ensure_ascii = False )
464+ print (f" Saved: { path } (corpus={ len (corpus )} , queries={ len (queries )} , qrels={ sum (len (v ) for v in qrels .values ())} )" )
465+
466+
467+ # ── BeIR 영문 데이터셋 ──
468+
469+
470+ def _load_mteb_beir_dataset (hf_path : str , name : str , out_file : str , * , qrels_split : str = "test" ) -> None :
471+ """MTEB BeIR 형식 데이터셋 — corpus/queries config + default config(=qrels)."""
472+ from datasets import load_dataset
473+
474+ print (f"Downloading { name } ..." )
475+
476+ # corpus
477+ corpus_ds = load_dataset (hf_path , "corpus" , split = "corpus" )
478+ corpus = {}
479+ for row in corpus_ds :
480+ corpus [str (row ["_id" ])] = {"title" : row .get ("title" , "" ), "text" : row .get ("text" , "" )}
481+
482+ # queries
483+ queries_ds = load_dataset (hf_path , "queries" , split = "queries" )
484+ queries = {}
485+ for row in queries_ds :
486+ queries [str (row ["_id" ])] = row .get ("text" , "" )
487+
488+ # qrels (default config, test split)
489+ qrels_ds = load_dataset (hf_path , "default" , split = qrels_split )
490+ qrels : dict [str , dict [str , int ]] = {}
491+ for row in qrels_ds :
492+ qid = str (row .get ("query-id" , "" ))
493+ cid = str (row .get ("corpus-id" , "" ))
494+ score = row .get ("score" , 1 )
495+ if qid and cid :
496+ qrels .setdefault (qid , {})[cid ] = int (score )
497+
498+ out = {
499+ "name" : name ,
500+ "source" : hf_path ,
501+ "corpus_size" : len (corpus ),
502+ "query_size" : len (queries ),
503+ "qrels_size" : sum (len (v ) for v in qrels .values ()),
504+ "corpus" : corpus ,
505+ "queries" : queries ,
506+ "qrels" : qrels ,
507+ }
508+ path = DATA_DIR / out_file
509+ with open (path , "w" ) as f :
510+ json .dump (out , f , ensure_ascii = False )
511+ print (f" Saved: { path } (corpus={ len (corpus )} , queries={ len (queries )} , qrels={ sum (len (v ) for v in qrels .values ())} )" )
512+
513+
514+ def download_nfcorpus () -> None :
515+ """NFCorpus — 의료/영양 도메인 (MTEB BeIR)."""
516+ _load_mteb_beir_dataset ("mteb/NFCorpus" , "NFCorpus" , "nfcorpus.json" )
517+
518+
519+ def download_scifact () -> None :
520+ """SciFact — 과학적 주장 검증 (MTEB BeIR)."""
521+ _load_mteb_beir_dataset ("mteb/SciFact" , "SciFact" , "scifact.json" )
522+
523+
524+ def download_fiqa () -> None :
525+ """FiQA — 금융 QA (MTEB BeIR, 57K corpus)."""
526+ _load_mteb_beir_dataset ("mteb/FiQA" , "FiQA" , "fiqa.json" )
527+
528+
529+ # ── MTEB 한국어 데이터셋 ──
530+
531+
532+ def download_miracl_retrieval_ko () -> None :
533+ """MIRACLRetrieval Korean — MTEB 핵심 한국어 검색 벤치마크 (1.49M corpus → 샘플링)."""
534+ _load_multilingual_beir_dataset (
535+ "mteb/MIRACLRetrieval" ,
536+ "MIRACLRetrieval-ko" ,
537+ "miracl_retrieval_ko.json" ,
538+ "ko" ,
539+ max_corpus = 10000 ,
540+ )
541+
542+
543+ def download_multilongdoc_ko () -> None :
544+ """MultiLongDocRetrieval Korean — 장문서 검색 벤치마크."""
545+ _load_multilingual_beir_dataset (
546+ "mteb/MultiLongDocRetrieval" ,
547+ "MultiLongDocRetrieval-ko" ,
548+ "multilongdoc_ko.json" ,
549+ "ko" ,
550+ )
551+
552+
553+ def download_xpqa_ko () -> None :
554+ """XPQARetrieval Korean — 다도메인 한국어 검색."""
555+ _load_multilingual_beir_dataset (
556+ "mteb/XPQARetrieval" ,
557+ "XPQARetrieval-ko" ,
558+ "xpqa_ko.json" ,
559+ "kor-kor" ,
560+ )
561+
562+
382563def download_publichealthqa_ko () -> None :
383564 """PublicHealthQA Korean — 의료/공중보건 도메인 (BeIR 형식, korean- prefix)."""
384565 from datasets import load_dataset
@@ -426,6 +607,7 @@ def main() -> None:
426607 print ("Downloading benchmark datasets from HuggingFace" )
427608 print ("=" * 60 )
428609
610+ # 기존 데이터셋
429611 download_ko_strategyqa ()
430612 download_autorag_retrieval ()
431613 download_miracl_ko ()
@@ -436,6 +618,16 @@ def main() -> None:
436618 download_publichealthqa_ko ()
437619 download_hotpotqa ()
438620
621+ # 신규: BeIR 영문 3종
622+ download_nfcorpus ()
623+ download_scifact ()
624+ download_fiqa ()
625+
626+ # 신규: MTEB 한국어 3종
627+ download_miracl_retrieval_ko ()
628+ download_multilongdoc_ko ()
629+ download_xpqa_ko ()
630+
439631 print ("\n " + "=" * 60 )
440632 print ("All datasets downloaded!" )
441633 print ("=" * 60 )
0 commit comments