diff --git a/ir_datasets/formats/trec.py b/ir_datasets/formats/trec.py index 3fd44ce4..66d649c0 100644 --- a/ir_datasets/formats/trec.py +++ b/ir_datasets/formats/trec.py @@ -126,20 +126,20 @@ def docs_iter(self): def _docs_iter(self, path): if Path(path).is_file(): - path_suffix = Path(path).suffix.lower() - if path_suffix == '.gz': + path_suffix = Path(path).suffix + if path_suffix.lower() == '.gz' or path_suffix == '.z': with gzip.open(path, 'rb') as f: yield from self._parser(f) - elif path_suffix in ['.z', '.0z', '.1z', '.2z']: + elif path_suffix in ['.Z', '.0Z', '.1Z', '.2Z']: # unix "compress" command encoding unlzw3 = ir_datasets.lazy_libs.unlzw3() - with io.BytesIO(unlzw3.unlzw(path)) as f: + with io.BytesIO(unlzw3.unlzw(Path(path))) as f: yield from self._parser(f) else: with open(path, 'rb') as f: yield from self._parser(f) elif Path(path).is_dir(): - for child in path.iterdir(): + for child in sorted(Path(path).iterdir()): yield from self._docs_iter(child) def _parser_bs(self, stream): diff --git a/test/dummy/trecdocs/compress_uc_0z.tar.gz b/test/dummy/trecdocs/compress_uc_0z.tar.gz new file mode 100644 index 00000000..0b977aa3 Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z.tar.gz differ diff --git a/test/dummy/trecdocs/compress_uc_0z/F00.0Z b/test/dummy/trecdocs/compress_uc_0z/F00.0Z new file mode 100644 index 00000000..253d806f Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z/F00.0Z differ diff --git a/test/dummy/trecdocs/compress_uc_0z/F01.0Z b/test/dummy/trecdocs/compress_uc_0z/F01.0Z new file mode 100644 index 00000000..b2b16c6c Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z/F01.0Z differ diff --git a/test/dummy/trecdocs/compress_uc_z.tar.gz b/test/dummy/trecdocs/compress_uc_z.tar.gz new file mode 100644 index 00000000..0782e957 Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z.tar.gz differ diff --git a/test/dummy/trecdocs/compress_uc_z/F00.Z b/test/dummy/trecdocs/compress_uc_z/F00.Z new file mode 100644 index 00000000..253d806f Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z/F00.Z differ diff --git a/test/dummy/trecdocs/compress_uc_z/F01.Z b/test/dummy/trecdocs/compress_uc_z/F01.Z new file mode 100644 index 00000000..b2b16c6c Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z/F01.Z differ diff --git a/test/dummy/trecdocs/gzip_gz.tar.gz b/test/dummy/trecdocs/gzip_gz.tar.gz new file mode 100644 index 00000000..e07be60f Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz.tar.gz differ diff --git a/test/dummy/trecdocs/gzip_gz/F00.gz b/test/dummy/trecdocs/gzip_gz/F00.gz new file mode 100644 index 00000000..fe7a9145 Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz/F00.gz differ diff --git a/test/dummy/trecdocs/gzip_gz/F01.gz b/test/dummy/trecdocs/gzip_gz/F01.gz new file mode 100644 index 00000000..cbf7e8fd Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz/F01.gz differ diff --git a/test/dummy/trecdocs/gzip_uc_gz.tar.gz b/test/dummy/trecdocs/gzip_uc_gz.tar.gz new file mode 100644 index 00000000..1bd8c750 Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz.tar.gz differ diff --git a/test/dummy/trecdocs/gzip_uc_gz/F00.GZ b/test/dummy/trecdocs/gzip_uc_gz/F00.GZ new file mode 100644 index 00000000..fe7a9145 Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz/F00.GZ differ diff --git a/test/dummy/trecdocs/gzip_uc_gz/F01.GZ b/test/dummy/trecdocs/gzip_uc_gz/F01.GZ new file mode 100644 index 00000000..cbf7e8fd Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz/F01.GZ differ diff --git a/test/dummy/trecdocs/gzip_z.tar.gz b/test/dummy/trecdocs/gzip_z.tar.gz new file mode 100644 index 00000000..4fe6ae58 Binary files /dev/null and b/test/dummy/trecdocs/gzip_z.tar.gz differ diff --git a/test/dummy/trecdocs/gzip_z/F00.z b/test/dummy/trecdocs/gzip_z/F00.z new file mode 100644 index 00000000..fe7a9145 Binary files /dev/null and b/test/dummy/trecdocs/gzip_z/F00.z differ diff --git a/test/dummy/trecdocs/gzip_z/F01.z b/test/dummy/trecdocs/gzip_z/F01.z new file mode 100644 index 00000000..cbf7e8fd Binary files /dev/null and b/test/dummy/trecdocs/gzip_z/F01.z differ diff --git a/test/dummy/trecdocs/plaintext_noext.tar.gz b/test/dummy/trecdocs/plaintext_noext.tar.gz new file mode 100644 index 00000000..6bc0f5b6 Binary files /dev/null and b/test/dummy/trecdocs/plaintext_noext.tar.gz differ diff --git a/test/dummy/trecdocs/plaintext_noext/F00 b/test/dummy/trecdocs/plaintext_noext/F00 new file mode 100644 index 00000000..01dbba1b --- /dev/null +++ b/test/dummy/trecdocs/plaintext_noext/F00 @@ -0,0 +1,29 @@ + + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_noext/F01 b/test/dummy/trecdocs/plaintext_noext/F01 new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_noext/F01 @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/dummy/trecdocs/plaintext_txt.tar.gz b/test/dummy/trecdocs/plaintext_txt.tar.gz new file mode 100644 index 00000000..beaa4c04 Binary files /dev/null and b/test/dummy/trecdocs/plaintext_txt.tar.gz differ diff --git a/test/dummy/trecdocs/plaintext_txt/F00.txt b/test/dummy/trecdocs/plaintext_txt/F00.txt new file mode 100644 index 00000000..01dbba1b --- /dev/null +++ b/test/dummy/trecdocs/plaintext_txt/F00.txt @@ -0,0 +1,29 @@ + + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_txt/F01.txt b/test/dummy/trecdocs/plaintext_txt/F01.txt new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_txt/F01.txt @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/dummy/trecdocs/plaintext_uc_txt.tar.gz b/test/dummy/trecdocs/plaintext_uc_txt.tar.gz new file mode 100644 index 00000000..ea0289f3 Binary files /dev/null and b/test/dummy/trecdocs/plaintext_uc_txt.tar.gz differ diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT new file mode 100644 index 00000000..01dbba1b --- /dev/null +++ b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT @@ -0,0 +1,29 @@ + + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/formats/test_trec.py b/test/formats/test_trec.py index b007984f..b445bbe7 100644 --- a/test/formats/test_trec.py +++ b/test/formats/test_trec.py @@ -1,8 +1,22 @@ import os import shutil import unittest +import contextlib from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs -from ir_datasets.util import StringFile +from ir_datasets.util import StringFile, RelativePath + + +class File: + def __init__(self, path): + self._path = path + + def path(self, force=True): + return self._path + + @contextlib.contextmanager + def stream(self): + yield open(self._path, 'rb') + class TestTrec(unittest.TestCase): @@ -127,6 +141,29 @@ def test_docs(self): self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) + + def test_docs_formats(self): + expected_results = [ + TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n'), + TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='\nMore body text\n\n'), + TrecDoc(doc_id='D102', text='\n\nsome very fun text\n markup &\n\n\n', marked_up_doc='\nsome very fun text\n markup &\n\n\n'), + ] + + for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']: + with self.subTest(source): + print(source, "no paths") + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}'))) + self.assertEqual(list(docs.docs_iter()), expected_results) + + print(source, "paths") + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*']) + self.assertEqual(list(docs.docs_iter()), expected_results) + + if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']: + print(source, "tarfile") + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*']) + self.assertEqual(list(docs.docs_iter()), expected_results) + def tearDown(self): if os.path.exists('MOCK.pklz4'): shutil.rmtree('MOCK.pklz4')