diff --git a/ir_datasets/formats/trec.py b/ir_datasets/formats/trec.py
index 3fd44ce4..66d649c0 100644
--- a/ir_datasets/formats/trec.py
+++ b/ir_datasets/formats/trec.py
@@ -126,20 +126,20 @@ def docs_iter(self):
def _docs_iter(self, path):
if Path(path).is_file():
- path_suffix = Path(path).suffix.lower()
- if path_suffix == '.gz':
+ path_suffix = Path(path).suffix
+ if path_suffix.lower() == '.gz' or path_suffix == '.z':
with gzip.open(path, 'rb') as f:
yield from self._parser(f)
- elif path_suffix in ['.z', '.0z', '.1z', '.2z']:
+ elif path_suffix in ['.Z', '.0Z', '.1Z', '.2Z']:
# unix "compress" command encoding
unlzw3 = ir_datasets.lazy_libs.unlzw3()
- with io.BytesIO(unlzw3.unlzw(path)) as f:
+ with io.BytesIO(unlzw3.unlzw(Path(path))) as f:
yield from self._parser(f)
else:
with open(path, 'rb') as f:
yield from self._parser(f)
elif Path(path).is_dir():
- for child in path.iterdir():
+ for child in sorted(Path(path).iterdir()):
yield from self._docs_iter(child)
def _parser_bs(self, stream):
diff --git a/test/dummy/trecdocs/compress_uc_0z.tar.gz b/test/dummy/trecdocs/compress_uc_0z.tar.gz
new file mode 100644
index 00000000..0b977aa3
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z.tar.gz differ
diff --git a/test/dummy/trecdocs/compress_uc_0z/F00.0Z b/test/dummy/trecdocs/compress_uc_0z/F00.0Z
new file mode 100644
index 00000000..253d806f
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z/F00.0Z differ
diff --git a/test/dummy/trecdocs/compress_uc_0z/F01.0Z b/test/dummy/trecdocs/compress_uc_0z/F01.0Z
new file mode 100644
index 00000000..b2b16c6c
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_0z/F01.0Z differ
diff --git a/test/dummy/trecdocs/compress_uc_z.tar.gz b/test/dummy/trecdocs/compress_uc_z.tar.gz
new file mode 100644
index 00000000..0782e957
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z.tar.gz differ
diff --git a/test/dummy/trecdocs/compress_uc_z/F00.Z b/test/dummy/trecdocs/compress_uc_z/F00.Z
new file mode 100644
index 00000000..253d806f
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z/F00.Z differ
diff --git a/test/dummy/trecdocs/compress_uc_z/F01.Z b/test/dummy/trecdocs/compress_uc_z/F01.Z
new file mode 100644
index 00000000..b2b16c6c
Binary files /dev/null and b/test/dummy/trecdocs/compress_uc_z/F01.Z differ
diff --git a/test/dummy/trecdocs/gzip_gz.tar.gz b/test/dummy/trecdocs/gzip_gz.tar.gz
new file mode 100644
index 00000000..e07be60f
Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz.tar.gz differ
diff --git a/test/dummy/trecdocs/gzip_gz/F00.gz b/test/dummy/trecdocs/gzip_gz/F00.gz
new file mode 100644
index 00000000..fe7a9145
Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz/F00.gz differ
diff --git a/test/dummy/trecdocs/gzip_gz/F01.gz b/test/dummy/trecdocs/gzip_gz/F01.gz
new file mode 100644
index 00000000..cbf7e8fd
Binary files /dev/null and b/test/dummy/trecdocs/gzip_gz/F01.gz differ
diff --git a/test/dummy/trecdocs/gzip_uc_gz.tar.gz b/test/dummy/trecdocs/gzip_uc_gz.tar.gz
new file mode 100644
index 00000000..1bd8c750
Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz.tar.gz differ
diff --git a/test/dummy/trecdocs/gzip_uc_gz/F00.GZ b/test/dummy/trecdocs/gzip_uc_gz/F00.GZ
new file mode 100644
index 00000000..fe7a9145
Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz/F00.GZ differ
diff --git a/test/dummy/trecdocs/gzip_uc_gz/F01.GZ b/test/dummy/trecdocs/gzip_uc_gz/F01.GZ
new file mode 100644
index 00000000..cbf7e8fd
Binary files /dev/null and b/test/dummy/trecdocs/gzip_uc_gz/F01.GZ differ
diff --git a/test/dummy/trecdocs/gzip_z.tar.gz b/test/dummy/trecdocs/gzip_z.tar.gz
new file mode 100644
index 00000000..4fe6ae58
Binary files /dev/null and b/test/dummy/trecdocs/gzip_z.tar.gz differ
diff --git a/test/dummy/trecdocs/gzip_z/F00.z b/test/dummy/trecdocs/gzip_z/F00.z
new file mode 100644
index 00000000..fe7a9145
Binary files /dev/null and b/test/dummy/trecdocs/gzip_z/F00.z differ
diff --git a/test/dummy/trecdocs/gzip_z/F01.z b/test/dummy/trecdocs/gzip_z/F01.z
new file mode 100644
index 00000000..cbf7e8fd
Binary files /dev/null and b/test/dummy/trecdocs/gzip_z/F01.z differ
diff --git a/test/dummy/trecdocs/plaintext_noext.tar.gz b/test/dummy/trecdocs/plaintext_noext.tar.gz
new file mode 100644
index 00000000..6bc0f5b6
Binary files /dev/null and b/test/dummy/trecdocs/plaintext_noext.tar.gz differ
diff --git a/test/dummy/trecdocs/plaintext_noext/F00 b/test/dummy/trecdocs/plaintext_noext/F00
new file mode 100644
index 00000000..01dbba1b
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_noext/F00
@@ -0,0 +1,29 @@
+
+ D100A
+ Something
+ Some text
+
+
+ Header Text
+Daily Report
+
+
+
+
+Main body text
+on multiple lines
+
+with some markup
+ here. Also, some invalid markup &.
+
+
+
+
+
+ 101
+
+
+More body text
+
+
+
diff --git a/test/dummy/trecdocs/plaintext_noext/F01 b/test/dummy/trecdocs/plaintext_noext/F01
new file mode 100644
index 00000000..69a3dc38
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_noext/F01
@@ -0,0 +1,11 @@
+
+ D102
+ more text
+
+
+some very fun text
+ markup &
+
+
+
+
diff --git a/test/dummy/trecdocs/plaintext_txt.tar.gz b/test/dummy/trecdocs/plaintext_txt.tar.gz
new file mode 100644
index 00000000..beaa4c04
Binary files /dev/null and b/test/dummy/trecdocs/plaintext_txt.tar.gz differ
diff --git a/test/dummy/trecdocs/plaintext_txt/F00.txt b/test/dummy/trecdocs/plaintext_txt/F00.txt
new file mode 100644
index 00000000..01dbba1b
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_txt/F00.txt
@@ -0,0 +1,29 @@
+
+ D100A
+ Something
+ Some text
+
+
+ Header Text
+Daily Report
+
+
+
+
+Main body text
+on multiple lines
+
+with some markup
+ here. Also, some invalid markup &.
+
+
+
+
+
+ 101
+
+
+More body text
+
+
+
diff --git a/test/dummy/trecdocs/plaintext_txt/F01.txt b/test/dummy/trecdocs/plaintext_txt/F01.txt
new file mode 100644
index 00000000..69a3dc38
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_txt/F01.txt
@@ -0,0 +1,11 @@
+
+ D102
+ more text
+
+
+some very fun text
+ markup &
+
+
+
+
diff --git a/test/dummy/trecdocs/plaintext_uc_txt.tar.gz b/test/dummy/trecdocs/plaintext_uc_txt.tar.gz
new file mode 100644
index 00000000..ea0289f3
Binary files /dev/null and b/test/dummy/trecdocs/plaintext_uc_txt.tar.gz differ
diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT
new file mode 100644
index 00000000..01dbba1b
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT
@@ -0,0 +1,29 @@
+
+ D100A
+ Something
+ Some text
+
+
+ Header Text
+Daily Report
+
+
+
+
+Main body text
+on multiple lines
+
+with some markup
+ here. Also, some invalid markup &.
+
+
+
+
+
+ 101
+
+
+More body text
+
+
+
diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT
new file mode 100644
index 00000000..69a3dc38
--- /dev/null
+++ b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT
@@ -0,0 +1,11 @@
+
+ D102
+ more text
+
+
+some very fun text
+ markup &
+
+
+
+
diff --git a/test/formats/test_trec.py b/test/formats/test_trec.py
index b007984f..b445bbe7 100644
--- a/test/formats/test_trec.py
+++ b/test/formats/test_trec.py
@@ -1,8 +1,22 @@
import os
import shutil
import unittest
+import contextlib
from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs
-from ir_datasets.util import StringFile
+from ir_datasets.util import StringFile, RelativePath
+
+
+class File:
+ def __init__(self, path):
+ self._path = path
+
+ def path(self, force=True):
+ return self._path
+
+ @contextlib.contextmanager
+ def stream(self):
+ yield open(self._path, 'rb')
+
class TestTrec(unittest.TestCase):
@@ -127,6 +141,29 @@ def test_docs(self):
self.assertEqual(docs.docs_path(), 'MOCK')
self.assertEqual(list(docs.docs_iter()), expected_results)
+
+ def test_docs_formats(self):
+ expected_results = [
+ TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n'),
+ TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='\nMore body text\n\n'),
+ TrecDoc(doc_id='D102', text='\n\nsome very fun text\n markup &\n\n\n', marked_up_doc='\nsome very fun text\n markup &\n\n\n'),
+ ]
+
+ for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']:
+ with self.subTest(source):
+ print(source, "no paths")
+ docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')))
+ self.assertEqual(list(docs.docs_iter()), expected_results)
+
+ print(source, "paths")
+ docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*'])
+ self.assertEqual(list(docs.docs_iter()), expected_results)
+
+ if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']:
+ print(source, "tarfile")
+ docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*'])
+ self.assertEqual(list(docs.docs_iter()), expected_results)
+
def tearDown(self):
if os.path.exists('MOCK.pklz4'):
shutil.rmtree('MOCK.pklz4')