From 9632f32eb4814880b942e2d4b764be4a3e9e1869 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 5 May 2022 21:59:17 +0100 Subject: [PATCH 1/2] handling .z files as gzip improved tests regarding #189 --- ir_datasets/formats/trec.py | 10 +++--- test/dummy/trecdocs/compress_uc_0z.tar.gz | Bin 0 -> 573 bytes test/dummy/trecdocs/compress_uc_0z/F00.0Z | Bin 0 -> 261 bytes test/dummy/trecdocs/compress_uc_0z/F01.0Z | Bin 0 -> 124 bytes test/dummy/trecdocs/compress_uc_z.tar.gz | Bin 0 -> 571 bytes test/dummy/trecdocs/compress_uc_z/F00.Z | Bin 0 -> 261 bytes test/dummy/trecdocs/compress_uc_z/F01.Z | Bin 0 -> 124 bytes test/dummy/trecdocs/gzip_gz.tar.gz | Bin 0 -> 577 bytes test/dummy/trecdocs/gzip_gz/F00.gz | Bin 0 -> 241 bytes test/dummy/trecdocs/gzip_gz/F01.gz | Bin 0 -> 143 bytes test/dummy/trecdocs/gzip_uc_gz.tar.gz | Bin 0 -> 581 bytes test/dummy/trecdocs/gzip_uc_gz/F00.GZ | Bin 0 -> 241 bytes test/dummy/trecdocs/gzip_uc_gz/F01.GZ | Bin 0 -> 143 bytes test/dummy/trecdocs/gzip_z.tar.gz | Bin 0 -> 578 bytes test/dummy/trecdocs/gzip_z/F00.z | Bin 0 -> 241 bytes test/dummy/trecdocs/gzip_z/F01.z | Bin 0 -> 143 bytes test/dummy/trecdocs/plaintext_noext.tar.gz | Bin 0 -> 442 bytes test/dummy/trecdocs/plaintext_noext/F00 | 29 +++++++++++++++ test/dummy/trecdocs/plaintext_noext/F01 | 11 ++++++ test/dummy/trecdocs/plaintext_txt.tar.gz | Bin 0 -> 442 bytes test/dummy/trecdocs/plaintext_txt/F00.txt | 29 +++++++++++++++ test/dummy/trecdocs/plaintext_txt/F01.txt | 11 ++++++ test/dummy/trecdocs/plaintext_uc_txt.tar.gz | Bin 0 -> 446 bytes test/dummy/trecdocs/plaintext_uc_txt/F00.TXT | 29 +++++++++++++++ test/dummy/trecdocs/plaintext_uc_txt/F01.TXT | 11 ++++++ test/formats/test_trec.py | 36 ++++++++++++++++++- 26 files changed, 160 insertions(+), 6 deletions(-) create mode 100644 test/dummy/trecdocs/compress_uc_0z.tar.gz create mode 100644 test/dummy/trecdocs/compress_uc_0z/F00.0Z create mode 100644 test/dummy/trecdocs/compress_uc_0z/F01.0Z create mode 100644 test/dummy/trecdocs/compress_uc_z.tar.gz create mode 100644 test/dummy/trecdocs/compress_uc_z/F00.Z create mode 100644 test/dummy/trecdocs/compress_uc_z/F01.Z create mode 100644 test/dummy/trecdocs/gzip_gz.tar.gz create mode 100644 test/dummy/trecdocs/gzip_gz/F00.gz create mode 100644 test/dummy/trecdocs/gzip_gz/F01.gz create mode 100644 test/dummy/trecdocs/gzip_uc_gz.tar.gz create mode 100644 test/dummy/trecdocs/gzip_uc_gz/F00.GZ create mode 100644 test/dummy/trecdocs/gzip_uc_gz/F01.GZ create mode 100644 test/dummy/trecdocs/gzip_z.tar.gz create mode 100644 test/dummy/trecdocs/gzip_z/F00.z create mode 100644 test/dummy/trecdocs/gzip_z/F01.z create mode 100644 test/dummy/trecdocs/plaintext_noext.tar.gz create mode 100644 test/dummy/trecdocs/plaintext_noext/F00 create mode 100644 test/dummy/trecdocs/plaintext_noext/F01 create mode 100644 test/dummy/trecdocs/plaintext_txt.tar.gz create mode 100644 test/dummy/trecdocs/plaintext_txt/F00.txt create mode 100644 test/dummy/trecdocs/plaintext_txt/F01.txt create mode 100644 test/dummy/trecdocs/plaintext_uc_txt.tar.gz create mode 100644 test/dummy/trecdocs/plaintext_uc_txt/F00.TXT create mode 100644 test/dummy/trecdocs/plaintext_uc_txt/F01.TXT diff --git a/ir_datasets/formats/trec.py b/ir_datasets/formats/trec.py index 3fd44ce4..66d649c0 100644 --- a/ir_datasets/formats/trec.py +++ b/ir_datasets/formats/trec.py @@ -126,20 +126,20 @@ def docs_iter(self): def _docs_iter(self, path): if Path(path).is_file(): - path_suffix = Path(path).suffix.lower() - if path_suffix == '.gz': + path_suffix = Path(path).suffix + if path_suffix.lower() == '.gz' or path_suffix == '.z': with gzip.open(path, 'rb') as f: yield from self._parser(f) - elif path_suffix in ['.z', '.0z', '.1z', '.2z']: + elif path_suffix in ['.Z', '.0Z', '.1Z', '.2Z']: # unix "compress" command encoding unlzw3 = ir_datasets.lazy_libs.unlzw3() - with io.BytesIO(unlzw3.unlzw(path)) as f: + with io.BytesIO(unlzw3.unlzw(Path(path))) as f: yield from self._parser(f) else: with open(path, 'rb') as f: yield from self._parser(f) elif Path(path).is_dir(): - for child in path.iterdir(): + for child in sorted(Path(path).iterdir()): yield from self._docs_iter(child) def _parser_bs(self, stream): diff --git a/test/dummy/trecdocs/compress_uc_0z.tar.gz b/test/dummy/trecdocs/compress_uc_0z.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0b977aa3d5bd0a3a6393c3e32f3ecaa7188b998f GIT binary patch literal 573 zcmV-D0>b?tiwFQ>IdozG153`&EhtJYE{-ovjyI^%XP^ZzFfcGTHB|u9W)RxI$OJ+| zfr6olk+G?Pv9Xbnse*x_iJ^fZgMukdT!RKmi%SxVfKDk+P0Yim4yY_G4Uat3A8>kz zfMnG8cQY{5Gl-(H9l-D~Gcmy(|Ayu!(D*kpH#Rc?#lM-UF@u5umED0AjPUrEpF6>( z!$$Ivv*UqAzj;3x7@ItJB#b1Q9(;6qFrkIDQeoky4QHh-Qyng7Hn;ORcS&2BV9oyq zO$ICnAGaP*VRW*ilWrY)R+d2Lutm`0<&|MZ4pj0l`FS+?=8Kf}=-}8j*wqF|s5GI%V+CVOUXSL4!z&1R=sCiIXB4gH&0_5J{OZWXi|^ zK_%zTib@L-36q9R9uQK>z4IDOQK(I+!X3p6=X--l!0s=~phcXtJQA78EjSDI;- literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/compress_uc_0z/F01.0Z b/test/dummy/trecdocs/compress_uc_0z/F01.0Z new file mode 100644 index 0000000000000000000000000000000000000000..b2b16c6ca08a58e41bb60bcf3b151ff25cd6f995 GIT binary patch literal 124 zcmV-?0E7P@osc|;JQ?CcLBN4dp7;O(f=CP)G8ls3@j~E`gavsZp|Y^e8#!e_&?$q5 z4nv9?3mT+S1Pez6bKbNGX{U^wJQIz8fYRfkjD=?ExDiQ`j=>NuT7VcMbEcb-WI(*B eV=`&SFml$^fdj%skS0)eC@}bfrAw6*Efxfi(lfpQ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/compress_uc_z.tar.gz b/test/dummy/trecdocs/compress_uc_z.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0782e95732523dfac627f64d659db2f01070d134 GIT binary patch literal 571 zcmV-B0>u3viwFQ>IdozG153`&EhtJYE{-ovj<3>Zpb0QAFfcbYRRGgw5Zb`V1VTfB zf}x3#v8jQvv7x!Sf`Orlfsr|bf+=mBg9=KEOA?EKPAN`J%)_S+s4OiFk37^LaC(S< zWR&=KGceSPqPiWx@Gvtm!5#mG<|feiH!?RiGXcfFsi850f&tZCf(4B5_?MqM!KTAT z@{zOSfkwZ1KNuLBJa{CGB$^(4bb2tMg|$*);ie5|r7cq(E@(El^Er1(b4x7=V&QgY zExxyfGjn;*92?=@hK5Uz7xuKKz1kt@IjQfUfNoTSgmUq+*%MP7_N+`c$~$20xmXdQTP66=NA_zmEdU{BJM-`QOCA6rTT$j3D{nz-To8(*t15{{~G4 zEC(OA9#G`u{?s7k;w7jaBIZ|lc^QwaN#>n73x6~%cAp?Mv4zEOlM07b>W2%75rJur z%^qDWiicD>Co4%cheVzb@SJIoa_R)5qw4c>U0&QmoRiypnf>a3V|I$7)XLPD2985%#@qt4bC+!qPBWD3GPZ3N<2~du!>pieg&TMH zy6A4^yKz@443!VJ8>)rnto!ih_KxD&6MW~CzxQ2zWTSlix`B>^Q7{Td!6+CA008I& J(l7uH008wl5w`#U literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/compress_uc_z/F00.Z b/test/dummy/trecdocs/compress_uc_z/F00.Z new file mode 100644 index 0000000000000000000000000000000000000000..253d806fd0873fe4be79d8eb84d494e8f46e4f4c GIT binary patch literal 261 zcmV+g0s8(Qosc|;JQ?CcLBN4dp7;O(f=CP)FoG}y!Q+L%AqflefD%MX5hqj@PI=R& z3>`FS+?=8Kf}=-}8j*wqF|s5GI%V+CVOUXSL4!z&1R=sCiIXB4gH&0_5J{OZWXi|^ zK_%zTib@L-36q9R9uQK>z4IDOQK(I+!X3p6=X--l!0s=~phcXtJQA78EjSDI;- literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/compress_uc_z/F01.Z b/test/dummy/trecdocs/compress_uc_z/F01.Z new file mode 100644 index 0000000000000000000000000000000000000000..b2b16c6ca08a58e41bb60bcf3b151ff25cd6f995 GIT binary patch literal 124 zcmV-?0E7P@osc|;JQ?CcLBN4dp7;O(f=CP)G8ls3@j~E`gavsZp|Y^e8#!e_&?$q5 z4nv9?3mT+S1Pez6bKbNGX{U^wJQIz8fYRfkjD=?ExDiQ`j=>NuT7VcMbEcb-WI(*B eV=`&SFml$^fdj%skS0)eC@}bfrAw6*Efxfi(lfpQ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_gz.tar.gz b/test/dummy/trecdocs/gzip_gz.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..e07be60f8c354f4c2e8e7f87cafeb8d94105c02a GIT binary patch literal 577 zcmV-H0>1qpiwFQ>IdozG152;UEQn99(r2I{FfcGMH#JoN(`FFbz{mtbLxF;!iIK6X zfw7U9nTdjdp^=%9DT9J3Egb~|rNt$QML?$%rzYm%QwLO*mWD?j>JK0U3Fxvi((ekpI^3eLBY+b&z^of zRaCvcr+@ZY_6S)`psMonN^8O631zcUXmsiWMik(Dv11V>vukPhS32 zm*}t7;EjrxgItVPrW~80(OatiCX>UkQTOJ0_Y=#ip3YHEw&h!BJNL5dhZp^NHxCNV z`D()TF*BHb=dpL|PK0wkzOuyp<6nz{2T==J3u~v(pQLyx^yGid`@Q|97vmWRI=BZNK;(bJ0n7hJhUW16Z)^t5 z|7N54pB?~||Gz-RB! PU?c$mK0=;h01f~ENM{}i literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_gz/F00.gz b/test/dummy/trecdocs/gzip_gz/F00.gz new file mode 100644 index 0000000000000000000000000000000000000000..fe7a9145e8341e3349011366cf8778004a9116cd GIT binary patch literal 241 zcmV4)1T*f75;&R?JQ-^&zvaebGck|EMn$^R+ejgK|aUjYCB7TR;! literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_gz/F01.gz b/test/dummy/trecdocs/gzip_gz/F01.gz new file mode 100644 index 0000000000000000000000000000000000000000..cbf7e8fdf024ad388839cad0b7011092fd306d4a GIT binary patch literal 143 zcmV;A0C4{wiwFqxE_7l514b}0040sF4#F@DM0bD1v9Lu2R*WS?K`hV?9l9bg3rbUk z1hs#U+b*{B?)=PUbrNEm_ey|ryV(KYVgbb0M&LtKBpuQe7$D;2+}i#IulQ*k!57pA xyA`~1H+P9w&fyun*DmP`AtyLzvPb;n9_!<+-kYdcOf-KK`U5<@e$|fv0067rKV<*_ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_uc_gz.tar.gz b/test/dummy/trecdocs/gzip_uc_gz.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..1bd8c7502996f86ee3e37ada29a6ebb114650401 GIT binary patch literal 581 zcmV-L0=oSliwFQ>IdozG152;UEQl{nj!&=BXP^}@FfcGTHB|u9W)RxI$OJ+|fr6ol zk+G?Pv9W=%fr5b{kTzgYFr}G`kU(j1Nn#PuDaEOYdHB=;m8GTOk%#&NP7e`~j);FZ z0|PzxC>q!S3=cCC6WsA{Y-R|Je;iGjH}gMtAK+=2l{c>K$Eb8t-2D@kHt z21Y+aY`|&1!v+F-errEFvNCH@oZ&Q`Nvk|2gsFPO#HPCH$XFM}ET})fY~_N2n^T`X z{dlUVdVR~2$b8WNvC4IfCvx0c?!2rGHRn3NaQT(U|HtmI3KtbCPI#g1tH;K2c&eVf z{Hrd}U#-C#6)y+57_Uq@HbbMgRQ*jRhhd}c&GqglmQ_8Sqn>Qbx6pR(W!Dcc`t@!e z6rA(bgzIBwF#FD9@7A3N=X!i)iTTIB76lKY7B2Si&MJ#-paZaw{&hI)$dSABR zR=r`cU+Y}_{BQS9i&PfYPM<$X@lxo?|C;xE`%N##GY)ic4?2L7{|yHw{~H;a!}Gtf z894tN8kmmee|i8={{I5a|Aq{zy&L%s8SuE2*B{-u!Id?vN7cnq=K`PXE+qqQui#de zt@Zu4^|pO^_vcJ%sZi4F?_Lc>yKgc+NMby4-R*&Mh>Gkb5ebDy&+gp%$F}}Qrp!U{ zE9|>P_Y|9#cZQ#PD7ojl-b-#pb&K>(?th-hy?(s*-s}|-J|^`dznE?I)?Mu%=&%?C TqhJ(_f{_FOrtP{u01f~E01+cz literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_uc_gz/F00.GZ b/test/dummy/trecdocs/gzip_uc_gz/F00.GZ new file mode 100644 index 0000000000000000000000000000000000000000..fe7a9145e8341e3349011366cf8778004a9116cd GIT binary patch literal 241 zcmV4)1T*f75;&R?JQ-^&zvaebGck|EMn$^R+ejgK|aUjYCB7TR;! literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_uc_gz/F01.GZ b/test/dummy/trecdocs/gzip_uc_gz/F01.GZ new file mode 100644 index 0000000000000000000000000000000000000000..cbf7e8fdf024ad388839cad0b7011092fd306d4a GIT binary patch literal 143 zcmV;A0C4{wiwFqxE_7l514b}0040sF4#F@DM0bD1v9Lu2R*WS?K`hV?9l9bg3rbUk z1hs#U+b*{B?)=PUbrNEm_ey|ryV(KYVgbb0M&LtKBpuQe7$D;2+}i#IulQ*k!57pA xyA`~1H+P9w&fyun*DmP`AtyLzvPb;n9_!<+-kYdcOf-KK`U5<@e$|fv0067rKV<*_ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_z.tar.gz b/test/dummy/trecdocs/gzip_z.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..4fe6ae587e974a477b18b68b1e7ba467184c8b15 GIT binary patch literal 578 zcmV-I0=@koiwFQ>IdozG152;UEQqhtXP_l8FfcGTHB|u9W)RxI$OJ+|fr6olk+G?P zv9W=vk%EDtk%@^JgMukdT?GZD#U+VFK&KR^Cg$N&2UM1phDRRi4>&zUKsqG;-3$!% zs%T{gFg(mmOmN4)k)b&>{*BCy%}jvtZ){{>z@T72D+eKi5gz~Y-5eZW^h%N#n1M-v zLA7@y-ys7Ym-70f8#lPJhV`hrIO<&Blij6c!0i>>%Cfb-|F+(?FYo@GNi7vhn*H6Y zp=kF_#s^7^N3Od)a1K$Cy(A)`@aWl{TmRVB|Hza%D1L=~x9FZ?^YYH{a}Oo=T-STa zt*CC1zRCU16S>!q*WR1GBErX{UgQ_E&EC4J{R15qBN4#!zrn!ee`7O4c>Xss0_T52 zbEDDxPY(df|5Kp(-+&=D;I!Xi1A#riwVxeXnKdcSaGK7fRUQ+xUQpdN&UW&iQJ>^)WM;edn=v>rRAoJ-)KU{NrDXf(KCx7khYTn71AEiTa;7 zCs%Rjcby}>FWYac-Z0p&buNDXxBI6>Dhq3;&!41tDfHxj&HKImrWfNG2RgV%!6+C7 QqhQbg0Limj5&#YW0H9JP`2YX_ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_z/F00.z b/test/dummy/trecdocs/gzip_z/F00.z new file mode 100644 index 0000000000000000000000000000000000000000..fe7a9145e8341e3349011366cf8778004a9116cd GIT binary patch literal 241 zcmV4)1T*f75;&R?JQ-^&zvaebGck|EMn$^R+ejgK|aUjYCB7TR;! literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/gzip_z/F01.z b/test/dummy/trecdocs/gzip_z/F01.z new file mode 100644 index 0000000000000000000000000000000000000000..cbf7e8fdf024ad388839cad0b7011092fd306d4a GIT binary patch literal 143 zcmV;A0C4{wiwFqxE_7l514b}0040sF4#F@DM0bD1v9Lu2R*WS?K`hV?9l9bg3rbUk z1hs#U+b*{B?)=PUbrNEm_ey|ryV(KYVgbb0M&LtKBpuQe7$D;2+}i#IulQ*k!57pA xyA`~1H+P9w&fyun*DmP`AtyLzvPb;n9_!<+-kYdcOf-KK`U5<@e$|fv0067rKV<*_ literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/plaintext_noext.tar.gz b/test/dummy/trecdocs/plaintext_noext.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..6bc0f5b6336e7fdeddaa9bca8fc5dafd058cc0d6 GIT binary patch literal 442 zcmV;r0Y&~FiwFQ>IdozG1MO2wPr^VD=DfdRym%7WZeIs#lg2_Ykw*{{Z>HKs8ryEu z2g<*9N*kUE7to;n4m-)b_S>ELBH!`Clz*g8p}@0QI{_%Ax~js$YK5gtDQpRl6eg=w zraF^>N{Xz@HBe7MW*b?OIx$j7xD)Qn!?ej{PrQ7`(w+&Tt@(E-J^CH^ADW_kKYvx$ z%lR{%VF2f^sjLRaXYvL9`TPxgaAT7E><>%;TcXqg0HaxWh%vNA?f%$=dlB$->V>Zu zD}BW1mI25&D6c{XL3i7-?{52T%&;D?D|ETz@)*W>meE8Xu^rE!!-z*BP611rs~TdA z+mG0%hnoq{!ksU=6A?m?`KcHA9DFb22_f%ZIt8Nx!)u8$6Ox=s;KXlPgbh0;OnJ;3 zVEKu-DuP}(b9~PQ19=yTaOnh5s{vS0+)5Tt&+X>VNK{(?mnBIN^EHy&U4DDK2eGSv z>F9SH(m#_{`j@p*|8k}OpTMsE*@pacg$MbLZz{al@`a{mJf5$KaFT_k6fWv@co9Ls kalc#;Sqk;~N_JLn*dphrmkJdsRQN0S1a;REy8sRV07OpKcK`qY literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/plaintext_noext/F00 b/test/dummy/trecdocs/plaintext_noext/F00 new file mode 100644 index 00000000..01dbba1b --- /dev/null +++ b/test/dummy/trecdocs/plaintext_noext/F00 @@ -0,0 +1,29 @@ + + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_noext/F01 b/test/dummy/trecdocs/plaintext_noext/F01 new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_noext/F01 @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/dummy/trecdocs/plaintext_txt.tar.gz b/test/dummy/trecdocs/plaintext_txt.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..beaa4c046286c008ac5931a29a5ae1c0f2716108 GIT binary patch literal 442 zcmV;r0Y&~FiwFQ>IdozG1MO2wPr^VD=DfdRym%7W-99)_n=}H!L>@s*yqPMCH1?Ht zi}3HA(gr9hUO|J(%;o^5az&#G zZKxb5Q>ms`KsyedB9d6z5mJeT?eELOwCQwDJb%a9o(aOj{5zD^&>r{>{107Kzn{OR zH1hfL2FCzp8q;_M=z)2BiGMzSqcylONqY7NCP0f(Y5{;z&pgB!TBCM}~8 zuKw8x^v~Hw|B9aLUn%wf8~Ch$UXXvPa4+5QqQbK+UukMCqUDwdr-`3S;i6iFm%#G` k?$-o@L_)Q?k)72WHp%(vr9_DmCH@LN0O4Uf(*O + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_txt/F01.txt b/test/dummy/trecdocs/plaintext_txt/F01.txt new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_txt/F01.txt @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/dummy/trecdocs/plaintext_uc_txt.tar.gz b/test/dummy/trecdocs/plaintext_uc_txt.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..ea0289f3ad06eb3b54e57ba7a01bfe1646eb355d GIT binary patch literal 446 zcmV;v0YUyBiwFQ>IdozG1MO5vPr^VH<~+aRdGRDL(*+LHCXGNaktJ9YZluB>jos2t z5&pfeOIQUjh(Y@u`jWTJH}l@tz_Dyk@TC~Xld)KeYU%u-l-5-hQd`R`m6Xip0H#Q? zN@YeR29znPtdv0g0WJrKBVmO|CE}KMDi70UvlH?B9&39g2=?dSrnF*?%yXWB?@?2f zl6-r89iYi5H2^@XW*(vqjA5&1>hSEl zT+D6n4P&{FXq`L&$p$4==pg8B8O^81UJEmf7wifhZcTXzW|E~<(MOt=?W|zP13wgi zCDlz0(ahEe`*d+L;nkn6((c6fz>OVY2M!0v_IO0dhb`uywPA3N{|z0Ygo$f~?{R<) z+d9m7$SYttk$;y3ZEs;Y_7pVaoh8DptA9U{mZpn|5BmEj`9+{RmjVR}6!;F8~ez02RU7K>z>% literal 0 HcmV?d00001 diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT new file mode 100644 index 00000000..01dbba1b --- /dev/null +++ b/test/dummy/trecdocs/plaintext_uc_txt/F00.TXT @@ -0,0 +1,29 @@ + + D100A + Something + Some text + + + Header Text +Daily Report + + + + +Main body text +on multiple lines + +with some markup + here. Also, some invalid markup &. + + + + + + 101 + + +More body text + + + diff --git a/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT new file mode 100644 index 00000000..69a3dc38 --- /dev/null +++ b/test/dummy/trecdocs/plaintext_uc_txt/F01.TXT @@ -0,0 +1,11 @@ + + D102 + more text + + +some very fun text + markup & + + + + diff --git a/test/formats/test_trec.py b/test/formats/test_trec.py index b007984f..694c4dc6 100644 --- a/test/formats/test_trec.py +++ b/test/formats/test_trec.py @@ -1,8 +1,22 @@ import os import shutil import unittest +import contextlib from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs -from ir_datasets.util import StringFile +from ir_datasets.util import StringFile, RelativePath + + +class File: + def __init__(self, path): + self._path = path + + def path(self, force=True): + return self._path + + @contextlib.contextmanager + def stream(self): + yield open(self._path, 'rb') + class TestTrec(unittest.TestCase): @@ -127,6 +141,26 @@ def test_docs(self): self.assertEqual(docs.docs_path(), 'MOCK') self.assertEqual(list(docs.docs_iter()), expected_results) + + def test_docs_formats(self): + expected_results = [ + TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n'), + TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='\nMore body text\n\n'), + TrecDoc(doc_id='D102', text='\n\nsome very fun text\n markup &\n\n\n', marked_up_doc='\nsome very fun text\n markup &\n\n\n'), + ] + + for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']: + with self.subTest(source): + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}'))) + self.assertEqual(list(docs.docs_iter()), expected_results) + + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*']) + self.assertEqual(list(docs.docs_iter()), expected_results) + + if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']: + docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*']) + self.assertEqual(list(docs.docs_iter()), expected_results) + def tearDown(self): if os.path.exists('MOCK.pklz4'): shutil.rmtree('MOCK.pklz4') From 1377ec1cf1902a9b82723264dd4f1bc32afb7121 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 5 May 2022 22:10:54 +0100 Subject: [PATCH 2/2] diagnosing Windows build --- test/formats/test_trec.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/formats/test_trec.py b/test/formats/test_trec.py index 694c4dc6..b445bbe7 100644 --- a/test/formats/test_trec.py +++ b/test/formats/test_trec.py @@ -151,13 +151,16 @@ def test_docs_formats(self): for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']: with self.subTest(source): + print(source, "no paths") docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}'))) self.assertEqual(list(docs.docs_iter()), expected_results) + print(source, "paths") docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*']) self.assertEqual(list(docs.docs_iter()), expected_results) if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']: + print(source, "tarfile") docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*']) self.assertEqual(list(docs.docs_iter()), expected_results)