diff --git a/Cargo.lock b/Cargo.lock index f0899e3f..6a10cab4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,24 +19,24 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.1" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -58,12 +58,13 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "once_cell_polyfill", + "windows-sys 0.61.2", ] [[package]] @@ -155,6 +156,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + [[package]] name = "byteorder" version = "1.4.3" @@ -190,20 +197,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.3.19" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.19" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -213,11 +219,11 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.12" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", "syn 2.0.89", @@ -225,9 +231,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "colorchoice" @@ -278,33 +284,19 @@ checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "errno" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ - "cc", "libc", + "windows-sys 0.61.2", ] [[package]] name = "fastrand" -version = "1.9.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "fnv" @@ -428,12 +420,6 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -449,12 +435,6 @@ dependencies = [ "libc", ] -[[package]] -name = "hermit-abi" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" - [[package]] name = "http" version = "0.2.9" @@ -489,36 +469,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] -name = "instant" -version = "0.1.12" +name = "is_terminal_polyfill" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" -dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "is-terminal" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" -dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix", - "windows-sys 0.48.0", -] +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -537,9 +491,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "libc" -version = "0.2.142" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libdeflate-sys" @@ -561,9 +515,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.3.7" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "log" @@ -655,7 +609,7 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi", "libc", ] @@ -689,6 +643,12 @@ version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "openssl-probe" version = "0.1.5" @@ -802,7 +762,7 @@ version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "pyo3-build-config", "quote", @@ -854,15 +814,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags", -] - [[package]] name = "ring" version = "0.17.3" @@ -891,16 +842,15 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.37.19" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -977,7 +927,7 @@ version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -1037,9 +987,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" @@ -1071,15 +1021,14 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.5.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", "rustix", - "windows-sys 0.45.0", + "windows-sys 0.52.0", ] [[package]] @@ -1200,9 +1149,9 @@ dependencies = [ [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "wasi" @@ -1220,6 +1169,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-sys" version = "0.42.0" @@ -1237,35 +1192,38 @@ dependencies = [ [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.42.2", + "windows-targets 0.48.0", ] [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.48.0", + "windows-targets 0.52.6", ] [[package]] -name = "windows-targets" -version = "0.42.2" +name = "windows-sys" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", ] [[package]] @@ -1283,6 +1241,22 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -1295,6 +1269,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -1307,6 +1287,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -1319,6 +1305,18 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -1331,6 +1329,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -1343,6 +1347,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -1355,6 +1365,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -1366,3 +1382,9 @@ name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/pybigtools/pybigtools/__init__.py b/pybigtools/pybigtools/__init__.py index e3b625e0..d2915503 100644 --- a/pybigtools/pybigtools/__init__.py +++ b/pybigtools/pybigtools/__init__.py @@ -1,10 +1,95 @@ # ruff: noqa: F403 # ruff: noqa: F405 # This is here to make sure that source installs with pip work correctly +import warnings as _warnings + from pybigtools.pybigtools import * from pybigtools import pybigtools as bt __doc__ = bt.__doc__ if hasattr(bt, "__all__"): __all__ = bt.__all__ + +_RustBBIReader = bt.BBIReader +_rust_open = bt.open + + +# Sentinel distinguishing "user did not pass" from "user passed None" in `values`. +_UNSET = object() + + +class BBIReader: + """Python-side wrapper around the Rust ``BBIReader`` that stages a + deprecation cycle on the ``values()`` parameters. All other attributes + are delegated to the underlying Rust object. + """ + + __slots__ = ("_rust",) + + def __init__(self, rust_reader): + object.__setattr__(self, "_rust", rust_reader) + + def __getattr__(self, name): + return getattr(self._rust, name) + + def __enter__(self): + self._rust.__enter__() + return self + + def __exit__(self, exc_type, exc_value, traceback): + return self._rust.__exit__(exc_type, exc_value, traceback) + + def values( + self, + chrom, + start=None, + end=None, + bins=None, + summary="mean", + exact=False, + uncovered=None, + oob=float("nan"), + fillna=_UNSET, + missing=_UNSET, + arr=None, + ): + if missing is not _UNSET: + _warnings.warn( + "`missing` is deprecated and will be removed in a future " + "release; use `fillna` instead.", + DeprecationWarning, + stacklevel=2, + ) + if fillna is _UNSET: + fillna = missing + if fillna is _UNSET: + _warnings.warn( + "The default behavior of `values()` has changed: empty bins / " + "uncovered positions are now returned as NaN instead of " + "filled with 0. Pass `fillna=0` to keep the previous " + "behavior, or `fillna=None` to silence this warning. See " + "also `uncovered` to include uncovered bases in summary " + "statistic calculations.", + DeprecationWarning, + stacklevel=2, + ) + fillna = None + return self._rust.values( + chrom, + start, + end, + bins=bins, + summary=summary, + exact=exact, + uncovered=uncovered, + oob=oob, + fillna=fillna, + arr=arr, + ) + + +def open(path, mode="r"): # noqa: A001 - intentional shadow of builtin + return BBIReader(_rust_open(path, mode)) + + del bt diff --git a/pybigtools/pyproject.toml b/pybigtools/pyproject.toml index b9c6c180..8e7d9662 100644 --- a/pybigtools/pyproject.toml +++ b/pybigtools/pyproject.toml @@ -36,6 +36,7 @@ dev = [ {include-group = "test"} ] test = [ + "pybbi>=0.4.2", "pytest", "smart_open[http]", ] @@ -48,6 +49,15 @@ repository = "https://github.com/jackh726/bigtools" [tool.maturin] python-source = "." +[tool.pytest.ini_options] +filterwarnings = [ + # Deprecation warnings from pybigtools' `values()` are expected during + # the deprecation cycle and would otherwise flood the test output. + # Tests that exercise the deprecation path itself use + # `pytest.warns(DeprecationWarning)` locally. + "ignore::DeprecationWarning:pybigtools", +] + [tool.uv] default-groups = ["dev"] cache-keys = [{file = "pyproject.toml"}, {file = "Cargo.toml"}, {file = "**/*.rs"}] diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs new file mode 100644 index 00000000..794d5f57 --- /dev/null +++ b/pybigtools/src/coverage.rs @@ -0,0 +1,1165 @@ +use bigtools::{BBIReadError, BedEntry, Value}; +use std::cmp::Reverse; +use std::collections::BinaryHeap; + +/// Converts sorted [`BedEntry`] intervals into non-overlapping coverage depth [`Value`] intervals. +/// +/// This iterator consumes a stream of sorted, potentially overlapping BED entries and produces a +/// stream of non-overlapping intervals whose values represent the coverage depth (number of +/// overlapping entries) over each range. Adjacent intervals with the same depth are merged, so +/// the output is a run-length-encoded coverage track. +/// +/// The input BED entries **must** be sorted by start position, then end position. +/// +/// # Notes +/// +/// Implements a 1-D sweep-line over endpoint events: each entry contributes a `+1` at `start` and +/// a `-1` at `end`; the running sum at any scan position is the coverage depth at that base. +/// +/// Memory scales with the number of intervals overlapping the current scan position (not with the +/// total number of input entries). The internal heap holds one pending `end` event for each such +/// interval until the scan moves past it. +/// +/// # Example +/// +/// ```rust +/// use bigtools::{BedEntry, Value}; +/// use pybigtools::coverage::CoverageIterator; +/// +/// let entries = vec![ +/// Ok(BedEntry { start: 10, end: 30, rest: "".to_string() }), +/// Ok(BedEntry { start: 20, end: 40, rest: "".to_string() }), +/// ]; +/// +/// let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); +/// let intervals: Result, _> = coverage_iter.collect(); +/// +/// // Result: [10-20): coverage=1, [20-30): coverage=2, [30-40): coverage=1 +/// ``` +pub struct CoverageIterator { + iter: std::iter::Peekable, + ends: BinaryHeap>, + coverage: u32, + run_start: Option, + finished: bool, +} + +impl CoverageIterator +where + I: Iterator>, +{ + pub fn new(iter: I) -> Result { + Ok(Self { + iter: iter.peekable(), + ends: BinaryHeap::new(), + coverage: 0, + run_start: None, + finished: false, + }) + } +} + +impl Iterator for CoverageIterator +where + I: Iterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + if self.finished { + return None; + } + + loop { + // Determine the next event position: the minimum of the next entry's start + // (peeked from the iterator) and the smallest pending end in the heap. + let next_start = match self.iter.peek() { + Some(Ok(e)) => Some(e.start), + Some(Err(_)) => { + // Surface the iterator error + let err = match self.iter.next() { + Some(Err(e)) => e, + _ => unreachable!(), + }; + self.finished = true; + return Some(Err(err)); + } + None => None, + }; + let next_end = self.ends.peek().map(|&Reverse(p)| p); + let pos = match (next_start, next_end) { + (Some(s), Some(e)) => s.min(e), + (Some(s), None) => s, + (None, Some(e)) => e, + (None, None) => { + self.finished = true; + return None; + } + }; + + // Process all events at `pos`. Start events come from the iterator (in sort order); + // end events come from the heap. Zero-length entries (start == end == pos) push an + // end into the heap during the start drain that is then consumed by the end drain + // in the same iteration -- so they contribute net 0 to delta, as expected. + let mut delta: i32 = 0; + + // Drain start events at `pos` + while matches!(self.iter.peek(), Some(Ok(e)) if e.start == pos) { + let entry = match self.iter.next() { + Some(Ok(e)) => e, + _ => unreachable!(), + }; + delta += 1; + self.ends.push(Reverse(entry.end)); + } + + // Drain end events at `pos` (including any just pushed by zero-length entries) + while matches!(self.ends.peek(), Some(&Reverse(p)) if p == pos) { + self.ends.pop(); + delta -= 1; + } + + // Update coverage. `checked_add_signed` surfaces malformed input (e.g. an entry + // with start > end, which would queue a -1 event before its matching +1) as a + // panic rather than silent wrong output. + let old_cov = self.coverage; + let new_cov = old_cov.checked_add_signed(delta).expect( + "CoverageIterator: coverage underflow (likely malformed input: start > end)", + ); + + // If coverage didn't change at this event position, the current run continues; + // no emit, no state update. This collapses adjacent same-depth intervals + // (run-length encoding), matching the conventional "coverage track" semantics + if old_cov == new_cov { + continue; + } + + // Close the previous run if there was one. Skip emit if `run_start >= pos`, + // which only happens under malformed or out-of-sort-order input. + let interval_to_emit = match self.run_start { + Some(start) if old_cov > 0 && start < pos => Some(Value { + start, + end: pos, + value: old_cov as f32, + }), + _ => None, + }; + + self.coverage = new_cov; + self.run_start = if new_cov > 0 { Some(pos) } else { None }; + + if let Some(interval) = interval_to_emit { + return Some(Ok(interval)); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_streaming_simple() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 40, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 2); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 30, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_streaming_overlapping() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 40, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 20, + end: 30, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 30, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_streaming_complex() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 25, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 35, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 45, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 5); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + assert_eq!( + intervals[3], + Value { + start: 30, + end: 35, + value: 2.0 + } + ); + assert_eq!( + intervals[4], + Value { + start: 35, + end: 45, + value: 1.0 + } + ); + } + + #[test] + fn test_same_start_positions() { + // Test multiple entries with the same start position (sorted by start, then by end) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 25, + rest: "entry3".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-20): coverage=3, [20-25): coverage=2, [25-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 3.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 20, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_same_end_positions() { + // Test multiple entries with the same end position (already sorted by start) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 30, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-20): coverage=2, [20-30): coverage=3 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 30, + value: 3.0 + } + ); + } + + #[test] + fn test_identical_intervals() { + // Test multiple identical intervals + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-20): coverage=3 + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 3.0 + } + ); + } + + #[test] + fn test_adjacent_intervals() { + // Test intervals that are adjacent (end of one = start of next) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 40, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Adjacent intervals with the same coverage are merged: [10-40): coverage=1 + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_empty_input() { + let entries: Vec> = vec![]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 0); + } + + #[test] + fn test_single_entry() { + let entries = vec![Ok(BedEntry { + start: 5, + end: 15, + rest: "single".to_string(), + })]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 5, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_nested_intervals() { + // One interval completely inside another + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "outer".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 25, + rest: "inner".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-25): coverage=2, [25-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_mixed_same_positions() { + // Some entries start where others end at the same position + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 20, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-20): coverage=2, [20-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_many_overlapping() { + // Five intervals all overlapping each other + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "1".to_string(), + }), + Ok(BedEntry { + start: 12, + end: 22, + rest: "2".to_string(), + }), + Ok(BedEntry { + start: 14, + end: 24, + rest: "3".to_string(), + }), + Ok(BedEntry { + start: 16, + end: 26, + rest: "4".to_string(), + }), + Ok(BedEntry { + start: 18, + end: 28, + rest: "5".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Should produce multiple intervals with increasing then decreasing coverage + assert!(intervals.len() > 5); // More intervals than input entries + assert!(intervals.iter().any(|v| v.value >= 4.0)); // Should reach high coverage + + // Verify no gaps and proper ordering + for i in 1..intervals.len() { + assert_eq!( + intervals[i - 1].end, + intervals[i].start, + "Gap detected between intervals" + ); + } + } + + #[test] + fn test_single_base_intervals() { + // Test intervals that are exactly 1 base pair long + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 11, + rest: "1bp".to_string(), + }), + Ok(BedEntry { + start: 12, + end: 13, + rest: "1bp".to_string(), + }), + Ok(BedEntry { + start: 14, + end: 15, + rest: "1bp".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 11, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 12, + end: 13, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 14, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_large_gaps() { + // Test intervals with large gaps between them + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "first".to_string(), + }), + Ok(BedEntry { + start: 1000, + end: 1010, + rest: "second".to_string(), + }), + Ok(BedEntry { + start: 100000, + end: 100010, + rest: "third".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 1000, + end: 1010, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 100000, + end: 100010, + value: 1.0 + } + ); + } + + #[test] + fn test_start_at_zero() { + // Test intervals starting at position 0 + let entries = vec![ + Ok(BedEntry { + start: 0, + end: 10, + rest: "zero_start".to_string(), + }), + Ok(BedEntry { + start: 5, + end: 15, + rest: "overlapping".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 0, + end: 5, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 5, + end: 10, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_boundary_positions() { + // Test intervals with position 0 and various boundary conditions + let entries = vec![ + Ok(BedEntry { + start: 0, + end: 1, + rest: "zero".to_string(), + }), + Ok(BedEntry { + start: 1, + end: 2, + rest: "one".to_string(), + }), + Ok(BedEntry { + start: 4294967290, + end: 4294967295, + rest: "high".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Adjacent intervals [0,1) and [1,2) with the same coverage merge into [0,2) + assert_eq!(intervals.len(), 2); + assert_eq!( + intervals[0], + Value { + start: 0, + end: 2, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 4294967290, + end: 4294967295, + value: 1.0 + } + ); + } + + #[test] + fn test_very_long_interval() { + // Test an interval spanning a very large genomic region + let entries = vec![ + Ok(BedEntry { + start: 1000, + end: 1000000, + rest: "long".to_string(), + }), + Ok(BedEntry { + start: 500000, + end: 500010, + rest: "short_inside".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Should handle large intervals correctly + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 1000, + end: 500000, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 500000, + end: 500010, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 500010, + end: 1000000, + value: 1.0 + } + ); + } + + #[test] + fn test_zero_length_intervals() { + // Test intervals with zero length (start == end) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 10, + rest: "zero_length1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 20, + rest: "normal".to_string(), + }), + Ok(BedEntry { + start: 25, + end: 25, + rest: "zero_length2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Zero-length intervals should be ignored (produce no coverage) + // Only the normal interval [15, 20) should produce coverage + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 15, + end: 20, + value: 1.0 + } + ); + } + + #[test] + fn test_zero_length_with_overlaps() { + // Test zero-length intervals mixed with overlapping normal intervals + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "normal1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 15, + rest: "zero_length".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 25, + rest: "normal2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Zero-length interval should not affect coverage calculation + // Should get: [10-15): coverage=1, [15-20): coverage=2, [20-25): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 25, + value: 1.0 + } + ); + } + + #[test] + fn test_malformed_start_greater_than_end() { + // An entry with start > end is malformed. The current algorithm doesn't crash + // (the end is only pushed to the heap after the start is consumed, so the +1 always + // precedes the -1 and coverage stays non-negative). It emits no intervals because + // the end position is less than prev_pos, failing the emit check. This test pins + // that behavior so a future change that errors out is visible. + let entries = vec![Ok(BedEntry { + start: 20, + end: 10, + rest: "malformed".to_string(), + })]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals, vec![]); + } + + #[test] + fn test_sort_violation_produces_incorrect_output() { + // Out-of-order input is a documented precondition violation. The algorithm + // doesn't detect it and produces incorrect (but non-panicking) output. This test + // pins the current behavior so any future fix (e.g. error or debug assertion) is + // visible as a test change. + // + // Correct output for sorted [(5,15), (10,20)] would be: + // [5-10): cov=1, [10-15): cov=2, [15-20): cov=1 + // With the entries presented out of order, we instead get a collapsed range at + // cov=2 covering the entire [5, 15) span. + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "first".to_string(), + }), + Ok(BedEntry { + start: 5, + end: 15, + rest: "out_of_order".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!( + intervals, + vec![ + Value { + start: 5, + end: 15, + value: 2.0, + }, + Value { + start: 15, + end: 20, + value: 1.0, + }, + ] + ); + } + + #[test] + fn test_many_same_position() { + // 1000 entries all starting at the same position. Stresses the start-drain loop + // which has to consume all of them in a single outer iteration. Output should be + // a single interval at the full depth. + let n = 1000u32; + let entries: Vec> = (0..n) + .map(|_| { + Ok(BedEntry { + start: 100, + end: 200, + rest: String::new(), + }) + }) + .collect(); + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!( + intervals, + vec![Value { + start: 100, + end: 200, + value: n as f32, + }] + ); + } + + #[test] + fn test_stress_many_overlapping() { + // 10k entries with high pairwise overlap. Stresses the heap (peak depth ~ `width`) + // and verifies correctness via algorithmic invariants: + // 1. Total "base-coverage units" emitted equals the sum of input entry lengths. + // 2. Intervals are non-overlapping, sorted, and contiguous (no zero-coverage gaps + // in a region covered by at least one entry). + // 3. The full plateau region (positions `width-1 .. n`) is at coverage == `width`. + // + // We deliberately don't compare to a brute-force RLE reference because the iterator + // emits at every event position even when coverage doesn't change between events + // (which a coverage-based RLE would merge). + let n: u32 = 10_000; + let width: u32 = 100; + + let entries: Vec> = (0..n) + .map(|i| { + Ok(BedEntry { + start: i, + end: i + width, + rest: String::new(), + }) + }) + .collect(); + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Invariant 1: total base-coverage units == sum of input entry lengths + let total_base_cov: u64 = intervals + .iter() + .map(|v| (v.end - v.start) as u64 * v.value as u64) + .sum(); + let expected_total: u64 = n as u64 * width as u64; + assert_eq!(total_base_cov, expected_total); + + // Invariant 2: intervals are sorted, contiguous, all coverage > 0 + for v in &intervals { + assert!(v.value > 0.0, "zero-coverage interval emitted"); + assert!(v.start < v.end, "degenerate interval emitted"); + } + for w in intervals.windows(2) { + assert_eq!( + w[0].end, w[1].start, + "non-contiguous intervals (no entry has cov=0 in this fixture)" + ); + } + + // Invariant 3: plateau coverage equals `width` + let mid_pos = n / 2; + let at_mid = intervals + .iter() + .find(|v| v.start <= mid_pos && v.end > mid_pos) + .expect("no interval covers mid_pos"); + assert_eq!(at_mid.value, width as f32); + } +} diff --git a/pybigtools/src/errors.rs b/pybigtools/src/errors.rs index 1e70e118..4dbb5c8e 100644 --- a/pybigtools/src/errors.rs +++ b/pybigtools/src/errors.rs @@ -27,9 +27,9 @@ impl ToPyErr for bigtools::ZoomIntervalError { fn to_py_err(self) -> PyErr { match self { bigtools::ZoomIntervalError::ReductionLevelNotFound => { - PyErr::new::(format!( - "The passed reduction level was not found" - )) + PyErr::new::( + "The passed reduction level was not found".to_string(), + ) } _ => PyErr::new::(format!("{}", self)), } diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index a9bf3006..3238d2d9 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -11,17 +11,17 @@ use pyo3::types::{PyAny, PyString}; use pyo3::wrap_pyfunction; use url::Url; +mod coverage; mod errors; mod file_like; +mod raster; mod reader; -mod utils; mod writer; use errors::{BBIFileClosed, BBIReadError}; use file_like::PyFileLikeObject; -use reader::{BBIRead, BigBedEntriesIterator, BigWigIntervalIterator}; -use utils::BBIReadRaw; -use writer::{BigBedWrite, BigWigWrite}; +use reader::{BBIReadRaw, BBIReader, BigBedEntriesIterator, BigWigIntervalIterator}; +use writer::{BigBedWriter, BigWigWriter}; impl Reopen for PyFileLikeObject { fn reopen(&self) -> io::Result { @@ -84,22 +84,20 @@ fn open( } if iswrite { - return Err(PyErr::new::(format!( - "Writing only supports file names", - ))); + return Err(PyErr::new::( + "Writing only supports file names".to_string(), + )); } let file_like = match PyFileLikeObject::new(path_url_or_file_like, true, false, true) { Ok(file_like) => file_like, - Err(_) => return Err(PyErr::new::(format!( - "Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object.", - ))), + Err(_) => return Err(PyErr::new::("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object.".to_string())), }; let read = match GenericBBIRead::open(file_like) { - Ok(GenericBBIRead::BigWig(bigwig)) => BBIRead { + Ok(GenericBBIRead::BigWig(bigwig)) => BBIReader { bbi: BBIReadRaw::BigWigFileLike(bigwig.cached()), } .into_py(py), - Ok(GenericBBIRead::BigBed(bigbed)) => BBIRead { + Ok(GenericBBIRead::BigBed(bigbed)) => BBIReader { bbi: BBIReadRaw::BigBedFileLike(bigbed.cached()), } .into_py(py), @@ -123,29 +121,26 @@ fn open_path_or_url( { Some(e) => e.to_string(), None => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } }; let res = if iswrite { - match Url::parse(&path_url_or_file_like) { - Ok(_) => { - return Err(PyErr::new::(format!( - "Invalid file path. Writing does not support urls." - ))) - } - Err(_) => {} + if Url::parse(&path_url_or_file_like).is_ok() { + return Err(PyErr::new::( + "Invalid file path. Writing does not support urls.".to_string(), + )); } match extension.as_ref() { - "bw" | "bigWig" | "bigwig" => BigWigWrite { + "bw" | "bigWig" | "bigwig" => BigWigWriter { bigwig: Some(path_url_or_file_like), } .into_py(py), - "bb" | "bigBed" | "bigbed" => BigBedWrite { + "bb" | "bigBed" | "bigbed" => BigBedWriter { bigbed: Some(path_url_or_file_like), } .into_py(py), _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } } } else { @@ -154,9 +149,10 @@ fn open_path_or_url( match Url::parse(&path_url_or_file_like) { Ok(_) => {} Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::( "Invalid file path. The file does not exist and it is not a url." - ))) + .to_string(), + )) } } } @@ -164,27 +160,27 @@ fn open_path_or_url( "bw" | "bigWig" | "bigwig" => { if isfile { match BigWigReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigWigFile(bwr.cached()), } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigWig." - ))) + return Err(PyErr::new::( + "Error opening bigWig.".to_string(), + )) } } } else { #[cfg(feature = "remote")] match BigWigReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigWigRemote(bwr.cached()), } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigWig." - ))) + return Err(PyErr::new::( + "Error opening bigWig.".to_string(), + )) } } @@ -199,27 +195,27 @@ fn open_path_or_url( "bb" | "bigBed" | "bigbed" => { if isfile { match BigBedReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigBedFile(bwr.cached()), } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigBed." - ))) + return Err(PyErr::new::( + "Error opening bigBed.".to_string(), + )) } } } else { #[cfg(feature = "remote")] match BigBedReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigBedRemote(bwr.cached()), } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigBed." - ))) + return Err(PyErr::new::( + "Error opening bigBed.".to_string(), + )) } } @@ -232,7 +228,7 @@ fn open_path_or_url( } } _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } } }; @@ -246,9 +242,9 @@ fn pybigtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(open))?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -257,7 +253,6 @@ fn pybigtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let collections = PyModule::import_bound(py, "collections")?; let namedtuple = collections.getattr("namedtuple")?; - let summary_statistics = namedtuple.call( ( "SummaryStatistics".to_object(py), diff --git a/pybigtools/src/raster.rs b/pybigtools/src/raster.rs new file mode 100644 index 00000000..31dd6e5d --- /dev/null +++ b/pybigtools/src/raster.rs @@ -0,0 +1,1135 @@ +use bigtools::{BBIFileRead, BBIReadError, BedEntry, BigBedRead, BigWigRead, Value, ZoomRecord}; +use numpy::ndarray::ArrayViewMut; +use numpy::{PyArray1, PyArrayMethods}; +use pyo3::exceptions; +use pyo3::prelude::*; +use std::ops::IndexMut; + +use crate::coverage::CoverageIterator; +use crate::errors::ConvertResult; + +#[derive(Copy, Clone, Debug)] +pub enum Summary { + Mean, + Std, + Min, + Max, + Sum, + SumSquares, + BasesCovered, + BinCovered, +} + +#[derive(Debug)] +enum BBIRecord { + Value(Value), + BedEntry(BedEntry), + ZoomRecord(ZoomRecord), +} + +#[derive(Copy, Clone, Debug)] +struct SummaryStats { + sum: f64, + sum_squares: f64, + min: f64, + max: f64, + bases_covered: f64, +} + +enum BBIRead<'a, R: BBIFileRead> { + BigWig(&'a mut BigWigRead), + BigBed(&'a mut BigBedRead), +} + +impl BBIRead<'_, R> { + pub fn chroms(&self) -> &[bigtools::ChromInfo] { + match self { + BBIRead::BigWig(bw) => bw.chroms(), + BBIRead::BigBed(bb) => bb.chroms(), + } + } + + /// Resolve a genomic range with potentially unspecified start and end positions. + /// Returns a tuple of (start, end, chrom_length). + pub fn resolve_range( + &self, + chrom_name: &str, + start: Option, + end: Option, + ) -> PyResult<(i32, i32, i32)> { + let chroms = self.chroms(); + let chrom = chroms.iter().find(|x| x.name == chrom_name); + let length = match chrom { + None => { + return Err(PyErr::new::(format!( + "No chromomsome with name `{}` found.", + chrom_name + ))) + } + Some(c) => c.length as i32, + }; + Ok((start.unwrap_or(0), end.unwrap_or(length), length)) + } + + pub fn zoom_headers(&self) -> &[bigtools::ZoomHeader] { + match self { + BBIRead::BigWig(bw) => bw.info().zoom_headers.as_slice(), + BBIRead::BigBed(bb) => bb.info().zoom_headers.as_slice(), + } + } + + /// Find the closest zoom resolution that does not exceed the desired bin size. + /// If no such zoom level exists, returns None. + pub fn best_zoom(&self, start: i32, end: i32, bins: usize) -> Option { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + self.zoom_headers() + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level) + .map(|z| z.reduction_level) + } +} + +#[allow(clippy::too_many_arguments)] +pub fn intervals_to_array( + py: Python<'_>, + bw: &mut BigWigRead, + chrom: &str, + start: Option, + end: Option, + bins: Option, + summary: Summary, + exact: bool, + uncovered: Option, + oob: f64, + fillna: Option, + arr: Option, +) -> PyResult { + let mut bbi = BBIRead::BigWig(bw); + records_to_array( + py, &mut bbi, chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, + ) +} + +#[allow(clippy::too_many_arguments)] +pub fn entries_to_array( + py: Python<'_>, + bb: &mut BigBedRead, + chrom: &str, + start: Option, + end: Option, + bins: Option, + summary: Summary, + exact: bool, + uncovered: Option, + oob: f64, + fillna: Option, + arr: Option, +) -> PyResult { + let mut bbi = BBIRead::BigBed(bb); + records_to_array( + py, &mut bbi, chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, + ) +} + +#[allow(clippy::too_many_arguments)] +fn records_to_array( + py: Python<'_>, + bbi: &mut BBIRead, + chrom: &str, + start: Option, + end: Option, + bins: Option, + summary: Summary, + exact: bool, + uncovered: Option, + oob: f64, + fillna: Option, + arr: Option, +) -> PyResult { + let (start, end, chrom_length) = bbi.resolve_range(chrom, start, end)?; + let (valid_start, valid_end) = (start.max(0) as u32, end.min(chrom_length) as u32); + let arr = arr.unwrap_or_else(|| { + let size = bins.unwrap_or((end - start) as usize); + let init = uncovered.unwrap_or(f64::NAN); + PyArray1::from_vec_bound(py, vec![init; size]).to_object(py) + }); + let array: &Bound<'_, PyArray1> = + arr.downcast_bound::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; + + // Fill the array + match bins { + // Base-level raster + None => { + if array.len()? != (end - start) as usize { + return Err(PyErr::new::(format!( + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", + (end - start) as usize, + array.len()?, + ))); + } + let mut array = array.readwrite(); + let view = array.as_array_mut(); + match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_interval(chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::Value)); + fill_values(start, end, iter, uncovered, view).convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_interval(chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::BedEntry)); + fill_values(start, end, iter, uncovered, view).convert_err()?; + } + } + } + // Reduction + Some(bins) => { + if array.len()? != bins { + return Err(PyErr::new::(format!( + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", + bins, + array.len()?, + ))); + } + let reduction_level = if !exact { + bbi.best_zoom(start, end, bins) + } else { + None + }; + let mut array = array.readwrite(); + let view = array.as_array_mut(); + match reduction_level { + // Interpolate from zoom-level summaries + Some(reduction_level) => match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) + .convert_err()? + .map(|item| item.map(BBIRecord::ZoomRecord)); + fill_binned(start, end, iter, summary, bins, uncovered, view) + .convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) + .convert_err()? + .map(|item| item.map(BBIRecord::ZoomRecord)); + fill_binned(start, end, iter, summary, bins, uncovered, view) + .convert_err()?; + } + }, + // Aggregate from base-level data + None => match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_interval(chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::Value)); + fill_binned(start, end, iter, summary, bins, uncovered, view) + .convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_interval(chrom, valid_start, valid_end) + .convert_err()?; + let coverage_iter = CoverageIterator::new(iter) + .convert_err()? + .map(|result| result.map(BBIRecord::Value)); + fill_binned(start, end, coverage_iter, summary, bins, uncovered, view) + .convert_err()?; + } + }, + } + } + }; + + // Post-process fill for NaN positions. Applied before filling out-of-bounds. + let mut array = array.readwrite(); + let mut view = array.as_array_mut(); + if let Some(fill) = fillna { + for v in view.iter_mut() { + if v.is_nan() { + *v = fill; + } + } + } + + // Handle out-of-bounds (bins/positions that overlap out-of-chromosome coordinates). + fill_out_of_bounds(start, end, chrom_length, bins, oob, view.view_mut()); + + Ok(arr) +} + +/// Rasterize intervals at nucleotide resolution. +/// +/// For [`Value`] intervals, the values are assigned over the covered bases. +/// For [`BedEntry`] intervals, the total coverage is accumulated over each base. +/// For [`ZoomRecord`] intervals, the bin mean statistic is assigned over the covered bases. +/// +/// # Arguments +/// * `start` - Start coordinate of the target range +/// * `end` - End coordinate of the target range +/// * `iter` - Iterator of data intervals +/// * `uncovered` - Value to use for bases with no data; `None` leaves them as NaN +/// * `view` - Mutable numpy array view to fill (must have length == end - start) +fn fill_values>>( + start: i32, + end: i32, + iter: I, + uncovered: Option, + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), (end - start) as usize); + view.fill(f64::NAN); + + for interval in iter { + let interval = interval?; + + // Clamp interval to our target range + let interval_start = match interval { + BBIRecord::Value(v) => (v.start as i32).max(start), + BBIRecord::BedEntry(ref e) => (e.start as i32).max(start), + BBIRecord::ZoomRecord(ref z) => (z.start as i32).max(start), + }; + let interval_end = match interval { + BBIRecord::Value(v) => (v.end as i32).min(end), + BBIRecord::BedEntry(ref e) => (e.end as i32).min(end), + BBIRecord::ZoomRecord(ref z) => (z.end as i32).min(end), + }; + if interval_start >= interval_end { + continue; + } + + // Fill values + let rel_start = (interval_start - start) as usize; + let rel_end = (interval_end - start) as usize; + let value = match interval { + BBIRecord::Value(v) => v.value as f64, + BBIRecord::BedEntry(_) => 1.0, + BBIRecord::ZoomRecord(z) => z.summary.sum / (z.end - z.start) as f64, + }; + for i in rel_start..rel_end { + let val = *view.index_mut(i); + *view.index_mut(i) = if val.is_nan() { value } else { val + value }; + } + } + + // Finalize: replace remaining NaN with `uncovered` if a fill was requested. + if let Some(fill) = uncovered { + for val in view.iter_mut() { + if val.is_nan() { + *val = fill; + } + } + } + + Ok(()) +} + +/// Rasterize intervals over a uniform grid of bins. +/// +/// This function iterates over each data interval and accumulates its contribution to the +/// overlapping bins, weighted by the overlap amount. +/// +/// For [`Value`] intervals, the values are aggregated based on bin overlap. +/// For [`ZoomRecord`] intervals, the summary statistics are aggregated based on bin overlap. +/// +/// # Notes +/// [`BedEntry`] intervals are not supported. They must be converted to coverage [`Value`] +/// intervals. See [`CoverageIterator`] for details. +/// +/// Per-bin `bases_covered` is normalized up to the next integer (matching UCSC's convention), +/// with `sum` and `sum_squares` rescaled proportionally so that `mean = sum / bases_covered` +/// is invariant. This only has an effect on the [`ZoomRecord`] path, where partial overlap of +/// a zoom record can produce a fractional share of its `validCount`. +/// +/// `uncovered` is the per-base value assigned to uncovered bases within each bin. `None` +/// means "exclude from the summary" and yields the classical statistics over only the +/// covered bases. `Some(v)` (commonly `Some(0.0)`) means "treat uncovered bases as having +/// value `v`"; passing `Some(0.0)` reproduces UCSC's `*0` summary statistics. The choice +/// applies uniformly: there are no separate `*0` variants of `Mean`, `Std`, `Min`, `Max`. +/// +/// # Arguments +/// * `start` - Start coordinate of the target range +/// * `end` - End coordinate of the target range +/// * `iter` - Iterator of data intervals +/// * `summary` - How to aggregate values +/// * `bins` - Number of bins to divide the range into +/// * `uncovered` - Per-base value for uncovered bases; `None` excludes them +/// * `view` - Mutable numpy array view to fill (must have length == bins) +fn fill_binned>>( + start: i32, + end: i32, + iter: I, + summary: Summary, + bins: usize, + uncovered: Option, + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), bins); + let bin_size = (end - start) as f64 / bins as f64; + + // Initialize statistics for each bin + let mut stats = vec![ + SummaryStats { + sum: 0.0, + sum_squares: 0.0, + min: f64::NAN, + max: f64::NAN, + bases_covered: 0.0, + }; + bins + ]; + + // Process each data interval + for interval_result in iter { + let interval = interval_result?; + + // Clamp interval to our target range + let interval_start = match interval { + BBIRecord::Value(v) => (v.start as i32).max(start), + BBIRecord::BedEntry(ref e) => (e.start as i32).max(start), + BBIRecord::ZoomRecord(ref z) => (z.start as i32).max(start), + }; + let interval_end = match interval { + BBIRecord::Value(v) => (v.end as i32).min(end), + BBIRecord::BedEntry(ref e) => (e.end as i32).min(end), + BBIRecord::ZoomRecord(ref z) => (z.end as i32).min(end), + }; + if interval_start >= interval_end { + continue; + } + + // Determine which bins this interval overlaps. + // For the last bin, we need to handle the case where `interval_end` (open-ended) exactly + // aligns with a bin boundary (in which case, we exclude the bin to the right). + let rel_start = interval_start - start; + let rel_end = interval_end - start; + let first_bin = ((rel_start as f64) / bin_size).floor() as usize; + let last_bin = if (rel_end as f64) % bin_size == 0.0 { + (((rel_end as f64) / bin_size) - 1.0).floor() as usize + } else { + ((rel_end as f64) / bin_size).floor() as usize + }; + // Clamp bins to `[0, bins - 1]` in case of floating point inaccuracies + let first_bin = first_bin.min(bins - 1); + let last_bin = last_bin.min(bins - 1); + + // Accumulate contribution of interval to each overlapping bin + for (i, stat) in stats + .iter_mut() + .enumerate() + .take(last_bin + 1) + .skip(first_bin) + { + let (bin_start, bin_end) = integer_bin_bounds(start, i, bin_size); + + // Calculate integer overlap between interval and bin + let overlap_start = interval_start.max(bin_start); + let overlap_end = interval_end.min(bin_end); + let overlap = (overlap_end - overlap_start) as f64; + + if overlap > 0.0 { + match interval { + BBIRecord::Value(v) => { + let value = v.value as f64; + stat.sum += value * overlap; + stat.sum_squares += value * value * overlap; + stat.min = stat.min.min(value); + stat.max = stat.max.max(value); + stat.bases_covered += overlap; + } + BBIRecord::ZoomRecord(z) => { + let summary = z.summary; + let overlap_factor = overlap / (z.end - z.start) as f64; + stat.sum += summary.sum * overlap_factor; + stat.sum_squares += summary.sum_squares * overlap_factor; + stat.min = stat.min.min(summary.min_val); + stat.max = stat.max.max(summary.max_val); + stat.bases_covered += summary.bases_covered as f64 * overlap_factor; + } + BBIRecord::BedEntry(_) => { + panic!("BED records must be converted to non-overlapping coverage intervals prior to binning."); + } + } + } + } + } + + // Normalize bases_covered to an integer count, matching UCSC's convention. + // + // For the Value path, accumulated `bases_covered` is already integer-valued (overlaps are + // computed in integer coordinates), so `ceil` is a no-op. + // + // For the ZoomRecord path, a partial overlap of a zoom record contributes a fractional + // share of its `validCount` (e.g. a zoom summarizing 10 bases with validCount=7, half + // overlapped, contributes 3.5). UCSC rounds the per-bin total up to the next integer and + // rescales `sum` and `sum_squares` by `ceil(bc) / bc` to keep `mean = sum / bases_covered` + // invariant. The `ceil` direction preserves the invariant that `bases_covered >= 1` + // whenever any data overlaps the bin. Note that `std` is *not* preserved under this + // rescaling, but matching UCSC requires it. + for stat in stats.iter_mut() { + if stat.bases_covered > 0.0 { + let vc = stat.bases_covered.ceil(); + let norm = vc / stat.bases_covered; + stat.sum *= norm; + stat.sum_squares *= norm; + stat.bases_covered = vc; + } + } + + // Fill the output array. + // + // `uncovered` is the per-base value assigned to uncovered bases within each bin: + // - `None`: uncovered bases are excluded from the summary computation + // ("regular" statistics over just the covered bases). For a fully-uncovered + // bin the result is NaN for stats over an empty set; for `BasesCovered` and + // `BinCovered` the result is the factual 0. + // - `Some(v)`: uncovered bases are folded into the summary as if they had + // value `v`. `Some(0.0)` reproduces UCSC's `*0` statistics (e.g. + // `mean0 = sum / bin_size`). Other values are well-defined generalizations. + // + // `BasesCovered` and `BinCovered` always report counts of *actual* data bases, + // independent of `uncovered`. + for i in 0..bins { + let (bin_start, bin_end) = integer_bin_bounds(start, i, bin_size); + let bin_width = (bin_end - bin_start) as f64; + let n_covered = stats[i].bases_covered; + let n_uncovered = bin_width - n_covered; + + // Effective sum/sum_squares and denominator after optionally folding in + // `uncovered`. When `uncovered` is `None`, only covered bases contribute. + let (s, ss, n) = match uncovered { + None => (stats[i].sum, stats[i].sum_squares, n_covered), + Some(u) => ( + stats[i].sum + u * n_uncovered, + stats[i].sum_squares + u * u * n_uncovered, + bin_width, + ), + }; + + *view.index_mut(i) = match summary { + Summary::Mean => { + if n > 0.0 { + s / n + } else { + f64::NAN + } + } + Summary::Std => { + if n < 1.0 { + f64::NAN + } else { + let mut variance = ss - s * s / n; + if n > 1.0 { + variance /= n - 1.0; + } + variance.sqrt() + } + } + Summary::Sum => { + if n > 0.0 { + s + } else { + f64::NAN + } + } + Summary::SumSquares => { + if n > 0.0 { + ss + } else { + f64::NAN + } + } + Summary::Min => match uncovered { + None => { + if n_covered > 0.0 { + stats[i].min + } else { + f64::NAN + } + } + Some(u) if n_uncovered > 0.0 => { + if n_covered > 0.0 { + stats[i].min.min(u) + } else { + u + } + } + Some(_) => stats[i].min, + }, + Summary::Max => match uncovered { + None => { + if n_covered > 0.0 { + stats[i].max + } else { + f64::NAN + } + } + Some(u) if n_uncovered > 0.0 => { + if n_covered > 0.0 { + stats[i].max.max(u) + } else { + u + } + } + Some(_) => stats[i].max, + }, + Summary::BasesCovered => n_covered, + Summary::BinCovered => n_covered / bin_size, + }; + } + + Ok(()) +} + +/// Compute the integer-aligned base coordinates of the bin at index `i`. +/// +/// Float-valued `bin_size` does not generally fall on integer base boundaries, but the +/// underlying data is indexed by integer positions. We resolve this by truncating each +/// bin edge: bin `i` nominally spans +/// `[start + floor(i * bin_size), start + floor((i + 1) * bin_size))`. +/// +/// Consequence: for non-integer `bin_size`, adjacent bins can have integer widths that +/// differ by 1 (e.g. for `bin_size = 2.4`, widths alternate 2, 2, 3, 2, 3) even though +/// each bin "intends" to span `bin_size` bases. The integer width returned here is the +/// actual number of base positions assigned to the bin. +/// +/// For upsampling (`bin_size < 1`), the truncation can collapse a bin to zero integer +/// width. We expand such bins to span a single base so that overlap and division +/// operations stay well-defined; consecutive upsampled bins may then share the same +/// underlying base. +fn integer_bin_bounds(start: i32, i: usize, bin_size: f64) -> (i32, i32) { + let bin_start = start + (i as f64 * bin_size) as i32; + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + if bin_start == bin_end { + (bin_start, bin_start + 1) + } else { + (bin_start, bin_end) + } +} + +/// Fill out-of-bounds elements in the array with a specified value. +/// +/// If a bin overlaps any base coordinates outside the valid range of the chromosome, it is filled +/// with the `oob` value. +fn fill_out_of_bounds( + start: i32, + end: i32, + chrom_length: i32, + bins: Option, + oob: f64, + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) { + let bin_size = match bins { + Some(bins) => (end - start) as f64 / bins as f64, + _ => 1.0, + }; + + if start < 0 { + for i in 0..view.len() { + let bin_start = start + (i as f64 * bin_size) as i32; + if bin_start < 0 { + view[i] = oob; + } else { + break; + } + } + } + if end > chrom_length { + for i in (0..view.len()).rev() { + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + if bin_end > chrom_length { + view[i] = oob; + } else { + break; + } + } + } +} + +#[cfg(test)] +mod tests { + use std::vec; + + use super::*; + use bigtools::Value; + use numpy::ndarray::Array; + + #[test] + fn test_fill_binned_single_interval() { + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![Value { + start: 5, + end: 15, + value: 2.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Mean, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + + // First bin (0-10): overlaps 5-10 = 5 bp with value 2.0 + // Second bin (10-20): overlaps 10-15 = 5 bp with value 2.0 + // uncovered=None excludes uncovered bases → mean of just the covered bp. + assert_eq!(arr.to_vec(), vec![2.0, 2.0]); + } + + #[test] + fn test_fill_binned_partial_overlap() { + // Bin 0 [0,10): overlaps [5,8) — 3 bp at value 4.0 + // Bin 1 [10,20): no overlap + let start = 0; + let end = 20; + let n_bins = 2; + + // uncovered = None ("exclude uncovered" / classical statistics). + // Bin 0: mean over the 3 covered bases = 4.0. + // Bin 1: fully uncovered → NaN. + let intervals = vec![Value { + start: 5, + end: 8, + value: 4.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Mean, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 4.0).abs() < 1e-12); + assert!(arr[1].is_nan()); + + // uncovered = Some(0.0) (UCSC mean0 semantics: uncovered bases count as 0). + // Bin 0: (4.0 * 3 + 0.0 * 7) / 10 = 1.2. + // Bin 1: (0.0 * 10) / 10 = 0.0. + let intervals = vec![Value { + start: 5, + end: 8, + value: 4.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Mean, + n_bins, + Some(0.0), + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 1.2).abs() < 1e-12); + assert!((arr[1] - 0.0).abs() < 1e-12); + } + + #[test] + fn test_fill_binned_min_max() { + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![ + Value { + start: 0, + end: 15, + value: 1.0, + }, + Value { + start: 5, + end: 25, + value: 3.0, + }, + ]; + + // Test Min + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.clone().into_iter().map(BBIRecord::Value).map(Ok), + Summary::Min, + n_bins, + Some(-1.0), + arr.view_mut(), + ) + .unwrap(); + + // First bin (0-10): sees values 1.0 and 3.0, min = 1.0 + // Second bin (10-20): sees values 1.0 and 3.0, min = 1.0 + assert_eq!(arr.to_vec(), vec![1.0, 1.0]); + + // Test Max + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Max, + n_bins, + Some(-1.0), + arr.view_mut(), + ) + .unwrap(); + + // First bin (0-10): sees values 1.0 and 3.0, max = 3.0 + // Second bin (10-20): sees values 1.0 and 3.0, max = 3.0 + assert_eq!(arr.to_vec(), vec![3.0, 3.0]); + } + + #[test] + fn test_fill_binned_noninteger_binsize() { + // 5 bins over range 0..12 + // bin_size = 12 / 5 = 2.4 + // Bin edges are truncated to integers for overlap determination. + // + // Bin 0: [0, 2.4) → [0, 2) + // overlaps [1,3) by [1, 2) = 1 bp with value 1.0 + // → 1.0 + // Bin 1: [2.4, 4.8) → [2, 4) + // overlaps [1,3) by [2, 3) = 1 bp with value 1.0 + // → 1.0 + // Bin 2: [4.8, 7.2) → [4, 7) + // overlaps [4,8) by [4, 7) = 3 bp with value 2.0 + // → 2.0 + // Bin 3: [7.2, 9.6) → [7, 9) + // overlaps [4,8) by [7, 8) = 1 bp with value 2.0 + // overlaps [8,10) by [8, 9) = 1 bp with value 3.0 + // → (2.0*1 + 3.0*1) / (1+1) = 2.5 + // Bin 4: [9.6, 12) → [9, 12) + // overlaps [8,10) by [9, 10) = 1 bp with value 3.0 + // → 3.0 + let start = 0; + let end = 12; + let n_bins = 5; + let intervals = vec![ + Value { + start: 1, + end: 3, + value: 1.0, + }, + Value { + start: 4, + end: 8, + value: 2.0, + }, + Value { + start: 8, + end: 10, + value: 3.0, + }, + ]; + let answer = vec![1.0, 1.0, 2.0, 2.5, 3.0]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Mean, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + + for i in 0..n_bins { + assert!( + (arr[i] - answer[i]).abs() < 0.01, + "Bin {}: expected {}, got {}, diff {}", + i, + answer[i], + arr[i], + (arr[i] - answer[i]).abs() + ); + } + } + + #[test] + fn test_fill_binned_min_max_with_uncovered_zero_noninteger_binsize() { + // 5 bins over range 0..12, bin_size = 2.4 + // Integer-aligned bin spans: [0,2), [2,4), [4,7), [7,9), [9,12) + // Widths: 2 2 3 2 3 + // + // Single Value covering [0, 5) with value=5 covers bins 0 and 1 fully and + // overlaps bin 2 partially (1 of 3 bases, at position 4). Bins 3 and 4 have + // no coverage. + // + // With `uncovered = 0.0`, uncovered bases are folded into min/max as zeros. + // The integer-adjusted bin width is load-bearing: comparing `bases_covered` + // (=2 for bin 0) against the float `bin_size` (=2.4) would falsely report + // bin 0 as partially covered, contaminating min/max with 0. Using the actual + // integer bin width (=2) correctly identifies bin 0 as fully covered. + let start = 0; + let end = 12; + let n_bins = 5; + let uncovered = Some(0.0); + + // Positive value: min should reveal the "contamination" bug if present + // (would pull min from 5 down to 0 in the fully-covered bins). + let intervals = vec![Value { + start: 0, + end: 5, + value: 5.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.clone().into_iter().map(BBIRecord::Value).map(Ok), + Summary::Min, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + // Bin 0, 1 fully covered → min = 5. Bin 2 partial → min(5, 0) = 0. + // Bins 3, 4 fully uncovered → uncovered (0). + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 0.0, 0.0, 0.0]); + + // Max with a negative value: partial coverage should pull max *up* to 0. + let intervals_neg = vec![Value { + start: 0, + end: 5, + value: -5.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals_neg.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Max, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + // Bins 0, 1 fully covered → max = -5. Bin 2 partial → max(-5, 0) = 0. + // Bins 3, 4 fully uncovered → uncovered (0). + assert_eq!(arr.to_vec(), vec![-5.0, -5.0, 0.0, 0.0, 0.0]); + + // Max with positive value: partial coverage doesn't change max since + // max(5, 0) == 5; the test pins that the integer-aligned check doesn't + // accidentally pull max *down* in a fully-covered bin. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Max, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 5.0, 0.0, 0.0]); + } + + /// Build an iterator of `Ok(BBIRecord::Value(...))` from a list of triples. + fn vals(items: Vec<(u32, u32, f32)>) -> impl Iterator> { + items + .into_iter() + .map(|(s, e, v)| Value { + start: s, + end: e, + value: v, + }) + .map(BBIRecord::Value) + .map(Ok) + } + + #[test] + fn test_fill_binned_sum_sum_squares() { + // 2 bins over range 0..20. Values: + // [0, 5) value=2 → bin 0 contributes 2*5 = 10 + // [5, 15) value=3 → bin 0 contributes 3*5 = 15; bin 1 contributes 3*5 = 15 + // Bin 0: sum=25 over 10 covered bp; bin 1: sum=15 over 5 covered bp. + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![(0u32, 5u32, 2.0f32), (5, 15, 3.0)]; + + // Sum, uncovered=None: only covered contributes. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Sum, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![25.0, 15.0]); + + // Sum, uncovered=Some(1.0): bin 1 also gets 1.0 * 5 uncovered bases. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Sum, + n_bins, + Some(1.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![25.0, 15.0 + 1.0 * 5.0]); + + // SumSquares, uncovered=None. + // Bin 0: 2*2*5 + 3*3*5 = 20 + 45 = 65. Bin 1: 3*3*5 = 45. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::SumSquares, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![65.0, 45.0]); + + // SumSquares, uncovered=Some(2.0): adds 2*2 = 4 per uncovered base. + // Bin 1 has 5 uncovered bp → +20. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::SumSquares, + n_bins, + Some(2.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![65.0, 45.0 + 4.0 * 5.0]); + } + + #[test] + fn test_fill_binned_bases_and_bin_covered() { + // 4 bins of width 5 over [0, 20). One value covering [0, 12). + // Bin 0 [0, 5): fully covered (5). + // Bin 1 [5, 10): fully covered (5). + // Bin 2 [10, 15): partially covered (2 of 5). + // Bin 3 [15, 20): no coverage (0). + let start = 0; + let end = 20; + let n_bins = 4; + let intervals = vec![(0u32, 12u32, 1.0f32)]; + + // BasesCovered: actual integer count of covered bases per bin. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::BasesCovered, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 2.0, 0.0]); + + // `uncovered` must not affect BasesCovered. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::BasesCovered, + n_bins, + Some(99.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 2.0, 0.0]); + + // BinCovered: bases_covered / bin_size. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::BinCovered, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![1.0, 1.0, 0.4, 0.0]); + } + + #[test] + fn test_fill_binned_std() { + // 2 bins over [0, 10). Bin 0 sees two distinct values; bin 1 is empty. + // [0, 3) v=1 → contributes value 1 over 3 bases in bin 0 + // [3, 5) v=4 → contributes value 4 over 2 bases in bin 0 + // Bin 0 sum = 1*3 + 4*2 = 11; sum_squares = 1*3 + 16*2 = 35; n_covered = 5. + // Sample variance = (35 - 11*11/5) / (5-1) = (35 - 24.2) / 4 = 2.7 + // std = sqrt(2.7). + let start = 0; + let end = 10; + let n_bins = 2; + let intervals = vec![(0u32, 3u32, 1.0f32), (3, 5, 4.0)]; + + // uncovered=None: classical sample std over only the covered bases. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Std, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 2.7_f64.sqrt()).abs() < 1e-12); + assert!(arr[1].is_nan()); // empty bin → NaN + + // uncovered=Some(0.0): the 0-padded computation. + // Bin 0: s = 11 + 0*0 = 11; ss = 35 + 0 = 35; n = 5 (bin_width). + // variance = (35 - 121/5) / 4 = (35 - 24.2)/4 = 2.7 + // std = sqrt(2.7). Same as None because bin 0 is fully covered. + // Bin 1: s = 0; ss = 0; n = 5. variance = 0/4 = 0. std = 0. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::Std, + n_bins, + Some(0.0), + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 2.7_f64.sqrt()).abs() < 1e-12); + assert!((arr[1] - 0.0).abs() < 1e-12); + } +} diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index de1777fe..c7e3fdf1 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -3,128 +3,47 @@ use std::io::BufReader; use std::ops::Deref; use bigtools::bed::autosql::parse::parse_autosql; +#[cfg(feature = "remote")] +use bigtools::utils::file::remote_file::RemoteFile; +use bigtools::utils::file::reopen::ReopenableFile; use bigtools::utils::misc::{ bigwig_average_over_bed, BigWigAverageOverBedEntry, BigWigAverageOverBedError, Name, }; use bigtools::utils::reopen::Reopen; -use bigtools::{BBIReadError as _BBIReadError, BedEntry, Value, ZoomRecord}; +use bigtools::{ + BBIReadError as _BBIReadError, BedEntry, BigBedRead, BigWigRead, CachedBBIFileRead, Value, + ZoomRecord, +}; use pyo3::exceptions::{self, PyKeyError}; use pyo3::types::{IntoPyDict, PyAny, PyDict, PyList, PyString, PyTuple}; use pyo3::{prelude::*, PyTraverseError, PyVisit}; use crate::errors::{BBIFileClosed, BBIReadError, ConvertResult}; -use crate::utils::{entries_to_array, intervals_to_array, start_end, BBIReadRaw, Summary}; +use crate::file_like::PyFileLikeObject; +use crate::raster::{entries_to_array, intervals_to_array, Summary}; type ValueTuple = (u32, u32, f32); -enum BigWigAverageOverBedStatistics { - Size, - Bases, - Sum, - Mean0, - Mean, - Min, - Max, -} - -impl BigWigAverageOverBedStatistics { - fn from_str(val: &str) -> Option { - Some(match val { - "size" => BigWigAverageOverBedStatistics::Size, - "bases" => BigWigAverageOverBedStatistics::Bases, - "sum" => BigWigAverageOverBedStatistics::Sum, - "mean0" => BigWigAverageOverBedStatistics::Mean0, - "mean" => BigWigAverageOverBedStatistics::Mean, - "min" => BigWigAverageOverBedStatistics::Min, - "max" => BigWigAverageOverBedStatistics::Max, - _ => { - return None; - } - }) - } -} - -/// This class is an interator for the entries of bigWigAverageOverBed -#[pyclass(module = "pybigtools")] -struct BigWigAverageOverBedEntriesIterator { - iter: Box< - dyn Iterator> - + Send, - >, - usename: bool, - stats: Option>, - summary_statistics: PyObject, -} - -#[pymethods] -impl BigWigAverageOverBedEntriesIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult> { - let v = slf - .iter - .next() - .transpose() - .map_err(|e| PyErr::new::(format!("{}", e)))?; - - let Some((name, v)) = v else { - return Ok(None); - }; - let stats = match &slf.stats { - Some(stats) => { - let mut ret = Vec::with_capacity(stats.len()); - for stat in stats.iter() { - match stat { - BigWigAverageOverBedStatistics::Size => { - ret.push(v.size.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Bases => { - ret.push(v.bases.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Sum => ret.push(v.sum.to_object(slf.py())), - BigWigAverageOverBedStatistics::Mean0 => { - ret.push(v.mean0.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Mean => { - ret.push(v.mean.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Min => ret.push(v.min.to_object(slf.py())), - BigWigAverageOverBedStatistics::Max => ret.push(v.max.to_object(slf.py())), - } - } - if ret.len() == 1 { - ret[0].to_object(slf.py()) - } else { - PyTuple::new_bound(slf.py(), ret).to_object(slf.py()) - } - } - None => { - let val = slf.summary_statistics.call_bound( - slf.py(), - (v.size, v.bases, v.sum, v.mean0, v.mean, v.min, v.max), - None, - )?; - val.to_object(slf.py()) - } - }; - - match slf.usename { - true => Ok(Some((name, stats).to_object(slf.py()))), - false => Ok(Some(stats)), - } - } +pub enum BBIReadRaw { + Closed, + BigWigFile(BigWigRead>), + #[cfg(feature = "remote")] + BigWigRemote(BigWigRead>), + BigWigFileLike(BigWigRead>), + BigBedFile(BigBedRead>), + #[cfg(feature = "remote")] + BigBedRemote(BigBedRead>), + BigBedFileLike(BigBedRead>), } /// Interface for reading a BigWig or BigBed file. #[pyclass(module = "pybigtools")] -pub struct BBIRead { +pub struct BBIReader { pub bbi: BBIReadRaw, } #[pymethods] -impl BBIRead { +impl BBIReader { #[getter] fn is_bigwig(&self) -> bool { #[cfg(feature = "remote")] @@ -204,7 +123,7 @@ impl BBIRead { ("sum", summary.sum.to_object(py)), ( "mean", - (summary.sum as f64 / summary.bases_covered as f64).to_object(py), + (summary.sum / summary.bases_covered as f64).to_object(py), ), ("min", summary.min_val.to_object(py)), ("max", summary.max_val.to_object(py)), @@ -228,8 +147,68 @@ impl BBIRead { Ok(info) } + /// Return the names of chromosomes in a BBI file and their lengths. + /// + /// Parameters + /// ---------- + /// chrom : str or None + /// The name of the chromosome to get the length of. If None, then a + /// dictionary of all chromosome sizes will be returned. If the + /// chromosome doesn't exist, returns None. + /// + /// Returns + /// ------- + /// int or Dict[str, int] or None: + /// Chromosome length or a dictionary of chromosome lengths. + #[pyo3(signature = (chrom=None))] + fn chroms(&mut self, py: Python, chrom: Option) -> PyResult { + fn get_chrom_obj( + b: &B, + py: Python, + chrom: Option, + ) -> PyResult { + match chrom { + Some(chrom) => { + let chrom_length = b + .chroms() + .iter() + .find(|c| c.name == chrom) + .ok_or_else(|| { + PyErr::new::( + "No chromosome found with the specified name", + ) + }) + .map(|c| c.length.to_object(py))?; + Ok(chrom_length) + } + None => { + let chrom_dict: PyObject = b + .chroms() + .iter() + .map(|c| (c.name.clone(), c.length)) + .into_py_dict_bound(py) + .into(); + Ok(chrom_dict) + } + } + } + + match &self.bbi { + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigWigFileLike(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFile(b) => get_chrom_obj(b, py, chrom), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), + } + } + /// Return a list of sizes in bases of the summary intervals used in each /// of the zoom levels (i.e. reduction levels) of the BBI file. + #[allow(clippy::too_many_arguments)] fn zooms(&self) -> PyResult> { let zooms = match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), @@ -368,9 +347,9 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { @@ -472,9 +451,9 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; let iter = b @@ -554,17 +533,31 @@ impl BBIRead { /// If provided, the query interval will be divided into equally spaced /// bins and the values in each bin will be interpolated or summarized. /// If not provided, the values will be returned for each base. - /// summary : Literal["mean", "min", "max"], optional [default: "mean"] - /// The summary statistic to use. Currently supported statistics are - /// ``mean``, ``min``, and ``max``. + /// summary : str, optional [default: "mean"] + /// The summary statistic to use. One of ``mean``, ``std``, ``min``, + /// ``max``, ``sum``, ``sum_squares``, ``bases_covered``, + /// ``bin_covered``. /// exact : bool, optional [default: False] /// If True and ``bins`` is specified, return exact summary statistic /// values instead of interpolating from the optimal zoom level. /// Default is False. - /// missing : float, optional [default: 0.0] - /// Fill-in value for unreported data in valid regions. Default is 0. + /// uncovered : float or None, optional [default: None] + /// The value assigned to all uncovered bases. If ``None``, uncovered + /// bases are excluded from summary statistic calculations, and empty + /// positions or bins will be returned as NaN (subject to ``fillna``). + /// To treat uncovered bases as having a value of zero in summary + /// statistics (like UCSC's ``mean0``) set this parameter to ``0.0``. + /// Empty positions or bins will also be returned as ``0.0``. Other + /// finite values are also valid and will be used in the same way. + /// This parameter is ignored in the cases of ``bases_covered`` and + /// ``bin_covered`` summaries since they exclude uncovered bases by + /// definition. /// oob : float, optional [default: NaN] /// Fill-in value for out-of-bounds regions. Default is NaN. + /// fillna : float or None, optional [default: None] + /// Post-rasterization fill applied to in-bounds positions or bins that + /// are returned as NaN due to being empty. Default ``None`` leaves + /// NaN values untouched. /// arr : numpy.ndarray, optional /// If provided, the values will be written to this array or array /// view. The array must be of the correct size and type. @@ -595,9 +588,10 @@ impl BBIRead { /// records : Get the records of a given range on a chromosome. /// zoom_records : Get the zoom records of a given range on a chromosome. #[pyo3( - signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), - text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, + signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, uncovered=None, oob=f64::NAN, fillna=None, arr=None), + text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, uncovered=None, oob=float('nan'), fillna=None, arr=None)"#, )] + #[allow(clippy::too_many_arguments)] fn values( &mut self, py: Python<'_>, @@ -607,104 +601,54 @@ impl BBIRead { bins: Option, summary: String, exact: bool, - missing: f64, + uncovered: Option, oob: f64, + fillna: Option, arr: Option, ) -> PyResult { let summary = match summary.as_ref() { "mean" => Summary::Mean, + "std" => Summary::Std, "min" => Summary::Min, "max" => Summary::Max, + "sum" => Summary::Sum, + "sum_squares" => Summary::SumSquares, + "bases_covered" => Summary::BasesCovered, + "bin_covered" => Summary::BinCovered, _ => { return Err(PyErr::new::(format!( - "Unrecognized summary. Only `mean`, `min`, and `max` are allowed." + "Unrecognized summary statistic: {}", + summary ))); } }; + // Passing `uncovered=NaN` is equivalent to `uncovered=None` (exclude uncovered bases). + let uncovered = uncovered.and_then(|v| if v.is_nan() { None } else { Some(v) }); match &mut self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigWigFileLike(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigBedFile(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigBedFileLike(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), } } - /// Return the names of chromosomes in a BBI file and their lengths. - /// - /// Parameters - /// ---------- - /// chrom : str or None - /// The name of the chromosome to get the length of. If None, then a - /// dictionary of all chromosome sizes will be returned. If the - /// chromosome doesn't exist, returns None. - /// - /// Returns - /// ------- - /// int or Dict[str, int] or None: - /// Chromosome length or a dictionary of chromosome lengths. - #[pyo3(signature = (chrom=None))] - fn chroms(&mut self, py: Python, chrom: Option) -> PyResult { - fn get_chrom_obj( - b: &B, - py: Python, - chrom: Option, - ) -> PyResult { - match chrom { - Some(chrom) => { - let chrom_length = b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .ok_or_else(|| { - PyErr::new::( - "No chromosome found with the specified name", - ) - }) - .map(|c| c.length.to_object(py))?; - Ok(chrom_length) - } - None => { - let chrom_dict: PyObject = b - .chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict_bound(py) - .into(); - Ok(chrom_dict) - } - } - } - - match &self.bbi { - BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), - BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), - #[cfg(feature = "remote")] - BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigWigFileLike(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigBedFile(b) => get_chrom_obj(b, py, chrom), - #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), - } - } - /// Gets the average values from a bigWig over the entries of a bed file. /// /// Parameters @@ -847,9 +791,9 @@ impl BBIRead { if bed.is_instance(&path_class)? { bed.str()? } else { - return Err(PyErr::new::(format!( - "Unknown argument for `path`. Not a string or Path object.", - ))); + return Err(PyErr::new::( + "Unknown argument for `path`. Not a string or Path object.".to_string(), + )); } }; let bedin = BufReader::new(File::open(bed.to_str()?)?); @@ -942,40 +886,6 @@ impl BBIRead { } } -#[pyclass(module = "pybigtools")] -struct ZoomIntervalIterator { - iter: Box> + Send>, -} - -#[pymethods] -impl ZoomIntervalIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult> { - slf.iter - .next() - .transpose() - .map(|o| { - o.map(|v| { - let summary = [ - ("total_items", v.summary.total_items.to_object(slf.py())), - ("bases_covered", v.summary.bases_covered.to_object(slf.py())), - ("min_val", v.summary.min_val.to_object(slf.py())), - ("max_val", v.summary.max_val.to_object(slf.py())), - ("sum", v.summary.sum.to_object(slf.py())), - ("sum_squares", v.summary.sum_squares.to_object(slf.py())), - ] - .into_py_dict_bound(slf.py()) - .to_object(slf.py()); - (v.start, v.end, summary) - }) - }) - .convert_err() - } -} - /// An iterator for intervals in a bigWig. /// /// It returns only values that exist in the bigWig, skipping any missing @@ -1023,7 +933,174 @@ impl BigBedEntriesIterator { .chain(next.rest.split_whitespace().map(|o| o.to_object(py))) .collect(); Ok(Some( - PyTuple::new_bound::(py, elements.into_iter()).to_object(py), + PyTuple::new_bound::(py, elements).to_object(py), )) } } + +#[pyclass(module = "pybigtools")] +struct ZoomIntervalIterator { + iter: Box> + Send>, +} + +#[pymethods] +impl ZoomIntervalIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + slf.iter + .next() + .transpose() + .map(|o| { + o.map(|v| { + let summary = [ + ("total_items", v.summary.total_items.to_object(slf.py())), + ("bases_covered", v.summary.bases_covered.to_object(slf.py())), + ("min_val", v.summary.min_val.to_object(slf.py())), + ("max_val", v.summary.max_val.to_object(slf.py())), + ("sum", v.summary.sum.to_object(slf.py())), + ("sum_squares", v.summary.sum_squares.to_object(slf.py())), + ] + .into_py_dict_bound(slf.py()) + .to_object(slf.py()); + (v.start, v.end, summary) + }) + }) + .convert_err() + } +} + +enum BigWigAverageOverBedStatistics { + Size, + Bases, + Sum, + Mean0, + Mean, + Min, + Max, +} + +impl BigWigAverageOverBedStatistics { + fn from_str(val: &str) -> Option { + Some(match val { + "size" => BigWigAverageOverBedStatistics::Size, + "bases" => BigWigAverageOverBedStatistics::Bases, + "sum" => BigWigAverageOverBedStatistics::Sum, + "mean0" => BigWigAverageOverBedStatistics::Mean0, + "mean" => BigWigAverageOverBedStatistics::Mean, + "min" => BigWigAverageOverBedStatistics::Min, + "max" => BigWigAverageOverBedStatistics::Max, + _ => { + return None; + } + }) + } +} + +/// This class is an interator for the entries of bigWigAverageOverBed +#[pyclass(module = "pybigtools")] +struct BigWigAverageOverBedEntriesIterator { + iter: Box< + dyn Iterator> + + Send, + >, + usename: bool, + stats: Option>, + summary_statistics: PyObject, +} + +#[pymethods] +impl BigWigAverageOverBedEntriesIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + let v = slf + .iter + .next() + .transpose() + .map_err(|e| PyErr::new::(format!("{}", e)))?; + + let Some((name, v)) = v else { + return Ok(None); + }; + let stats = match &slf.stats { + Some(stats) => { + let mut ret = Vec::with_capacity(stats.len()); + for stat in stats.iter() { + match stat { + BigWigAverageOverBedStatistics::Size => { + ret.push(v.size.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Bases => { + ret.push(v.bases.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Sum => ret.push(v.sum.to_object(slf.py())), + BigWigAverageOverBedStatistics::Mean0 => { + ret.push(v.mean0.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Mean => { + ret.push(v.mean.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Min => ret.push(v.min.to_object(slf.py())), + BigWigAverageOverBedStatistics::Max => ret.push(v.max.to_object(slf.py())), + } + } + if ret.len() == 1 { + ret[0].to_object(slf.py()) + } else { + PyTuple::new_bound(slf.py(), ret).to_object(slf.py()) + } + } + None => { + let val = slf.summary_statistics.call_bound( + slf.py(), + (v.size, v.bases, v.sum, v.mean0, v.mean, v.min, v.max), + None, + )?; + val.to_object(slf.py()) + } + }; + + match slf.usename { + true => Ok(Some((name, stats).to_object(slf.py()))), + false => Ok(Some(stats)), + } + } +} + +fn start_end_clamped( + bbi: &BBIReadRaw, + chrom_name: &str, + start: Option, + end: Option, +) -> PyResult<(u32, u32)> { + let chroms = match &bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::BigWigFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => b.chroms(), + BBIReadRaw::BigWigFileLike(b) => b.chroms(), + BBIReadRaw::BigBedFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => b.chroms(), + BBIReadRaw::BigBedFileLike(b) => b.chroms(), + }; + let chrom = chroms.iter().find(|x| x.name == chrom_name); + let length = match chrom { + None => { + return Err(PyErr::new::(format!( + "No chromomsome with name `{}` found.", + chrom_name + ))) + } + Some(c) => c.length, + }; + Ok(( + start.map(|v| v.max(0) as u32).unwrap_or(0), + end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), + )) +} diff --git a/pybigtools/src/utils.rs b/pybigtools/src/utils.rs deleted file mode 100644 index 3bb4ca96..00000000 --- a/pybigtools/src/utils.rs +++ /dev/null @@ -1,1151 +0,0 @@ -use std::collections::VecDeque; -use std::ops::IndexMut; - -#[cfg(feature = "remote")] -use bigtools::utils::file::remote_file::RemoteFile; -use bigtools::utils::file::reopen::ReopenableFile; -use bigtools::{ - BBIFileRead, BBIReadError as _BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, - BigWigRead as BigWigReadRaw, CachedBBIFileRead, Value, ZoomRecord, -}; -use numpy::ndarray::ArrayViewMut; -use numpy::{PyArray1, PyArrayMethods}; -use pyo3::exceptions; -use pyo3::prelude::*; - -use crate::errors::{BBIFileClosed, ConvertResult}; -use crate::file_like::PyFileLikeObject; - -pub enum BBIReadRaw { - Closed, - BigWigFile(BigWigReadRaw>), - #[cfg(feature = "remote")] - BigWigRemote(BigWigReadRaw>), - BigWigFileLike(BigWigReadRaw>), - BigBedFile(BigBedReadRaw>), - #[cfg(feature = "remote")] - BigBedRemote(BigBedReadRaw>), - BigBedFileLike(BigBedReadRaw>), -} - -#[derive(Copy, Clone, Debug)] -pub enum Summary { - Mean, - Min, - Max, -} - -pub fn intervals_to_array( - py: Python<'_>, - b: &mut BigWigReadRaw, - chrom: &str, - start: Option, - end: Option, - bins: Option, - summary: Summary, - exact: bool, - missing: f64, - oob: f64, - arr: Option, -) -> PyResult { - let (start, end, length) = bigwig_start_end_length(b, chrom, start, end)?; - let zoom = if let (Some(bins), false) = (bins, exact) { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - zoom - } else { - None - }; - let arr = match (bins, arr) { - (_, Some(arr)) => arr, - (Some(bins), None) => PyArray1::from_vec_bound(py, vec![missing; bins]).to_object(py), - (None, None) => { - PyArray1::from_vec_bound(py, vec![missing; (end - start) as usize]).to_object(py) - } - }; - let v: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - let bin_size = match bins { - Some(bins) => { - if v.len()? != bins { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - bins, - v.len()?, - ))); - } - - let mut array = v.readwrite(); - - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - to_array_zoom( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - None => { - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_array_bins( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - }; - - (end - start) as f64 / bins as f64 - } - _ => { - if v.len()? != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - (end - start) as usize, - v.len()?, - ))); - } - - let mut array = v.readwrite(); - - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - - 1.0 - } - }; - - { - let mut array = v.readwrite(); - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); - - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end > length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len()); - for i in bin_start..bin_end { - array[i] = oob; - } - } - } - - Ok(arr) -} - -pub fn entries_to_array( - py: Python<'_>, - b: &mut BigBedReadRaw, - chrom: &str, - start: Option, - end: Option, - bins: Option, - summary: Summary, - exact: bool, - missing: f64, - oob: f64, - arr: Option, -) -> PyResult { - let (start, end, length) = bigbed_start_end_length(b, chrom, start, end)?; - let zoom = if let (Some(bins), false) = (bins, exact) { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - zoom - } else { - None - }; - let arr = match (bins, arr) { - (_, Some(arr)) => arr, - (Some(bins), None) => PyArray1::from_vec_bound(py, vec![missing; bins]).to_object(py), - (None, None) => { - PyArray1::from_vec_bound(py, vec![missing; (end - start) as usize]).to_object(py) - } - }; - let v: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - let bin_size = match bins { - Some(bins) => { - if v.len()? != bins { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - bins, - v.len()?, - ))); - } - - let mut array = v.readwrite(); - - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - to_entry_array_zoom( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - None => { - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_entry_array_bins( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - }; - - (end - start) as f64 / bins as f64 - } - _ => { - if v.len()? != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - (end - start) as usize, - v.len()?, - ))); - } - - let mut array = v.readwrite(); - - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_entry_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - - 1.0 - } - }; - - { - let mut array = v.readwrite(); - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); - - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end > length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len()); - for i in bin_start..bin_end { - array[i] = oob; - } - } - } - - Ok(arr) -} - -fn to_array>>( - start: i32, - end: i32, - iter: I, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), (end - start) as usize); - v.fill(f64::NAN); - for interval in iter { - let interval = interval?; - let interval_start = ((interval.start as i32) - start) as usize; - let interval_end = ((interval.end as i32) - start) as usize; - for i in interval_start..interval_end { - let val = *v.index_mut(i); - *v.index_mut(i) = if val.is_nan() { - interval.value as f64 - } else { - val + interval.value as f64 - }; - } - } - for val in v.iter_mut() { - *val = if val.is_nan() { missing } else { *val }; - } - Ok(()) -} - -fn to_array_bins>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); - - let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = bin_data.front_mut() { - if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } else { - break; - } - } - while let Some(bin) = bin_data - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - bin_data.push_back((bin, bin_start, bin_end, None)); - } - for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - // Not relevant here, but keeping for parity - if interval_end <= *bin_start { - break; - } - let (c, v) = data.get_or_insert_with(|| { - match summary { - // min & max are defined for NAN and we are about to set it - // can't use 0.0 because it may be either below or above the real value - Summary::Min | Summary::Max => (0, f64::NAN), - // addition is not defined for NAN - Summary::Mean => (0, 0.0), - } - }); - match summary { - Summary::Min => { - *v = v.min(interval.value as f64); - } - Summary::Max => { - *v = v.max(interval.value as f64); - } - Summary::Mean => { - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size: i32 = overlap_end - overlap_start; - *v += (overlap_size as f64) * interval.value as f64; - *c += overlap_size; - } - } - } - } - while let Some(front) = bin_data.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } - Ok(()) -} - -fn to_array_zoom>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); - - // (bin, bin_start, bin_end, Option<(covered_bases, value)>) - let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = bin_data.front_mut() { - if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } else { - break; - } - } - while let Some(bin) = bin_data - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - bin_data.push_back((bin, bin_start, bin_end, None)); - } - for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - // Not relevant here, but keeping for parity - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size: i32 = overlap_end - overlap_start; - match data.as_mut() { - Some((c, v)) => match summary { - Summary::Min => { - *v = v.min(interval.summary.min_val as f64); - *c += overlap_size; - } - Summary::Max => { - *v = v.max(interval.summary.max_val as f64); - *c += overlap_size; - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *v += (overlap_size as f64) * zoom_mean; - *c += overlap_size; - } - }, - None => match summary { - Summary::Min => { - *data = Some((overlap_size, interval.summary.min_val as f64)); - } - Summary::Max => { - *data = Some((overlap_size, interval.summary.max_val as f64)); - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *data = Some((overlap_size, (overlap_size as f64) * zoom_mean)); - } - }, - } - } - } - while let Some(front) = bin_data.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } - Ok(()) -} - -fn to_entry_array>>( - start: i32, - end: i32, - iter: I, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), (end - start) as usize); - v.fill(f64::NAN); - for interval in iter { - let interval = interval?; - let interval_start = ((interval.start as i32) - start) as usize; - let interval_end = ((interval.end as i32) - start) as usize; - for i in interval_start..interval_end { - let val = *v.index_mut(i); - *v.index_mut(i) = if val.is_nan() { 1.0 } else { val + 1.0 }; - } - } - for val in v.iter_mut() { - *val = if val.is_nan() { missing } else { *val }; - } - Ok(()) -} - -fn to_entry_array_bins>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); - - // (, , , , ) - // covered_bases = 0 if uncovered, 1 if covered - let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = bin_data.front_mut() { - if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } else { - break; - } - } - while let Some(bin) = bin_data - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - bin_data.push_back(( - bin, - bin_start, - bin_end, - vec![0; (bin_end - bin_start) as usize], - vec![missing; (bin_end - bin_start) as usize], - )); - } - for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - - let range = - ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); - for i in &mut data[range.clone()] { - // If NAN, then 0.0 + 1.0, else i + 1.0 - *i = (*i).max(0.0) + 1.0; - } - for i in &mut covered[range] { - *i = (*i).max(1); - } - } - } - while let Some(front) = bin_data.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } - Ok(()) -} - -fn to_entry_array_zoom>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); - - // (, , , , ) - // covered_bases = 0 if uncovered, 1 if covered - let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = bin_data.front_mut() { - if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } else { - break; - } - } - while let Some(bin) = bin_data - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - bin_data.push_back(( - bin, - bin_start, - bin_end, - vec![0; (bin_end - bin_start) as usize], - vec![missing; (bin_end - bin_start) as usize], - )); - } - for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - - let mean = interval.summary.sum / (interval.summary.bases_covered) as f64; - let range = - ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); - for i in &mut data[range.clone()] { - *i = i.max(0.0); - match summary { - Summary::Mean => *i += mean, - Summary::Min => *i = i.min(interval.summary.min_val), - Summary::Max => *i = i.max(interval.summary.max_val), - } - } - - for i in &mut covered[range] { - *i = (*i).max(1); - } - } - } - while let Some(front) = bin_data.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } - Ok(()) -} - -pub fn start_end( - bbi: &BBIReadRaw, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(u32, u32)> { - let chroms = match &bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), - BBIReadRaw::BigWigFile(b) => b.chroms(), - #[cfg(feature = "remote")] - BBIReadRaw::BigWigRemote(b) => b.chroms(), - BBIReadRaw::BigWigFileLike(b) => b.chroms(), - BBIReadRaw::BigBedFile(b) => b.chroms(), - #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => b.chroms(), - BBIReadRaw::BigBedFileLike(b) => b.chroms(), - }; - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); - let length = match chrom { - None => { - return Err(PyErr::new::(format!( - "No chromomsome with name `{}` found.", - chrom_name - ))) - } - Some(c) => c.length, - }; - return Ok(( - start.map(|v| v.max(0) as u32).unwrap_or(0), - end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), - )); -} - -fn bigwig_start_end_length( - bbi: &BigWigReadRaw, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chroms = bbi.chroms(); - start_end_length_inner(chrom_name, chroms, start, end) -} - -fn bigbed_start_end_length( - bbi: &BigBedReadRaw, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chroms = bbi.chroms(); - start_end_length_inner(chrom_name, chroms, start, end) -} - -fn start_end_length_inner( - chrom_name: &str, - chroms: &[bigtools::ChromInfo], - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); - let length = match chrom { - None => { - return Err(PyErr::new::(format!( - "No chromomsome with name `{}` found.", - chrom_name - ))) - } - Some(c) => c.length as i32, - }; - return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); -} - -#[cfg(test)] -mod test { - use bigtools::BedEntry; - use numpy::ndarray::Array; - - use crate::utils::to_entry_array_bins; - use crate::utils::Summary; - - #[test] - fn test_to_entry_array_bins() { - fn eq_with_nan_eq(a: f64, b: f64) -> bool { - (a.is_nan() && b.is_nan()) || (a == b) - } - - #[track_caller] - fn assert_equal(found: Vec, expected: Vec) { - let equal = (found.len() == expected.len()) && // zip stops at the shortest - found.iter() - .zip(expected.iter()) - .all(|(a,b)| eq_with_nan_eq(*a, *b)); - if !equal { - panic!("Vecs not equal: expected {:?}, found {:?}", expected, found); - } - } - - let entries = []; - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [f64::NAN].into_iter().collect::>()); - - let entries = []; - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 1, - 0.0, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [0.0].into_iter().collect::>()); - - let entries = [BedEntry { - start: 10, - end: 20, - rest: "".to_string(), - }]; - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - - let entries = [BedEntry { - start: 10, - end: 20, - rest: "".to_string(), - }]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 2, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 1.0].into_iter().collect::>()); - - let entries = [ - BedEntry { - start: 10, - end: 20, - rest: "".to_string(), - }, - BedEntry { - start: 15, - end: 20, - rest: "".to_string(), - }, - ]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Mean, - 2, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 2.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Min, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Max, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [2.0].into_iter().collect::>()); - - let entries = [ - BedEntry { - start: 10, - end: 20, - rest: "".to_string(), - }, - BedEntry { - start: 15, - end: 20, - rest: "".to_string(), - }, - BedEntry { - start: 15, - end: 25, - rest: "".to_string(), - }, - ]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Mean, - 2, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 3.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Min, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Max, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [3.0].into_iter().collect::>()); - } -} diff --git a/pybigtools/src/writer.rs b/pybigtools/src/writer.rs index dc6dfd91..fcd061bb 100644 --- a/pybigtools/src/writer.rs +++ b/pybigtools/src/writer.rs @@ -11,12 +11,12 @@ use crate::errors::BBIFileClosed; /// Interface for writing to a BigWig file. #[pyclass(module = "pybigtools")] -pub struct BigWigWrite { +pub struct BigWigWriter { pub bigwig: Option, } #[pymethods] -impl BigWigWrite { +impl BigWigWriter { /// Write values to the BigWig file. /// /// The underlying file will be closed automatically when the function @@ -77,17 +77,17 @@ impl BigWigWrite { let mut iter: Bound<'_, PyIterator> = match self.inner.downcast_bound(py) { Ok(o) => o.clone(), Err(_) => { - return Some(Err(IterError(format!( - "Passed value for `val` is not iterable." - )))) + return Some(Err(IterError( + "Passed value for `val` is not iterable.".to_string(), + ))) } }; let next: Result<(String, Value), pyo3::PyErr> = match iter.next()? { Err(e) => { e.print(py); - return Some(Err(IterError(format!( - "An error occurred while iterating." - )))); + return Some(Err(IterError( + "An error occurred while iterating.".to_string(), + ))); } Ok(n) => { // TODO: try block or separate function @@ -145,13 +145,12 @@ impl BigWigWrite { } })?; let vals_iter_raw = Iter { inner: iter }.map(|v| match v { - Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.0.to_string())), Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigwig.write(data, runtime) { - Err(e) => println!("{}", e), - Ok(_) => {} + if let Err(e) = bigwig.write(data, runtime) { + println!("{}", e) } Ok(()) }) @@ -169,12 +168,12 @@ impl BigWigWrite { /// Interface for writing to a BigBed file. #[pyclass(module = "pybigtools")] -pub struct BigBedWrite { +pub struct BigBedWriter { pub bigbed: Option, } #[pymethods] -impl BigBedWrite { +impl BigBedWriter { /// Write values to the BigBed file. /// /// The underlying file will be closed automatically when the function @@ -246,17 +245,17 @@ impl BigBedWrite { let mut iter: Bound<'_, PyIterator> = match self.inner.downcast_bound(py) { Ok(o) => o.clone(), Err(_) => { - return Some(Err(IterError(format!( - "Passed value for `val` is not iterable." - )))) + return Some(Err(IterError( + "Passed value for `val` is not iterable.".to_string(), + ))) } }; let next: Result<(String, BedEntry), pyo3::PyErr> = match iter.next()? { Err(e) => { e.print(py); - return Some(Err(IterError(format!( - "An error occurred while iterating." - )))); + return Some(Err(IterError( + "An error occurred while iterating.".to_string(), + ))); } Ok(n) => { // TODO: try block or separate function @@ -314,15 +313,12 @@ impl BigBedWrite { } })?; let vals_iter_raw = Iter { inner: iter }.map(|v| match v { - Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.0.to_string())), Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigbed.write(data, runtime) { - Err(e) => { - println!("{}", e) - } - Ok(_) => {} + if let Err(e) = bigbed.write(data, runtime) { + println!("{}", e) } Ok(()) }) diff --git a/pybigtools/tests/data/mini.bb b/pybigtools/tests/data/mini.bb new file mode 100644 index 00000000..c9252708 Binary files /dev/null and b/pybigtools/tests/data/mini.bb differ diff --git a/pybigtools/tests/data/mini.bw b/pybigtools/tests/data/mini.bw new file mode 100644 index 00000000..0df16fad Binary files /dev/null and b/pybigtools/tests/data/mini.bw differ diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index d0a40fdc..0d7c5e02 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -258,112 +258,205 @@ def test_zoom_records_oob(bw, bb): def test_values_bp(bw, bb): # (chrom, None, None) => all values on chrom - assert len(bw.values("chr17")) == 83_257_441 - assert len(bb.values("chr21")) == 48_129_895 + assert len(bw.values("chr17", fillna=None)) == 83_257_441 + assert len(bb.values("chr21", fillna=None)) == 48_129_895 # (chrom, start, None) => all values from (start, ) - assert len(bw.values("chr17", 0)) == 83_257_441 - assert len(bw.values("chr17", 10)) == 83_257_441 - 10 - assert len(bb.values("chr21", 0)) == 48_129_895 - assert len(bb.values("chr21", 10)) == 48_129_895 - 10 + assert len(bw.values("chr17", 0, fillna=None)) == 83_257_441 + assert len(bw.values("chr17", 10, fillna=None)) == 83_257_441 - 10 + assert len(bb.values("chr21", 0, fillna=None)) == 48_129_895 + assert len(bb.values("chr21", 10, fillna=None)) == 48_129_895 - 10 # (chrom, start, end) - assert len(bw.values("chr17", 100_000, 110_000)) == 10_000 - assert len(bb.values("chr21", 10_148_000, 10_158_000)) == 10_000 + assert len(bw.values("chr17", 100_000, 110_000, fillna=None)) == 10_000 + assert len(bb.values("chr21", 10_148_000, 10_158_000, fillna=None)) == 10_000 def test_values_binned(bw, bb): - assert len(bw.values("chr17", 100000, 110000, 10)) == 10 - assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 + assert len(bw.values("chr17", 100000, 110000, 10, fillna=None)) == 10 + assert len(bb.values("chr21", 10_148_000, 10_158_000, 10, fillna=None)) == 10 - assert bw.values("chr17", 100000, 110000, 10)[0] == 0.44886381671868925 - assert bb.values("chr21", 10_148_000, 10_158_000, 10)[0] == 1.0 + # Statistics over only the covered bases (`uncovered=None`). + assert ( + bw.values("chr17", 100000, 110000, 10, uncovered=None, fillna=None)[0] + == 0.44886381671868925 + ) + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=None, fillna=None)[0] + == 1.0 + ) - assert bw.values("chr17", 100000, 110000, 10, "max")[0] == 1.1978399753570557 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "max")[0] == 1.0 + assert ( + bw.values("chr17", 100000, 110000, 10, "max", uncovered=None, fillna=None)[0] + == 1.1978399753570557 + ) + assert ( + bb.values( + "chr21", 10_148_000, 10_158_000, 10, "max", uncovered=None, fillna=None + )[0] + == 1.0 + ) + + assert ( + bw.values("chr17", 100000, 110000, 10, "min", uncovered=None, fillna=None)[0] + == 0.05403999984264374 + ) + assert ( + bb.values( + "chr21", 10_148_000, 10_158_000, 10, "min", uncovered=None, fillna=None + )[0] + == 1.0 + ) - assert bw.values("chr17", 100000, 110000, 10, "min")[0] == 0.05403999984264374 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min")[0] == 0.0 + # Uncovered bases counted as 0 in min (min0 semantics). + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0, fillna=None)[ + 0 + ] + == 0.0 + ) def test_values_binned_exact(bw, bb): + # Statistics over only the covered bases (`uncovered=None`). + assert np.isclose( + bw.values( + "chr17", 100000, 110000, 10, "mean", exact=True, uncovered=None, fillna=None + )[0], + 0.4542629980980206, + ) assert ( - bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0] - == 0.4542629980980206 + bb.values( + "chr21", + 10_148_000, + 10_158_000, + 10, + "mean", + exact=True, + uncovered=None, + fillna=None, + )[0] + == 1.0 ) - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True)[0] == 1.0 - - assert list(bw.values("chr17", 59890, 59900, 10, "mean", exact=True)) == [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.06791999936103821, - 0.06791999936103821, - ] + assert np.allclose( + bw.values( + "chr17", 59890, 59900, 10, "mean", exact=True, uncovered=0.0, fillna=None + ), + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.06791999936103821, + 0.06791999936103821, + ], + ) -def test_values_binned_missing_oob(bw, bb): - assert list( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True, missing=-1.0) - ) == [ - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - 0.06791999936103821, - 0.06791999936103821, - ] - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0) +def test_values_binned_uncovered_oob(bw, bb): + assert np.allclose( + bw.values( + "chr17", 59890, 59900, 10, "mean", exact=True, uncovered=-1.0, fillna=None + ), + [ + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + 0.06791999936103821, + 0.06791999936103821, + ], + ) + + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, fillna=None) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0) + + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, fillna=None) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + x = bw.values( + "chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0, fillna=None + ) assert x[0] == 0.0 - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + x = bb.values( + "chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0, fillna=None + ) assert x[0] == 0.0 - assert pytest.approx(bw.values("chr17", 59890, 59900)[9], 0.0001) == 0.06792 - assert pytest.approx(bw.values("chr17", 59890, 59900, missing=np.nan)[9], 0.0001) == 0.06792 - -def test_values_binned_estimate_differences(bw, bb): - # Some differences in estimates between pybigtools to other libs - # Namely, bigtools calculates estimates by taking the - # sum of nanmeans over covered bases (summary.sum/summary.covered_bases) - # and dividing by covered bases (overlap between zoom and bin) - # So, including these as cases where the calculated value is different - vals = bw.values("chr17", 85525, 85730, bins=2, exact=False) - assert list(vals) == [0.15392776003070907, 2.728891665264241] - vals = bw.values("chr17", 85525, 85730, bins=2, exact=True) - assert list(vals) == [0.06770934917680595, 2.4864424403431347] - vals = bw.values("chr17", 59900, 60105, bins=2, exact=False) - assert list(vals) == [0.5358060553962108, 0.5513471488813751] - vals = bw.values("chr17", 59900, 60105, bins=2, exact=True) - assert list(vals) == [0.5362001863472602, 0.5527710799959679] - - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False) - assert list(vals) == [1.2572170068028603] - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True) - assert list(vals) == [1.3408662900188324] + assert np.isclose(bw.values("chr17", 59890, 59900, fillna=None)[9], 0.06792) + assert np.isclose( + bw.values("chr17", 59890, 59900, uncovered=None, fillna=None)[9], 0.06792 + ) + + +def test_values_binned_exact_vs_from_zoom(bw, bb): + # Statistics over only the covered bases (`uncovered=None`). + vals = bw.values( + "chr17", 85525, 85730, bins=2, exact=False, uncovered=None, fillna=None + ) + assert np.allclose(vals, [0.10737355, 2.72889167]) + vals = bw.values( + "chr17", 85525, 85730, bins=2, exact=True, uncovered=None, fillna=None + ) + assert np.allclose(vals, [0.06770935, 2.48644244]) + + vals = bw.values( + "chr17", 59900, 60105, bins=2, exact=False, uncovered=None, fillna=None + ) + assert np.allclose(vals, [0.53580606, 0.55134715]) + vals = bw.values( + "chr17", 59900, 60105, bins=2, exact=True, uncovered=None, fillna=None + ) + assert np.allclose(vals, [0.53620019, 0.55277108]) + + vals = bb.values( + "chr21", + 14_760_000, + 14_800_000, + bins=1, + exact=False, + uncovered=None, + fillna=None, + ) + assert np.allclose(vals, [1.23191877]) + vals = bb.values( + "chr21", + 14_760_000, + 14_800_000, + bins=1, + exact=True, + uncovered=None, + fillna=None, + ) + assert np.allclose(vals, [1.34086629]) def test_values_assign_to_array(bw, bb): # The returned array is the same as the one passed, so both show the same values arr = np.zeros(20) + ret_arr = bw.values( - "chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + "chr17", + -10, + 10, + 20, + "mean", + exact=True, + uncovered=0.0, + oob=np.nan, + arr=arr, + fillna=None, ) assert math.isnan(arr[0]) assert arr[19] == 0.0 @@ -372,7 +465,16 @@ def test_values_assign_to_array(bw, bb): assert np.array_equal(arr, ret_arr, equal_nan=True) ret_arr = bb.values( - "chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + "chr21", + -10, + 10, + 20, + "mean", + exact=True, + uncovered=0.0, + oob=np.nan, + arr=arr, + fillna=None, ) assert math.isnan(arr[0]) assert arr[19] == 0.0 @@ -380,30 +482,31 @@ def test_values_assign_to_array(bw, bb): assert ret_arr[19] == 0.0 assert np.array_equal(arr, ret_arr, equal_nan=True) + def test_values_oob(bw): chroms = bw.chroms() # Region: arbitrary chromosome, 1 bin outside overlap chrom = list(chroms.keys())[0] bin_size = 8 - end = chroms[chrom]+bin_size - start = end - 2*bin_size + end = chroms[chrom] + bin_size + start = end - 2 * bin_size # Try to get values - v=bw.values(chrom, start, end, bins = 3) + v = bw.values(chrom, start, end, bins=3, fillna=None) assert len(v) == 3 assert np.isnan(v[2]) # Note: also happens if explicitly setting oob: - v=bw.values(chrom, start, end, bins = 3, oob=0) + v = bw.values(chrom, start, end, bins=3, oob=0, fillna=None) assert len(v) == 3 assert v[2] == 0 + def test_big_gene_pred(): bb = pybigtools.open(REPO_ROOT / "bigtools/resources/test/bigGenePred.bb") - bb.values("chr21", 14_000_000, 18_000_000, 100) + bb.values("chr21", 14_000_000, 18_000_000, 100, fillna=None) def test_average_over_bed(bw, bb): assert pytest.raises(ValueError, bb.average_over_bed, "ignored") - assert pytest.raises(ValueError, bw.average_over_bed, 0) path = str(REPO_ROOT / "bigtools/resources/test/bwaob_intervals.bed") diff --git a/pybigtools/tests/test_io.py b/pybigtools/tests/test_io.py index 935a5a7b..32c94d1a 100644 --- a/pybigtools/tests/test_io.py +++ b/pybigtools/tests/test_io.py @@ -72,14 +72,15 @@ def genintervals(): assert a[1] == b[2] assert math.isclose(a[2], b[3], abs_tol=0.00001) + def test_bigbed_write(tmpdir): f = pybigtools.open(os.path.join(tmpdir, "test.bigBed"), "w") f.write( - {"chr1": 1000, "chr2": 1000}, + {"chr1": 1000, "chr2": 1000}, [ - ("chr1", 0, 100, "foo\tbar\t3"), + ("chr1", 0, 100, "foo\tbar\t3"), ("chr2", 100, 200, "xxx\tyyy\t1.0"), - ("chr2", 200, 300, "zzz\twww\t1.0") + ("chr2", 200, 300, "zzz\twww\t1.0"), ], autosql="""\ table bed3 @@ -88,16 +89,20 @@ def test_bigbed_write(tmpdir): string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" + string name; "Name of item" + string name2; "Another name" + float value; "Value" ) - """ +""", ) f.close() f = pybigtools.open(os.path.join(tmpdir, "test.bigBed")) records = list(f.records("chr2")) - assert records[0][2] == 'xxx' + assert records[0][2] == "xxx" sql = f.sql() assert sql.startswith("table") + # TODO: bigWigAverageOverBed # TODO: bigWigMerge diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py new file mode 100644 index 00000000..906bec97 --- /dev/null +++ b/pybigtools/tests/test_raster.py @@ -0,0 +1,921 @@ +import pathlib + +import numpy as np +import pytest + +import pybigtools as pbt +import bbi + +TEST_DIR = pathlib.Path(__file__).parent +REPO_ROOT = TEST_DIR.parent.parent + + +@pytest.fixture +def bw_path(): + """ + The values here are identical to the coverage depth of ``bb_path``. + The coverage breadth is 8 out of 12 bases (66.67%). + + chroms = {"chr1": 12} + entries = [ + ("chr1", 1, 3, 1.0), + ("chr1", 4, 8, 2.0), + ("chr1", 8, 10, 3.0) + ] + """ + return str(TEST_DIR / "data/mini.bw") + + +@pytest.fixture +def bb_path(): + """ + The coverage depth of these intervals is identical to ``bw_path``. + The coverage breadth is 8 out of 12 bases (66.67%). + + chroms = {"chr1": 12} + entries = [ + ("chr1", 1, 3, ""), + ("chr1", 4, 6, ""), + ("chr1", 4, 10, ""), + ("chr1", 6, 10, ""), + ("chr1", 8, 10, ""), + ] + """ + return str(TEST_DIR / "data/mini.bb") + + +@pytest.fixture +def big_bw_path(): + """Real bigWig with 10 zoom levels (chr17, 83M bases); used to exercise + the zoom-interpolation code path (`exact=False` with non-trivial bin sizes).""" + return str(REPO_ROOT / "bigtools/resources/test/valid.bigWig") + + +@pytest.fixture +def big_bb_path(): + """Real bigBed with 5 zoom levels (chr21, 48M bases).""" + return str(TEST_DIR / "data/bigBedExample.bb") + + +def test_values(bw_path, bb_path): + # Default `uncovered=None` (NaN): uncovered bases are NaN. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", 0, 12, fillna=None) + assert np.allclose( + x1, + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", -2, 14, fillna=None) + assert np.allclose( + x1, + [ + np.nan, + np.nan, + np.nan, + 1, + 1, + np.nan, + 2, + 2, + 2, + 2, + 3, + 3, + np.nan, + np.nan, + np.nan, + np.nan, + ], + equal_nan=True, + ) + + # `uncovered=0` reproduces the previous fill-with-zero behavior. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", 0, 12, uncovered=0, fillna=None) + assert np.allclose(x1, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0], equal_nan=True) + + +def test_values_missing_alias_deprecated(bw_path): + # `missing` is a deprecated alias for `fillna`. Using it raises a + # DeprecationWarning but otherwise behaves identically to `fillna`. + with pytest.warns(DeprecationWarning, match=r"`missing` is deprecated"): + x = pbt.open(bw_path).values("chr1", 0, 12, missing=0) + # Behavior matches `fillna=0`. + y = pbt.open(bw_path).values("chr1", 0, 12, fillna=0) + assert np.array_equal(x, y) + + +def test_values_default_fillna_deprecated(bw_path): + # Not passing `fillna` triggers a DeprecationWarning about the default + # change. The new default is `fillna=None` (leave NaN), matching the + # current Rust-level default. + with pytest.warns(DeprecationWarning, match=r"default behavior of `values"): + x = pbt.open(bw_path).values("chr1", 0, 12) + # Equivalent to explicit fillna=None. + y = pbt.open(bw_path).values("chr1", 0, 12, fillna=None) + assert np.array_equal(x, y, equal_nan=True) + + +def test_fillna(bw_path, bb_path): + # `fillna` is a post-rasterization fill that replaces NaN at in-bounds + # positions. It is INDEPENDENT of `oob`: out-of-chromosome positions are + # filled by `oob` and not touched by `fillna`. + + # Base-level: default `uncovered=None` leaves uncovered bases as NaN; + # `fillna=-99` replaces them after the fact. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 0, 12, fillna=-99) + assert np.allclose( + x, [-99, 1, 1, -99, 2, 2, 2, 2, 3, 3, -99, -99], equal_nan=True + ) + + # `oob` and `fillna` control different concerns: OOB positions take the + # `oob` value; in-bounds uncovered positions take `fillna`. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -2, 14, oob=-1, fillna=0) + assert np.allclose( + x, + [-1, -1, 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, -1, -1], + equal_nan=True, + ) + + # `fillna` does not "leak" into OOB positions: with `oob=NaN` (default), + # OOB positions stay NaN even when `fillna` is set. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -2, 14, fillna=0) + assert np.allclose( + x, + [ + np.nan, + np.nan, + 0, + 1, + 1, + 0, + 2, + 2, + 2, + 2, + 3, + 3, + 0, + 0, + np.nan, + np.nan, + ], + equal_nan=True, + ) + + # Binned: empty bin under `uncovered=None` (default) returns NaN; `fillna=0` + # replaces that. Partial bins are unaffected because they returned a + # real number. + for path in [bw_path, bb_path]: + x = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, summary="mean", fillna=0 + ) + # Bins 0..4 partial or full → real means; bin 5 empty → fillna=0. + assert np.allclose(x, [1, 1, 2, 2, 3, 0], equal_nan=True) + + +def test_binned_mean(bw_path, bb_path): + # uncovered=None: classical mean over only covered bases; empty bins → NaN. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="mean", + fillna=None, + ) + assert np.allclose( + x1, + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2.5, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [1.5, 2.5], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [2], equal_nan=True) + + +def test_binned_mean_uncovered_zero(bw_path, bb_path): + # uncovered=0: treat uncovered bases as 0 (UCSC `mean0` semantics). + # Equivalent to (sum + 0*n_uncovered) / bin_size = sum / bin_size. + uncovered = 0 + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, + ) + assert np.allclose( + x1, + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [0.5, 0.5, 2, 2, 3, uncovered], equal_nan=True) + + # bins=5 has non-integer `bin_size = 2.4`. Integer-aligned bin widths are + # [2, 2, 3, 2, 3] — see `integer_bin_bounds` in raster.rs. Mean0 uses the + # integer bin width (not the float `bin_size`) as the denominator. + # Bin widths: 2 2 3 2 3 + # Coverage: 1/2 1/2 3/3 2/2 1/3 + # Sums: 1 1 6 5 3 + # mean0: 0.5 0.5 2.0 2.5 1.0 + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [0.5, 0.5, 2.0, 2.5, 1.0], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [1, 10 / 6], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, + ) + assert np.allclose(x1, [16 / 12], equal_nan=True) + + +def test_binned_minmax(bw_path, bb_path): + # uncovered=None: classical min/max over only covered bases; empty bins → NaN. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="min", + fillna=None, + ) + assert np.allclose( + x1, + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], + equal_nan=True, + ) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="max", + fillna=None, + ) + assert np.allclose( + x1, + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [1, 2], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [2, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [1], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [3], equal_nan=True) + + +def test_binned_minmax_uncovered_zero(bw_path, bb_path): + # uncovered=0: uncovered bases counted as 0 in min/max (UCSC `min0`/`max0` semantics). + uncovered = 0 + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, + ) + assert np.allclose( + x1, + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + equal_nan=True, + ) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, + ) + assert np.allclose( + x1, + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [0, 0, 2, 2, 3, uncovered], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [0, 0], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [2, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, + ) + assert np.allclose(x1, [0], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, + ) + assert np.allclose(x1, [3], equal_nan=True) + + +@pytest.mark.parametrize( + "n_bins,exact", [(n_bins, exact) for n_bins in range(1, 13) for exact in [True]] +) +def test_binned_coverage_breadth(bw_path, bb_path, n_bins, exact): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + 0, + 12, + bins=n_bins, + exact=exact, + summary="bases_covered", + fillna=None, + ) + assert np.nansum(x1) == 8 + + +def test_values_arr_param(bw_path, bb_path): + # Pass a pre-allocated array. The function writes into it and returns the + # same object. + for path in [bw_path, bb_path]: + arr = np.full(12, -7.0) + ret = pbt.open(path).values("chr1", 0, 12, uncovered=0, arr=arr, fillna=None) + assert ret is arr # same object + assert np.allclose(arr, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0]) + + # Binned: pre-allocated array of the correct length. + for path in [bw_path, bb_path]: + arr = np.full(6, -7.0) + ret = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + summary="mean", + uncovered=None, + arr=arr, + fillna=None, + ) + assert ret is arr + assert np.allclose(arr, [1, 1, 2, 2, 3, np.nan], equal_nan=True) + + +def test_values_arr_wrong_size(bw_path): + # Wrong-length `arr` raises ValueError. + with pytest.raises(ValueError): + pbt.open(bw_path).values( + "chr1", 0, 12, arr=np.zeros(11), fillna=None + ) # need 12 + + with pytest.raises(ValueError): + pbt.open(bw_path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + arr=np.zeros(5), + fillna=None, + ) # need 6 + + +def test_values_bad_args(bw_path): + # Unknown chromosome. + with pytest.raises(KeyError): + pbt.open(bw_path).values("nope", 0, 12, fillna=None) + + # Unknown summary string. + with pytest.raises(ValueError): + pbt.open(bw_path).values("chr1", 0, 12, bins=4, summary="median", fillna=None) + + +# -- These tests check Parity with UCSC via pybbi -- + + +@pytest.mark.parametrize( + "start,end", [(start, end) for start in [0, 2] for end in [10, 12]] +) +def test_bbi_consistent_values(bw_path, bb_path, start, end): + # pybigtools defaults to `uncovered=None` (NaN), pybbi to `missing=0.0`. + # Pass matching values explicitly for the comparison. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", start, end, uncovered=0.0, fillna=None) + x2 = bbi.open(path).fetch("chr1", start, end, missing=0.0) + assert np.allclose(x1, x2, equal_nan=True) + + +def test_bbi_consistent_values_match_coverage(bb_path, bw_path): + bb1 = pbt.open(bb_path).values("chr1", uncovered=0.0, fillna=None) + bw1 = pbt.open(bw_path).values("chr1", uncovered=0.0, fillna=None) + assert np.allclose(bb1, bw1, equal_nan=True) + + bb2 = bbi.open(bb_path).fetch("chr1", 0, -1) + bw2 = bbi.open(bw_path).fetch("chr1", 0, -1) + assert np.allclose(bb2, bw2, equal_nan=True) + + +@pytest.mark.parametrize( + "uncovered,oob", + [(uncovered, oob) for uncovered in [None, 0, -1] for oob in [np.nan, 0, -1]], +) +@pytest.mark.parametrize( + "start,end", + [(start, end) for start in [-2, 0, 2] for end in [10, 12, 14]], +) +def test_bbi_consistent_values_uncovered_oob( + bw_path, bb_path, start, end, uncovered, oob +): + # `uncovered=None` (pybigtools) ≡ `missing=NaN` (pybbi); both mean "leave + # uncovered bases as NaN." + pbbi_missing = np.nan if uncovered is None else uncovered + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", start, end, uncovered=uncovered, oob=oob, fillna=None + ) + x2 = bbi.open(path).fetch("chr1", start, end, missing=pbbi_missing, oob=oob) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact,summary", + [ + (start, end, n_bins, exact, summary) + for start in [0, 2] + for end in [10, 12] + for n_bins in range(1, 13) + for exact in [True] + # `bin_covered` (= `cov` in pybbi) is excluded: pybigtools always returns + # the factual coverage fraction for empty bins (0.0), while pybbi fills + # empty bins with `missing`. Intentional divergence. + for summary in ["sum", "mean", "std", "min", "max"] + ], +) +def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summary): + # pybbi computes "regular" statistics (exclude uncovered) and fills empty bins + # with `missing`. Under pybigtools' per-base `uncovered` semantics, the + # equivalent comparison passes `uncovered=None` to pybigtools and + # `missing=NaN` to pybbi: classical stats over partial bins, NaN for empty. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + start, + end, + bins=n_bins, + exact=exact, + summary=summary, + uncovered=None, + fillna=None, + ) + x2 = bbi.open(path).fetch( + "chr1", + start, + end, + bins=n_bins, + exact=exact, + summary=summary, + missing=np.nan, + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact,oob", + [ + (start, end, n_bins, exact, oob) + for start in [-2, 0, 2] + for end in [10, 12, 14] + for n_bins in range(1, 13) + for exact in [True] + for oob in [np.nan, 0, -1] + ], +) +def test_bbi_consistent_binned_uncovered_oob( + bw_path, bb_path, start, end, n_bins, exact, oob +): + # `uncovered` is restricted to None for parity: pybigtools' per-base `uncovered` + # semantics diverge from pybbi's empty-bin-fill semantics whenever the bin is + # partially covered. With pybigtools' `uncovered=None` and pybbi's + # `missing=NaN`, both compute classical statistics over the covered bases. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", + start, + end, + bins=n_bins, + exact=exact, + uncovered=None, + oob=oob, + fillna=None, + ) + x2 = bbi.open(path).fetch( + "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan, oob=oob + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact", + [(0, 12, n_bins, exact) for n_bins in [12, 24, 48] for exact in [True]], +) +def test_bbi_consistent_binned_upsample(bw_path, bb_path, start, end, n_bins, exact): + # Upsampling produces only fully-covered or fully-empty bins (no partial bins), + # so per-base vs empty-bin `uncovered` semantics agree. + for path in [bb_path, bw_path]: + x1 = pbt.open(path).values( + "chr1", + start, + end, + bins=n_bins, + exact=exact, + uncovered=None, + fillna=None, + ) + x2 = bbi.open(path).fetch( + "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,summary", + [ + # Large query ranges + few bins → bin size big enough that `best_zoom` + # selects a real zoom level, exercising the zoom-interpolation path. + # `valid.bigWig` has zoom resolutions [10, 40, 160, 640, 2560, ...]. + # For a query of width W and N bins, max_zoom_size = W / (2*N). + # E.g. 100kb / (2*10 bins) = 5000 ≥ resolution 2560 → uses zoom 2560. + (100_000, 200_000, 10, "mean"), + (100_000, 200_000, 10, "std"), + (100_000, 200_000, 10, "min"), + (100_000, 200_000, 10, "max"), + (100_000, 200_000, 10, "sum"), + (100_000, 200_000, 50, "mean"), + (0, 1_000_000, 100, "mean"), + (0, 1_000_000, 100, "std"), + ], +) +def test_bbi_consistent_binned_zoom_bw(big_bw_path, start, end, n_bins, summary): + # Parity vs pybbi when traversing the zoom-interpolation path. This is the + # path that the UCSC ceil+rescale normalization affects, so a passing + # parity test here validates that normalization is correctly applied. + x1 = pbt.open(big_bw_path).values( + "chr17", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + uncovered=None, + fillna=None, + ) + x2 = bbi.open(big_bw_path).fetch( + "chr17", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + missing=np.nan, + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,summary", + [ + (10_000_000, 20_000_000, 50, "mean"), + (10_000_000, 20_000_000, 50, "min"), + (10_000_000, 20_000_000, 50, "max"), + (10_000_000, 20_000_000, 100, "sum"), + ], +) +def test_bbi_consistent_binned_zoom_bb(big_bb_path, start, end, n_bins, summary): + # Same parity check on the bigBed zoom path. + x1 = pbt.open(big_bb_path).values( + "chr21", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + uncovered=None, + fillna=None, + ) + x2 = bbi.open(big_bb_path).fetch( + "chr21", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + missing=np.nan, + ) + assert np.allclose(x1, x2, equal_nan=True) + + +def test_values_fully_out_of_bounds(bw_path, bb_path): + # Query entirely past the end of the chromosome. + # chr1 length is 12. Query [20, 30) is fully OOB. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 20, 30, fillna=None) + # Default oob=NaN. + assert len(x) == 10 + assert all(np.isnan(v) for v in x) + + # With oob=-1, the whole array is -1. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 20, 30, oob=-1, fillna=None) + assert np.array_equal(x, np.full(10, -1.0)) + + # Query entirely before the start (negative range). + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -10, -2, fillna=None) + assert len(x) == 8 + assert all(np.isnan(v) for v in x) + + # Binned version of the same — all bins OOB, all bins get oob. + for path in [bw_path, bb_path]: + x = pbt.open(path).values( + "chr1", 20, 30, bins=5, exact=True, oob=0, fillna=None + ) + assert np.array_equal(x, np.zeros(5)) diff --git a/pybigtools/uv.lock b/pybigtools/uv.lock index c16796d7..261a64c4 100644 --- a/pybigtools/uv.lock +++ b/pybigtools/uv.lock @@ -454,6 +454,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "pybbi" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f5/83/33a52ad666d93564e58986a78226dfb9dc7f52a75b36dcf8271f671f8eb1/pybbi-0.4.2.tar.gz", hash = "sha256:b9daa5f04cf5641c3addd729737d649c439f440082fe783c28be3353d2aa444b", size = 34725029, upload-time = "2025-10-30T17:09:51.165Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/a9/c2d4dbcc80c5406ca0a532fe632e16b2eda194b1da1754b82bc7d7ca7248/pybbi-0.4.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:4893409bb7b51c81759d43f6b15e2b4dc6d3de20e5e39b9ea4b8ff1bbfcc17a0", size = 2936500, upload-time = "2025-10-30T17:09:13.405Z" }, + { url = "https://files.pythonhosted.org/packages/33/aa/b6ae0e4fc80d5306a4180efb73d03413a5104953089d6e46b04cf6945b1c/pybbi-0.4.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:684c76ef134ee2815f02ef8ba0fb5f270fe45a127a092c24204311cd6e45a2bb", size = 3225149, upload-time = "2025-10-30T17:09:15.736Z" }, + { url = "https://files.pythonhosted.org/packages/40/c1/f7c95688e21d9c65717dc088945e31848b66bf904870b8fda1316b4e5490/pybbi-0.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20914802a4b3a9d46b2eed811ff77f8fb9d26d1bc4af9168ce3f33ffbd564518", size = 3081199, upload-time = "2025-10-30T17:09:17.272Z" }, + { url = "https://files.pythonhosted.org/packages/6d/3a/9b3fd6dcf58ead95c267fc819df9647700cc69200e992014b80e43f69b65/pybbi-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1965c3620f18546d65e85b1b99d5306303ffbc2fdbbeae93ab3a86dc96ac8669", size = 3399481, upload-time = "2025-10-30T17:09:18.944Z" }, + { url = "https://files.pythonhosted.org/packages/8f/50/bfdfde6d757c3d986f23467131511350a9de71a766c0063930ff70bc46bf/pybbi-0.4.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:145869f0c8e3ebca352df3477a43e15407b8dc1bf409081ea89d6498ace634de", size = 2933866, upload-time = "2025-10-30T17:09:21.063Z" }, + { url = "https://files.pythonhosted.org/packages/3f/1e/9c439b06647a84f769d8cf1ae35a54bfe0ad81c40bc26aea7c5130a1674c/pybbi-0.4.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ad8546cd8871a6b112980b00bbcd31ad409e4a24e979a6cd3114cc645b433b91", size = 3223265, upload-time = "2025-10-30T17:09:23.059Z" }, + { url = "https://files.pythonhosted.org/packages/8b/c8/589d044a3a1e63c772da4446d1d6f452225e45d9fe360eb5aa3f8efdd140/pybbi-0.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a7aeeb5a3d81a6331d19901e1a2c6cdcee9edb273b3aaa1b3e954e92d165030", size = 3109825, upload-time = "2025-10-30T17:09:24.62Z" }, + { url = "https://files.pythonhosted.org/packages/86/ce/b296d5e51276d88a2e51070ed97f76eedba639767204987cc6a465ae564f/pybbi-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a91937f82e3f6f0ae011711fc175d2b2ec1decce0af62435ae0bd297c5d3633", size = 3428525, upload-time = "2025-10-30T17:09:26.293Z" }, + { url = "https://files.pythonhosted.org/packages/3f/74/af3bfdecdacb64e35bffea2513541d71f8ea7358f5004880e37a2414b5f2/pybbi-0.4.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5d1fad3d46787ea5905e5f1080b16dfb83d67c8d83225efe3169d4dcf27eb612", size = 2943529, upload-time = "2025-10-30T17:09:28.849Z" }, + { url = "https://files.pythonhosted.org/packages/02/66/38380935e7d3856fe5daf0694880f8330631d09bf73b2b30d8e576eeb12b/pybbi-0.4.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:37ed8946551a9a5efc9e426d7b86248e845c51b803d48016027a7c8a1a9fce33", size = 3232894, upload-time = "2025-10-30T17:09:31.132Z" }, + { url = "https://files.pythonhosted.org/packages/36/1f/abbe9be6b0f501a7ff182f1c6534b8f83b69c97630776d8e482a2d4d9bf7/pybbi-0.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36dc0add6f54d1763bf247b2cff87eaf2c81a26da18d49ac33de164d6baec2df", size = 3099983, upload-time = "2025-10-30T17:09:32.928Z" }, + { url = "https://files.pythonhosted.org/packages/b0/e7/35195b5292b6ccdcaaa655a2dad615b9217f52ac18b9dfdf8afec82521a6/pybbi-0.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d587a05d7eed4d1fb92e3f62ad52e8389c7a6b315bc430a6eb678760106740e", size = 3420434, upload-time = "2025-10-30T17:09:34.693Z" }, + { url = "https://files.pythonhosted.org/packages/53/72/ac475ad1726f132fc272882b43defddfe09a831815a741850d2c510d7f0e/pybbi-0.4.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:54b9e541bb455aeb30f88f09c70b7a4b72f6a0ae0e2e7f2bc9798f2e647b3d16", size = 2943356, upload-time = "2025-10-30T17:09:36.18Z" }, + { url = "https://files.pythonhosted.org/packages/1b/d3/4e27f36ae7f0fc933330239d7b76debf6ef67dbd7fef722f3ba1e7cfb9cc/pybbi-0.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:2c6472900a6688d604ccfd406138337e5d5b9db442a13715ce7abbf2cb98b841", size = 3232770, upload-time = "2025-10-30T17:09:37.803Z" }, + { url = "https://files.pythonhosted.org/packages/8e/25/2cbcd423c16ceb6eaedc39d797ff8b31119fcb5d71910b5336dbefb942d5/pybbi-0.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ee599e318238b177b2c31dbe7789034ea0bfa9716e0aa06d387d7cc9f78938", size = 3095788, upload-time = "2025-10-30T17:09:39.523Z" }, + { url = "https://files.pythonhosted.org/packages/15/b2/dbaaaa0901785caad9317c0eedf97eaa3d841e1c05589a9388ce67c9d53f/pybbi-0.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce9afaf149c89b01dfa2926febe53b73b174be0cc5febfd4cfee9f1e72083f50", size = 3414507, upload-time = "2025-10-30T17:09:41.605Z" }, + { url = "https://files.pythonhosted.org/packages/0e/14/183e404d9015b880fde46e452190b6efe6c95dd6bf1cef28265abeba92b8/pybbi-0.4.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:b685fc3c72a64672e87cc590146ee91bb97b1b5cb061a85fbe5ade679ef7b5fb", size = 2936711, upload-time = "2025-10-30T17:09:43.377Z" }, + { url = "https://files.pythonhosted.org/packages/f6/cc/27c610695e6e746021d3ae311f9572aa7cd57796806fd54328ae67a8d5ec/pybbi-0.4.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:093c3efb6a6454397be6ffbcd37f8719107300ea02f4ec3066583a0bf87f16b3", size = 3225304, upload-time = "2025-10-30T17:09:45.117Z" }, + { url = "https://files.pythonhosted.org/packages/a6/c2/d73f53bcf6605e8c0740fb486838d1750da2e2091dae4e7b8733aa004110/pybbi-0.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31ac2493b598a4568c5a16c0a575e41633afbe5da3084d0ef2cdcbed1b26031b", size = 3077589, upload-time = "2025-10-30T17:09:46.863Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ca/c3af682d723805a06868f5d3cb00fe5b4d7f7452db4a672b7526f87285f7/pybbi-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be8bf199a520b75c4708b90d3245d183138728895fad0a145eb36bc038d39b3b", size = 3396498, upload-time = "2025-10-30T17:09:48.549Z" }, +] + [[package]] name = "pybigtools" source = { editable = "." } @@ -466,12 +500,14 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "pybbi" }, { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, { name = "ruff" }, { name = "smart-open", extra = ["http"] }, ] test = [ + { name = "pybbi" }, { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, { name = "smart-open", extra = ["http"] }, @@ -482,11 +518,13 @@ requires-dist = [{ name = "numpy" }] [package.metadata.requires-dev] dev = [ + { name = "pybbi", specifier = ">=0.4.2" }, { name = "pytest" }, { name = "ruff" }, { name = "smart-open", extras = ["http"] }, ] test = [ + { name = "pybbi", specifier = ">=0.4.2" }, { name = "pytest" }, { name = "smart-open", extras = ["http"] }, ]