From bf60a20727294ade9ae59bdb9fbba6758175516c Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 27 Oct 2025 10:23:27 -0400 Subject: [PATCH 01/17] refactor: Reorganize reader and utils --- pybigtools/src/reader.rs | 239 +++++++----- pybigtools/src/utils.rs | 816 ++++++++++++++++++--------------------- 2 files changed, 518 insertions(+), 537 deletions(-) diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index de1777f..da8cd04 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -13,110 +13,10 @@ use pyo3::types::{IntoPyDict, PyAny, PyDict, PyList, PyString, PyTuple}; use pyo3::{prelude::*, PyTraverseError, PyVisit}; use crate::errors::{BBIFileClosed, BBIReadError, ConvertResult}; -use crate::utils::{entries_to_array, intervals_to_array, start_end, BBIReadRaw, Summary}; +use crate::utils::{entries_to_array, intervals_to_array, BBIReadRaw, Summary}; type ValueTuple = (u32, u32, f32); -enum BigWigAverageOverBedStatistics { - Size, - Bases, - Sum, - Mean0, - Mean, - Min, - Max, -} - -impl BigWigAverageOverBedStatistics { - fn from_str(val: &str) -> Option { - Some(match val { - "size" => BigWigAverageOverBedStatistics::Size, - "bases" => BigWigAverageOverBedStatistics::Bases, - "sum" => BigWigAverageOverBedStatistics::Sum, - "mean0" => BigWigAverageOverBedStatistics::Mean0, - "mean" => BigWigAverageOverBedStatistics::Mean, - "min" => BigWigAverageOverBedStatistics::Min, - "max" => BigWigAverageOverBedStatistics::Max, - _ => { - return None; - } - }) - } -} - -/// This class is an interator for the entries of bigWigAverageOverBed -#[pyclass(module = "pybigtools")] -struct BigWigAverageOverBedEntriesIterator { - iter: Box< - dyn Iterator> - + Send, - >, - usename: bool, - stats: Option>, - summary_statistics: PyObject, -} - -#[pymethods] -impl BigWigAverageOverBedEntriesIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult> { - let v = slf - .iter - .next() - .transpose() - .map_err(|e| PyErr::new::(format!("{}", e)))?; - - let Some((name, v)) = v else { - return Ok(None); - }; - let stats = match &slf.stats { - Some(stats) => { - let mut ret = Vec::with_capacity(stats.len()); - for stat in stats.iter() { - match stat { - BigWigAverageOverBedStatistics::Size => { - ret.push(v.size.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Bases => { - ret.push(v.bases.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Sum => ret.push(v.sum.to_object(slf.py())), - BigWigAverageOverBedStatistics::Mean0 => { - ret.push(v.mean0.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Mean => { - ret.push(v.mean.to_object(slf.py())) - } - BigWigAverageOverBedStatistics::Min => ret.push(v.min.to_object(slf.py())), - BigWigAverageOverBedStatistics::Max => ret.push(v.max.to_object(slf.py())), - } - } - if ret.len() == 1 { - ret[0].to_object(slf.py()) - } else { - PyTuple::new_bound(slf.py(), ret).to_object(slf.py()) - } - } - None => { - let val = slf.summary_statistics.call_bound( - slf.py(), - (v.size, v.bases, v.sum, v.mean0, v.mean, v.min, v.max), - None, - )?; - val.to_object(slf.py()) - } - }; - - match slf.usename { - true => Ok(Some((name, stats).to_object(slf.py()))), - false => Ok(Some(stats)), - } - } -} - /// Interface for reading a BigWig or BigBed file. #[pyclass(module = "pybigtools")] pub struct BBIRead { @@ -368,7 +268,7 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clipped(&self.bbi, &chrom, start, end)?; match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { @@ -472,7 +372,7 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clipped(&self.bbi, &chrom, start, end)?; match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { @@ -1027,3 +927,136 @@ impl BigBedEntriesIterator { )) } } + +enum BigWigAverageOverBedStatistics { + Size, + Bases, + Sum, + Mean0, + Mean, + Min, + Max, +} + +impl BigWigAverageOverBedStatistics { + fn from_str(val: &str) -> Option { + Some(match val { + "size" => BigWigAverageOverBedStatistics::Size, + "bases" => BigWigAverageOverBedStatistics::Bases, + "sum" => BigWigAverageOverBedStatistics::Sum, + "mean0" => BigWigAverageOverBedStatistics::Mean0, + "mean" => BigWigAverageOverBedStatistics::Mean, + "min" => BigWigAverageOverBedStatistics::Min, + "max" => BigWigAverageOverBedStatistics::Max, + _ => { + return None; + } + }) + } +} + +/// This class is an interator for the entries of bigWigAverageOverBed +#[pyclass(module = "pybigtools")] +struct BigWigAverageOverBedEntriesIterator { + iter: Box< + dyn Iterator> + + Send, + >, + usename: bool, + stats: Option>, + summary_statistics: PyObject, +} + +#[pymethods] +impl BigWigAverageOverBedEntriesIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + let v = slf + .iter + .next() + .transpose() + .map_err(|e| PyErr::new::(format!("{}", e)))?; + + let Some((name, v)) = v else { + return Ok(None); + }; + let stats = match &slf.stats { + Some(stats) => { + let mut ret = Vec::with_capacity(stats.len()); + for stat in stats.iter() { + match stat { + BigWigAverageOverBedStatistics::Size => { + ret.push(v.size.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Bases => { + ret.push(v.bases.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Sum => ret.push(v.sum.to_object(slf.py())), + BigWigAverageOverBedStatistics::Mean0 => { + ret.push(v.mean0.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Mean => { + ret.push(v.mean.to_object(slf.py())) + } + BigWigAverageOverBedStatistics::Min => ret.push(v.min.to_object(slf.py())), + BigWigAverageOverBedStatistics::Max => ret.push(v.max.to_object(slf.py())), + } + } + if ret.len() == 1 { + ret[0].to_object(slf.py()) + } else { + PyTuple::new_bound(slf.py(), ret).to_object(slf.py()) + } + } + None => { + let val = slf.summary_statistics.call_bound( + slf.py(), + (v.size, v.bases, v.sum, v.mean0, v.mean, v.min, v.max), + None, + )?; + val.to_object(slf.py()) + } + }; + + match slf.usename { + true => Ok(Some((name, stats).to_object(slf.py()))), + false => Ok(Some(stats)), + } + } +} + +fn start_end_clipped( + bbi: &BBIReadRaw, + chrom_name: &str, + start: Option, + end: Option, +) -> PyResult<(u32, u32)> { + let chroms = match &bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::BigWigFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => b.chroms(), + BBIReadRaw::BigWigFileLike(b) => b.chroms(), + BBIReadRaw::BigBedFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => b.chroms(), + BBIReadRaw::BigBedFileLike(b) => b.chroms(), + }; + let chrom = chroms.into_iter().find(|x| x.name == chrom_name); + let length = match chrom { + None => { + return Err(PyErr::new::(format!( + "No chromomsome with name `{}` found.", + chrom_name + ))) + } + Some(c) => c.length, + }; + return Ok(( + start.map(|v| v.max(0) as u32).unwrap_or(0), + end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), + )); +} diff --git a/pybigtools/src/utils.rs b/pybigtools/src/utils.rs index 3bb4ca9..baf1241 100644 --- a/pybigtools/src/utils.rs +++ b/pybigtools/src/utils.rs @@ -5,27 +5,27 @@ use std::ops::IndexMut; use bigtools::utils::file::remote_file::RemoteFile; use bigtools::utils::file::reopen::ReopenableFile; use bigtools::{ - BBIFileRead, BBIReadError as _BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, - BigWigRead as BigWigReadRaw, CachedBBIFileRead, Value, ZoomRecord, + BBIFileRead, BBIReadError, BedEntry, BigBedRead, BigWigRead, CachedBBIFileRead, Value, + ZoomRecord, }; use numpy::ndarray::ArrayViewMut; use numpy::{PyArray1, PyArrayMethods}; use pyo3::exceptions; use pyo3::prelude::*; -use crate::errors::{BBIFileClosed, ConvertResult}; +use crate::errors::ConvertResult; use crate::file_like::PyFileLikeObject; pub enum BBIReadRaw { Closed, - BigWigFile(BigWigReadRaw>), + BigWigFile(BigWigRead>), #[cfg(feature = "remote")] - BigWigRemote(BigWigReadRaw>), - BigWigFileLike(BigWigReadRaw>), - BigBedFile(BigBedReadRaw>), + BigWigRemote(BigWigRead>), + BigWigFileLike(BigWigRead>), + BigBedFile(BigBedRead>), #[cfg(feature = "remote")] - BigBedRemote(BigBedReadRaw>), - BigBedFileLike(BigBedReadRaw>), + BigBedRemote(BigBedRead>), + BigBedFileLike(BigBedRead>), } #[derive(Copy, Clone, Debug)] @@ -37,7 +37,7 @@ pub enum Summary { pub fn intervals_to_array( py: Python<'_>, - b: &mut BigWigReadRaw, + bw: &mut BigWigRead, chrom: &str, start: Option, end: Option, @@ -48,47 +48,57 @@ pub fn intervals_to_array( oob: f64, arr: Option, ) -> PyResult { - let (start, end, length) = bigwig_start_end_length(b, chrom, start, end)?; - let zoom = if let (Some(bins), false) = (bins, exact) { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - zoom - } else { - None - }; - let arr = match (bins, arr) { - (_, Some(arr)) => arr, - (Some(bins), None) => PyArray1::from_vec_bound(py, vec![missing; bins]).to_object(py), - (None, None) => { - PyArray1::from_vec_bound(py, vec![missing; (end - start) as usize]).to_object(py) + let (start, end, chrom_length) = bigwig_start_end_length(bw, chrom, start, end)?; + let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(chrom_length) as u32); + let arr = arr.unwrap_or_else(|| { + let size = bins.unwrap_or((end - start) as usize); + PyArray1::from_vec_bound(py, vec![missing; size]).to_object(py) + }); + + // Fill array + let array: &Bound<'_, PyArray1> = + arr.downcast_bound::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; + match bins { + // Base-level data + None => { + if array.len()? != (end - start) as usize { + return Err(PyErr::new::(format!( + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", + (end - start) as usize, + array.len()?, + ))); + } + let mut array = array.readwrite(); + let view = array.as_array_mut(); + let iter = bw + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + fill_values(start, end, iter, missing, view).convert_err()?; } - }; - let v: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - let bin_size = match bins { + // Binned data Some(bins) => { - if v.len()? != bins { + if array.len()? != bins { return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", bins, - v.len()?, + array.len()?, ))); } - - let mut array = v.readwrite(); - + let zoom = if !exact { + bigwig_closest_zoom(bw, start, end, bins) + } else { + None + }; + let mut array = array.readwrite(); + let view = array.as_array_mut(); match zoom { + // Interpolate from zoom-level summaries Some(zoom) => { - let iter = b + let iter = bw .get_zoom_interval( &chrom, intervals_start, @@ -96,86 +106,34 @@ pub fn intervals_to_array( zoom.reduction_level, ) .convert_err()?; - to_array_zoom( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; + fill_binned_values_from_zoom(start, end, iter, summary, bins, missing, view) + .convert_err()?; } + // Aggregate from base-level data None => { - let iter = b + let iter = bw .get_interval(&chrom, intervals_start, intervals_end) .convert_err()?; - to_array_bins( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), + fill_binned_values_from_intervals( + start, end, iter, summary, bins, missing, view, ) .convert_err()?; } - }; - - (end - start) as f64 / bins as f64 - } - _ => { - if v.len()? != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - (end - start) as usize, - v.len()?, - ))); } - - let mut array = v.readwrite(); - - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - - 1.0 } }; - { - let mut array = v.readwrite(); - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); - - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end > length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len()); - for i in bin_start..bin_end { - array[i] = oob; - } - } - } + // Handle out-of-bounds + let mut array = array.readwrite(); + let view = array.as_array_mut(); + fill_out_of_bounds(start, end, chrom_length, bins, oob, view); Ok(arr) } pub fn entries_to_array( py: Python<'_>, - b: &mut BigBedReadRaw, + bb: &mut BigBedRead, chrom: &str, start: Option, end: Option, @@ -186,47 +144,57 @@ pub fn entries_to_array( oob: f64, arr: Option, ) -> PyResult { - let (start, end, length) = bigbed_start_end_length(b, chrom, start, end)?; - let zoom = if let (Some(bins), false) = (bins, exact) { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - zoom - } else { - None - }; - let arr = match (bins, arr) { - (_, Some(arr)) => arr, - (Some(bins), None) => PyArray1::from_vec_bound(py, vec![missing; bins]).to_object(py), - (None, None) => { - PyArray1::from_vec_bound(py, vec![missing; (end - start) as usize]).to_object(py) - } - }; - let v: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; + let (start, end, length) = bigbed_start_end_length(bb, chrom, start, end)?; let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - let bin_size = match bins { + let arr = arr.unwrap_or_else(|| { + let size = bins.unwrap_or((end - start) as usize); + PyArray1::from_vec_bound(py, vec![missing; size]).to_object(py) + }); + + // Fill array + let array: &Bound<'_, PyArray1> = + arr.downcast_bound::>(py).map_err(|_| { + PyErr::new::( + "Passed `arr` option must be a one-dimensional numpy array.", + ) + })?; + match bins { + // Base-level data + None => { + if array.len()? != (end - start) as usize { + return Err(PyErr::new::(format!( + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", + (end - start) as usize, + array.len()?, + ))); + } + let mut array = array.readwrite(); + let view = array.as_array_mut(); + let iter = bb + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + fill_coverage(start, end, iter, missing, view).convert_err()?; + } + // Binned data Some(bins) => { - if v.len()? != bins { + if array.len()? != bins { return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", + "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", bins, - v.len()?, + array.len()?, ))); } - - let mut array = v.readwrite(); - + let zoom = if !exact { + bigbed_closest_zoom(bb, start, end, bins) + } else { + None + }; + let mut array = array.readwrite(); + let view = array.as_array_mut(); match zoom { + // Interpolate from zoom-level summaries Some(zoom) => { - let iter = b + let iter = bb .get_zoom_interval( &chrom, intervals_start, @@ -234,124 +202,100 @@ pub fn entries_to_array( zoom.reduction_level, ) .convert_err()?; - to_entry_array_zoom( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; + fill_binned_coverage_from_zoom(start, end, iter, summary, bins, missing, view) + .convert_err()?; } + // Aggregate from base-level data None => { - let iter = b + let iter = bb .get_interval(&chrom, intervals_start, intervals_end) .convert_err()?; - to_entry_array_bins( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), + fill_binned_coverage_from_entries( + start, end, iter, summary, bins, missing, view, ) .convert_err()?; } - }; - - (end - start) as f64 / bins as f64 - } - _ => { - if v.len()? != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", - (end - start) as usize, - v.len()?, - ))); } - - let mut array = v.readwrite(); - - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_entry_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - - 1.0 } }; - { - let mut array = v.readwrite(); - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); - - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end > length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len()); - for i in bin_start..bin_end { - array[i] = oob; - } - } - } + // Handle out-of-bounds + let mut array = array.readwrite(); + let view = array.as_array_mut(); + fill_out_of_bounds(start, end, length, bins, oob, view); Ok(arr) } -fn to_array>>( +fn fill_values>>( start: i32, end: i32, iter: I, missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), (end - start) as usize); - v.fill(f64::NAN); + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), (end - start) as usize); + view.fill(f64::NAN); + for interval in iter { let interval = interval?; let interval_start = ((interval.start as i32) - start) as usize; let interval_end = ((interval.end as i32) - start) as usize; for i in interval_start..interval_end { - let val = *v.index_mut(i); - *v.index_mut(i) = if val.is_nan() { + let val = *view.index_mut(i); + *view.index_mut(i) = if val.is_nan() { interval.value as f64 } else { val + interval.value as f64 }; } } - for val in v.iter_mut() { + for val in view.iter_mut() { + *val = if val.is_nan() { missing } else { *val }; + } + Ok(()) +} + +fn fill_coverage>>( + start: i32, + end: i32, + iter: I, + missing: f64, + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), (end - start) as usize); + view.fill(f64::NAN); + + for interval in iter { + let interval = interval?; + let interval_start = ((interval.start as i32) - start) as usize; + let interval_end = ((interval.end as i32) - start) as usize; + for i in interval_start..interval_end { + let val = *view.index_mut(i); + *view.index_mut(i) = if val.is_nan() { 1.0 } else { val + 1.0 }; + } + } + for val in view.iter_mut() { *val = if val.is_nan() { missing } else { *val }; } + Ok(()) } -fn to_array_bins>>( +fn fill_binned_values_from_intervals>>( start: i32, end: i32, iter: I, summary: Summary, bins: usize, missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), bins); + view.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); + // (bin, bin_start, bin_end, Option<(covered_bases, value)>) + let mut deque: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -360,27 +304,27 @@ fn to_array_bins>>( let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - while let Some(front) = bin_data.front_mut() { + while let Some(front) = deque.front_mut() { if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); + let front = deque.pop_front().unwrap(); let bin = front.0; match summary { Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } else { break; } } - while let Some(bin) = bin_data + while let Some(bin) = deque .back() .map(|last_bin| { if last_bin.0 < bin_end { @@ -394,12 +338,12 @@ fn to_array_bins>>( let bin_start = ((bin as f64) * bin_size) as i32; let bin_end = (((bin + 1) as f64) * bin_size) as i32; - bin_data.push_back((bin, bin_start, bin_end, None)); + deque.push_back((bin, bin_start, bin_end, None)); } for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + assert!(deque.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, data) in deque.iter_mut() { // Since bed files can have overlapping entries, a previous entry could have extended // past the last bin for the current interval // Not relevant here, but keeping for parity @@ -432,38 +376,39 @@ fn to_array_bins>>( } } } - while let Some(front) = bin_data.pop_front() { + while let Some(front) = deque.pop_front() { let bin = front.0; match summary { Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } Ok(()) } -fn to_array_zoom>>( +fn fill_binned_coverage_from_entries>>( start: i32, end: i32, iter: I, summary: Summary, bins: usize, missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), bins); + view.fill(missing); - // (bin, bin_start, bin_end, Option<(covered_bases, value)>) - let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); + // (, , , , ) + // covered_bases = 0 if uncovered, 1 if covered + let mut deque: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -472,27 +417,42 @@ fn to_array_zoom>>( let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - while let Some(front) = bin_data.front_mut() { + while let Some(front) = deque.front_mut() { if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); + let front = deque.pop_front().unwrap(); let bin = front.0; match summary { Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front + .4 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front + .4 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + view[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) + .unwrap_or(missing); } } } else { break; } } - while let Some(bin) = bin_data + while let Some(bin) = deque .back() .map(|last_bin| { if last_bin.0 < bin_end { @@ -506,111 +466,84 @@ fn to_array_zoom>>( let bin_start = ((bin as f64) * bin_size) as i32; let bin_end = (((bin + 1) as f64) * bin_size) as i32; - bin_data.push_back((bin, bin_start, bin_end, None)); + deque.push_back(( + bin, + bin_start, + bin_end, + vec![0; (bin_end - bin_start) as usize], + vec![missing; (bin_end - bin_start) as usize], + )); } for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + assert!(deque.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, covered, data) in deque.iter_mut() { // Since bed files can have overlapping entries, a previous entry could have extended // past the last bin for the current interval - // Not relevant here, but keeping for parity if interval_end <= *bin_start { break; } let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - let overlap_size: i32 = overlap_end - overlap_start; - match data.as_mut() { - Some((c, v)) => match summary { - Summary::Min => { - *v = v.min(interval.summary.min_val as f64); - *c += overlap_size; - } - Summary::Max => { - *v = v.max(interval.summary.max_val as f64); - *c += overlap_size; - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *v += (overlap_size as f64) * zoom_mean; - *c += overlap_size; - } - }, - None => match summary { - Summary::Min => { - *data = Some((overlap_size, interval.summary.min_val as f64)); - } - Summary::Max => { - *data = Some((overlap_size, interval.summary.max_val as f64)); - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *data = Some((overlap_size, (overlap_size as f64) * zoom_mean)); - } - }, + + let range = + ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); + for i in &mut data[range.clone()] { + // If NAN, then 0.0 + 1.0, else i + 1.0 + *i = (*i).max(0.0) + 1.0; + } + for i in &mut covered[range] { + *i = (*i).max(1); } } } - while let Some(front) = bin_data.pop_front() { + while let Some(front) = deque.pop_front() { let bin = front.0; match summary { Summary::Min => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front + .4 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.map(|v| v.1).unwrap_or(missing); + view[bin] = front + .4 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + view[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) + .unwrap_or(missing); } } } Ok(()) } -fn to_entry_array>>( - start: i32, - end: i32, - iter: I, - missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), (end - start) as usize); - v.fill(f64::NAN); - for interval in iter { - let interval = interval?; - let interval_start = ((interval.start as i32) - start) as usize; - let interval_end = ((interval.end as i32) - start) as usize; - for i in interval_start..interval_end { - let val = *v.index_mut(i); - *v.index_mut(i) = if val.is_nan() { 1.0 } else { val + 1.0 }; - } - } - for val in v.iter_mut() { - *val = if val.is_nan() { missing } else { *val }; - } - Ok(()) -} - -fn to_entry_array_bins>>( +fn fill_binned_values_from_zoom>>( start: i32, end: i32, iter: I, summary: Summary, bins: usize, missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), bins); + view.fill(missing); - // (, , , , ) - // covered_bases = 0 if uncovered, 1 if covered - let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); + // (bin, bin_start, bin_end, Option<(covered_bases, value)>) + let mut deque: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -619,42 +552,27 @@ fn to_entry_array_bins>>( let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - while let Some(front) = bin_data.front_mut() { + while let Some(front) = deque.front_mut() { if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); + let front = deque.pop_front().unwrap(); let bin = front.0; match summary { Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); + view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } else { break; } } - while let Some(bin) = bin_data + while let Some(bin) = deque .back() .map(|last_bin| { if last_bin.0 < bin_end { @@ -668,85 +586,87 @@ fn to_entry_array_bins>>( let bin_start = ((bin as f64) * bin_size) as i32; let bin_end = (((bin + 1) as f64) * bin_size) as i32; - bin_data.push_back(( - bin, - bin_start, - bin_end, - vec![0; (bin_end - bin_start) as usize], - vec![missing; (bin_end - bin_start) as usize], - )); + deque.push_back((bin, bin_start, bin_end, None)); } for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + assert!(deque.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, data) in deque.iter_mut() { // Since bed files can have overlapping entries, a previous entry could have extended // past the last bin for the current interval + // Not relevant here, but keeping for parity if interval_end <= *bin_start { break; } let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - - let range = - ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); - for i in &mut data[range.clone()] { - // If NAN, then 0.0 + 1.0, else i + 1.0 - *i = (*i).max(0.0) + 1.0; - } - for i in &mut covered[range] { - *i = (*i).max(1); + let overlap_size: i32 = overlap_end - overlap_start; + match data.as_mut() { + Some((c, v)) => match summary { + Summary::Min => { + *v = v.min(interval.summary.min_val as f64); + *c += overlap_size; + } + Summary::Max => { + *v = v.max(interval.summary.max_val as f64); + *c += overlap_size; + } + Summary::Mean => { + let zoom_mean = + (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); + *v += (overlap_size as f64) * zoom_mean; + *c += overlap_size; + } + }, + None => match summary { + Summary::Min => { + *data = Some((overlap_size, interval.summary.min_val as f64)); + } + Summary::Max => { + *data = Some((overlap_size, interval.summary.max_val as f64)); + } + Summary::Mean => { + let zoom_mean = + (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); + *data = Some((overlap_size, (overlap_size as f64) * zoom_mean)); + } + }, } } } - while let Some(front) = bin_data.pop_front() { + while let Some(front) = deque.pop_front() { let bin = front.0; match summary { Summary::Min => { - v[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); + view[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); + view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } Ok(()) } -fn to_entry_array_zoom>>( +fn fill_binned_coverage_from_zoom>>( start: i32, end: i32, iter: I, summary: Summary, bins: usize, missing: f64, - mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), _BBIReadError> { - assert_eq!(v.len(), bins); - v.fill(missing); + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(view.len(), bins); + view.fill(missing); // (, , , , ) // covered_bases = 0 if uncovered, 1 if covered - let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); + let mut deque: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -755,28 +675,28 @@ fn to_entry_array_zoom>>( let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - while let Some(front) = bin_data.front_mut() { + while let Some(front) = deque.front_mut() { if front.0 < bin_start { - let front = bin_data.pop_front().unwrap(); + let front = deque.pop_front().unwrap(); let bin = front.0; match summary { Summary::Min => { - v[bin] = front + view[bin] = front .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { - v[bin] = front + view[bin] = front .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front + view[bin] = front .3 .iter() .any(|v| *v > 0) @@ -790,7 +710,7 @@ fn to_entry_array_zoom>>( break; } } - while let Some(bin) = bin_data + while let Some(bin) = deque .back() .map(|last_bin| { if last_bin.0 < bin_end { @@ -804,7 +724,7 @@ fn to_entry_array_zoom>>( let bin_start = ((bin as f64) * bin_size) as i32; let bin_end = (((bin + 1) as f64) * bin_size) as i32; - bin_data.push_back(( + deque.push_back(( bin, bin_start, bin_end, @@ -813,9 +733,9 @@ fn to_entry_array_zoom>>( )); } for bin in bin_start..bin_end { - assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + assert!(deque.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, covered, data) in deque.iter_mut() { // Since bed files can have overlapping entries, a previous entry could have extended // past the last bin for the current interval if interval_end <= *bin_start { @@ -841,26 +761,26 @@ fn to_entry_array_zoom>>( } } } - while let Some(front) = bin_data.pop_front() { + while let Some(front) = deque.pop_front() { let bin = front.0; match summary { Summary::Min => { - v[bin] = front + view[bin] = front .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { - v[bin] = front + view[bin] = front .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front + view[bin] = front .3 .iter() .any(|v| *v > 0) @@ -874,56 +794,56 @@ fn to_entry_array_zoom>>( Ok(()) } -pub fn start_end( - bbi: &BBIReadRaw, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(u32, u32)> { - let chroms = match &bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), - BBIReadRaw::BigWigFile(b) => b.chroms(), - #[cfg(feature = "remote")] - BBIReadRaw::BigWigRemote(b) => b.chroms(), - BBIReadRaw::BigWigFileLike(b) => b.chroms(), - BBIReadRaw::BigBedFile(b) => b.chroms(), - #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => b.chroms(), - BBIReadRaw::BigBedFileLike(b) => b.chroms(), +fn fill_out_of_bounds( + start: i32, + end: i32, + chrom_length: i32, + bins: Option, + oob: f64, + mut view: ArrayViewMut<'_, f64, numpy::Ix1>, +) { + let bin_size = match bins { + Some(bins) => (end - start) as f64 / bins as f64, + _ => 1.0, }; - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); - let length = match chrom { - None => { - return Err(PyErr::new::(format!( - "No chromomsome with name `{}` found.", - chrom_name - ))) + + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + view[i] = oob; } - Some(c) => c.length, - }; - return Ok(( - start.map(|v| v.max(0) as u32).unwrap_or(0), - end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), - )); + } + if end > chrom_length { + let interval_start = (chrom_length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, view.len()); + for i in bin_start..bin_end { + view[i] = oob; + } + } } fn bigwig_start_end_length( - bbi: &BigWigReadRaw, + bw: &BigWigRead, chrom_name: &str, start: Option, end: Option, ) -> PyResult<(i32, i32, i32)> { - let chroms = bbi.chroms(); + let chroms = bw.chroms(); start_end_length_inner(chrom_name, chroms, start, end) } fn bigbed_start_end_length( - bbi: &BigBedReadRaw, + bb: &BigBedRead, chrom_name: &str, start: Option, end: Option, ) -> PyResult<(i32, i32, i32)> { - let chroms = bbi.chroms(); + let chroms = bb.chroms(); start_end_length_inner(chrom_name, chroms, start, end) } @@ -946,16 +866,44 @@ fn start_end_length_inner( return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); } +fn bigwig_closest_zoom<'a, R: BBIFileRead>( + bw: &'a BigWigRead, + start: i32, + end: i32, + bins: usize, +) -> Option<&'a bigtools::ZoomHeader> { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + bw.info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level) +} + +fn bigbed_closest_zoom<'a, R: BBIFileRead>( + bb: &'a BigBedRead, + start: i32, + end: i32, + bins: usize, +) -> Option<&'a bigtools::ZoomHeader> { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + bb.info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level) +} + #[cfg(test)] mod test { use bigtools::BedEntry; use numpy::ndarray::Array; - use crate::utils::to_entry_array_bins; + use crate::utils::fill_binned_coverage_from_entries; use crate::utils::Summary; #[test] - fn test_to_entry_array_bins() { + fn test_fill_binned_coverage_from_entries() { fn eq_with_nan_eq(a: f64, b: f64) -> bool { (a.is_nan() && b.is_nan()) || (a == b) } @@ -973,7 +921,7 @@ mod test { let entries = []; let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -988,7 +936,7 @@ mod test { let entries = []; let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -1007,7 +955,7 @@ mod test { rest: "".to_string(), }]; let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -1026,7 +974,7 @@ mod test { rest: "".to_string(), }]; let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -1052,7 +1000,7 @@ mod test { }, ]; let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1065,7 +1013,7 @@ mod test { let res = arr.to_vec(); assert_equal(res, [1.0, 2.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1078,7 +1026,7 @@ mod test { let res = arr.to_vec(); assert_equal(res, [1.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1109,7 +1057,7 @@ mod test { }, ]; let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1122,7 +1070,7 @@ mod test { let res = arr.to_vec(); assert_equal(res, [1.0, 3.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1135,7 +1083,7 @@ mod test { let res = arr.to_vec(); assert_equal(res, [1.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); - to_entry_array_bins( + fill_binned_coverage_from_entries( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), From 44686787f15fcf3eb9bfe729eb2339b6c98b50a4 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 28 Oct 2025 13:48:14 -0400 Subject: [PATCH 02/17] refactor: Reimplementation of value and summary rasterization --- pybigtools/src/coverage.rs | 1034 ++++++++++++++++++++++++++ pybigtools/src/lib.rs | 29 +- pybigtools/src/reader.rs | 231 +++--- pybigtools/src/utils.rs | 1419 ++++++++++++++---------------------- pybigtools/src/writer.rs | 8 +- 5 files changed, 1720 insertions(+), 1001 deletions(-) create mode 100644 pybigtools/src/coverage.rs diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs new file mode 100644 index 0000000..0b8472a --- /dev/null +++ b/pybigtools/src/coverage.rs @@ -0,0 +1,1034 @@ +use bigtools::{BBIReadError, BedEntry, Value}; +use std::cmp::Reverse; +use std::collections::BinaryHeap; + +/// Converts sorted [`BedEntry`] intervals into non-overlapping coverage depth [`Value`] intervals. +/// +/// This iterator consumes a stream of sorted, potentially overlapping BED entries and produces a +/// stream of non-overlapping intervals whose values represent the coverage depth (number of +/// overlapping entries) over its genomic positions. +/// +/// The input BED entries **must** be sorted by start position, then end position. +/// +/// # Notes +/// +/// The iterator uses roughly constant memory by maintaining a small heap that only grows if many +/// entries share the same start or end position. +/// +/// # Example +/// +/// ```rust +/// use bigtools::{BedEntry, Value}; +/// use pybigtools::coverage::CoverageIterator; +/// +/// let entries = vec![ +/// Ok(BedEntry { start: 10, end: 30, rest: "".to_string() }), +/// Ok(BedEntry { start: 20, end: 40, rest: "".to_string() }), +/// ]; +/// +/// let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); +/// let intervals: Result, _> = coverage_iter.collect(); +/// +/// // Result: [10-20): coverage=1, [20-30): coverage=2, [30-40): coverage=1 +/// ``` +pub struct CoverageIterator { + iter: I, + heap: BinaryHeap>, + coverage: i32, + prev_pos: Option, + finished: bool, +} + +impl CoverageIterator +where + I: Iterator>, +{ + pub fn new(mut iter: I) -> Result { + let mut heap = BinaryHeap::new(); + + // Initialize the heap with two entries (four positions) + match iter.next() { + Some(Ok(entry)) => { + heap.push(Reverse((entry.start, 1))); + heap.push(Reverse((entry.end, -1))); + } + Some(Err(e)) => { + return Err(e); + } + None => {} + } + + Ok(Self { + iter, + heap, + coverage: 0, + prev_pos: None, + finished: false, + }) + } +} + +impl Iterator for CoverageIterator +where + I: Iterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + if self.finished { + return None; + } + + while !self.heap.is_empty() { + let Reverse((pos, inc)) = self.heap.pop().unwrap(); + + // Batch all events at the same position + // First, collect all same-position events already in the heap + let mut delta = inc; + while let Some(&Reverse((next_pos, inc))) = self.heap.peek() { + if next_pos == pos { + delta += inc; + self.heap.pop(); + } else { + break; + } + } + + // If we have more entries with the current position, load them. + while let Some(result) = self.iter.next() { + match result { + Ok(entry) => { + self.heap.push(Reverse((entry.start, 1))); + self.heap.push(Reverse((entry.end, -1))); + + // If this entry contributes to the current position, process those events + let has_current_pos = entry.start == pos || entry.end == pos; + if has_current_pos { + while let Some(&Reverse((next_pos, inc))) = self.heap.peek() { + if next_pos == pos { + delta += inc; + self.heap.pop(); + } else { + break; + } + } + } else { + // No more events at current position, stop loading + break; + } + } + Err(e) => { + self.finished = true; + return Some(Err(e)); + } + } + } + + // Check if we need to emit an interval + let interval_to_emit = if let Some(start) = self.prev_pos { + if self.coverage > 0 && start < pos { + Some(Value { + start, + end: pos, + value: self.coverage as f32, + }) + } else { + None + } + } else { + None + }; + + // Update coverage and current position with all batched events + self.coverage += delta; + self.prev_pos = Some(pos); + + // Return the interval if we have one + if let Some(interval) = interval_to_emit { + return Some(Ok(interval)); + } + } + + self.finished = true; + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_streaming_simple() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 40, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 2); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 30, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_streaming_overlapping() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 40, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 20, + end: 30, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 30, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_streaming_complex() { + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 25, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 35, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 45, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 5); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + assert_eq!( + intervals[3], + Value { + start: 30, + end: 35, + value: 2.0 + } + ); + assert_eq!( + intervals[4], + Value { + start: 35, + end: 45, + value: 1.0 + } + ); + } + + #[test] + fn test_same_start_positions() { + // Test multiple entries with the same start position (sorted by start, then by end) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 25, + rest: "entry3".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-20): coverage=3, [20-25): coverage=2, [25-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 3.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 20, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_same_end_positions() { + // Test multiple entries with the same end position (already sorted by start) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 30, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-20): coverage=2, [20-30): coverage=3 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 30, + value: 3.0 + } + ); + } + + #[test] + fn test_identical_intervals() { + // Test multiple identical intervals + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-20): coverage=3 + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 3.0 + } + ); + } + + #[test] + fn test_adjacent_intervals() { + // Test intervals that are adjacent (end of one = start of next) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 30, + end: 40, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-20): coverage=1, [20-30): coverage=1, [30-40): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 20, + end: 30, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 30, + end: 40, + value: 1.0 + } + ); + } + + #[test] + fn test_empty_input() { + let entries: Vec> = vec![]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 0); + } + + #[test] + fn test_single_entry() { + let entries = vec![Ok(BedEntry { + start: 5, + end: 15, + rest: "single".to_string(), + })]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 5, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_nested_intervals() { + // One interval completely inside another + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 30, + rest: "outer".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 25, + rest: "inner".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-25): coverage=2, [25-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 25, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 25, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_mixed_same_positions() { + // Some entries start where others end at the same position + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "entry1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 20, + rest: "entry2".to_string(), + }), + Ok(BedEntry { + start: 20, + end: 30, + rest: "entry3".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Expected: [10-15): coverage=1, [15-20): coverage=2, [20-30): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 30, + value: 1.0 + } + ); + } + + #[test] + fn test_many_overlapping() { + // Five intervals all overlapping each other + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "1".to_string(), + }), + Ok(BedEntry { + start: 12, + end: 22, + rest: "2".to_string(), + }), + Ok(BedEntry { + start: 14, + end: 24, + rest: "3".to_string(), + }), + Ok(BedEntry { + start: 16, + end: 26, + rest: "4".to_string(), + }), + Ok(BedEntry { + start: 18, + end: 28, + rest: "5".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Should produce multiple intervals with increasing then decreasing coverage + assert!(intervals.len() > 5); // More intervals than input entries + assert!(intervals.iter().any(|v| v.value >= 4.0)); // Should reach high coverage + + // Verify no gaps and proper ordering + for i in 1..intervals.len() { + assert_eq!( + intervals[i - 1].end, + intervals[i].start, + "Gap detected between intervals" + ); + } + } + + #[test] + fn test_single_base_intervals() { + // Test intervals that are exactly 1 base pair long + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 11, + rest: "1bp".to_string(), + }), + Ok(BedEntry { + start: 12, + end: 13, + rest: "1bp".to_string(), + }), + Ok(BedEntry { + start: 14, + end: 15, + rest: "1bp".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 11, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 12, + end: 13, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 14, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_large_gaps() { + // Test intervals with large gaps between them + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "first".to_string(), + }), + Ok(BedEntry { + start: 1000, + end: 1010, + rest: "second".to_string(), + }), + Ok(BedEntry { + start: 100000, + end: 100010, + rest: "third".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 20, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 1000, + end: 1010, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 100000, + end: 100010, + value: 1.0 + } + ); + } + + #[test] + fn test_start_at_zero() { + // Test intervals starting at position 0 + let entries = vec![ + Ok(BedEntry { + start: 0, + end: 10, + rest: "zero_start".to_string(), + }), + Ok(BedEntry { + start: 5, + end: 15, + rest: "overlapping".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 0, + end: 5, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 5, + end: 10, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + } + + #[test] + fn test_boundary_positions() { + // Test intervals with position 0 and various boundary conditions + let entries = vec![ + Ok(BedEntry { + start: 0, + end: 1, + rest: "zero".to_string(), + }), + Ok(BedEntry { + start: 1, + end: 2, + rest: "one".to_string(), + }), + Ok(BedEntry { + start: 4294967290, + end: 4294967295, + rest: "high".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 0, + end: 1, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 1, + end: 2, + value: 1.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 4294967290, + end: 4294967295, + value: 1.0 + } + ); + } + + #[test] + fn test_very_long_interval() { + // Test an interval spanning a very large genomic region + let entries = vec![ + Ok(BedEntry { + start: 1000, + end: 1000000, + rest: "long".to_string(), + }), + Ok(BedEntry { + start: 500000, + end: 500010, + rest: "short_inside".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Should handle large intervals correctly + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 1000, + end: 500000, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 500000, + end: 500010, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 500010, + end: 1000000, + value: 1.0 + } + ); + } + + #[test] + fn test_zero_length_intervals() { + // Test intervals with zero length (start == end) + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 10, + rest: "zero_length1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 20, + rest: "normal".to_string(), + }), + Ok(BedEntry { + start: 25, + end: 25, + rest: "zero_length2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Zero-length intervals should be ignored (produce no coverage) + // Only the normal interval [15, 20) should produce coverage + assert_eq!(intervals.len(), 1); + assert_eq!( + intervals[0], + Value { + start: 15, + end: 20, + value: 1.0 + } + ); + } + + #[test] + fn test_zero_length_with_overlaps() { + // Test zero-length intervals mixed with overlapping normal intervals + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "normal1".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 15, + rest: "zero_length".to_string(), + }), + Ok(BedEntry { + start: 15, + end: 25, + rest: "normal2".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Zero-length interval should not affect coverage calculation + // Should get: [10-15): coverage=1, [15-20): coverage=2, [20-25): coverage=1 + assert_eq!(intervals.len(), 3); + assert_eq!( + intervals[0], + Value { + start: 10, + end: 15, + value: 1.0 + } + ); + assert_eq!( + intervals[1], + Value { + start: 15, + end: 20, + value: 2.0 + } + ); + assert_eq!( + intervals[2], + Value { + start: 20, + end: 25, + value: 1.0 + } + ); + } +} diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index a9bf300..a907d8a 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -11,6 +11,7 @@ use pyo3::types::{PyAny, PyString}; use pyo3::wrap_pyfunction; use url::Url; +mod coverage; mod errors; mod file_like; mod reader; @@ -19,9 +20,8 @@ mod writer; use errors::{BBIFileClosed, BBIReadError}; use file_like::PyFileLikeObject; -use reader::{BBIRead, BigBedEntriesIterator, BigWigIntervalIterator}; -use utils::BBIReadRaw; -use writer::{BigBedWrite, BigWigWrite}; +use reader::{BBIReadRaw, BBIReader, BigBedEntriesIterator, BigWigIntervalIterator}; +use writer::{BigBedWriter, BigWigWriter}; impl Reopen for PyFileLikeObject { fn reopen(&self) -> io::Result { @@ -95,11 +95,11 @@ fn open( ))), }; let read = match GenericBBIRead::open(file_like) { - Ok(GenericBBIRead::BigWig(bigwig)) => BBIRead { + Ok(GenericBBIRead::BigWig(bigwig)) => BBIReader { bbi: BBIReadRaw::BigWigFileLike(bigwig.cached()), } .into_py(py), - Ok(GenericBBIRead::BigBed(bigbed)) => BBIRead { + Ok(GenericBBIRead::BigBed(bigbed)) => BBIReader { bbi: BBIReadRaw::BigBedFileLike(bigbed.cached()), } .into_py(py), @@ -136,11 +136,11 @@ fn open_path_or_url( Err(_) => {} } match extension.as_ref() { - "bw" | "bigWig" | "bigwig" => BigWigWrite { + "bw" | "bigWig" | "bigwig" => BigWigWriter { bigwig: Some(path_url_or_file_like), } .into_py(py), - "bb" | "bigBed" | "bigbed" => BigBedWrite { + "bb" | "bigBed" | "bigbed" => BigBedWriter { bigbed: Some(path_url_or_file_like), } .into_py(py), @@ -164,7 +164,7 @@ fn open_path_or_url( "bw" | "bigWig" | "bigwig" => { if isfile { match BigWigReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigWigFile(bwr.cached()), } .into_py(py), @@ -177,7 +177,7 @@ fn open_path_or_url( } else { #[cfg(feature = "remote")] match BigWigReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigWigRemote(bwr.cached()), } .into_py(py), @@ -199,7 +199,7 @@ fn open_path_or_url( "bb" | "bigBed" | "bigbed" => { if isfile { match BigBedReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigBedFile(bwr.cached()), } .into_py(py), @@ -212,7 +212,7 @@ fn open_path_or_url( } else { #[cfg(feature = "remote")] match BigBedReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BBIRead { + Ok(bwr) => BBIReader { bbi: BBIReadRaw::BigBedRemote(bwr.cached()), } .into_py(py), @@ -246,9 +246,9 @@ fn pybigtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(open))?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -257,7 +257,6 @@ fn pybigtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let collections = PyModule::import_bound(py, "collections")?; let namedtuple = collections.getattr("namedtuple")?; - let summary_statistics = namedtuple.call( ( "SummaryStatistics".to_object(py), diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index da8cd04..605ec0e 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -3,28 +3,47 @@ use std::io::BufReader; use std::ops::Deref; use bigtools::bed::autosql::parse::parse_autosql; +#[cfg(feature = "remote")] +use bigtools::utils::file::remote_file::RemoteFile; +use bigtools::utils::file::reopen::ReopenableFile; use bigtools::utils::misc::{ bigwig_average_over_bed, BigWigAverageOverBedEntry, BigWigAverageOverBedError, Name, }; use bigtools::utils::reopen::Reopen; -use bigtools::{BBIReadError as _BBIReadError, BedEntry, Value, ZoomRecord}; +use bigtools::{ + BBIReadError as _BBIReadError, BedEntry, BigBedRead, BigWigRead, CachedBBIFileRead, Value, + ZoomRecord, +}; use pyo3::exceptions::{self, PyKeyError}; use pyo3::types::{IntoPyDict, PyAny, PyDict, PyList, PyString, PyTuple}; use pyo3::{prelude::*, PyTraverseError, PyVisit}; use crate::errors::{BBIFileClosed, BBIReadError, ConvertResult}; -use crate::utils::{entries_to_array, intervals_to_array, BBIReadRaw, Summary}; +use crate::file_like::PyFileLikeObject; +use crate::utils::{entries_to_array, intervals_to_array, Summary}; type ValueTuple = (u32, u32, f32); +pub enum BBIReadRaw { + Closed, + BigWigFile(BigWigRead>), + #[cfg(feature = "remote")] + BigWigRemote(BigWigRead>), + BigWigFileLike(BigWigRead>), + BigBedFile(BigBedRead>), + #[cfg(feature = "remote")] + BigBedRemote(BigBedRead>), + BigBedFileLike(BigBedRead>), +} + /// Interface for reading a BigWig or BigBed file. #[pyclass(module = "pybigtools")] -pub struct BBIRead { +pub struct BBIReader { pub bbi: BBIReadRaw, } #[pymethods] -impl BBIRead { +impl BBIReader { #[getter] fn is_bigwig(&self) -> bool { #[cfg(feature = "remote")] @@ -128,6 +147,65 @@ impl BBIRead { Ok(info) } + /// Return the names of chromosomes in a BBI file and their lengths. + /// + /// Parameters + /// ---------- + /// chrom : str or None + /// The name of the chromosome to get the length of. If None, then a + /// dictionary of all chromosome sizes will be returned. If the + /// chromosome doesn't exist, returns None. + /// + /// Returns + /// ------- + /// int or Dict[str, int] or None: + /// Chromosome length or a dictionary of chromosome lengths. + #[pyo3(signature = (chrom=None))] + fn chroms(&mut self, py: Python, chrom: Option) -> PyResult { + fn get_chrom_obj( + b: &B, + py: Python, + chrom: Option, + ) -> PyResult { + match chrom { + Some(chrom) => { + let chrom_length = b + .chroms() + .into_iter() + .find(|c| c.name == chrom) + .ok_or_else(|| { + PyErr::new::( + "No chromosome found with the specified name", + ) + }) + .map(|c| c.length.to_object(py))?; + Ok(chrom_length) + } + None => { + let chrom_dict: PyObject = b + .chroms() + .into_iter() + .map(|c| (c.name.clone(), c.length)) + .into_py_dict_bound(py) + .into(); + Ok(chrom_dict) + } + } + } + + match &self.bbi { + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigWigFileLike(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFile(b) => get_chrom_obj(b, py, chrom), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), + } + } + /// Return a list of sizes in bases of the summary intervals used in each /// of the zoom levels (i.e. reduction levels) of the BBI file. fn zooms(&self) -> PyResult> { @@ -268,7 +346,7 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end_clipped(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { @@ -372,7 +450,7 @@ impl BBIRead { start: Option, end: Option, ) -> PyResult { - let (start, end) = start_end_clipped(&self.bbi, &chrom, start, end)?; + let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { @@ -513,11 +591,21 @@ impl BBIRead { ) -> PyResult { let summary = match summary.as_ref() { "mean" => Summary::Mean, + "std" => Summary::Std, "min" => Summary::Min, "max" => Summary::Max, + "mean0" => Summary::Mean0, + "std0" => Summary::Std0, + "min0" => Summary::Min0, + "max0" => Summary::Max0, + "sum" => Summary::Sum, + "sum_squares" => Summary::SumSquares, + "bases_covered" => Summary::BasesCovered, + "bin_covered" => Summary::BinCovered, _ => { return Err(PyErr::new::(format!( - "Unrecognized summary. Only `mean`, `min`, and `max` are allowed." + "Unrecognized summary statistic: {}", + summary ))); } }; @@ -546,65 +634,6 @@ impl BBIRead { } } - /// Return the names of chromosomes in a BBI file and their lengths. - /// - /// Parameters - /// ---------- - /// chrom : str or None - /// The name of the chromosome to get the length of. If None, then a - /// dictionary of all chromosome sizes will be returned. If the - /// chromosome doesn't exist, returns None. - /// - /// Returns - /// ------- - /// int or Dict[str, int] or None: - /// Chromosome length or a dictionary of chromosome lengths. - #[pyo3(signature = (chrom=None))] - fn chroms(&mut self, py: Python, chrom: Option) -> PyResult { - fn get_chrom_obj( - b: &B, - py: Python, - chrom: Option, - ) -> PyResult { - match chrom { - Some(chrom) => { - let chrom_length = b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .ok_or_else(|| { - PyErr::new::( - "No chromosome found with the specified name", - ) - }) - .map(|c| c.length.to_object(py))?; - Ok(chrom_length) - } - None => { - let chrom_dict: PyObject = b - .chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict_bound(py) - .into(); - Ok(chrom_dict) - } - } - } - - match &self.bbi { - BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), - BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), - #[cfg(feature = "remote")] - BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigWigFileLike(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigBedFile(b) => get_chrom_obj(b, py, chrom), - #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), - BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), - } - } - /// Gets the average values from a bigWig over the entries of a bed file. /// /// Parameters @@ -842,40 +871,6 @@ impl BBIRead { } } -#[pyclass(module = "pybigtools")] -struct ZoomIntervalIterator { - iter: Box> + Send>, -} - -#[pymethods] -impl ZoomIntervalIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult> { - slf.iter - .next() - .transpose() - .map(|o| { - o.map(|v| { - let summary = [ - ("total_items", v.summary.total_items.to_object(slf.py())), - ("bases_covered", v.summary.bases_covered.to_object(slf.py())), - ("min_val", v.summary.min_val.to_object(slf.py())), - ("max_val", v.summary.max_val.to_object(slf.py())), - ("sum", v.summary.sum.to_object(slf.py())), - ("sum_squares", v.summary.sum_squares.to_object(slf.py())), - ] - .into_py_dict_bound(slf.py()) - .to_object(slf.py()); - (v.start, v.end, summary) - }) - }) - .convert_err() - } -} - /// An iterator for intervals in a bigWig. /// /// It returns only values that exist in the bigWig, skipping any missing @@ -928,6 +923,40 @@ impl BigBedEntriesIterator { } } +#[pyclass(module = "pybigtools")] +struct ZoomIntervalIterator { + iter: Box> + Send>, +} + +#[pymethods] +impl ZoomIntervalIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + slf.iter + .next() + .transpose() + .map(|o| { + o.map(|v| { + let summary = [ + ("total_items", v.summary.total_items.to_object(slf.py())), + ("bases_covered", v.summary.bases_covered.to_object(slf.py())), + ("min_val", v.summary.min_val.to_object(slf.py())), + ("max_val", v.summary.max_val.to_object(slf.py())), + ("sum", v.summary.sum.to_object(slf.py())), + ("sum_squares", v.summary.sum_squares.to_object(slf.py())), + ] + .into_py_dict_bound(slf.py()) + .to_object(slf.py()); + (v.start, v.end, summary) + }) + }) + .convert_err() + } +} + enum BigWigAverageOverBedStatistics { Size, Bases, @@ -1028,7 +1057,7 @@ impl BigWigAverageOverBedEntriesIterator { } } -fn start_end_clipped( +fn start_end_clamped( bbi: &BBIReadRaw, chrom_name: &str, start: Option, diff --git a/pybigtools/src/utils.rs b/pybigtools/src/utils.rs index baf1241..f3d2635 100644 --- a/pybigtools/src/utils.rs +++ b/pybigtools/src/utils.rs @@ -1,38 +1,97 @@ -use std::collections::VecDeque; -use std::ops::IndexMut; - -#[cfg(feature = "remote")] -use bigtools::utils::file::remote_file::RemoteFile; -use bigtools::utils::file::reopen::ReopenableFile; -use bigtools::{ - BBIFileRead, BBIReadError, BedEntry, BigBedRead, BigWigRead, CachedBBIFileRead, Value, - ZoomRecord, -}; +use bigtools::{BBIFileRead, BBIReadError, BedEntry, BigBedRead, BigWigRead, Value, ZoomRecord}; use numpy::ndarray::ArrayViewMut; use numpy::{PyArray1, PyArrayMethods}; use pyo3::exceptions; use pyo3::prelude::*; +use std::ops::IndexMut; +use crate::coverage::CoverageIterator; use crate::errors::ConvertResult; -use crate::file_like::PyFileLikeObject; - -pub enum BBIReadRaw { - Closed, - BigWigFile(BigWigRead>), - #[cfg(feature = "remote")] - BigWigRemote(BigWigRead>), - BigWigFileLike(BigWigRead>), - BigBedFile(BigBedRead>), - #[cfg(feature = "remote")] - BigBedRemote(BigBedRead>), - BigBedFileLike(BigBedRead>), -} #[derive(Copy, Clone, Debug)] pub enum Summary { Mean, + Std, Min, Max, + Mean0, + Std0, + Min0, + Max0, + Sum, + SumSquares, + BasesCovered, + BinCovered, +} + +#[derive(Debug)] +enum BBIRecord { + Value(Value), + BedEntry(BedEntry), + ZoomRecord(ZoomRecord), +} + +#[derive(Copy, Clone, Debug)] +struct SummaryStats { + sum: f64, + sum_squares: f64, + min: f64, + max: f64, + bases_covered: f64, +} + +enum BBIRead<'a, R: BBIFileRead> { + BigWig(&'a mut BigWigRead), + BigBed(&'a mut BigBedRead), +} + +impl<'a, R: BBIFileRead> BBIRead<'a, R> { + pub fn chroms(&self) -> &[bigtools::ChromInfo] { + match self { + BBIRead::BigWig(bw) => bw.chroms(), + BBIRead::BigBed(bb) => bb.chroms(), + } + } + + /// Resolve a genomic range with potentially unspecified start and end positions. + /// Returns a tuple of (start, end, chrom_length). + pub fn resolve_range( + &self, + chrom_name: &str, + start: Option, + end: Option, + ) -> PyResult<(i32, i32, i32)> { + let chroms = self.chroms(); + let chrom = chroms.into_iter().find(|x| x.name == chrom_name); + let length = match chrom { + None => { + return Err(PyErr::new::(format!( + "No chromomsome with name `{}` found.", + chrom_name + ))) + } + Some(c) => c.length as i32, + }; + return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); + } + + pub fn zoom_headers(&self) -> &[bigtools::ZoomHeader] { + match self { + BBIRead::BigWig(bw) => bw.info().zoom_headers.as_slice(), + BBIRead::BigBed(bb) => bb.info().zoom_headers.as_slice(), + } + } + + /// Find the closest zoom resolution that does not exceed the desired bin size. + /// If no such zoom level exists, returns None. + pub fn best_zoom(&self, start: i32, end: i32, bins: usize) -> Option { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + self.zoom_headers() + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level) + .map(|z| z.reduction_level) + } } pub fn intervals_to_array( @@ -48,87 +107,10 @@ pub fn intervals_to_array( oob: f64, arr: Option, ) -> PyResult { - let (start, end, chrom_length) = bigwig_start_end_length(bw, chrom, start, end)?; - let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(chrom_length) as u32); - let arr = arr.unwrap_or_else(|| { - let size = bins.unwrap_or((end - start) as usize); - PyArray1::from_vec_bound(py, vec![missing; size]).to_object(py) - }); - - // Fill array - let array: &Bound<'_, PyArray1> = - arr.downcast_bound::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - match bins { - // Base-level data - None => { - if array.len()? != (end - start) as usize { - return Err(PyErr::new::(format!( - "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", - (end - start) as usize, - array.len()?, - ))); - } - let mut array = array.readwrite(); - let view = array.as_array_mut(); - let iter = bw - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - fill_values(start, end, iter, missing, view).convert_err()?; - } - // Binned data - Some(bins) => { - if array.len()? != bins { - return Err(PyErr::new::(format!( - "Passed `arr` does not have the expected length (expected `{}`, found `{}`).", - bins, - array.len()?, - ))); - } - let zoom = if !exact { - bigwig_closest_zoom(bw, start, end, bins) - } else { - None - }; - let mut array = array.readwrite(); - let view = array.as_array_mut(); - match zoom { - // Interpolate from zoom-level summaries - Some(zoom) => { - let iter = bw - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - fill_binned_values_from_zoom(start, end, iter, summary, bins, missing, view) - .convert_err()?; - } - // Aggregate from base-level data - None => { - let iter = bw - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - fill_binned_values_from_intervals( - start, end, iter, summary, bins, missing, view, - ) - .convert_err()?; - } - } - } - }; - - // Handle out-of-bounds - let mut array = array.readwrite(); - let view = array.as_array_mut(); - fill_out_of_bounds(start, end, chrom_length, bins, oob, view); - - Ok(arr) + let mut bbi = BBIRead::BigWig(bw); + records_to_array( + py, &mut bbi, chrom, start, end, bins, summary, exact, missing, oob, arr, + ) } pub fn entries_to_array( @@ -144,22 +126,41 @@ pub fn entries_to_array( oob: f64, arr: Option, ) -> PyResult { - let (start, end, length) = bigbed_start_end_length(bb, chrom, start, end)?; - let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); + let mut bbi = BBIRead::BigBed(bb); + records_to_array( + py, &mut bbi, chrom, start, end, bins, summary, exact, missing, oob, arr, + ) +} + +fn records_to_array( + py: Python<'_>, + bbi: &mut BBIRead, + chrom: &str, + start: Option, + end: Option, + bins: Option, + summary: Summary, + exact: bool, + missing: f64, + oob: f64, + arr: Option, +) -> PyResult { + let (start, end, chrom_length) = bbi.resolve_range(chrom, start, end)?; + let (valid_start, valid_end) = (start.max(0) as u32, end.min(chrom_length) as u32); let arr = arr.unwrap_or_else(|| { let size = bins.unwrap_or((end - start) as usize); PyArray1::from_vec_bound(py, vec![missing; size]).to_object(py) }); - - // Fill array let array: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { PyErr::new::( - "Passed `arr` option must be a one-dimensional numpy array.", + "`arr` option must be a one-dimensional numpy array, if passed.", ) })?; + + // Fill the array match bins { - // Base-level data + // Base-level raster None => { if array.len()? != (end - start) as usize { return Err(PyErr::new::(format!( @@ -170,12 +171,24 @@ pub fn entries_to_array( } let mut array = array.readwrite(); let view = array.as_array_mut(); - let iter = bb - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - fill_coverage(start, end, iter, missing, view).convert_err()?; + match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_interval(&chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::Value)); + fill_values(start, end, iter, missing, view).convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_interval(&chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::BedEntry)); + fill_values(start, end, iter, missing, view).convert_err()?; + } + } } - // Binned data + // Reduction Some(bins) => { if array.len()? != bins { return Err(PyErr::new::(format!( @@ -184,37 +197,54 @@ pub fn entries_to_array( array.len()?, ))); } - let zoom = if !exact { - bigbed_closest_zoom(bb, start, end, bins) + let reduction_level = if !exact { + bbi.best_zoom(start, end, bins) } else { None }; let mut array = array.readwrite(); let view = array.as_array_mut(); - match zoom { + match reduction_level { // Interpolate from zoom-level summaries - Some(zoom) => { - let iter = bb - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - fill_binned_coverage_from_zoom(start, end, iter, summary, bins, missing, view) - .convert_err()?; - } + Some(reduction_level) => match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_zoom_interval(&chrom, valid_start, valid_end, reduction_level) + .convert_err()? + .map(|item| item.map(BBIRecord::ZoomRecord)); + fill_binned(start, end, iter, summary, bins, missing, view) + .convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_zoom_interval(&chrom, valid_start, valid_end, reduction_level) + .convert_err()? + .map(|item| item.map(BBIRecord::ZoomRecord)); + fill_binned(start, end, iter, summary, bins, missing, view) + .convert_err()?; + } + }, // Aggregate from base-level data - None => { - let iter = bb - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - fill_binned_coverage_from_entries( - start, end, iter, summary, bins, missing, view, - ) - .convert_err()?; - } + None => match bbi { + BBIRead::BigWig(bw) => { + let iter = bw + .get_interval(&chrom, valid_start, valid_end) + .convert_err()? + .map(|item| item.map(BBIRecord::Value)); + fill_binned(start, end, iter, summary, bins, missing, view) + .convert_err()?; + } + BBIRead::BigBed(bb) => { + let iter = bb + .get_interval(&chrom, valid_start, valid_end) + .convert_err()?; + let coverage_iter = CoverageIterator::new(iter) + .convert_err()? + .map(|result| result.map(BBIRecord::Value)); + fill_binned(start, end, coverage_iter, summary, bins, missing, view) + .convert_err()?; + } + }, } } }; @@ -222,12 +252,24 @@ pub fn entries_to_array( // Handle out-of-bounds let mut array = array.readwrite(); let view = array.as_array_mut(); - fill_out_of_bounds(start, end, length, bins, oob, view); + fill_out_of_bounds(start, end, chrom_length, bins, oob, view); Ok(arr) } -fn fill_values>>( +/// Rasterize intervals at nucleotide resolution. +/// +/// For [`Value`] intervals, the values are assigned over the covered bases. +/// For [`BedEntry`] intervals, the total coverage is accumulated over each base. +/// For [`ZoomRecord`] intervals, the mean statistic is assigned over the covered bases. +/// +/// # Arguments +/// * `start` - Start coordinate of the target range +/// * `end` - End coordinate of the target range +/// * `iter` - Iterator of data intervals +/// * `missing` - Value to use for bases with no data +/// * `view` - Mutable numpy array view to fill (must have length == end - start) +fn fill_values>>( start: i32, end: i32, iter: I, @@ -239,42 +281,37 @@ fn fill_values>>( for interval in iter { let interval = interval?; - let interval_start = ((interval.start as i32) - start) as usize; - let interval_end = ((interval.end as i32) - start) as usize; - for i in interval_start..interval_end { - let val = *view.index_mut(i); - *view.index_mut(i) = if val.is_nan() { - interval.value as f64 - } else { - val + interval.value as f64 - }; - } - } - for val in view.iter_mut() { - *val = if val.is_nan() { missing } else { *val }; - } - Ok(()) -} -fn fill_coverage>>( - start: i32, - end: i32, - iter: I, - missing: f64, - mut view: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { - assert_eq!(view.len(), (end - start) as usize); - view.fill(f64::NAN); + // Clamp interval to our target range + let interval_start = match interval { + BBIRecord::Value(v) => (v.start as i32).max(start), + BBIRecord::BedEntry(ref e) => (e.start as i32).max(start), + BBIRecord::ZoomRecord(ref z) => (z.start as i32).max(start), + }; + let interval_end = match interval { + BBIRecord::Value(v) => (v.end as i32).min(end), + BBIRecord::BedEntry(ref e) => (e.end as i32).min(end), + BBIRecord::ZoomRecord(ref z) => (z.end as i32).min(end), + }; + if interval_start >= interval_end { + continue; + } - for interval in iter { - let interval = interval?; - let interval_start = ((interval.start as i32) - start) as usize; - let interval_end = ((interval.end as i32) - start) as usize; - for i in interval_start..interval_end { + // Fill values + let rel_start = (interval_start - start) as usize; + let rel_end = (interval_end - start) as usize; + let value = match interval { + BBIRecord::Value(v) => v.value as f64, + BBIRecord::BedEntry(_) => 1.0, + BBIRecord::ZoomRecord(z) => z.summary.sum / (z.end - z.start) as f64, + }; + for i in rel_start..rel_end { let val = *view.index_mut(i); - *view.index_mut(i) = if val.is_nan() { 1.0 } else { val + 1.0 }; + *view.index_mut(i) = if val.is_nan() { value } else { val + value }; } } + + // Finalize for val in view.iter_mut() { *val = if val.is_nan() { missing } else { *val }; } @@ -282,7 +319,27 @@ fn fill_coverage>>( Ok(()) } -fn fill_binned_values_from_intervals>>( +/// Rasterize intervals over a uniform grid of bins. +/// +/// This function iterates over each data interval and accumulates its contribution to the +/// overlapping bins, weighted by the overlap amount. +/// +/// For [`Value`] intervals, the values are aggregated based on bin overlap. +/// For [`ZoomRecord`] intervals, the summary statistics are aggregated based on bin overlap. +/// +/// # Notes +/// [`BedEntry`] intervals are not supported. They must be converted to coverage [`Value`] +/// intervals. See [`CoverageIterator`] for details. +/// +/// # Arguments +/// * `start` - Start coordinate of the target range +/// * `end` - End coordinate of the target range +/// * `iter` - Iterator of data intervals +/// * `summary` - How to aggregate values +/// * `bins` - Number of bins to divide the range into +/// * `missing` - Value to use for bins with no intersecting data +/// * `view` - Mutable numpy array view to fill (must have length == bins) +fn fill_binned>>( start: i32, end: i32, iter: I, @@ -292,508 +349,170 @@ fn fill_binned_values_from_intervals, ) -> Result<(), BBIReadError> { assert_eq!(view.len(), bins); - view.fill(missing); - - // (bin, bin_start, bin_end, Option<(covered_bases, value)>) - let mut deque: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - while let Some(front) = deque.front_mut() { - if front.0 < bin_start { - let front = deque.pop_front().unwrap(); - let bin = front.0; + // Initialize statistics for each bin + let mut stats = vec![ + SummaryStats { + sum: 0.0, + sum_squares: 0.0, + min: f64::NAN, + max: f64::NAN, + bases_covered: 0.0, + }; + bins + ]; + + // Process each data interval + for interval_result in iter { + let interval = interval_result?; + + // Clamp interval to our target range + let interval_start = match interval { + BBIRecord::Value(v) => (v.start as i32).max(start), + BBIRecord::BedEntry(ref e) => (e.start as i32).max(start), + BBIRecord::ZoomRecord(ref z) => (z.start as i32).max(start), + }; + let interval_end = match interval { + BBIRecord::Value(v) => (v.end as i32).min(end), + BBIRecord::BedEntry(ref e) => (e.end as i32).min(end), + BBIRecord::ZoomRecord(ref z) => (z.end as i32).min(end), + }; + if interval_start >= interval_end { + continue; + } - match summary { - Summary::Min => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); + // Determine which bins this interval overlaps. + // For the last bin, we need to handle the case where `interval_end` (open-ended) exactly + // aligns with a bin boundary (in which case, we exclude the bin to the right). + let rel_start = interval_start - start; + let rel_end = interval_end - start; + let first_bin = ((rel_start as f64) / bin_size).floor() as usize; + let last_bin = if (rel_end as f64) % bin_size == 0.0 { + (((rel_end as f64) / bin_size) - 1.0).floor() as usize + } else { + ((rel_end as f64) / bin_size).floor() as usize + }; + // Clamp bins to `[0, bins - 1]` in case of floating point inaccuracies + let first_bin = first_bin.min(bins - 1); + let last_bin = last_bin.min(bins - 1); + + // Accumulate contribution of interval to each overlapping bin + for i in first_bin..=last_bin { + // Round bin edges down to the nearest integer (base) for overlap calculation + let bin_start = start + (i as f64 * bin_size) as i32; + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + + // For upsampling (case where bin_size < 1), ensure at least one base of overlap is counted + let bin_end = if bin_start == bin_end { + bin_start + 1 + } else { + bin_end + }; + + // Calculate overlap between interval and bin + let overlap_start = interval_start.max(bin_start); + let overlap_end = interval_end.min(bin_end); + let overlap = (overlap_end - overlap_start) as f64; + + if overlap > 0.0 { + match interval { + BBIRecord::Value(v) => { + let value = v.value as f64; + stats[i].sum += value * overlap; + stats[i].sum_squares += value * value * overlap; + stats[i].min = stats[i].min.min(value); + stats[i].max = stats[i].max.max(value); + stats[i].bases_covered += overlap; } - Summary::Max => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); + BBIRecord::ZoomRecord(z) => { + let summary = z.summary; + let overlap_factor = overlap / (z.end - z.start) as f64; + stats[i].sum += summary.sum * overlap_factor; + stats[i].sum_squares += summary.sum_squares * overlap_factor; + stats[i].min = stats[i].min.min(summary.min_val); + stats[i].max = stats[i].max.max(summary.max_val); + stats[i].bases_covered += summary.bases_covered as f64 * overlap_factor; } - Summary::Mean => { - view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + BBIRecord::BedEntry(_) => { + panic!("BED records must be converted to non-overlapping coverage intervals prior to binning."); } } - } else { - break; } } - while let Some(bin) = deque - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; + } - deque.push_back((bin, bin_start, bin_end, None)); - } - for bin in bin_start..bin_end { - assert!(deque.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, data) in deque.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - // Not relevant here, but keeping for parity - if interval_end <= *bin_start { - break; - } - let (c, v) = data.get_or_insert_with(|| { - match summary { - // min & max are defined for NAN and we are about to set it - // can't use 0.0 because it may be either below or above the real value - Summary::Min | Summary::Max => (0, f64::NAN), - // addition is not defined for NAN - Summary::Mean => (0, 0.0), - } - }); + // Fill the output array + for i in 0..bins { + if stats[i].bases_covered == 0.0 { + *view.index_mut(i) = missing; + } else { match summary { + Summary::Mean => { + *view.index_mut(i) = stats[i].sum / stats[i].bases_covered; + } + Summary::Std => { + let n = stats[i].bases_covered; + let mut variance = stats[i].sum_squares - (stats[i].sum * stats[i].sum) / n; + if n > 1.0 { + variance /= n - 1.0; + } + *view.index_mut(i) = variance.sqrt(); + } Summary::Min => { - *v = v.min(interval.value as f64); + *view.index_mut(i) = stats[i].min; } Summary::Max => { - *v = v.max(interval.value as f64); + *view.index_mut(i) = stats[i].max; } - Summary::Mean => { - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size: i32 = overlap_end - overlap_start; - *v += (overlap_size as f64) * interval.value as f64; - *c += overlap_size; + Summary::Mean0 => { + *view.index_mut(i) = stats[i].sum / bin_size; } - } - } - } - while let Some(front) = deque.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } - Ok(()) -} - -fn fill_binned_coverage_from_entries>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut view: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { - assert_eq!(view.len(), bins); - view.fill(missing); - - // (, , , , ) - // covered_bases = 0 if uncovered, 1 if covered - let mut deque: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = deque.front_mut() { - if front.0 < bin_start { - let front = deque.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - view[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); + Summary::Std0 => { + let n = bin_size; + let mut variance = stats[i].sum_squares - (stats[i].sum * stats[i].sum) / n; + if n > 1.0 { + variance /= n - 1.0; } + *view.index_mut(i) = variance.sqrt(); } - } else { - break; - } - } - while let Some(bin) = deque - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None - } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - deque.push_back(( - bin, - bin_start, - bin_end, - vec![0; (bin_end - bin_start) as usize], - vec![missing; (bin_end - bin_start) as usize], - )); - } - for bin in bin_start..bin_end { - assert!(deque.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, covered, data) in deque.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - - let range = - ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); - for i in &mut data[range.clone()] { - // If NAN, then 0.0 + 1.0, else i + 1.0 - *i = (*i).max(0.0) + 1.0; - } - for i in &mut covered[range] { - *i = (*i).max(1); - } - } - } - while let Some(front) = deque.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - view[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } - Ok(()) -} - -fn fill_binned_values_from_zoom>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut view: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { - assert_eq!(view.len(), bins); - view.fill(missing); - - // (bin, bin_start, bin_end, Option<(covered_bases, value)>) - let mut deque: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = deque.front_mut() { - if front.0 < bin_start { - let front = deque.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); + Summary::Min0 => { + *view.index_mut(i) = if stats[i].bases_covered < bin_size.floor() { + stats[i].min.min(0.0) + } else { + stats[i].min } - Summary::Mean => { - view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); + } + Summary::Max0 => { + *view.index_mut(i) = if stats[i].bases_covered < bin_size.floor() { + stats[i].max.max(0.0) + } else { + stats[i].max } } - } else { - break; - } - } - while let Some(bin) = deque - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None + Summary::Sum => { + *view.index_mut(i) = stats[i].sum; } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - deque.push_back((bin, bin_start, bin_end, None)); - } - for bin in bin_start..bin_end { - assert!(deque.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, data) in deque.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - // Not relevant here, but keeping for parity - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size: i32 = overlap_end - overlap_start; - match data.as_mut() { - Some((c, v)) => match summary { - Summary::Min => { - *v = v.min(interval.summary.min_val as f64); - *c += overlap_size; - } - Summary::Max => { - *v = v.max(interval.summary.max_val as f64); - *c += overlap_size; - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *v += (overlap_size as f64) * zoom_mean; - *c += overlap_size; - } - }, - None => match summary { - Summary::Min => { - *data = Some((overlap_size, interval.summary.min_val as f64)); - } - Summary::Max => { - *data = Some((overlap_size, interval.summary.max_val as f64)); - } - Summary::Mean => { - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *data = Some((overlap_size, (overlap_size as f64) * zoom_mean)); - } - }, - } - } - } - while let Some(front) = deque.pop_front() { - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Max => { - view[bin] = front.3.map(|v| v.1).unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); - } - } - } - Ok(()) -} - -fn fill_binned_coverage_from_zoom>>( - start: i32, - end: i32, - iter: I, - summary: Summary, - bins: usize, - missing: f64, - mut view: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { - assert_eq!(view.len(), bins); - view.fill(missing); - - // (, , , , ) - // covered_bases = 0 if uncovered, 1 if covered - let mut deque: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as i32).max(start) - start; - let interval_end = (interval.end as i32).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = (((interval_end - 1) as f64) / bin_size) as usize; - - while let Some(front) = deque.front_mut() { - if front.0 < bin_start { - let front = deque.pop_front().unwrap(); - let bin = front.0; - - match summary { - Summary::Min => { - view[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - view[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } + Summary::SumSquares => { + *view.index_mut(i) = stats[i].sum_squares; } - } else { - break; - } - } - while let Some(bin) = deque - .back() - .map(|last_bin| { - if last_bin.0 < bin_end { - Some(last_bin.0 + 1) - } else { - None + Summary::BasesCovered => { + *view.index_mut(i) = stats[i].bases_covered; } - }) - .unwrap_or(Some(bin_start)) - { - let bin_start = ((bin as f64) * bin_size) as i32; - let bin_end = (((bin + 1) as f64) * bin_size) as i32; - - deque.push_back(( - bin, - bin_start, - bin_end, - vec![0; (bin_end - bin_start) as usize], - vec![missing; (bin_end - bin_start) as usize], - )); - } - for bin in bin_start..bin_end { - assert!(deque.iter().find(|b| b.0 == bin).is_some()); - } - for (_bin, bin_start, bin_end, covered, data) in deque.iter_mut() { - // Since bed files can have overlapping entries, a previous entry could have extended - // past the last bin for the current interval - if interval_end <= *bin_start { - break; - } - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - - let mean = interval.summary.sum / (interval.summary.bases_covered) as f64; - let range = - ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); - for i in &mut data[range.clone()] { - *i = i.max(0.0); - match summary { - Summary::Mean => *i += mean, - Summary::Min => *i = i.min(interval.summary.min_val), - Summary::Max => *i = i.max(interval.summary.max_val), + Summary::BinCovered => { + *view.index_mut(i) = stats[i].bases_covered / bin_size; } } - - for i in &mut covered[range] { - *i = (*i).max(1); - } } } - while let Some(front) = deque.pop_front() { - let bin = front.0; - match summary { - Summary::Min => { - view[bin] = front - .4 - .into_iter() - .reduce(|min, v| min.min(v)) - .unwrap_or(missing); - } - Summary::Max => { - view[bin] = front - .4 - .into_iter() - .reduce(|max, v| max.max(v)) - .unwrap_or(missing); - } - Summary::Mean => { - view[bin] = front - .3 - .iter() - .any(|v| *v > 0) - .then(|| front.3.into_iter().sum::() as f64) - // Map nan to 0 so the sum is non-nan - .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) - .unwrap_or(missing); - } - } - } Ok(()) } +/// Fill out-of-bounds elements in the array with a specified value. +/// +/// If any bin overlaps base coordinates outside the valid range of the chromosome, it is filled +/// with the `oob` value. fn fill_out_of_bounds( start: i32, end: i32, @@ -808,292 +527,230 @@ fn fill_out_of_bounds( }; if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - view[i] = oob; + for i in 0..view.len() { + let bin_start = start + (i as f64 * bin_size) as i32; + if bin_start < 0 { + view[i] = oob; + } else { + break; + } } } if end > chrom_length { - let interval_start = (chrom_length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, view.len()); - for i in bin_start..bin_end { - view[i] = oob; + for i in (0..view.len()).rev() { + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + if bin_end > chrom_length { + view[i] = oob; + } else { + break; + } } } } -fn bigwig_start_end_length( - bw: &BigWigRead, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chroms = bw.chroms(); - start_end_length_inner(chrom_name, chroms, start, end) -} - -fn bigbed_start_end_length( - bb: &BigBedRead, - chrom_name: &str, - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chroms = bb.chroms(); - start_end_length_inner(chrom_name, chroms, start, end) -} - -fn start_end_length_inner( - chrom_name: &str, - chroms: &[bigtools::ChromInfo], - start: Option, - end: Option, -) -> PyResult<(i32, i32, i32)> { - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); - let length = match chrom { - None => { - return Err(PyErr::new::(format!( - "No chromomsome with name `{}` found.", - chrom_name - ))) - } - Some(c) => c.length as i32, - }; - return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); -} - -fn bigwig_closest_zoom<'a, R: BBIFileRead>( - bw: &'a BigWigRead, - start: i32, - end: i32, - bins: usize, -) -> Option<&'a bigtools::ZoomHeader> { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - bw.info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level) -} - -fn bigbed_closest_zoom<'a, R: BBIFileRead>( - bb: &'a BigBedRead, - start: i32, - end: i32, - bins: usize, -) -> Option<&'a bigtools::ZoomHeader> { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - bb.info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level) -} - #[cfg(test)] -mod test { - use bigtools::BedEntry; - use numpy::ndarray::Array; +mod tests { + use std::vec; - use crate::utils::fill_binned_coverage_from_entries; - use crate::utils::Summary; + use super::*; + use bigtools::Value; + use numpy::ndarray::Array; #[test] - fn test_fill_binned_coverage_from_entries() { - fn eq_with_nan_eq(a: f64, b: f64) -> bool { - (a.is_nan() && b.is_nan()) || (a == b) - } - - #[track_caller] - fn assert_equal(found: Vec, expected: Vec) { - let equal = (found.len() == expected.len()) && // zip stops at the shortest - found.iter() - .zip(expected.iter()) - .all(|(a,b)| eq_with_nan_eq(*a, *b)); - if !equal { - panic!("Vecs not equal: expected {:?}, found {:?}", expected, found); - } - } - - let entries = []; - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [f64::NAN].into_iter().collect::>()); + fn test_fill_binned_single_interval() { + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![Value { + start: 5, + end: 15, + value: 2.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); - let entries = []; - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, - 1, + n_bins, 0.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [0.0].into_iter().collect::>()); - let entries = [BedEntry { - start: 10, - end: 20, - rest: "".to_string(), + // First bin (0-10): overlaps 5-10 = 5 bp with value 2.0 + // Second bin (10-20): overlaps 10-15 = 5 bp with value 2.0 + // Both should have value 2.0 + assert_eq!(arr.to_vec(), vec![2.0, 2.0]); + } + + #[test] + fn test_fill_binned_partial_overlap() { + let start = 0; + let end = 20; + let n_bins = 2; + let mut arr = Array::from(vec![0.0; n_bins]); + + let intervals = vec![Value { + start: 5, + end: 8, + value: 4.0, }]; - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, - 1, - f64::NAN, + n_bins, + -1.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - let entries = [BedEntry { - start: 10, - end: 20, - rest: "".to_string(), + // First bin (0-10): overlaps 5-8 = 3 bp with value 4.0 + // Second bin (10-20): no overlap + assert_eq!(arr.to_vec(), vec![4.0, -1.0]); + + let intervals = vec![Value { + start: 5, + end: 8, + value: 4.0, }]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.into_iter().map(|v| Ok(v)), - Summary::Mean, - 2, - f64::NAN, + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Mean0, + n_bins, + -1.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 1.0].into_iter().collect::>()); - let entries = [ - BedEntry { - start: 10, - end: 20, - rest: "".to_string(), + // First bin (0-10): overlaps 5-8 = 3 bp with value 4.0 + // Second bin (10-20): no overlap + assert!((arr[0] - (4.0 * 3.0 / 10.0)).abs() < 0.01); + assert!((arr[1] - (-1.0)).abs() < 0.01); + } + + #[test] + fn test_fill_binned_min_max() { + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![ + Value { + start: 0, + end: 15, + value: 1.0, }, - BedEntry { - start: 15, - end: 20, - rest: "".to_string(), + Value { + start: 5, + end: 25, + value: 3.0, }, ]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Mean, - 2, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 2.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), + + // Test Min + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.clone().into_iter().map(BBIRecord::Value).map(Ok), Summary::Min, - 1, - f64::NAN, + n_bins, + -1.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), + + // First bin (0-10): sees values 1.0 and 3.0, min = 1.0 + // Second bin (10-20): sees values 1.0 and 3.0, min = 1.0 + assert_eq!(arr.to_vec(), vec![1.0, 1.0]); + + // Test Max + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Max, - 1, - f64::NAN, + n_bins, + -1.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [2.0].into_iter().collect::>()); - let entries = [ - BedEntry { - start: 10, - end: 20, - rest: "".to_string(), + // First bin (0-10): sees values 1.0 and 3.0, max = 3.0 + // Second bin (10-20): sees values 1.0 and 3.0, max = 3.0 + assert_eq!(arr.to_vec(), vec![3.0, 3.0]); + } + + #[test] + fn test_fill_binned_noninteger_binsize() { + // 5 bins over range 0..12 + // bin_size = 12 / 5 = 2.4 + // Bin edges are truncated to integers for overlap determination. + // + // Bin 0: [0, 2.4) → [0, 2) + // overlaps [1,3) by [1, 2) = 1 bp with value 1.0 + // → 1.0 + // Bin 1: [2.4, 4.8) → [2, 4) + // overlaps [1,3) by [2, 3) = 1 bp with value 1.0 + // → 1.0 + // Bin 2: [4.8, 7.2) → [4, 7) + // overlaps [4,8) by [4, 7) = 3 bp with value 2.0 + // → 2.0 + // Bin 3: [7.2, 9.6) → [7, 9) + // overlaps [4,8) by [7, 8) = 1 bp with value 2.0 + // overlaps [8,10) by [8, 9) = 1 bp with value 3.0 + // → (2.0*1 + 3.0*1) / (1+1) = 2.5 + // Bin 4: [9.6, 12) → [9, 12) + // overlaps [8,10) by [9, 10) = 1 bp with value 3.0 + // → 3.0 + let start = 0; + let end = 12; + let n_bins = 5; + let intervals = vec![ + Value { + start: 1, + end: 3, + value: 1.0, }, - BedEntry { - start: 15, - end: 20, - rest: "".to_string(), + Value { + start: 4, + end: 8, + value: 2.0, }, - BedEntry { - start: 15, - end: 25, - rest: "".to_string(), + Value { + start: 8, + end: 10, + value: 3.0, }, ]; - let mut arr = Array::from(vec![f64::NAN, f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), + let answer = vec![1.0, 1.0, 2.0, 2.5, 3.0]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, - 2, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0, 3.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Min, - 1, - f64::NAN, - arr.view_mut(), - ) - .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [1.0].into_iter().collect::>()); - let mut arr = Array::from(vec![f64::NAN]); - fill_binned_coverage_from_entries( - 10, - 20, - entries.clone().into_iter().map(|v| Ok(v)), - Summary::Max, - 1, - f64::NAN, + n_bins, + 0.0, arr.view_mut(), ) .unwrap(); - let res = arr.to_vec(); - assert_equal(res, [3.0].into_iter().collect::>()); + + for i in 0..n_bins { + assert!( + (arr[i] - answer[i]).abs() < 0.01, + "Bin {}: expected {}, got {}, diff {}", + i, + answer[i], + arr[i], + (arr[i] - answer[i]).abs() + ); + } } } diff --git a/pybigtools/src/writer.rs b/pybigtools/src/writer.rs index dc6dfd9..7f63c53 100644 --- a/pybigtools/src/writer.rs +++ b/pybigtools/src/writer.rs @@ -11,12 +11,12 @@ use crate::errors::BBIFileClosed; /// Interface for writing to a BigWig file. #[pyclass(module = "pybigtools")] -pub struct BigWigWrite { +pub struct BigWigWriter { pub bigwig: Option, } #[pymethods] -impl BigWigWrite { +impl BigWigWriter { /// Write values to the BigWig file. /// /// The underlying file will be closed automatically when the function @@ -169,12 +169,12 @@ impl BigWigWrite { /// Interface for writing to a BigBed file. #[pyclass(module = "pybigtools")] -pub struct BigBedWrite { +pub struct BigBedWriter { pub bigbed: Option, } #[pymethods] -impl BigBedWrite { +impl BigBedWriter { /// Write values to the BigBed file. /// /// The underlying file will be closed automatically when the function From 04428657db6705c0bb371ddad2bf571fe1b2359e Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 29 Oct 2025 20:11:47 -0400 Subject: [PATCH 03/17] test: Comprehensive testing of raster methods and comparison with pybbi --- pybigtools/pyproject.toml | 1 + pybigtools/tests/test_api.py | 104 ++++----- pybigtools/tests/test_io.py | 15 +- pybigtools/tests/test_raster.py | 367 ++++++++++++++++++++++++++++++++ pybigtools/uv.lock | 38 ++++ 5 files changed, 470 insertions(+), 55 deletions(-) create mode 100644 pybigtools/tests/test_raster.py diff --git a/pybigtools/pyproject.toml b/pybigtools/pyproject.toml index b9c6c18..4450be8 100644 --- a/pybigtools/pyproject.toml +++ b/pybigtools/pyproject.toml @@ -36,6 +36,7 @@ dev = [ {include-group = "test"} ] test = [ + "pybbi", "pytest", "smart_open[http]", ] diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index d0a40fd..7f767ef 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -283,49 +283,54 @@ def test_values_binned(bw, bb): assert bb.values("chr21", 10_148_000, 10_158_000, 10, "max")[0] == 1.0 assert bw.values("chr17", 100000, 110000, 10, "min")[0] == 0.05403999984264374 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min")[0] == 0.0 + assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min")[0] == 1.0 + assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min0")[0] == 0.0 def test_values_binned_exact(bw, bb): - assert ( - bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0] - == 0.4542629980980206 + assert np.isclose( + bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0], + 0.4542629980980206, ) assert bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True)[0] == 1.0 - - assert list(bw.values("chr17", 59890, 59900, 10, "mean", exact=True)) == [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.06791999936103821, - 0.06791999936103821, - ] + assert np.allclose( + bw.values("chr17", 59890, 59900, 10, "mean", exact=True), + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.06791999936103821, + 0.06791999936103821, + ], + ) def test_values_binned_missing_oob(bw, bb): - assert list( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True, missing=-1.0) - ) == [ - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - -1.0, - 0.06791999936103821, - 0.06791999936103821, - ] + assert np.allclose( + bw.values("chr17", 59890, 59900, 10, "mean", exact=True, missing=-1.0), + [ + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + 0.06791999936103821, + 0.06791999936103821, + ], + ) x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0) assert math.isnan(x[0]) assert not math.isnan(x[19]) + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0) assert math.isnan(x[0]) assert not math.isnan(x[19]) @@ -335,33 +340,31 @@ def test_values_binned_missing_oob(bw, bb): x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) assert x[0] == 0.0 - assert pytest.approx(bw.values("chr17", 59890, 59900)[9], 0.0001) == 0.06792 - assert pytest.approx(bw.values("chr17", 59890, 59900, missing=np.nan)[9], 0.0001) == 0.06792 + assert np.isclose(bw.values("chr17", 59890, 59900)[9], 0.06792) + assert np.isclose(bw.values("chr17", 59890, 59900, missing=np.nan)[9], 0.06792) -def test_values_binned_estimate_differences(bw, bb): - # Some differences in estimates between pybigtools to other libs - # Namely, bigtools calculates estimates by taking the - # sum of nanmeans over covered bases (summary.sum/summary.covered_bases) - # and dividing by covered bases (overlap between zoom and bin) - # So, including these as cases where the calculated value is different + +def test_values_binned_exact_vs_from_zoom(bw, bb): vals = bw.values("chr17", 85525, 85730, bins=2, exact=False) - assert list(vals) == [0.15392776003070907, 2.728891665264241] + assert np.allclose(vals, [0.10737355, 2.72889167]) vals = bw.values("chr17", 85525, 85730, bins=2, exact=True) - assert list(vals) == [0.06770934917680595, 2.4864424403431347] + assert np.allclose(vals, [0.06770935, 2.48644244]) + vals = bw.values("chr17", 59900, 60105, bins=2, exact=False) - assert list(vals) == [0.5358060553962108, 0.5513471488813751] + assert np.allclose(vals, [0.53580606, 0.55134715]) vals = bw.values("chr17", 59900, 60105, bins=2, exact=True) - assert list(vals) == [0.5362001863472602, 0.5527710799959679] + assert np.allclose(vals, [0.53620019, 0.55277108]) vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False) - assert list(vals) == [1.2572170068028603] + assert np.allclose(vals, [1.23191877]) vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True) - assert list(vals) == [1.3408662900188324] + assert np.allclose(vals, [1.34086629]) def test_values_assign_to_array(bw, bb): # The returned array is the same as the one passed, so both show the same values arr = np.zeros(20) + ret_arr = bw.values( "chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr ) @@ -380,22 +383,24 @@ def test_values_assign_to_array(bw, bb): assert ret_arr[19] == 0.0 assert np.array_equal(arr, ret_arr, equal_nan=True) + def test_values_oob(bw): chroms = bw.chroms() # Region: arbitrary chromosome, 1 bin outside overlap chrom = list(chroms.keys())[0] bin_size = 8 - end = chroms[chrom]+bin_size - start = end - 2*bin_size + end = chroms[chrom] + bin_size + start = end - 2 * bin_size # Try to get values - v=bw.values(chrom, start, end, bins = 3) + v = bw.values(chrom, start, end, bins=3) assert len(v) == 3 assert np.isnan(v[2]) # Note: also happens if explicitly setting oob: - v=bw.values(chrom, start, end, bins = 3, oob=0) + v = bw.values(chrom, start, end, bins=3, oob=0) assert len(v) == 3 assert v[2] == 0 + def test_big_gene_pred(): bb = pybigtools.open(REPO_ROOT / "bigtools/resources/test/bigGenePred.bb") bb.values("chr21", 14_000_000, 18_000_000, 100) @@ -403,7 +408,6 @@ def test_big_gene_pred(): def test_average_over_bed(bw, bb): assert pytest.raises(ValueError, bb.average_over_bed, "ignored") - assert pytest.raises(ValueError, bw.average_over_bed, 0) path = str(REPO_ROOT / "bigtools/resources/test/bwaob_intervals.bed") diff --git a/pybigtools/tests/test_io.py b/pybigtools/tests/test_io.py index 935a5a7..32c94d1 100644 --- a/pybigtools/tests/test_io.py +++ b/pybigtools/tests/test_io.py @@ -72,14 +72,15 @@ def genintervals(): assert a[1] == b[2] assert math.isclose(a[2], b[3], abs_tol=0.00001) + def test_bigbed_write(tmpdir): f = pybigtools.open(os.path.join(tmpdir, "test.bigBed"), "w") f.write( - {"chr1": 1000, "chr2": 1000}, + {"chr1": 1000, "chr2": 1000}, [ - ("chr1", 0, 100, "foo\tbar\t3"), + ("chr1", 0, 100, "foo\tbar\t3"), ("chr2", 100, 200, "xxx\tyyy\t1.0"), - ("chr2", 200, 300, "zzz\twww\t1.0") + ("chr2", 200, 300, "zzz\twww\t1.0"), ], autosql="""\ table bed3 @@ -88,16 +89,20 @@ def test_bigbed_write(tmpdir): string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" + string name; "Name of item" + string name2; "Another name" + float value; "Value" ) - """ +""", ) f.close() f = pybigtools.open(os.path.join(tmpdir, "test.bigBed")) records = list(f.records("chr2")) - assert records[0][2] == 'xxx' + assert records[0][2] == "xxx" sql = f.sql() assert sql.startswith("table") + # TODO: bigWigAverageOverBed # TODO: bigWigMerge diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py new file mode 100644 index 0000000..76b48ed --- /dev/null +++ b/pybigtools/tests/test_raster.py @@ -0,0 +1,367 @@ +import pathlib + +import numpy as np +import pytest + +import pybigtools as pbt +import bbi + +TEST_DIR = pathlib.Path(__file__).parent +REPO_ROOT = TEST_DIR.parent.parent + + +@pytest.fixture +def bw_path(): + """ + The values here are identical to the coverage depth of ``bb_path``. + The coverage breadth is 8 out of 12 bases (66.67%). + + chroms = {"chr1": 12} + entries = [ + ("chr1", 1, 3, 1.0), + ("chr1", 4, 8, 2.0), + ("chr1", 8, 10, 3.0) + ] + """ + return str(TEST_DIR / "data/mini.bw") + + +@pytest.fixture +def bb_path(): + """ + The coverage depth of these intervals is identical to ``bw_path``. + The coverage breadth is 8 out of 12 bases (66.67%). + + chroms = {"chr1": 12} + entries = [ + ("chr1", 1, 3, ""), + ("chr1", 4, 6, ""), + ("chr1", 4, 10, ""), + ("chr1", 6, 10, ""), + ("chr1", 8, 10, ""), + ] + """ + return str(TEST_DIR / "data/mini.bb") + + +def test_values(bw_path, bb_path): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", 0, 12) + assert np.allclose(x1, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", -2, 14) + assert np.allclose( + x1, + [np.nan, np.nan, 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, np.nan, np.nan], + equal_nan=True, + ) + + +@pytest.mark.parametrize("missing", [np.nan, 0, -1]) +def test_binned_mean(bw_path, bb_path, missing): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="mean" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="mean" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="mean" + ) + assert np.allclose(x1, [1, 1, 2, 2.5, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="mean" + ) + assert np.allclose(x1, [1.5, 2.5], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="mean" + ) + assert np.allclose(x1, [2], equal_nan=True) + + +@pytest.mark.parametrize("missing", [np.nan, 0, -1]) +def test_binned_mean0(bw_path, bb_path, missing): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="mean0" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="mean0" + ) + assert np.allclose(x1, [0.5, 0.5, 2, 2, 3, missing], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="mean0" + ) + assert np.allclose( + x1, [1 / 2.4, 1 / 2.4, 6 / 2.4, 5 / 2.4, 3 / 2.4], equal_nan=True + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="mean0" + ) + assert np.allclose(x1, [1, 10 / 6], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="mean0" + ) + assert np.allclose(x1, [16 / 12], equal_nan=True) + + +@pytest.mark.parametrize("missing", [np.nan, 0, -1]) +def test_binned_minmax(bw_path, bb_path, missing): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="min" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="max" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="min" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="max" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="min" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="max" + ) + assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="min" + ) + assert np.allclose(x1, [1, 2], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="max" + ) + assert np.allclose(x1, [2, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="min" + ) + assert np.allclose(x1, [1], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="max" + ) + assert np.allclose(x1, [3], equal_nan=True) + + +@pytest.mark.parametrize("missing", [np.nan, 0, -1]) +def test_binned_minmax0(bw_path, bb_path, missing): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="min0" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="max0" + ) + assert np.allclose( + x1, + [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + equal_nan=True, + ) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="min0" + ) + assert np.allclose(x1, [0, 0, 2, 2, 3, missing], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="max0" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="min0" + ) + assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="max0" + ) + assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="min0" + ) + assert np.allclose(x1, [0, 0], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="max0" + ) + assert np.allclose(x1, [2, 3], equal_nan=True) + + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="min0" + ) + assert np.allclose(x1, [0], equal_nan=True) + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="max0" + ) + assert np.allclose(x1, [3], equal_nan=True) + + +@pytest.mark.parametrize( + "n_bins,exact", [(n_bins, exact) for n_bins in range(1, 13) for exact in [True]] +) +def test_binned_coverage_breadth(bw_path, bb_path, n_bins, exact): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", 0, 12, bins=n_bins, exact=exact, summary="bases_covered" + ) + assert np.nansum(x1) == 8 + + +@pytest.mark.parametrize( + "start,end", [(start, end) for start in [0, 2] for end in [10, 12]] +) +def test_bbi_consistent_values(bw_path, bb_path, start, end): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", start, end) + x2 = bbi.open(path).fetch("chr1", start, end) + assert np.allclose(x1, x2, equal_nan=True) + + +def test_bbi_consistent_values_match_coverage(bb_path, bw_path): + bb1 = pbt.open(bb_path).values("chr1") + bw1 = pbt.open(bw_path).values("chr1") + assert np.allclose(bb1, bw1, equal_nan=True) + + bb2 = bbi.open(bb_path).fetch("chr1", 0, -1) + bw2 = bbi.open(bw_path).fetch("chr1", 0, -1) + assert np.allclose(bb2, bw2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,missing,oob", + [ + (start, end, missing, oob) + for start in [-2, 0, 2] + for end in [10, 12, 14] + for missing in [np.nan, 0, -1] + for oob in [np.nan, 0, -1] + ], +) +def test_bbi_consistent_values_missing_oob(bw_path, bb_path, start, end, missing, oob): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", start, end, missing=missing, oob=oob) + x2 = bbi.open(path).fetch("chr1", start, end, missing=missing, oob=oob) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact,summary", + [ + (start, end, n_bins, exact, summary) + for start in [0, 2] + for end in [10, 12] + for n_bins in range(1, 13) + for exact in [True] + for summary in ["sum", "mean", "std", "min", "max", "bin_covered"] + ], +) +def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summary): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", start, end, bins=n_bins, exact=exact, summary=summary + ) + x2 = bbi.open(path).fetch( + "chr1", + start, + end, + bins=n_bins, + exact=exact, + summary=summary if summary != "bin_covered" else "cov", + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact,missing,oob", + [ + (start, end, n_bins, exact, missing, oob) + for start in [-2, 0, 2] + for end in [10, 12, 14] + for n_bins in range(1, 13) + for exact in [True] + for missing in [np.nan, 0, -1] + for oob in [np.nan, 0, -1] + ], +) +def test_bbi_consistent_binned_missing_oob( + bw_path, bb_path, start, end, n_bins, exact, missing, oob +): + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values( + "chr1", start, end, bins=n_bins, exact=exact, missing=missing, oob=oob + ) + x2 = bbi.open(path).fetch( + "chr1", start, end, bins=n_bins, exact=exact, missing=missing, oob=oob + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,exact", + [(0, 12, n_bins, exact) for n_bins in [12, 24, 48] for exact in [True]], +) +def test_bbi_consistent_binned_upsample(bw_path, bb_path, start, end, n_bins, exact): + for path in [bb_path, bw_path]: + x1 = pbt.open(path).values("chr1", start, end, bins=n_bins, exact=exact) + x2 = bbi.open(path).fetch("chr1", start, end, bins=n_bins, exact=exact) + assert np.allclose(x1, x2, equal_nan=True) diff --git a/pybigtools/uv.lock b/pybigtools/uv.lock index c16796d..f142b45 100644 --- a/pybigtools/uv.lock +++ b/pybigtools/uv.lock @@ -454,6 +454,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "pybbi" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6e/aa/fa75fd38bb1f2c3e22c2bc0ce28122c9c2b586c5688f0b8b27078a560b0d/pybbi-0.4.1.tar.gz", hash = "sha256:6a193dcd0d4d4575c258d6cd5d2d7c5368a5cf58e939d8faae9f8fe192373a5c", size = 34725107, upload-time = "2024-10-18T15:58:26.803Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/ee/d6166c0b7d1c8727526871303c52df38c4d02f8d136faa0151fa02908490/pybbi-0.4.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:6add8b3d461aabce5a43f8c2d97e7b69b358ff25f035842ccfbf14098788d973", size = 2740827, upload-time = "2024-10-18T15:57:42.728Z" }, + { url = "https://files.pythonhosted.org/packages/12/b0/a36607acac77d38261469befe423c37412e4298c0a707a29cb31344654e0/pybbi-0.4.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6b4931a389a9dd686305fb1a9a9fb24ccb4aaf2f97ffbef7ab8c5e6366d4328f", size = 3008186, upload-time = "2024-10-18T15:57:44.438Z" }, + { url = "https://files.pythonhosted.org/packages/a9/9b/c74cc68eaa9abfdebd5fd6e00c91605a10c56254c2023d23717c45f41b13/pybbi-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0065c5f52f165df962028299a12016ea4b3af51e3c230d9d02d8df70084976f2", size = 3062137, upload-time = "2024-10-18T15:57:47.92Z" }, + { url = "https://files.pythonhosted.org/packages/c7/49/4be3740f7d5586dfdc6b66d649a74f043ec61ff292081631cef2646a8a4f/pybbi-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f31c966598af3109085c098ea33f9ba5e48108feb94c0c73692fe125f5daadd", size = 3386466, upload-time = "2024-10-18T15:57:50.063Z" }, + { url = "https://files.pythonhosted.org/packages/cd/b2/f274c9afdd5e8261dce265deea7c1f098d7a2e16589e15dce00887d6661e/pybbi-0.4.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:da07fd359d49a7aa3d8facc0adc9cd24e10109c7d2bf35e900b7151b45e6b644", size = 2741185, upload-time = "2024-10-18T15:57:52.298Z" }, + { url = "https://files.pythonhosted.org/packages/5a/75/76592a8f2b1a8e1320866ac17d9efae79a83e33864b09a5a7e23edd65268/pybbi-0.4.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a8a2351bfdec49954e9b12df37422ce6815b9d22e219fdc03787b82099c9b005", size = 3008323, upload-time = "2024-10-18T15:57:54.849Z" }, + { url = "https://files.pythonhosted.org/packages/06/1f/530d3fd1420fc9b31ccc5659dbde14744f1c58fff47ed03a6a42abb9b319/pybbi-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf2dcf60e712b409c2199568159b23ccd39a1273d02e96ceea27a668527e1f2e", size = 3096868, upload-time = "2024-10-18T15:57:56.397Z" }, + { url = "https://files.pythonhosted.org/packages/67/96/42d795bcd89926d4e24fbe90c7eb875dac0e64c57fe2d15c74df5ab208e4/pybbi-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb94da99b7edbf0eb2111b273f7345ae78fd0ed6ccd098a649a0c4c546a4fc7d", size = 3417886, upload-time = "2024-10-18T15:57:59.877Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ad/0517feaeef73c493604f85de73adb66b1d9fd8d42da2bcbac0702d029dd4/pybbi-0.4.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:3d6c33ded8cf8b5b85f7093cb7ced67059540e80203df31c33c12c2167aa28ad", size = 2741753, upload-time = "2024-10-18T15:58:02.676Z" }, + { url = "https://files.pythonhosted.org/packages/8d/c7/ce5b7ad871e9f580fd2cb4c8bf02e8c13708b45657dd17d1dbc33ecf828c/pybbi-0.4.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:68d351ab964d6189a5d3beec7105e679177bc43141a4ec272e89246a6e8d7f72", size = 3009601, upload-time = "2024-10-18T15:58:04.7Z" }, + { url = "https://files.pythonhosted.org/packages/37/f3/5ed2a7e188ac9063d3ddf97e0d6fff4f876753222ed7093ea0ff8324cf9c/pybbi-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:607084a293d1c18915d3a4811c7c81b640601ba18b240f67338b2d97fd793481", size = 3086782, upload-time = "2024-10-18T15:58:06.636Z" }, + { url = "https://files.pythonhosted.org/packages/cf/be/dc05048eb85965bea938caacd3068b62e3fbc00284a8d5b52205f3212685/pybbi-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7d9f22cd18e7b8affc66c42a2c95e542afd8a1652430c0d9bfbabf7d4c11799", size = 3411157, upload-time = "2024-10-18T15:58:08.731Z" }, + { url = "https://files.pythonhosted.org/packages/c3/df/26a59938de8f092536dcd7287154bd21346f6721b1a6f38c5fa141e35b38/pybbi-0.4.1-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:b276deb438fa990d9db0947ec8dfa3f6d75728ba43e1d71650a0c24ffc6482af", size = 2740591, upload-time = "2024-10-18T15:58:10.632Z" }, + { url = "https://files.pythonhosted.org/packages/fa/ff/bd13e585560406b1624f68a36b98e74b705cf202ab465f1cd8231efb0522/pybbi-0.4.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:fe1c68e0d13489d75b374d4f82bdeeff192241ff3dfffefc59652491ab3d8b42", size = 3008555, upload-time = "2024-10-18T15:58:12.298Z" }, + { url = "https://files.pythonhosted.org/packages/da/bb/ab151ecbe26f1421a8a3ae12bd357471d130387b22b0820456b8749acc69/pybbi-0.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8dc2ebd0e1d670dbcec02f52978dad910293eb705383318090a7608c142f911", size = 3087731, upload-time = "2024-10-18T15:58:13.839Z" }, + { url = "https://files.pythonhosted.org/packages/bd/70/1942a663e32085eba7ec255f1ee873047df1901b2798dd4e57a1db0795b9/pybbi-0.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2197144d071e17fbbb136508046315b319bbe2fbf88085b356fb02f962b7b704", size = 3411890, upload-time = "2024-10-18T15:58:15.626Z" }, + { url = "https://files.pythonhosted.org/packages/51/78/fe1a9aafe2c73c0e848e8bc315a9ee443e6eb4ff675d0cf11f38d6d87b4c/pybbi-0.4.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:bdcabcd727ea90ec334589a9fb1ad6a3403b551c1fa8dd497beba93c09911d0b", size = 2740939, upload-time = "2024-10-18T15:58:17.399Z" }, + { url = "https://files.pythonhosted.org/packages/26/5e/a2aa1d3a36fe8209075bf7c029eb88b03d3326665453166fdfcfca2c4871/pybbi-0.4.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:0e622817ba90fa39efea407deac364201f2ffdfa274311422450fac329078c7b", size = 3008362, upload-time = "2024-10-18T15:58:19.94Z" }, + { url = "https://files.pythonhosted.org/packages/c6/c6/e74b133766270ca4bc83f611cf080ad8d890d0c9be4eed5d3096cc75754e/pybbi-0.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaf44f1cc8bbbdcbf6ecbdada9a00f85a87416a299a7de72462b03f075287a7a", size = 3062047, upload-time = "2024-10-18T15:58:22.184Z" }, + { url = "https://files.pythonhosted.org/packages/be/fe/12f528f09015f8d80f8c6307b439cb8f77524509540b765325a4d4f1ae7a/pybbi-0.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6726ea3decad71cb17541e6d0d440588ab6a555d2e280cedf4517f2ea011a07c", size = 3386409, upload-time = "2024-10-18T15:58:24.136Z" }, +] + [[package]] name = "pybigtools" source = { editable = "." } @@ -466,12 +500,14 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "pybbi" }, { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, { name = "ruff" }, { name = "smart-open", extra = ["http"] }, ] test = [ + { name = "pybbi" }, { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, { name = "smart-open", extra = ["http"] }, @@ -482,11 +518,13 @@ requires-dist = [{ name = "numpy" }] [package.metadata.requires-dev] dev = [ + { name = "pybbi" }, { name = "pytest" }, { name = "ruff" }, { name = "smart-open", extras = ["http"] }, ] test = [ + { name = "pybbi" }, { name = "pytest" }, { name = "smart-open", extras = ["http"] }, ] From 5ec1966cb59d6ce4a599ff50eac47ae69ce5be26 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 29 Oct 2025 20:30:48 -0400 Subject: [PATCH 04/17] style: Run cargo clippy and update some docstrings --- pybigtools/src/coverage.rs | 7 +++-- pybigtools/src/errors.rs | 6 ++-- pybigtools/src/lib.rs | 56 ++++++++++++++++++-------------------- pybigtools/src/reader.rs | 28 ++++++++++--------- pybigtools/src/utils.rs | 18 ++++++------ pybigtools/src/writer.rs | 40 ++++++++++++--------------- 6 files changed, 75 insertions(+), 80 deletions(-) diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs index 0b8472a..2989086 100644 --- a/pybigtools/src/coverage.rs +++ b/pybigtools/src/coverage.rs @@ -6,7 +6,7 @@ use std::collections::BinaryHeap; /// /// This iterator consumes a stream of sorted, potentially overlapping BED entries and produces a /// stream of non-overlapping intervals whose values represent the coverage depth (number of -/// overlapping entries) over its genomic positions. +/// overlapping entries) over its range. /// /// The input BED entries **must** be sorted by start position, then end position. /// @@ -46,7 +46,7 @@ where pub fn new(mut iter: I) -> Result { let mut heap = BinaryHeap::new(); - // Initialize the heap with two entries (four positions) + // Initialize the heap match iter.next() { Some(Ok(entry)) => { heap.push(Reverse((entry.start, 1))); @@ -95,13 +95,14 @@ where } // If we have more entries with the current position, load them. - while let Some(result) = self.iter.next() { + for result in self.iter.by_ref() { match result { Ok(entry) => { self.heap.push(Reverse((entry.start, 1))); self.heap.push(Reverse((entry.end, -1))); // If this entry contributes to the current position, process those events + // and remove them from the heap let has_current_pos = entry.start == pos || entry.end == pos; if has_current_pos { while let Some(&Reverse((next_pos, inc))) = self.heap.peek() { diff --git a/pybigtools/src/errors.rs b/pybigtools/src/errors.rs index 1e70e11..4dbb5c8 100644 --- a/pybigtools/src/errors.rs +++ b/pybigtools/src/errors.rs @@ -27,9 +27,9 @@ impl ToPyErr for bigtools::ZoomIntervalError { fn to_py_err(self) -> PyErr { match self { bigtools::ZoomIntervalError::ReductionLevelNotFound => { - PyErr::new::(format!( - "The passed reduction level was not found" - )) + PyErr::new::( + "The passed reduction level was not found".to_string(), + ) } _ => PyErr::new::(format!("{}", self)), } diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index a907d8a..19e5141 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -84,15 +84,13 @@ fn open( } if iswrite { - return Err(PyErr::new::(format!( - "Writing only supports file names", - ))); + return Err(PyErr::new::( + "Writing only supports file names".to_string(), + )); } let file_like = match PyFileLikeObject::new(path_url_or_file_like, true, false, true) { Ok(file_like) => file_like, - Err(_) => return Err(PyErr::new::(format!( - "Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object.", - ))), + Err(_) => return Err(PyErr::new::("Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object.".to_string())), }; let read = match GenericBBIRead::open(file_like) { Ok(GenericBBIRead::BigWig(bigwig)) => BBIReader { @@ -123,17 +121,14 @@ fn open_path_or_url( { Some(e) => e.to_string(), None => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } }; let res = if iswrite { - match Url::parse(&path_url_or_file_like) { - Ok(_) => { - return Err(PyErr::new::(format!( - "Invalid file path. Writing does not support urls." - ))) - } - Err(_) => {} + if Url::parse(&path_url_or_file_like).is_ok() { + return Err(PyErr::new::( + "Invalid file path. Writing does not support urls.".to_string(), + )); } match extension.as_ref() { "bw" | "bigWig" | "bigwig" => BigWigWriter { @@ -145,7 +140,7 @@ fn open_path_or_url( } .into_py(py), _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } } } else { @@ -154,9 +149,10 @@ fn open_path_or_url( match Url::parse(&path_url_or_file_like) { Ok(_) => {} Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::( "Invalid file path. The file does not exist and it is not a url." - ))) + .to_string(), + )) } } } @@ -169,9 +165,9 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigWig." - ))) + return Err(PyErr::new::( + "Error opening bigWig.".to_string(), + )) } } } else { @@ -182,9 +178,9 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigWig." - ))) + return Err(PyErr::new::( + "Error opening bigWig.".to_string(), + )) } } @@ -204,9 +200,9 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigBed." - ))) + return Err(PyErr::new::( + "Error opening bigBed.".to_string(), + )) } } } else { @@ -217,9 +213,9 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( - "Error opening bigBed." - ))) + return Err(PyErr::new::( + "Error opening bigBed.".to_string(), + )) } } @@ -232,7 +228,7 @@ fn open_path_or_url( } } _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb).".to_string())); } } }; diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index 605ec0e..33c41dc 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -123,7 +123,7 @@ impl BBIReader { ("sum", summary.sum.to_object(py)), ( "mean", - (summary.sum as f64 / summary.bases_covered as f64).to_object(py), + (summary.sum / summary.bases_covered as f64).to_object(py), ), ("min", summary.min_val.to_object(py)), ("max", summary.max_val.to_object(py)), @@ -171,7 +171,7 @@ impl BBIReader { Some(chrom) => { let chrom_length = b .chroms() - .into_iter() + .iter() .find(|c| c.name == chrom) .ok_or_else(|| { PyErr::new::( @@ -184,7 +184,7 @@ impl BBIReader { None => { let chrom_dict: PyObject = b .chroms() - .into_iter() + .iter() .map(|c| (c.name.clone(), c.length)) .into_py_dict_bound(py) .into(); @@ -208,6 +208,7 @@ impl BBIReader { /// Return a list of sizes in bases of the summary intervals used in each /// of the zoom levels (i.e. reduction levels) of the BBI file. + #[allow(clippy::too_many_arguments)] fn zooms(&self) -> PyResult> { let zooms = match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), @@ -348,7 +349,7 @@ impl BBIReader { ) -> PyResult { let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { @@ -452,7 +453,7 @@ impl BBIReader { ) -> PyResult { let (start, end) = start_end_clamped(&self.bbi, &chrom, start, end)?; match &self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; let iter = b @@ -576,6 +577,7 @@ impl BBIReader { signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, )] + #[allow(clippy::too_many_arguments)] fn values( &mut self, py: Python<'_>, @@ -610,7 +612,7 @@ impl BBIReader { } }; match &mut self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => intervals_to_array( py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), @@ -776,9 +778,9 @@ impl BBIReader { if bed.is_instance(&path_class)? { bed.str()? } else { - return Err(PyErr::new::(format!( - "Unknown argument for `path`. Not a string or Path object.", - ))); + return Err(PyErr::new::( + "Unknown argument for `path`. Not a string or Path object.".to_string(), + )); } }; let bedin = BufReader::new(File::open(bed.to_str()?)?); @@ -918,7 +920,7 @@ impl BigBedEntriesIterator { .chain(next.rest.split_whitespace().map(|o| o.to_object(py))) .collect(); Ok(Some( - PyTuple::new_bound::(py, elements.into_iter()).to_object(py), + PyTuple::new_bound::(py, elements).to_object(py), )) } } @@ -1074,7 +1076,7 @@ fn start_end_clamped( BBIReadRaw::BigBedRemote(b) => b.chroms(), BBIReadRaw::BigBedFileLike(b) => b.chroms(), }; - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); + let chrom = chroms.iter().find(|x| x.name == chrom_name); let length = match chrom { None => { return Err(PyErr::new::(format!( @@ -1084,8 +1086,8 @@ fn start_end_clamped( } Some(c) => c.length, }; - return Ok(( + Ok(( start.map(|v| v.max(0) as u32).unwrap_or(0), end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), - )); + )) } diff --git a/pybigtools/src/utils.rs b/pybigtools/src/utils.rs index f3d2635..aafcd09 100644 --- a/pybigtools/src/utils.rs +++ b/pybigtools/src/utils.rs @@ -45,7 +45,7 @@ enum BBIRead<'a, R: BBIFileRead> { BigBed(&'a mut BigBedRead), } -impl<'a, R: BBIFileRead> BBIRead<'a, R> { +impl BBIRead<'_, R> { pub fn chroms(&self) -> &[bigtools::ChromInfo] { match self { BBIRead::BigWig(bw) => bw.chroms(), @@ -62,7 +62,7 @@ impl<'a, R: BBIFileRead> BBIRead<'a, R> { end: Option, ) -> PyResult<(i32, i32, i32)> { let chroms = self.chroms(); - let chrom = chroms.into_iter().find(|x| x.name == chrom_name); + let chrom = chroms.iter().find(|x| x.name == chrom_name); let length = match chrom { None => { return Err(PyErr::new::(format!( @@ -72,7 +72,7 @@ impl<'a, R: BBIFileRead> BBIRead<'a, R> { } Some(c) => c.length as i32, }; - return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); + Ok((start.unwrap_or(0), end.unwrap_or(length), length)) } pub fn zoom_headers(&self) -> &[bigtools::ZoomHeader] { @@ -174,14 +174,14 @@ fn records_to_array( match bbi { BBIRead::BigWig(bw) => { let iter = bw - .get_interval(&chrom, valid_start, valid_end) + .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::Value)); fill_values(start, end, iter, missing, view).convert_err()?; } BBIRead::BigBed(bb) => { let iter = bb - .get_interval(&chrom, valid_start, valid_end) + .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::BedEntry)); fill_values(start, end, iter, missing, view).convert_err()?; @@ -209,7 +209,7 @@ fn records_to_array( Some(reduction_level) => match bbi { BBIRead::BigWig(bw) => { let iter = bw - .get_zoom_interval(&chrom, valid_start, valid_end, reduction_level) + .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) .convert_err()? .map(|item| item.map(BBIRecord::ZoomRecord)); fill_binned(start, end, iter, summary, bins, missing, view) @@ -217,7 +217,7 @@ fn records_to_array( } BBIRead::BigBed(bb) => { let iter = bb - .get_zoom_interval(&chrom, valid_start, valid_end, reduction_level) + .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) .convert_err()? .map(|item| item.map(BBIRecord::ZoomRecord)); fill_binned(start, end, iter, summary, bins, missing, view) @@ -228,7 +228,7 @@ fn records_to_array( None => match bbi { BBIRead::BigWig(bw) => { let iter = bw - .get_interval(&chrom, valid_start, valid_end) + .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::Value)); fill_binned(start, end, iter, summary, bins, missing, view) @@ -236,7 +236,7 @@ fn records_to_array( } BBIRead::BigBed(bb) => { let iter = bb - .get_interval(&chrom, valid_start, valid_end) + .get_interval(chrom, valid_start, valid_end) .convert_err()?; let coverage_iter = CoverageIterator::new(iter) .convert_err()? diff --git a/pybigtools/src/writer.rs b/pybigtools/src/writer.rs index 7f63c53..fcd061b 100644 --- a/pybigtools/src/writer.rs +++ b/pybigtools/src/writer.rs @@ -77,17 +77,17 @@ impl BigWigWriter { let mut iter: Bound<'_, PyIterator> = match self.inner.downcast_bound(py) { Ok(o) => o.clone(), Err(_) => { - return Some(Err(IterError(format!( - "Passed value for `val` is not iterable." - )))) + return Some(Err(IterError( + "Passed value for `val` is not iterable.".to_string(), + ))) } }; let next: Result<(String, Value), pyo3::PyErr> = match iter.next()? { Err(e) => { e.print(py); - return Some(Err(IterError(format!( - "An error occurred while iterating." - )))); + return Some(Err(IterError( + "An error occurred while iterating.".to_string(), + ))); } Ok(n) => { // TODO: try block or separate function @@ -145,13 +145,12 @@ impl BigWigWriter { } })?; let vals_iter_raw = Iter { inner: iter }.map(|v| match v { - Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.0.to_string())), Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigwig.write(data, runtime) { - Err(e) => println!("{}", e), - Ok(_) => {} + if let Err(e) = bigwig.write(data, runtime) { + println!("{}", e) } Ok(()) }) @@ -246,17 +245,17 @@ impl BigBedWriter { let mut iter: Bound<'_, PyIterator> = match self.inner.downcast_bound(py) { Ok(o) => o.clone(), Err(_) => { - return Some(Err(IterError(format!( - "Passed value for `val` is not iterable." - )))) + return Some(Err(IterError( + "Passed value for `val` is not iterable.".to_string(), + ))) } }; let next: Result<(String, BedEntry), pyo3::PyErr> = match iter.next()? { Err(e) => { e.print(py); - return Some(Err(IterError(format!( - "An error occurred while iterating." - )))); + return Some(Err(IterError( + "An error occurred while iterating.".to_string(), + ))); } Ok(n) => { // TODO: try block or separate function @@ -314,15 +313,12 @@ impl BigBedWriter { } })?; let vals_iter_raw = Iter { inner: iter }.map(|v| match v { - Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.0.to_string())), Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigbed.write(data, runtime) { - Err(e) => { - println!("{}", e) - } - Ok(_) => {} + if let Err(e) = bigbed.write(data, runtime) { + println!("{}", e) } Ok(()) }) From c3a3c7afd6a886b1830efa95f73c2f986842bc59 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 29 Oct 2025 20:44:35 -0400 Subject: [PATCH 05/17] Rename utils.rs -> raster.rs --- pybigtools/src/lib.rs | 2 +- pybigtools/src/{utils.rs => raster.rs} | 0 pybigtools/src/reader.rs | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename pybigtools/src/{utils.rs => raster.rs} (100%) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 19e5141..abb438c 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -15,7 +15,7 @@ mod coverage; mod errors; mod file_like; mod reader; -mod utils; +mod raster; mod writer; use errors::{BBIFileClosed, BBIReadError}; diff --git a/pybigtools/src/utils.rs b/pybigtools/src/raster.rs similarity index 100% rename from pybigtools/src/utils.rs rename to pybigtools/src/raster.rs diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index 33c41dc..718da36 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -20,7 +20,7 @@ use pyo3::{prelude::*, PyTraverseError, PyVisit}; use crate::errors::{BBIFileClosed, BBIReadError, ConvertResult}; use crate::file_like::PyFileLikeObject; -use crate::utils::{entries_to_array, intervals_to_array, Summary}; +use crate::raster::{entries_to_array, intervals_to_array, Summary}; type ValueTuple = (u32, u32, f32); From a37abd0465945a7e2ef4c4504ec52479d694f201 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 30 Oct 2025 15:17:50 -0400 Subject: [PATCH 06/17] test: Upgrade pybbi to make comparison tests pass --- pybigtools/pyproject.toml | 2 +- pybigtools/tests/test_raster.py | 4 +-- pybigtools/uv.lock | 48 ++++++++++++++++----------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pybigtools/pyproject.toml b/pybigtools/pyproject.toml index 4450be8..76f494e 100644 --- a/pybigtools/pyproject.toml +++ b/pybigtools/pyproject.toml @@ -36,7 +36,7 @@ dev = [ {include-group = "test"} ] test = [ - "pybbi", + "pybbi>=0.4.2", "pytest", "smart_open[http]", ] diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py index 76b48ed..7120c53 100644 --- a/pybigtools/tests/test_raster.py +++ b/pybigtools/tests/test_raster.py @@ -229,11 +229,11 @@ def test_binned_minmax0(bw_path, bb_path, missing): x1 = pbt.open(path).values( "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="min0" ) - assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) + assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) x1 = pbt.open(path).values( "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="max0" ) - assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( diff --git a/pybigtools/uv.lock b/pybigtools/uv.lock index f142b45..261a64c 100644 --- a/pybigtools/uv.lock +++ b/pybigtools/uv.lock @@ -456,7 +456,7 @@ wheels = [ [[package]] name = "pybbi" -version = "0.4.1" +version = "0.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, @@ -464,28 +464,28 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6e/aa/fa75fd38bb1f2c3e22c2bc0ce28122c9c2b586c5688f0b8b27078a560b0d/pybbi-0.4.1.tar.gz", hash = "sha256:6a193dcd0d4d4575c258d6cd5d2d7c5368a5cf58e939d8faae9f8fe192373a5c", size = 34725107, upload-time = "2024-10-18T15:58:26.803Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/83/33a52ad666d93564e58986a78226dfb9dc7f52a75b36dcf8271f671f8eb1/pybbi-0.4.2.tar.gz", hash = "sha256:b9daa5f04cf5641c3addd729737d649c439f440082fe783c28be3353d2aa444b", size = 34725029, upload-time = "2025-10-30T17:09:51.165Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/ee/d6166c0b7d1c8727526871303c52df38c4d02f8d136faa0151fa02908490/pybbi-0.4.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:6add8b3d461aabce5a43f8c2d97e7b69b358ff25f035842ccfbf14098788d973", size = 2740827, upload-time = "2024-10-18T15:57:42.728Z" }, - { url = "https://files.pythonhosted.org/packages/12/b0/a36607acac77d38261469befe423c37412e4298c0a707a29cb31344654e0/pybbi-0.4.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6b4931a389a9dd686305fb1a9a9fb24ccb4aaf2f97ffbef7ab8c5e6366d4328f", size = 3008186, upload-time = "2024-10-18T15:57:44.438Z" }, - { url = "https://files.pythonhosted.org/packages/a9/9b/c74cc68eaa9abfdebd5fd6e00c91605a10c56254c2023d23717c45f41b13/pybbi-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0065c5f52f165df962028299a12016ea4b3af51e3c230d9d02d8df70084976f2", size = 3062137, upload-time = "2024-10-18T15:57:47.92Z" }, - { url = "https://files.pythonhosted.org/packages/c7/49/4be3740f7d5586dfdc6b66d649a74f043ec61ff292081631cef2646a8a4f/pybbi-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f31c966598af3109085c098ea33f9ba5e48108feb94c0c73692fe125f5daadd", size = 3386466, upload-time = "2024-10-18T15:57:50.063Z" }, - { url = "https://files.pythonhosted.org/packages/cd/b2/f274c9afdd5e8261dce265deea7c1f098d7a2e16589e15dce00887d6661e/pybbi-0.4.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:da07fd359d49a7aa3d8facc0adc9cd24e10109c7d2bf35e900b7151b45e6b644", size = 2741185, upload-time = "2024-10-18T15:57:52.298Z" }, - { url = "https://files.pythonhosted.org/packages/5a/75/76592a8f2b1a8e1320866ac17d9efae79a83e33864b09a5a7e23edd65268/pybbi-0.4.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a8a2351bfdec49954e9b12df37422ce6815b9d22e219fdc03787b82099c9b005", size = 3008323, upload-time = "2024-10-18T15:57:54.849Z" }, - { url = "https://files.pythonhosted.org/packages/06/1f/530d3fd1420fc9b31ccc5659dbde14744f1c58fff47ed03a6a42abb9b319/pybbi-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf2dcf60e712b409c2199568159b23ccd39a1273d02e96ceea27a668527e1f2e", size = 3096868, upload-time = "2024-10-18T15:57:56.397Z" }, - { url = "https://files.pythonhosted.org/packages/67/96/42d795bcd89926d4e24fbe90c7eb875dac0e64c57fe2d15c74df5ab208e4/pybbi-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb94da99b7edbf0eb2111b273f7345ae78fd0ed6ccd098a649a0c4c546a4fc7d", size = 3417886, upload-time = "2024-10-18T15:57:59.877Z" }, - { url = "https://files.pythonhosted.org/packages/3b/ad/0517feaeef73c493604f85de73adb66b1d9fd8d42da2bcbac0702d029dd4/pybbi-0.4.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:3d6c33ded8cf8b5b85f7093cb7ced67059540e80203df31c33c12c2167aa28ad", size = 2741753, upload-time = "2024-10-18T15:58:02.676Z" }, - { url = "https://files.pythonhosted.org/packages/8d/c7/ce5b7ad871e9f580fd2cb4c8bf02e8c13708b45657dd17d1dbc33ecf828c/pybbi-0.4.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:68d351ab964d6189a5d3beec7105e679177bc43141a4ec272e89246a6e8d7f72", size = 3009601, upload-time = "2024-10-18T15:58:04.7Z" }, - { url = "https://files.pythonhosted.org/packages/37/f3/5ed2a7e188ac9063d3ddf97e0d6fff4f876753222ed7093ea0ff8324cf9c/pybbi-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:607084a293d1c18915d3a4811c7c81b640601ba18b240f67338b2d97fd793481", size = 3086782, upload-time = "2024-10-18T15:58:06.636Z" }, - { url = "https://files.pythonhosted.org/packages/cf/be/dc05048eb85965bea938caacd3068b62e3fbc00284a8d5b52205f3212685/pybbi-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7d9f22cd18e7b8affc66c42a2c95e542afd8a1652430c0d9bfbabf7d4c11799", size = 3411157, upload-time = "2024-10-18T15:58:08.731Z" }, - { url = "https://files.pythonhosted.org/packages/c3/df/26a59938de8f092536dcd7287154bd21346f6721b1a6f38c5fa141e35b38/pybbi-0.4.1-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:b276deb438fa990d9db0947ec8dfa3f6d75728ba43e1d71650a0c24ffc6482af", size = 2740591, upload-time = "2024-10-18T15:58:10.632Z" }, - { url = "https://files.pythonhosted.org/packages/fa/ff/bd13e585560406b1624f68a36b98e74b705cf202ab465f1cd8231efb0522/pybbi-0.4.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:fe1c68e0d13489d75b374d4f82bdeeff192241ff3dfffefc59652491ab3d8b42", size = 3008555, upload-time = "2024-10-18T15:58:12.298Z" }, - { url = "https://files.pythonhosted.org/packages/da/bb/ab151ecbe26f1421a8a3ae12bd357471d130387b22b0820456b8749acc69/pybbi-0.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8dc2ebd0e1d670dbcec02f52978dad910293eb705383318090a7608c142f911", size = 3087731, upload-time = "2024-10-18T15:58:13.839Z" }, - { url = "https://files.pythonhosted.org/packages/bd/70/1942a663e32085eba7ec255f1ee873047df1901b2798dd4e57a1db0795b9/pybbi-0.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2197144d071e17fbbb136508046315b319bbe2fbf88085b356fb02f962b7b704", size = 3411890, upload-time = "2024-10-18T15:58:15.626Z" }, - { url = "https://files.pythonhosted.org/packages/51/78/fe1a9aafe2c73c0e848e8bc315a9ee443e6eb4ff675d0cf11f38d6d87b4c/pybbi-0.4.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:bdcabcd727ea90ec334589a9fb1ad6a3403b551c1fa8dd497beba93c09911d0b", size = 2740939, upload-time = "2024-10-18T15:58:17.399Z" }, - { url = "https://files.pythonhosted.org/packages/26/5e/a2aa1d3a36fe8209075bf7c029eb88b03d3326665453166fdfcfca2c4871/pybbi-0.4.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:0e622817ba90fa39efea407deac364201f2ffdfa274311422450fac329078c7b", size = 3008362, upload-time = "2024-10-18T15:58:19.94Z" }, - { url = "https://files.pythonhosted.org/packages/c6/c6/e74b133766270ca4bc83f611cf080ad8d890d0c9be4eed5d3096cc75754e/pybbi-0.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaf44f1cc8bbbdcbf6ecbdada9a00f85a87416a299a7de72462b03f075287a7a", size = 3062047, upload-time = "2024-10-18T15:58:22.184Z" }, - { url = "https://files.pythonhosted.org/packages/be/fe/12f528f09015f8d80f8c6307b439cb8f77524509540b765325a4d4f1ae7a/pybbi-0.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6726ea3decad71cb17541e6d0d440588ab6a555d2e280cedf4517f2ea011a07c", size = 3386409, upload-time = "2024-10-18T15:58:24.136Z" }, + { url = "https://files.pythonhosted.org/packages/e0/a9/c2d4dbcc80c5406ca0a532fe632e16b2eda194b1da1754b82bc7d7ca7248/pybbi-0.4.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:4893409bb7b51c81759d43f6b15e2b4dc6d3de20e5e39b9ea4b8ff1bbfcc17a0", size = 2936500, upload-time = "2025-10-30T17:09:13.405Z" }, + { url = "https://files.pythonhosted.org/packages/33/aa/b6ae0e4fc80d5306a4180efb73d03413a5104953089d6e46b04cf6945b1c/pybbi-0.4.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:684c76ef134ee2815f02ef8ba0fb5f270fe45a127a092c24204311cd6e45a2bb", size = 3225149, upload-time = "2025-10-30T17:09:15.736Z" }, + { url = "https://files.pythonhosted.org/packages/40/c1/f7c95688e21d9c65717dc088945e31848b66bf904870b8fda1316b4e5490/pybbi-0.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20914802a4b3a9d46b2eed811ff77f8fb9d26d1bc4af9168ce3f33ffbd564518", size = 3081199, upload-time = "2025-10-30T17:09:17.272Z" }, + { url = "https://files.pythonhosted.org/packages/6d/3a/9b3fd6dcf58ead95c267fc819df9647700cc69200e992014b80e43f69b65/pybbi-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1965c3620f18546d65e85b1b99d5306303ffbc2fdbbeae93ab3a86dc96ac8669", size = 3399481, upload-time = "2025-10-30T17:09:18.944Z" }, + { url = "https://files.pythonhosted.org/packages/8f/50/bfdfde6d757c3d986f23467131511350a9de71a766c0063930ff70bc46bf/pybbi-0.4.2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:145869f0c8e3ebca352df3477a43e15407b8dc1bf409081ea89d6498ace634de", size = 2933866, upload-time = "2025-10-30T17:09:21.063Z" }, + { url = "https://files.pythonhosted.org/packages/3f/1e/9c439b06647a84f769d8cf1ae35a54bfe0ad81c40bc26aea7c5130a1674c/pybbi-0.4.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ad8546cd8871a6b112980b00bbcd31ad409e4a24e979a6cd3114cc645b433b91", size = 3223265, upload-time = "2025-10-30T17:09:23.059Z" }, + { url = "https://files.pythonhosted.org/packages/8b/c8/589d044a3a1e63c772da4446d1d6f452225e45d9fe360eb5aa3f8efdd140/pybbi-0.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a7aeeb5a3d81a6331d19901e1a2c6cdcee9edb273b3aaa1b3e954e92d165030", size = 3109825, upload-time = "2025-10-30T17:09:24.62Z" }, + { url = "https://files.pythonhosted.org/packages/86/ce/b296d5e51276d88a2e51070ed97f76eedba639767204987cc6a465ae564f/pybbi-0.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a91937f82e3f6f0ae011711fc175d2b2ec1decce0af62435ae0bd297c5d3633", size = 3428525, upload-time = "2025-10-30T17:09:26.293Z" }, + { url = "https://files.pythonhosted.org/packages/3f/74/af3bfdecdacb64e35bffea2513541d71f8ea7358f5004880e37a2414b5f2/pybbi-0.4.2-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5d1fad3d46787ea5905e5f1080b16dfb83d67c8d83225efe3169d4dcf27eb612", size = 2943529, upload-time = "2025-10-30T17:09:28.849Z" }, + { url = "https://files.pythonhosted.org/packages/02/66/38380935e7d3856fe5daf0694880f8330631d09bf73b2b30d8e576eeb12b/pybbi-0.4.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:37ed8946551a9a5efc9e426d7b86248e845c51b803d48016027a7c8a1a9fce33", size = 3232894, upload-time = "2025-10-30T17:09:31.132Z" }, + { url = "https://files.pythonhosted.org/packages/36/1f/abbe9be6b0f501a7ff182f1c6534b8f83b69c97630776d8e482a2d4d9bf7/pybbi-0.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36dc0add6f54d1763bf247b2cff87eaf2c81a26da18d49ac33de164d6baec2df", size = 3099983, upload-time = "2025-10-30T17:09:32.928Z" }, + { url = "https://files.pythonhosted.org/packages/b0/e7/35195b5292b6ccdcaaa655a2dad615b9217f52ac18b9dfdf8afec82521a6/pybbi-0.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d587a05d7eed4d1fb92e3f62ad52e8389c7a6b315bc430a6eb678760106740e", size = 3420434, upload-time = "2025-10-30T17:09:34.693Z" }, + { url = "https://files.pythonhosted.org/packages/53/72/ac475ad1726f132fc272882b43defddfe09a831815a741850d2c510d7f0e/pybbi-0.4.2-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:54b9e541bb455aeb30f88f09c70b7a4b72f6a0ae0e2e7f2bc9798f2e647b3d16", size = 2943356, upload-time = "2025-10-30T17:09:36.18Z" }, + { url = "https://files.pythonhosted.org/packages/1b/d3/4e27f36ae7f0fc933330239d7b76debf6ef67dbd7fef722f3ba1e7cfb9cc/pybbi-0.4.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:2c6472900a6688d604ccfd406138337e5d5b9db442a13715ce7abbf2cb98b841", size = 3232770, upload-time = "2025-10-30T17:09:37.803Z" }, + { url = "https://files.pythonhosted.org/packages/8e/25/2cbcd423c16ceb6eaedc39d797ff8b31119fcb5d71910b5336dbefb942d5/pybbi-0.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ee599e318238b177b2c31dbe7789034ea0bfa9716e0aa06d387d7cc9f78938", size = 3095788, upload-time = "2025-10-30T17:09:39.523Z" }, + { url = "https://files.pythonhosted.org/packages/15/b2/dbaaaa0901785caad9317c0eedf97eaa3d841e1c05589a9388ce67c9d53f/pybbi-0.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce9afaf149c89b01dfa2926febe53b73b174be0cc5febfd4cfee9f1e72083f50", size = 3414507, upload-time = "2025-10-30T17:09:41.605Z" }, + { url = "https://files.pythonhosted.org/packages/0e/14/183e404d9015b880fde46e452190b6efe6c95dd6bf1cef28265abeba92b8/pybbi-0.4.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:b685fc3c72a64672e87cc590146ee91bb97b1b5cb061a85fbe5ade679ef7b5fb", size = 2936711, upload-time = "2025-10-30T17:09:43.377Z" }, + { url = "https://files.pythonhosted.org/packages/f6/cc/27c610695e6e746021d3ae311f9572aa7cd57796806fd54328ae67a8d5ec/pybbi-0.4.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:093c3efb6a6454397be6ffbcd37f8719107300ea02f4ec3066583a0bf87f16b3", size = 3225304, upload-time = "2025-10-30T17:09:45.117Z" }, + { url = "https://files.pythonhosted.org/packages/a6/c2/d73f53bcf6605e8c0740fb486838d1750da2e2091dae4e7b8733aa004110/pybbi-0.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31ac2493b598a4568c5a16c0a575e41633afbe5da3084d0ef2cdcbed1b26031b", size = 3077589, upload-time = "2025-10-30T17:09:46.863Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ca/c3af682d723805a06868f5d3cb00fe5b4d7f7452db4a672b7526f87285f7/pybbi-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be8bf199a520b75c4708b90d3245d183138728895fad0a145eb36bc038d39b3b", size = 3396498, upload-time = "2025-10-30T17:09:48.549Z" }, ] [[package]] @@ -518,13 +518,13 @@ requires-dist = [{ name = "numpy" }] [package.metadata.requires-dev] dev = [ - { name = "pybbi" }, + { name = "pybbi", specifier = ">=0.4.2" }, { name = "pytest" }, { name = "ruff" }, { name = "smart-open", extras = ["http"] }, ] test = [ - { name = "pybbi" }, + { name = "pybbi", specifier = ">=0.4.2" }, { name = "pytest" }, { name = "smart-open", extras = ["http"] }, ] From 08484bf59216cb97998b0ac71ea5ffa960eacab0 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 12 May 2026 12:50:01 -0400 Subject: [PATCH 07/17] fix: Normalize bases_covered in fill_binned to match UCSC UCSC's bbiIntervalSlice/bbiSummarySlice round per-bin bases_covered up to the next integer and rescale sum and sum_squares by ceil(bc)/bc so that mean = sum / bases_covered is preserved. This matters for the zoom path, where a partial overlap of a zoom record contributes a fractional share of its validCount. The Value path is unaffected since overlaps are integer-valued. The ceil direction (vs. floor) preserves the invariant that bases_covered >= 1 whenever any data overlaps the bin -- a tiny boundary clip of a zoom record won't spuriously erase known-existing data. std is not algebraically preserved under the rescale, but matching UCSC's reported std requires applying the same step. --- pybigtools/src/lib.rs | 2 +- pybigtools/src/raster.rs | 55 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index abb438c..3238d2d 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -14,8 +14,8 @@ use url::Url; mod coverage; mod errors; mod file_like; -mod reader; mod raster; +mod reader; mod writer; use errors::{BBIFileClosed, BBIReadError}; diff --git a/pybigtools/src/raster.rs b/pybigtools/src/raster.rs index aafcd09..3b6215c 100644 --- a/pybigtools/src/raster.rs +++ b/pybigtools/src/raster.rs @@ -261,7 +261,7 @@ fn records_to_array( /// /// For [`Value`] intervals, the values are assigned over the covered bases. /// For [`BedEntry`] intervals, the total coverage is accumulated over each base. -/// For [`ZoomRecord`] intervals, the mean statistic is assigned over the covered bases. +/// For [`ZoomRecord`] intervals, the bin mean statistic is assigned over the covered bases. /// /// # Arguments /// * `start` - Start coordinate of the target range @@ -331,6 +331,11 @@ fn fill_values>>( /// [`BedEntry`] intervals are not supported. They must be converted to coverage [`Value`] /// intervals. See [`CoverageIterator`] for details. /// +/// Per-bin `bases_covered` is normalized up to the next integer (matching UCSC's convention), +/// with `sum` and `sum_squares` rescaled proportionally so that `mean = sum / bases_covered` +/// is invariant. This only has an effect on the [`ZoomRecord`] path, where partial overlap of +/// a zoom record can produce a fractional share of its `validCount`. +/// /// # Arguments /// * `start` - Start coordinate of the target range /// * `end` - End coordinate of the target range @@ -410,7 +415,7 @@ fn fill_binned>>( bin_end }; - // Calculate overlap between interval and bin + // Calculate integer overlap between interval and adjusted bin let overlap_start = interval_start.max(bin_start); let overlap_end = interval_end.min(bin_end); let overlap = (overlap_end - overlap_start) as f64; @@ -442,6 +447,28 @@ fn fill_binned>>( } } + // Normalize bases_covered to an integer count, matching UCSC's convention. + // + // For the Value path, accumulated `bases_covered` is already integer-valued (overlaps are + // computed in integer coordinates), so `ceil` is a no-op. + // + // For the ZoomRecord path, a partial overlap of a zoom record contributes a fractional + // share of its `validCount` (e.g. a zoom summarizing 10 bases with validCount=7, half + // overlapped, contributes 3.5). UCSC rounds the per-bin total up to the next integer and + // rescales `sum` and `sum_squares` by `ceil(bc) / bc` to keep `mean = sum / bases_covered` + // invariant. The `ceil` direction preserves the invariant that `bases_covered >= 1` + // whenever any data overlaps the bin. Note that `std` is *not* preserved under this + // rescaling, but matching UCSC requires it. + for stat in stats.iter_mut() { + if stat.bases_covered > 0.0 { + let vc = stat.bases_covered.ceil(); + let norm = vc / stat.bases_covered; + stat.sum *= norm; + stat.sum_squares *= norm; + stat.bases_covered = vc; + } + } + // Fill the output array for i in 0..bins { if stats[i].bases_covered == 0.0 { @@ -477,14 +504,32 @@ fn fill_binned>>( *view.index_mut(i) = variance.sqrt(); } Summary::Min0 => { - *view.index_mut(i) = if stats[i].bases_covered < bin_size.floor() { + // Use integer-adjusted bin size when checking for partial coverage + let bin_start = start + (i as f64 * bin_size) as i32; + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + let adj_bin_size = (bin_end - bin_start) as f64; + let adj_bin_size = if adj_bin_size == 0.0 { + 1.0 + } else { + adj_bin_size + }; + *view.index_mut(i) = if stats[i].bases_covered < adj_bin_size { stats[i].min.min(0.0) } else { stats[i].min } } Summary::Max0 => { - *view.index_mut(i) = if stats[i].bases_covered < bin_size.floor() { + // Use integer-adjusted bin size when checking for partial coverage + let bin_start = start + (i as f64 * bin_size) as i32; + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + let adj_bin_size = (bin_end - bin_start) as f64; + let adj_bin_size = if adj_bin_size == 0.0 { + 1.0 + } else { + adj_bin_size + }; + *view.index_mut(i) = if stats[i].bases_covered < adj_bin_size { stats[i].max.max(0.0) } else { stats[i].max @@ -511,7 +556,7 @@ fn fill_binned>>( /// Fill out-of-bounds elements in the array with a specified value. /// -/// If any bin overlaps base coordinates outside the valid range of the chromosome, it is filled +/// If a bin overlaps any base coordinates outside the valid range of the chromosome, it is filled /// with the `oob` value. fn fill_out_of_bounds( start: i32, From 87926e7fdac9674ecc68cd8a5de1c19f4811fd54 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 12 May 2026 14:14:25 -0400 Subject: [PATCH 08/17] Update docstring --- pybigtools/src/coverage.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs index 2989086..2ac3e89 100644 --- a/pybigtools/src/coverage.rs +++ b/pybigtools/src/coverage.rs @@ -12,8 +12,12 @@ use std::collections::BinaryHeap; /// /// # Notes /// -/// The iterator uses roughly constant memory by maintaining a small heap that only grows if many -/// entries share the same start or end position. +/// Implements a 1-D sweep-line over endpoint events: each entry contributes a `+1` at `start` and +/// a `-1` at `end`; the running sum at any scan position is the coverage depth at that base. +/// +/// Memory scales with the number of intervals overlapping the current scan position (not with the +/// total number of input entries). The internal heap holds one pending `end` event for each such +/// interval until the scan moves past it. /// /// # Example /// From 000094478caa9d5e9973a7a1362fcf48f8b57b5b Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 12 May 2026 15:14:07 -0400 Subject: [PATCH 09/17] Add check for malformed intervals --- pybigtools/src/coverage.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs index 2ac3e89..5e0a2e0 100644 --- a/pybigtools/src/coverage.rs +++ b/pybigtools/src/coverage.rs @@ -38,7 +38,7 @@ use std::collections::BinaryHeap; pub struct CoverageIterator { iter: I, heap: BinaryHeap>, - coverage: i32, + coverage: u32, prev_pos: Option, finished: bool, } @@ -144,8 +144,12 @@ where None }; - // Update coverage and current position with all batched events - self.coverage += delta; + // Update coverage and current position with all batched events. + // `checked_add_signed` surfaces malformed input (e.g. an entry with start > end, + // which queues a -1 event before its matching +1) as a panic. + self.coverage = self.coverage.checked_add_signed(delta).expect( + "CoverageIterator: coverage underflow (likely malformed input: start > end)", + ); self.prev_pos = Some(pos); // Return the interval if we have one From e88a33aff78aaa69a9f6c74b40f36a53d104ac70 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Tue, 12 May 2026 16:26:36 -0400 Subject: [PATCH 10/17] refactor: Simplify and RLE-encode CoverageIterator The output is now a run-length-encoded coverage track: adjacent intervals with the same depth are merged, matching the conventional `bedtools genomecov -bg` semantics. Previously the iterator emitted at every event position, producing many same-depth intervals in plateau regions. The internal algorithm now uses a Peekable iterator for start events and an ends-only heap (one entry per active interval, half the size of the previous start+end heap). The next event position is the min of the next entry's start and the smallest pending end. The two-phase batching collapses to a single drain-starts + drain-ends pass per event position. `new()` no longer eagerly pulls the first entry. Renames: `heap` -> `ends`, `prev_pos` -> `run_start`. Behavior change for downstream consumers: `fill_binned` is unaffected (same total contribution per bin) but advances the iterator fewer times in dense regions. Two tests were updated to match the new output (adjacent same-coverage intervals merge); four new tests cover malformed input, sort violations, the many-same-position batching case, and a 10k-entry stress check. --- pybigtools/src/coverage.rs | 354 +++++++++++++++++++++++++------------ 1 file changed, 238 insertions(+), 116 deletions(-) diff --git a/pybigtools/src/coverage.rs b/pybigtools/src/coverage.rs index 5e0a2e0..794d5f5 100644 --- a/pybigtools/src/coverage.rs +++ b/pybigtools/src/coverage.rs @@ -6,7 +6,8 @@ use std::collections::BinaryHeap; /// /// This iterator consumes a stream of sorted, potentially overlapping BED entries and produces a /// stream of non-overlapping intervals whose values represent the coverage depth (number of -/// overlapping entries) over its range. +/// overlapping entries) over each range. Adjacent intervals with the same depth are merged, so +/// the output is a run-length-encoded coverage track. /// /// The input BED entries **must** be sorted by start position, then end position. /// @@ -35,11 +36,11 @@ use std::collections::BinaryHeap; /// /// // Result: [10-20): coverage=1, [20-30): coverage=2, [30-40): coverage=1 /// ``` -pub struct CoverageIterator { - iter: I, - heap: BinaryHeap>, +pub struct CoverageIterator { + iter: std::iter::Peekable, + ends: BinaryHeap>, coverage: u32, - prev_pos: Option, + run_start: Option, finished: bool, } @@ -47,26 +48,12 @@ impl CoverageIterator where I: Iterator>, { - pub fn new(mut iter: I) -> Result { - let mut heap = BinaryHeap::new(); - - // Initialize the heap - match iter.next() { - Some(Ok(entry)) => { - heap.push(Reverse((entry.start, 1))); - heap.push(Reverse((entry.end, -1))); - } - Some(Err(e)) => { - return Err(e); - } - None => {} - } - + pub fn new(iter: I) -> Result { Ok(Self { - iter, - heap, + iter: iter.peekable(), + ends: BinaryHeap::new(), coverage: 0, - prev_pos: None, + run_start: None, finished: false, }) } @@ -83,83 +70,88 @@ where return None; } - while !self.heap.is_empty() { - let Reverse((pos, inc)) = self.heap.pop().unwrap(); - - // Batch all events at the same position - // First, collect all same-position events already in the heap - let mut delta = inc; - while let Some(&Reverse((next_pos, inc))) = self.heap.peek() { - if next_pos == pos { - delta += inc; - self.heap.pop(); - } else { - break; + loop { + // Determine the next event position: the minimum of the next entry's start + // (peeked from the iterator) and the smallest pending end in the heap. + let next_start = match self.iter.peek() { + Some(Ok(e)) => Some(e.start), + Some(Err(_)) => { + // Surface the iterator error + let err = match self.iter.next() { + Some(Err(e)) => e, + _ => unreachable!(), + }; + self.finished = true; + return Some(Err(err)); } - } - - // If we have more entries with the current position, load them. - for result in self.iter.by_ref() { - match result { - Ok(entry) => { - self.heap.push(Reverse((entry.start, 1))); - self.heap.push(Reverse((entry.end, -1))); - - // If this entry contributes to the current position, process those events - // and remove them from the heap - let has_current_pos = entry.start == pos || entry.end == pos; - if has_current_pos { - while let Some(&Reverse((next_pos, inc))) = self.heap.peek() { - if next_pos == pos { - delta += inc; - self.heap.pop(); - } else { - break; - } - } - } else { - // No more events at current position, stop loading - break; - } - } - Err(e) => { - self.finished = true; - return Some(Err(e)); - } + None => None, + }; + let next_end = self.ends.peek().map(|&Reverse(p)| p); + let pos = match (next_start, next_end) { + (Some(s), Some(e)) => s.min(e), + (Some(s), None) => s, + (None, Some(e)) => e, + (None, None) => { + self.finished = true; + return None; } + }; + + // Process all events at `pos`. Start events come from the iterator (in sort order); + // end events come from the heap. Zero-length entries (start == end == pos) push an + // end into the heap during the start drain that is then consumed by the end drain + // in the same iteration -- so they contribute net 0 to delta, as expected. + let mut delta: i32 = 0; + + // Drain start events at `pos` + while matches!(self.iter.peek(), Some(Ok(e)) if e.start == pos) { + let entry = match self.iter.next() { + Some(Ok(e)) => e, + _ => unreachable!(), + }; + delta += 1; + self.ends.push(Reverse(entry.end)); } - // Check if we need to emit an interval - let interval_to_emit = if let Some(start) = self.prev_pos { - if self.coverage > 0 && start < pos { - Some(Value { - start, - end: pos, - value: self.coverage as f32, - }) - } else { - None - } - } else { - None - }; + // Drain end events at `pos` (including any just pushed by zero-length entries) + while matches!(self.ends.peek(), Some(&Reverse(p)) if p == pos) { + self.ends.pop(); + delta -= 1; + } - // Update coverage and current position with all batched events. - // `checked_add_signed` surfaces malformed input (e.g. an entry with start > end, - // which queues a -1 event before its matching +1) as a panic. - self.coverage = self.coverage.checked_add_signed(delta).expect( + // Update coverage. `checked_add_signed` surfaces malformed input (e.g. an entry + // with start > end, which would queue a -1 event before its matching +1) as a + // panic rather than silent wrong output. + let old_cov = self.coverage; + let new_cov = old_cov.checked_add_signed(delta).expect( "CoverageIterator: coverage underflow (likely malformed input: start > end)", ); - self.prev_pos = Some(pos); - // Return the interval if we have one + // If coverage didn't change at this event position, the current run continues; + // no emit, no state update. This collapses adjacent same-depth intervals + // (run-length encoding), matching the conventional "coverage track" semantics + if old_cov == new_cov { + continue; + } + + // Close the previous run if there was one. Skip emit if `run_start >= pos`, + // which only happens under malformed or out-of-sort-order input. + let interval_to_emit = match self.run_start { + Some(start) if old_cov > 0 && start < pos => Some(Value { + start, + end: pos, + value: old_cov as f32, + }), + _ => None, + }; + + self.coverage = new_cov; + self.run_start = if new_cov > 0 { Some(pos) } else { None }; + if let Some(interval) = interval_to_emit { return Some(Ok(interval)); } } - - self.finished = true; - None } } @@ -486,28 +478,12 @@ mod tests { let result: Result, _> = coverage_iter.collect(); let intervals = result.unwrap(); - // Expected: [10-20): coverage=1, [20-30): coverage=1, [30-40): coverage=1 - assert_eq!(intervals.len(), 3); + // Adjacent intervals with the same coverage are merged: [10-40): coverage=1 + assert_eq!(intervals.len(), 1); assert_eq!( intervals[0], Value { start: 10, - end: 20, - value: 1.0 - } - ); - assert_eq!( - intervals[1], - Value { - start: 20, - end: 30, - value: 1.0 - } - ); - assert_eq!( - intervals[2], - Value { - start: 30, end: 40, value: 1.0 } @@ -874,25 +850,18 @@ mod tests { let result: Result, _> = coverage_iter.collect(); let intervals = result.unwrap(); - assert_eq!(intervals.len(), 3); + // Adjacent intervals [0,1) and [1,2) with the same coverage merge into [0,2) + assert_eq!(intervals.len(), 2); assert_eq!( intervals[0], Value { start: 0, - end: 1, - value: 1.0 - } - ); - assert_eq!( - intervals[1], - Value { - start: 1, end: 2, value: 1.0 } ); assert_eq!( - intervals[2], + intervals[1], Value { start: 4294967290, end: 4294967295, @@ -1040,4 +1009,157 @@ mod tests { } ); } + + #[test] + fn test_malformed_start_greater_than_end() { + // An entry with start > end is malformed. The current algorithm doesn't crash + // (the end is only pushed to the heap after the start is consumed, so the +1 always + // precedes the -1 and coverage stays non-negative). It emits no intervals because + // the end position is less than prev_pos, failing the emit check. This test pins + // that behavior so a future change that errors out is visible. + let entries = vec![Ok(BedEntry { + start: 20, + end: 10, + rest: "malformed".to_string(), + })]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!(intervals, vec![]); + } + + #[test] + fn test_sort_violation_produces_incorrect_output() { + // Out-of-order input is a documented precondition violation. The algorithm + // doesn't detect it and produces incorrect (but non-panicking) output. This test + // pins the current behavior so any future fix (e.g. error or debug assertion) is + // visible as a test change. + // + // Correct output for sorted [(5,15), (10,20)] would be: + // [5-10): cov=1, [10-15): cov=2, [15-20): cov=1 + // With the entries presented out of order, we instead get a collapsed range at + // cov=2 covering the entire [5, 15) span. + let entries = vec![ + Ok(BedEntry { + start: 10, + end: 20, + rest: "first".to_string(), + }), + Ok(BedEntry { + start: 5, + end: 15, + rest: "out_of_order".to_string(), + }), + ]; + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!( + intervals, + vec![ + Value { + start: 5, + end: 15, + value: 2.0, + }, + Value { + start: 15, + end: 20, + value: 1.0, + }, + ] + ); + } + + #[test] + fn test_many_same_position() { + // 1000 entries all starting at the same position. Stresses the start-drain loop + // which has to consume all of them in a single outer iteration. Output should be + // a single interval at the full depth. + let n = 1000u32; + let entries: Vec> = (0..n) + .map(|_| { + Ok(BedEntry { + start: 100, + end: 200, + rest: String::new(), + }) + }) + .collect(); + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + assert_eq!( + intervals, + vec![Value { + start: 100, + end: 200, + value: n as f32, + }] + ); + } + + #[test] + fn test_stress_many_overlapping() { + // 10k entries with high pairwise overlap. Stresses the heap (peak depth ~ `width`) + // and verifies correctness via algorithmic invariants: + // 1. Total "base-coverage units" emitted equals the sum of input entry lengths. + // 2. Intervals are non-overlapping, sorted, and contiguous (no zero-coverage gaps + // in a region covered by at least one entry). + // 3. The full plateau region (positions `width-1 .. n`) is at coverage == `width`. + // + // We deliberately don't compare to a brute-force RLE reference because the iterator + // emits at every event position even when coverage doesn't change between events + // (which a coverage-based RLE would merge). + let n: u32 = 10_000; + let width: u32 = 100; + + let entries: Vec> = (0..n) + .map(|i| { + Ok(BedEntry { + start: i, + end: i + width, + rest: String::new(), + }) + }) + .collect(); + + let coverage_iter = CoverageIterator::new(entries.into_iter()).unwrap(); + let result: Result, _> = coverage_iter.collect(); + let intervals = result.unwrap(); + + // Invariant 1: total base-coverage units == sum of input entry lengths + let total_base_cov: u64 = intervals + .iter() + .map(|v| (v.end - v.start) as u64 * v.value as u64) + .sum(); + let expected_total: u64 = n as u64 * width as u64; + assert_eq!(total_base_cov, expected_total); + + // Invariant 2: intervals are sorted, contiguous, all coverage > 0 + for v in &intervals { + assert!(v.value > 0.0, "zero-coverage interval emitted"); + assert!(v.start < v.end, "degenerate interval emitted"); + } + for w in intervals.windows(2) { + assert_eq!( + w[0].end, w[1].start, + "non-contiguous intervals (no entry has cov=0 in this fixture)" + ); + } + + // Invariant 3: plateau coverage equals `width` + let mid_pos = n / 2; + let at_mid = intervals + .iter() + .find(|v| v.start <= mid_pos && v.end > mid_pos) + .expect("no interval covers mid_pos"); + assert_eq!(at_mid.value, width as f32); + } } From 41eab86cfb431171eedde560180173ef1be8eee2 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 00:39:43 -0400 Subject: [PATCH 11/17] refactor: Separate uncovered base treatment from post-fill semantics in values() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reshape the `values()` API around two orthogonal concerns: - `uncovered: Option[float]`: per-base value assigned to uncovered bases during stat computation. `None` (default) excludes them from the summary (classical statistics over only the covered bases); `0.0` reproduces UCSC's `*0` statistics; any other finite value is a well-defined generalization. For `bases_covered` and `bin_covered` this is ignored — those always report counts of actual data bases. - `fillna: Option[float]`: post-rasterization fill for any NaN positions or bins that remain after stat computation. Independent of `oob`: out-of-chromosome positions take the `oob` value and are not touched by `fillna`, so the two parameters control different concerns. Drops the `Mean0`, `Std0`, `Min0`, `Max0` enum variants and string mappings — they're now expressible as the regular summaries with `uncovered=0.0`. Drops the previous `missing` parameter, which was overloaded between "per-base fill" (base-level) and "empty-bin fill" (binned) semantics. --- pybigtools/src/raster.rs | 378 ++++++++++++++++++++++---------- pybigtools/src/reader.rs | 49 +++-- pybigtools/tests/test_api.py | 73 +++--- pybigtools/tests/test_raster.py | 252 ++++++++++++++------- 4 files changed, 513 insertions(+), 239 deletions(-) diff --git a/pybigtools/src/raster.rs b/pybigtools/src/raster.rs index 3b6215c..8f57d49 100644 --- a/pybigtools/src/raster.rs +++ b/pybigtools/src/raster.rs @@ -14,10 +14,6 @@ pub enum Summary { Std, Min, Max, - Mean0, - Std0, - Min0, - Max0, Sum, SumSquares, BasesCovered, @@ -103,13 +99,14 @@ pub fn intervals_to_array( bins: Option, summary: Summary, exact: bool, - missing: f64, + uncovered: Option, oob: f64, + fillna: Option, arr: Option, ) -> PyResult { let mut bbi = BBIRead::BigWig(bw); records_to_array( - py, &mut bbi, chrom, start, end, bins, summary, exact, missing, oob, arr, + py, &mut bbi, chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ) } @@ -122,13 +119,14 @@ pub fn entries_to_array( bins: Option, summary: Summary, exact: bool, - missing: f64, + uncovered: Option, oob: f64, + fillna: Option, arr: Option, ) -> PyResult { let mut bbi = BBIRead::BigBed(bb); records_to_array( - py, &mut bbi, chrom, start, end, bins, summary, exact, missing, oob, arr, + py, &mut bbi, chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ) } @@ -141,15 +139,17 @@ fn records_to_array( bins: Option, summary: Summary, exact: bool, - missing: f64, + uncovered: Option, oob: f64, + fillna: Option, arr: Option, ) -> PyResult { let (start, end, chrom_length) = bbi.resolve_range(chrom, start, end)?; let (valid_start, valid_end) = (start.max(0) as u32, end.min(chrom_length) as u32); let arr = arr.unwrap_or_else(|| { let size = bins.unwrap_or((end - start) as usize); - PyArray1::from_vec_bound(py, vec![missing; size]).to_object(py) + let init = uncovered.unwrap_or(f64::NAN); + PyArray1::from_vec_bound(py, vec![init; size]).to_object(py) }); let array: &Bound<'_, PyArray1> = arr.downcast_bound::>(py).map_err(|_| { @@ -177,14 +177,14 @@ fn records_to_array( .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::Value)); - fill_values(start, end, iter, missing, view).convert_err()?; + fill_values(start, end, iter, uncovered, view).convert_err()?; } BBIRead::BigBed(bb) => { let iter = bb .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::BedEntry)); - fill_values(start, end, iter, missing, view).convert_err()?; + fill_values(start, end, iter, uncovered, view).convert_err()?; } } } @@ -212,7 +212,7 @@ fn records_to_array( .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) .convert_err()? .map(|item| item.map(BBIRecord::ZoomRecord)); - fill_binned(start, end, iter, summary, bins, missing, view) + fill_binned(start, end, iter, summary, bins, uncovered, view) .convert_err()?; } BBIRead::BigBed(bb) => { @@ -220,7 +220,7 @@ fn records_to_array( .get_zoom_interval(chrom, valid_start, valid_end, reduction_level) .convert_err()? .map(|item| item.map(BBIRecord::ZoomRecord)); - fill_binned(start, end, iter, summary, bins, missing, view) + fill_binned(start, end, iter, summary, bins, uncovered, view) .convert_err()?; } }, @@ -231,7 +231,7 @@ fn records_to_array( .get_interval(chrom, valid_start, valid_end) .convert_err()? .map(|item| item.map(BBIRecord::Value)); - fill_binned(start, end, iter, summary, bins, missing, view) + fill_binned(start, end, iter, summary, bins, uncovered, view) .convert_err()?; } BBIRead::BigBed(bb) => { @@ -241,7 +241,7 @@ fn records_to_array( let coverage_iter = CoverageIterator::new(iter) .convert_err()? .map(|result| result.map(BBIRecord::Value)); - fill_binned(start, end, coverage_iter, summary, bins, missing, view) + fill_binned(start, end, coverage_iter, summary, bins, uncovered, view) .convert_err()?; } }, @@ -249,10 +249,19 @@ fn records_to_array( } }; - // Handle out-of-bounds + // Post-process fill for NaN positions. Applied before filling out-of-bounds. let mut array = array.readwrite(); - let view = array.as_array_mut(); - fill_out_of_bounds(start, end, chrom_length, bins, oob, view); + let mut view = array.as_array_mut(); + if let Some(fill) = fillna { + for v in view.iter_mut() { + if v.is_nan() { + *v = fill; + } + } + } + + // Handle out-of-bounds (bins/positions that overlap out-of-chromosome coordinates). + fill_out_of_bounds(start, end, chrom_length, bins, oob, view.view_mut()); Ok(arr) } @@ -267,13 +276,13 @@ fn records_to_array( /// * `start` - Start coordinate of the target range /// * `end` - End coordinate of the target range /// * `iter` - Iterator of data intervals -/// * `missing` - Value to use for bases with no data +/// * `uncovered` - Value to use for bases with no data; `None` leaves them as NaN /// * `view` - Mutable numpy array view to fill (must have length == end - start) fn fill_values>>( start: i32, end: i32, iter: I, - missing: f64, + uncovered: Option, mut view: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(view.len(), (end - start) as usize); @@ -311,9 +320,13 @@ fn fill_values>>( } } - // Finalize - for val in view.iter_mut() { - *val = if val.is_nan() { missing } else { *val }; + // Finalize: replace remaining NaN with `uncovered` if a fill was requested. + if let Some(fill) = uncovered { + for val in view.iter_mut() { + if val.is_nan() { + *val = fill; + } + } } Ok(()) @@ -336,13 +349,19 @@ fn fill_values>>( /// is invariant. This only has an effect on the [`ZoomRecord`] path, where partial overlap of /// a zoom record can produce a fractional share of its `validCount`. /// +/// `uncovered` is the per-base value assigned to uncovered bases within each bin. `None` +/// means "exclude from the summary" and yields the classical statistics over only the +/// covered bases. `Some(v)` (commonly `Some(0.0)`) means "treat uncovered bases as having +/// value `v`"; passing `Some(0.0)` reproduces UCSC's `*0` summary statistics. The choice +/// applies uniformly: there are no separate `*0` variants of `Mean`, `Std`, `Min`, `Max`. +/// /// # Arguments /// * `start` - Start coordinate of the target range -/// * `end` - End coordinate of the target range +/// * `end` - End coordinate of the target range /// * `iter` - Iterator of data intervals /// * `summary` - How to aggregate values /// * `bins` - Number of bins to divide the range into -/// * `missing` - Value to use for bins with no intersecting data +/// * `uncovered` - Per-base value for uncovered bases; `None` excludes them /// * `view` - Mutable numpy array view to fill (must have length == bins) fn fill_binned>>( start: i32, @@ -350,7 +369,7 @@ fn fill_binned>>( iter: I, summary: Summary, bins: usize, - missing: f64, + uncovered: Option, mut view: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(view.len(), bins); @@ -404,18 +423,9 @@ fn fill_binned>>( // Accumulate contribution of interval to each overlapping bin for i in first_bin..=last_bin { - // Round bin edges down to the nearest integer (base) for overlap calculation - let bin_start = start + (i as f64 * bin_size) as i32; - let bin_end = start + ((i + 1) as f64 * bin_size) as i32; - - // For upsampling (case where bin_size < 1), ensure at least one base of overlap is counted - let bin_end = if bin_start == bin_end { - bin_start + 1 - } else { - bin_end - }; + let (bin_start, bin_end) = integer_bin_bounds(start, i, bin_size); - // Calculate integer overlap between interval and adjusted bin + // Calculate integer overlap between interval and bin let overlap_start = interval_start.max(bin_start); let overlap_end = interval_end.min(bin_end); let overlap = (overlap_end - overlap_start) as f64; @@ -469,91 +479,137 @@ fn fill_binned>>( } } - // Fill the output array + // Fill the output array. + // + // `uncovered` is the per-base value assigned to uncovered bases within each bin: + // - `None`: uncovered bases are excluded from the summary computation + // ("regular" statistics over just the covered bases). For a fully-uncovered + // bin the result is NaN for stats over an empty set; for `BasesCovered` and + // `BinCovered` the result is the factual 0. + // - `Some(v)`: uncovered bases are folded into the summary as if they had + // value `v`. `Some(0.0)` reproduces UCSC's `*0` statistics (e.g. + // `mean0 = sum / bin_size`). Other values are well-defined generalizations. + // + // `BasesCovered` and `BinCovered` always report counts of *actual* data bases, + // independent of `uncovered`. for i in 0..bins { - if stats[i].bases_covered == 0.0 { - *view.index_mut(i) = missing; - } else { - match summary { - Summary::Mean => { - *view.index_mut(i) = stats[i].sum / stats[i].bases_covered; + let (bin_start, bin_end) = integer_bin_bounds(start, i, bin_size); + let bin_width = (bin_end - bin_start) as f64; + let n_covered = stats[i].bases_covered; + let n_uncovered = bin_width - n_covered; + + // Effective sum/sum_squares and denominator after optionally folding in + // `uncovered`. When `uncovered` is `None`, only covered bases contribute. + let (s, ss, n) = match uncovered { + None => (stats[i].sum, stats[i].sum_squares, n_covered), + Some(u) => ( + stats[i].sum + u * n_uncovered, + stats[i].sum_squares + u * u * n_uncovered, + bin_width, + ), + }; + + *view.index_mut(i) = match summary { + Summary::Mean => { + if n > 0.0 { + s / n + } else { + f64::NAN } - Summary::Std => { - let n = stats[i].bases_covered; - let mut variance = stats[i].sum_squares - (stats[i].sum * stats[i].sum) / n; + } + Summary::Std => { + if n < 1.0 { + f64::NAN + } else { + let mut variance = ss - s * s / n; if n > 1.0 { variance /= n - 1.0; } - *view.index_mut(i) = variance.sqrt(); - } - Summary::Min => { - *view.index_mut(i) = stats[i].min; + variance.sqrt() } - Summary::Max => { - *view.index_mut(i) = stats[i].max; + } + Summary::Sum => { + if n > 0.0 { + s + } else { + f64::NAN } - Summary::Mean0 => { - *view.index_mut(i) = stats[i].sum / bin_size; + } + Summary::SumSquares => { + if n > 0.0 { + ss + } else { + f64::NAN } - Summary::Std0 => { - let n = bin_size; - let mut variance = stats[i].sum_squares - (stats[i].sum * stats[i].sum) / n; - if n > 1.0 { - variance /= n - 1.0; + } + Summary::Min => match uncovered { + None => { + if n_covered > 0.0 { + stats[i].min + } else { + f64::NAN } - *view.index_mut(i) = variance.sqrt(); } - Summary::Min0 => { - // Use integer-adjusted bin size when checking for partial coverage - let bin_start = start + (i as f64 * bin_size) as i32; - let bin_end = start + ((i + 1) as f64 * bin_size) as i32; - let adj_bin_size = (bin_end - bin_start) as f64; - let adj_bin_size = if adj_bin_size == 0.0 { - 1.0 + Some(u) if n_uncovered > 0.0 => { + if n_covered > 0.0 { + stats[i].min.min(u) } else { - adj_bin_size - }; - *view.index_mut(i) = if stats[i].bases_covered < adj_bin_size { - stats[i].min.min(0.0) - } else { - stats[i].min + u } } - Summary::Max0 => { - // Use integer-adjusted bin size when checking for partial coverage - let bin_start = start + (i as f64 * bin_size) as i32; - let bin_end = start + ((i + 1) as f64 * bin_size) as i32; - let adj_bin_size = (bin_end - bin_start) as f64; - let adj_bin_size = if adj_bin_size == 0.0 { - 1.0 - } else { - adj_bin_size - }; - *view.index_mut(i) = if stats[i].bases_covered < adj_bin_size { - stats[i].max.max(0.0) - } else { + Some(_) => stats[i].min, + }, + Summary::Max => match uncovered { + None => { + if n_covered > 0.0 { stats[i].max + } else { + f64::NAN } } - Summary::Sum => { - *view.index_mut(i) = stats[i].sum; - } - Summary::SumSquares => { - *view.index_mut(i) = stats[i].sum_squares; - } - Summary::BasesCovered => { - *view.index_mut(i) = stats[i].bases_covered; - } - Summary::BinCovered => { - *view.index_mut(i) = stats[i].bases_covered / bin_size; + Some(u) if n_uncovered > 0.0 => { + if n_covered > 0.0 { + stats[i].max.max(u) + } else { + u + } } - } - } + Some(_) => stats[i].max, + }, + Summary::BasesCovered => n_covered, + Summary::BinCovered => n_covered / bin_size, + }; } Ok(()) } +/// Compute the integer-aligned base coordinates of the bin at index `i`. +/// +/// Float-valued `bin_size` does not generally fall on integer base boundaries, but the +/// underlying data is indexed by integer positions. We resolve this by truncating each +/// bin edge: bin `i` nominally spans +/// `[start + floor(i * bin_size), start + floor((i + 1) * bin_size))`. +/// +/// Consequence: for non-integer `bin_size`, adjacent bins can have integer widths that +/// differ by 1 (e.g. for `bin_size = 2.4`, widths alternate 2, 2, 3, 2, 3) even though +/// each bin "intends" to span `bin_size` bases. The integer width returned here is the +/// actual number of base positions assigned to the bin. +/// +/// For upsampling (`bin_size < 1`), the truncation can collapse a bin to zero integer +/// width. We expand such bins to span a single base so that overlap and division +/// operations stay well-defined; consecutive upsampled bins may then share the same +/// underlying base. +fn integer_bin_bounds(start: i32, i: usize, bin_size: f64) -> (i32, i32) { + let bin_start = start + (i as f64 * bin_size) as i32; + let bin_end = start + ((i + 1) as f64 * bin_size) as i32; + if bin_start == bin_end { + (bin_start, bin_start + 1) + } else { + (bin_start, bin_end) + } +} + /// Fill out-of-bounds elements in the array with a specified value. /// /// If a bin overlaps any base coordinates outside the valid range of the chromosome, it is filled @@ -619,64 +675,68 @@ mod tests { intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, n_bins, - 0.0, + None, arr.view_mut(), ) .unwrap(); // First bin (0-10): overlaps 5-10 = 5 bp with value 2.0 // Second bin (10-20): overlaps 10-15 = 5 bp with value 2.0 - // Both should have value 2.0 + // uncovered=None excludes uncovered bases → mean of just the covered bp. assert_eq!(arr.to_vec(), vec![2.0, 2.0]); } #[test] fn test_fill_binned_partial_overlap() { + // Bin 0 [0,10): overlaps [5,8) — 3 bp at value 4.0 + // Bin 1 [10,20): no overlap let start = 0; let end = 20; let n_bins = 2; - let mut arr = Array::from(vec![0.0; n_bins]); + // uncovered = None ("exclude uncovered" / classical statistics). + // Bin 0: mean over the 3 covered bases = 4.0. + // Bin 1: fully uncovered → NaN. let intervals = vec![Value { start: 5, end: 8, value: 4.0, }]; + let mut arr = Array::from(vec![0.0; n_bins]); fill_binned( start, end, intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, n_bins, - -1.0, + None, arr.view_mut(), ) .unwrap(); + assert!((arr[0] - 4.0).abs() < 1e-12); + assert!(arr[1].is_nan()); - // First bin (0-10): overlaps 5-8 = 3 bp with value 4.0 - // Second bin (10-20): no overlap - assert_eq!(arr.to_vec(), vec![4.0, -1.0]); - + // uncovered = Some(0.0) (UCSC mean0 semantics: uncovered bases count as 0). + // Bin 0: (4.0 * 3 + 0.0 * 7) / 10 = 1.2. + // Bin 1: (0.0 * 10) / 10 = 0.0. let intervals = vec![Value { start: 5, end: 8, value: 4.0, }]; + let mut arr = Array::from(vec![0.0; n_bins]); fill_binned( start, end, intervals.into_iter().map(BBIRecord::Value).map(Ok), - Summary::Mean0, + Summary::Mean, n_bins, - -1.0, + Some(0.0), arr.view_mut(), ) .unwrap(); - - // First bin (0-10): overlaps 5-8 = 3 bp with value 4.0 - // Second bin (10-20): no overlap - assert!((arr[0] - (4.0 * 3.0 / 10.0)).abs() < 0.01); - assert!((arr[1] - (-1.0)).abs() < 0.01); + assert!((arr[0] - 1.2).abs() < 1e-12); + assert!((arr[1] - 0.0).abs() < 1e-12); } #[test] @@ -705,7 +765,7 @@ mod tests { intervals.clone().into_iter().map(BBIRecord::Value).map(Ok), Summary::Min, n_bins, - -1.0, + Some(-1.0), arr.view_mut(), ) .unwrap(); @@ -722,7 +782,7 @@ mod tests { intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Max, n_bins, - -1.0, + Some(-1.0), arr.view_mut(), ) .unwrap(); @@ -782,7 +842,7 @@ mod tests { intervals.into_iter().map(BBIRecord::Value).map(Ok), Summary::Mean, n_bins, - 0.0, + None, arr.view_mut(), ) .unwrap(); @@ -798,4 +858,84 @@ mod tests { ); } } + + #[test] + fn test_fill_binned_min_max_with_uncovered_zero_noninteger_binsize() { + // 5 bins over range 0..12, bin_size = 2.4 + // Integer-aligned bin spans: [0,2), [2,4), [4,7), [7,9), [9,12) + // Widths: 2 2 3 2 3 + // + // Single Value covering [0, 5) with value=5 covers bins 0 and 1 fully and + // overlaps bin 2 partially (1 of 3 bases, at position 4). Bins 3 and 4 have + // no coverage. + // + // With `uncovered = 0.0`, uncovered bases are folded into min/max as zeros. + // The integer-adjusted bin width is load-bearing: comparing `bases_covered` + // (=2 for bin 0) against the float `bin_size` (=2.4) would falsely report + // bin 0 as partially covered, contaminating min/max with 0. Using the actual + // integer bin width (=2) correctly identifies bin 0 as fully covered. + let start = 0; + let end = 12; + let n_bins = 5; + let uncovered = Some(0.0); + + // Positive value: min should reveal the "contamination" bug if present + // (would pull min from 5 down to 0 in the fully-covered bins). + let intervals = vec![Value { + start: 0, + end: 5, + value: 5.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.clone().into_iter().map(BBIRecord::Value).map(Ok), + Summary::Min, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + // Bin 0, 1 fully covered → min = 5. Bin 2 partial → min(5, 0) = 0. + // Bins 3, 4 fully uncovered → uncovered (0). + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 0.0, 0.0, 0.0]); + + // Max with a negative value: partial coverage should pull max *up* to 0. + let intervals_neg = vec![Value { + start: 0, + end: 5, + value: -5.0, + }]; + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals_neg.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Max, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + // Bins 0, 1 fully covered → max = -5. Bin 2 partial → max(-5, 0) = 0. + // Bins 3, 4 fully uncovered → uncovered (0). + assert_eq!(arr.to_vec(), vec![-5.0, -5.0, 0.0, 0.0, 0.0]); + + // Max with positive value: partial coverage doesn't change max since + // max(5, 0) == 5; the test pins that the integer-aligned check doesn't + // accidentally pull max *down* in a fully-covered bin. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + intervals.into_iter().map(BBIRecord::Value).map(Ok), + Summary::Max, + n_bins, + uncovered, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 5.0, 0.0, 0.0]); + } } diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index 718da36..67adf3d 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -533,17 +533,31 @@ impl BBIReader { /// If provided, the query interval will be divided into equally spaced /// bins and the values in each bin will be interpolated or summarized. /// If not provided, the values will be returned for each base. - /// summary : Literal["mean", "min", "max"], optional [default: "mean"] - /// The summary statistic to use. Currently supported statistics are - /// ``mean``, ``min``, and ``max``. + /// summary : str, optional [default: "mean"] + /// The summary statistic to use. One of ``mean``, ``std``, ``min``, + /// ``max``, ``sum``, ``sum_squares``, ``bases_covered``, + /// ``bin_covered``. /// exact : bool, optional [default: False] /// If True and ``bins`` is specified, return exact summary statistic /// values instead of interpolating from the optimal zoom level. /// Default is False. - /// missing : float, optional [default: 0.0] - /// Fill-in value for unreported data in valid regions. Default is 0. + /// uncovered : float or None, optional [default: None] + /// The value assigned to all uncovered bases. If ``None``, uncovered + /// bases are excluded from summary statistic calculations, and empty + /// positions or bins will be returned as NaN (subject to ``fillna``). + /// To treat uncovered bases as having a value of zero in summary + /// statistics (like UCSC's ``mean0``) set this parameter to ``0.0``. + /// Empty positions or bins will also be returned as ``0.0``. Other + /// finite values are also valid and will be used in the same way. + /// This parameter is ignored in the cases of ``bases_covered`` and + /// ``bin_covered`` summaries since they exclude uncovered bases by + /// definition. /// oob : float, optional [default: NaN] /// Fill-in value for out-of-bounds regions. Default is NaN. + /// fillna : float or None, optional [default: None] + /// Post-rasterization fill applied to in-bounds positions or bins that + /// are returned as NaN due to being empty. Default ``None`` leaves + /// NaN values untouched. /// arr : numpy.ndarray, optional /// If provided, the values will be written to this array or array /// view. The array must be of the correct size and type. @@ -574,8 +588,8 @@ impl BBIReader { /// records : Get the records of a given range on a chromosome. /// zoom_records : Get the zoom records of a given range on a chromosome. #[pyo3( - signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), - text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, + signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, uncovered=None, oob=f64::NAN, fillna=None, arr=None), + text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, uncovered=None, oob=..., fillna=None, arr=None)"#, )] #[allow(clippy::too_many_arguments)] fn values( @@ -587,8 +601,9 @@ impl BBIReader { bins: Option, summary: String, exact: bool, - missing: f64, + uncovered: Option, oob: f64, + fillna: Option, arr: Option, ) -> PyResult { let summary = match summary.as_ref() { @@ -596,10 +611,6 @@ impl BBIReader { "std" => Summary::Std, "min" => Summary::Min, "max" => Summary::Max, - "mean0" => Summary::Mean0, - "std0" => Summary::Std0, - "min0" => Summary::Min0, - "max0" => Summary::Max0, "sum" => Summary::Sum, "sum_squares" => Summary::SumSquares, "bases_covered" => Summary::BasesCovered, @@ -611,27 +622,29 @@ impl BBIReader { ))); } }; + // Passing `uncovered=NaN` is equivalent to `uncovered=None` (exclude uncovered bases). + let uncovered = uncovered.and_then(|v| if v.is_nan() { None } else { Some(v) }); match &mut self.bbi { BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigWigFileLike(b) => intervals_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigBedFile(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), BBIReadRaw::BigBedFileLike(b) => entries_to_array( - py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, uncovered, oob, fillna, arr, ), } } diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 7f767ef..7fabac9 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -276,25 +276,47 @@ def test_values_binned(bw, bb): assert len(bw.values("chr17", 100000, 110000, 10)) == 10 assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 - assert bw.values("chr17", 100000, 110000, 10)[0] == 0.44886381671868925 - assert bb.values("chr21", 10_148_000, 10_158_000, 10)[0] == 1.0 + nan = float("nan") + assert ( + bw.values("chr17", 100000, 110000, 10, uncovered=nan)[0] + == 0.44886381671868925 + ) + assert bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=nan)[0] == 1.0 - assert bw.values("chr17", 100000, 110000, 10, "max")[0] == 1.1978399753570557 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "max")[0] == 1.0 + assert ( + bw.values("chr17", 100000, 110000, 10, "max", uncovered=nan)[0] + == 1.1978399753570557 + ) + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "max", uncovered=nan)[0] == 1.0 + ) - assert bw.values("chr17", 100000, 110000, 10, "min")[0] == 0.05403999984264374 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min")[0] == 1.0 - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min0")[0] == 0.0 + assert ( + bw.values("chr17", 100000, 110000, 10, "min", uncovered=nan)[0] + == 0.05403999984264374 + ) + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=nan)[0] == 1.0 + ) + # uncovered=0: uncovered bases counted as 0 in min (UCSC min0 semantics). + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0)[0] == 0.0 + ) def test_values_binned_exact(bw, bb): + nan = float("nan") assert np.isclose( - bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0], + bw.values("chr17", 100000, 110000, 10, "mean", exact=True, uncovered=nan)[0], 0.4542629980980206, ) - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True)[0] == 1.0 + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True, uncovered=nan)[0] + == 1.0 + ) + assert np.allclose( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True), + bw.values("chr17", 59890, 59900, 10, "mean", exact=True, uncovered=0.0), [ 0.0, 0.0, @@ -310,9 +332,9 @@ def test_values_binned_exact(bw, bb): ) -def test_values_binned_missing_oob(bw, bb): +def test_values_binned_uncovered_oob(bw, bb): assert np.allclose( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True, missing=-1.0), + bw.values("chr17", 59890, 59900, 10, "mean", exact=True, uncovered=-1.0), [ -1.0, -1.0, @@ -327,37 +349,38 @@ def test_values_binned_missing_oob(bw, bb): ], ) - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0) + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0) + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0) assert x[0] == 0.0 - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0) assert x[0] == 0.0 assert np.isclose(bw.values("chr17", 59890, 59900)[9], 0.06792) - assert np.isclose(bw.values("chr17", 59890, 59900, missing=np.nan)[9], 0.06792) + assert np.isclose(bw.values("chr17", 59890, 59900, uncovered=np.nan)[9], 0.06792) def test_values_binned_exact_vs_from_zoom(bw, bb): - vals = bw.values("chr17", 85525, 85730, bins=2, exact=False) + nan = float("nan") + vals = bw.values("chr17", 85525, 85730, bins=2, exact=False, uncovered=nan) assert np.allclose(vals, [0.10737355, 2.72889167]) - vals = bw.values("chr17", 85525, 85730, bins=2, exact=True) + vals = bw.values("chr17", 85525, 85730, bins=2, exact=True, uncovered=nan) assert np.allclose(vals, [0.06770935, 2.48644244]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=False) + vals = bw.values("chr17", 59900, 60105, bins=2, exact=False, uncovered=nan) assert np.allclose(vals, [0.53580606, 0.55134715]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=True) + vals = bw.values("chr17", 59900, 60105, bins=2, exact=True, uncovered=nan) assert np.allclose(vals, [0.53620019, 0.55277108]) - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False) + vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False, uncovered=nan) assert np.allclose(vals, [1.23191877]) - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True) + vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True, uncovered=nan) assert np.allclose(vals, [1.34086629]) @@ -366,7 +389,7 @@ def test_values_assign_to_array(bw, bb): arr = np.zeros(20) ret_arr = bw.values( - "chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + "chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=np.nan, arr=arr ) assert math.isnan(arr[0]) assert arr[19] == 0.0 @@ -375,7 +398,7 @@ def test_values_assign_to_array(bw, bb): assert np.array_equal(arr, ret_arr, equal_nan=True) ret_arr = bb.values( - "chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + "chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=np.nan, arr=arr ) assert math.isnan(arr[0]) assert arr[19] == 0.0 diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py index 7120c53..cf22b9c 100644 --- a/pybigtools/tests/test_raster.py +++ b/pybigtools/tests/test_raster.py @@ -45,213 +45,285 @@ def bb_path(): def test_values(bw_path, bb_path): + # Default `uncovered=None` (NaN): uncovered bases are NaN. for path in [bw_path, bb_path]: x1 = pbt.open(path).values("chr1", 0, 12) - assert np.allclose(x1, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0], equal_nan=True) + assert np.allclose( + x1, + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], + equal_nan=True, + ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values("chr1", -2, 14) assert np.allclose( x1, - [np.nan, np.nan, 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, np.nan, np.nan], + [ + np.nan, np.nan, + np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan, + np.nan, np.nan, + ], + equal_nan=True, + ) + + # `uncovered=0` reproduces the previous fill-with-zero behavior. + for path in [bw_path, bb_path]: + x1 = pbt.open(path).values("chr1", 0, 12, uncovered=0) + assert np.allclose(x1, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0], equal_nan=True) + + +def test_fillna(bw_path, bb_path): + # `fillna` is a post-rasterization fill that replaces NaN at in-bounds + # positions. It is INDEPENDENT of `oob`: out-of-chromosome positions are + # filled by `oob` and not touched by `fillna`. + + # Base-level: default `uncovered=NaN` leaves uncovered bases as NaN; + # `fillna=-99` replaces them after the fact. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 0, 12, fillna=-99) + assert np.allclose( + x, [-99, 1, 1, -99, 2, 2, 2, 2, 3, 3, -99, -99], equal_nan=True + ) + + # `oob` and `fillna` control different concerns: OOB positions take the + # `oob` value; in-bounds uncovered positions take `fillna`. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -2, 14, oob=-1, fillna=0) + assert np.allclose( + x, + [-1, -1, 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, -1, -1], equal_nan=True, ) + # `fillna` does not "leak" into OOB positions: with `oob=NaN` (default), + # OOB positions stay NaN even when `fillna` is set. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -2, 14, fillna=0) + assert np.allclose( + x, + [ + np.nan, np.nan, + 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, + np.nan, np.nan, + ], + equal_nan=True, + ) + + # Binned: empty bin under `uncovered=NaN` (default) returns NaN; `fillna=0` + # replaces that. Partial bins are unaffected because they returned a + # real number. + for path in [bw_path, bb_path]: + x = pbt.open(path).values( + "chr1", 0, 12, bins=6, exact=True, summary="mean", fillna=0 + ) + # Bins 0..4 partial or full → real means; bin 5 empty → fillna=0. + assert np.allclose(x, [1, 1, 2, 2, 3, 0], equal_nan=True) + -@pytest.mark.parametrize("missing", [np.nan, 0, -1]) -def test_binned_mean(bw_path, bb_path, missing): +def test_binned_mean(bw_path, bb_path): + # uncovered=NaN: classical mean over only covered bases; empty bins → NaN. + uncovered = np.nan for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="mean" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="mean" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="mean" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="mean" + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose(x1, [1, 1, 2, 2.5, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="mean" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose(x1, [1.5, 2.5], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="mean" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose(x1, [2], equal_nan=True) -@pytest.mark.parametrize("missing", [np.nan, 0, -1]) -def test_binned_mean0(bw_path, bb_path, missing): +def test_binned_mean_uncovered_zero(bw_path, bb_path): + # uncovered=0: treat uncovered bases as 0 (UCSC `mean0` semantics). + # Equivalent to (sum + 0*n_uncovered) / bin_size = sum / bin_size. + uncovered = 0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="mean0" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="mean0" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="mean" ) - assert np.allclose(x1, [0.5, 0.5, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [0.5, 0.5, 2, 2, 3, uncovered], equal_nan=True) + # bins=5 has non-integer `bin_size = 2.4`. Integer-aligned bin widths are + # [2, 2, 3, 2, 3] — see `integer_bin_bounds` in raster.rs. Mean0 uses the + # integer bin width (not the float `bin_size`) as the denominator. + # Bin widths: 2 2 3 2 3 + # Coverage: 1/2 1/2 3/3 2/2 1/3 + # Sums: 1 1 6 5 3 + # mean0: 0.5 0.5 2.0 2.5 1.0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="mean0" - ) - assert np.allclose( - x1, [1 / 2.4, 1 / 2.4, 6 / 2.4, 5 / 2.4, 3 / 2.4], equal_nan=True + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="mean" ) + assert np.allclose(x1, [0.5, 0.5, 2.0, 2.5, 1.0], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="mean0" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose(x1, [1, 10 / 6], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="mean0" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="mean" ) assert np.allclose(x1, [16 / 12], equal_nan=True) -@pytest.mark.parametrize("missing", [np.nan, 0, -1]) -def test_binned_minmax(bw_path, bb_path, missing): +def test_binned_minmax(bw_path, bb_path): + # uncovered=NaN: classical min/max over only covered bases; empty bins → NaN. + uncovered = np.nan for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="min" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="max" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="min" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="min" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="max" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="max" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="min" + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="max" + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="min" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [1, 2], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="max" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [2, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="min" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [1], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="max" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [3], equal_nan=True) -@pytest.mark.parametrize("missing", [np.nan, 0, -1]) -def test_binned_minmax0(bw_path, bb_path, missing): +def test_binned_minmax_uncovered_zero(bw_path, bb_path): + # uncovered=0: uncovered bases counted as 0 in min/max (UCSC `min0`/`max0` semantics). + uncovered = 0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="min0" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, missing=missing, summary="max0" + "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose( x1, - [missing, 1, 1, missing, 2, 2, 2, 2, 3, 3, missing, missing], + [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="min0" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="min" ) - assert np.allclose(x1, [0, 0, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [0, 0, 2, 2, 3, uncovered], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, missing=missing, summary="max0" + "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="max" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, missing], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="min0" + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, missing=missing, summary="max0" + "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="min0" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [0, 0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, missing=missing, summary="max0" + "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [2, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="min0" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="min" ) assert np.allclose(x1, [0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, missing=missing, summary="max0" + "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="max" ) assert np.allclose(x1, [3], equal_nan=True) @@ -271,15 +343,17 @@ def test_binned_coverage_breadth(bw_path, bb_path, n_bins, exact): "start,end", [(start, end) for start in [0, 2] for end in [10, 12]] ) def test_bbi_consistent_values(bw_path, bb_path, start, end): + # pybigtools now defaults to `uncovered=NaN`, pybbi to `missing=0.0`. + # Pass matching values explicitly for the comparison. for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", start, end) - x2 = bbi.open(path).fetch("chr1", start, end) + x1 = pbt.open(path).values("chr1", start, end, uncovered=0.0) + x2 = bbi.open(path).fetch("chr1", start, end, missing=0.0) assert np.allclose(x1, x2, equal_nan=True) def test_bbi_consistent_values_match_coverage(bb_path, bw_path): - bb1 = pbt.open(bb_path).values("chr1") - bw1 = pbt.open(bw_path).values("chr1") + bb1 = pbt.open(bb_path).values("chr1", uncovered=0.0) + bw1 = pbt.open(bw_path).values("chr1", uncovered=0.0) assert np.allclose(bb1, bw1, equal_nan=True) bb2 = bbi.open(bb_path).fetch("chr1", 0, -1) @@ -288,19 +362,19 @@ def test_bbi_consistent_values_match_coverage(bb_path, bw_path): @pytest.mark.parametrize( - "start,end,missing,oob", + "start,end,uncovered,oob", [ - (start, end, missing, oob) + (start, end, uncovered, oob) for start in [-2, 0, 2] for end in [10, 12, 14] - for missing in [np.nan, 0, -1] + for uncovered in [np.nan, 0, -1] for oob in [np.nan, 0, -1] ], ) -def test_bbi_consistent_values_missing_oob(bw_path, bb_path, start, end, missing, oob): +def test_bbi_consistent_values_uncovered_oob(bw_path, bb_path, start, end, uncovered, oob): for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", start, end, missing=missing, oob=oob) - x2 = bbi.open(path).fetch("chr1", start, end, missing=missing, oob=oob) + x1 = pbt.open(path).values("chr1", start, end, uncovered=uncovered, oob=oob) + x2 = bbi.open(path).fetch("chr1", start, end, missing=uncovered, oob=oob) assert np.allclose(x1, x2, equal_nan=True) @@ -312,13 +386,27 @@ def test_bbi_consistent_values_missing_oob(bw_path, bb_path, start, end, missing for end in [10, 12] for n_bins in range(1, 13) for exact in [True] - for summary in ["sum", "mean", "std", "min", "max", "bin_covered"] + # `bin_covered` (= `cov` in pybbi) is excluded: pybigtools always returns + # the factual coverage fraction for empty bins (0.0), while pybbi fills + # empty bins with `uncovered`. Intentional divergence. + for summary in ["sum", "mean", "std", "min", "max"] ], ) def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summary): + # pybbi computes "regular" statistics (exclude uncovered) and fills empty bins + # with `uncovered`. To compare apples-to-apples under pybigtools' new unified + # per-base `uncovered` semantics, pass `uncovered=NaN` to both — that means + # "exclude uncovered" in pybigtools, matching pybbi's behavior for partial + # bins, and NaN for empty bins in both. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, summary=summary + "chr1", + start, + end, + bins=n_bins, + exact=exact, + summary=summary, + uncovered=np.nan, ) x2 = bbi.open(path).fetch( "chr1", @@ -326,32 +414,36 @@ def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summ end, bins=n_bins, exact=exact, - summary=summary if summary != "bin_covered" else "cov", + summary=summary, + missing=np.nan, ) assert np.allclose(x1, x2, equal_nan=True) @pytest.mark.parametrize( - "start,end,n_bins,exact,missing,oob", + "start,end,n_bins,exact,oob", [ - (start, end, n_bins, exact, missing, oob) + (start, end, n_bins, exact, oob) for start in [-2, 0, 2] for end in [10, 12, 14] for n_bins in range(1, 13) for exact in [True] - for missing in [np.nan, 0, -1] for oob in [np.nan, 0, -1] ], ) -def test_bbi_consistent_binned_missing_oob( - bw_path, bb_path, start, end, n_bins, exact, missing, oob +def test_bbi_consistent_binned_uncovered_oob( + bw_path, bb_path, start, end, n_bins, exact, oob ): + # `uncovered` is restricted to NaN for parity: pybigtools' per-base `uncovered` + # semantics diverge from pybbi's empty-bin-fill semantics whenever the bin is + # partially covered. With `uncovered=NaN` both libraries compute classical + # statistics over the covered bases. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, missing=missing, oob=oob + "chr1", start, end, bins=n_bins, exact=exact, uncovered=np.nan, oob=oob ) x2 = bbi.open(path).fetch( - "chr1", start, end, bins=n_bins, exact=exact, missing=missing, oob=oob + "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan, oob=oob ) assert np.allclose(x1, x2, equal_nan=True) @@ -361,7 +453,13 @@ def test_bbi_consistent_binned_missing_oob( [(0, 12, n_bins, exact) for n_bins in [12, 24, 48] for exact in [True]], ) def test_bbi_consistent_binned_upsample(bw_path, bb_path, start, end, n_bins, exact): + # Upsampling produces only fully-covered or fully-empty bins (no partial bins), + # so per-base vs empty-bin `uncovered` semantics agree. for path in [bb_path, bw_path]: - x1 = pbt.open(path).values("chr1", start, end, bins=n_bins, exact=exact) - x2 = bbi.open(path).fetch("chr1", start, end, bins=n_bins, exact=exact) + x1 = pbt.open(path).values( + "chr1", start, end, bins=n_bins, exact=exact, uncovered=np.nan + ) + x2 = bbi.open(path).fetch( + "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan + ) assert np.allclose(x1, x2, equal_nan=True) From acc897b7c4becd69ebe96d0bf400e2d5bdc64702 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 10:57:30 -0400 Subject: [PATCH 12/17] Clippy fixes --- pybigtools/src/raster.rs | 30 +++++++++++++++++++----------- pybigtools/src/reader.rs | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/pybigtools/src/raster.rs b/pybigtools/src/raster.rs index 8f57d49..6110899 100644 --- a/pybigtools/src/raster.rs +++ b/pybigtools/src/raster.rs @@ -90,6 +90,7 @@ impl BBIRead<'_, R> { } } +#[allow(clippy::too_many_arguments)] pub fn intervals_to_array( py: Python<'_>, bw: &mut BigWigRead, @@ -110,6 +111,7 @@ pub fn intervals_to_array( ) } +#[allow(clippy::too_many_arguments)] pub fn entries_to_array( py: Python<'_>, bb: &mut BigBedRead, @@ -130,6 +132,7 @@ pub fn entries_to_array( ) } +#[allow(clippy::too_many_arguments)] fn records_to_array( py: Python<'_>, bbi: &mut BBIRead, @@ -422,7 +425,12 @@ fn fill_binned>>( let last_bin = last_bin.min(bins - 1); // Accumulate contribution of interval to each overlapping bin - for i in first_bin..=last_bin { + for (i, stat) in stats + .iter_mut() + .enumerate() + .take(last_bin + 1) + .skip(first_bin) + { let (bin_start, bin_end) = integer_bin_bounds(start, i, bin_size); // Calculate integer overlap between interval and bin @@ -434,20 +442,20 @@ fn fill_binned>>( match interval { BBIRecord::Value(v) => { let value = v.value as f64; - stats[i].sum += value * overlap; - stats[i].sum_squares += value * value * overlap; - stats[i].min = stats[i].min.min(value); - stats[i].max = stats[i].max.max(value); - stats[i].bases_covered += overlap; + stat.sum += value * overlap; + stat.sum_squares += value * value * overlap; + stat.min = stat.min.min(value); + stat.max = stat.max.max(value); + stat.bases_covered += overlap; } BBIRecord::ZoomRecord(z) => { let summary = z.summary; let overlap_factor = overlap / (z.end - z.start) as f64; - stats[i].sum += summary.sum * overlap_factor; - stats[i].sum_squares += summary.sum_squares * overlap_factor; - stats[i].min = stats[i].min.min(summary.min_val); - stats[i].max = stats[i].max.max(summary.max_val); - stats[i].bases_covered += summary.bases_covered as f64 * overlap_factor; + stat.sum += summary.sum * overlap_factor; + stat.sum_squares += summary.sum_squares * overlap_factor; + stat.min = stat.min.min(summary.min_val); + stat.max = stat.max.max(summary.max_val); + stat.bases_covered += summary.bases_covered as f64 * overlap_factor; } BBIRecord::BedEntry(_) => { panic!("BED records must be converted to non-overlapping coverage intervals prior to binning."); diff --git a/pybigtools/src/reader.rs b/pybigtools/src/reader.rs index 67adf3d..c7e3fdf 100644 --- a/pybigtools/src/reader.rs +++ b/pybigtools/src/reader.rs @@ -589,7 +589,7 @@ impl BBIReader { /// zoom_records : Get the zoom records of a given range on a chromosome. #[pyo3( signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, uncovered=None, oob=f64::NAN, fillna=None, arr=None), - text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, uncovered=None, oob=..., fillna=None, arr=None)"#, + text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, uncovered=None, oob=float('nan'), fillna=None, arr=None)"#, )] #[allow(clippy::too_many_arguments)] fn values( From 296bc8f15af740133d9a0fc3033b245ef9a4cdaf Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 11:07:31 -0400 Subject: [PATCH 13/17] Update python tests --- pybigtools/tests/test_api.py | 50 +++--- pybigtools/tests/test_raster.py | 300 ++++++++++++++++++++++++++------ 2 files changed, 272 insertions(+), 78 deletions(-) diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 7fabac9..c2d3cc1 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -276,42 +276,42 @@ def test_values_binned(bw, bb): assert len(bw.values("chr17", 100000, 110000, 10)) == 10 assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 - nan = float("nan") + # Statistics over only the covered bases (`uncovered=None`). assert ( - bw.values("chr17", 100000, 110000, 10, uncovered=nan)[0] - == 0.44886381671868925 + bw.values("chr17", 100000, 110000, 10, uncovered=None)[0] == 0.44886381671868925 ) - assert bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=nan)[0] == 1.0 + assert bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=None)[0] == 1.0 assert ( - bw.values("chr17", 100000, 110000, 10, "max", uncovered=nan)[0] + bw.values("chr17", 100000, 110000, 10, "max", uncovered=None)[0] == 1.1978399753570557 ) assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "max", uncovered=nan)[0] == 1.0 + bb.values("chr21", 10_148_000, 10_158_000, 10, "max", uncovered=None)[0] == 1.0 ) assert ( - bw.values("chr17", 100000, 110000, 10, "min", uncovered=nan)[0] + bw.values("chr17", 100000, 110000, 10, "min", uncovered=None)[0] == 0.05403999984264374 ) assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=nan)[0] == 1.0 - ) - # uncovered=0: uncovered bases counted as 0 in min (UCSC min0 semantics). - assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0)[0] == 0.0 + bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=None)[0] == 1.0 ) + # Uncovered bases counted as 0 in min (min0 semantics). + assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0)[0] == 0.0 + def test_values_binned_exact(bw, bb): - nan = float("nan") + # Statistics over only the covered bases (`uncovered=None`). assert np.isclose( - bw.values("chr17", 100000, 110000, 10, "mean", exact=True, uncovered=nan)[0], + bw.values("chr17", 100000, 110000, 10, "mean", exact=True, uncovered=None)[0], 0.4542629980980206, ) assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True, uncovered=nan)[0] + bb.values( + "chr21", 10_148_000, 10_158_000, 10, "mean", exact=True, uncovered=None + )[0] == 1.0 ) @@ -363,24 +363,28 @@ def test_values_binned_uncovered_oob(bw, bb): assert x[0] == 0.0 assert np.isclose(bw.values("chr17", 59890, 59900)[9], 0.06792) - assert np.isclose(bw.values("chr17", 59890, 59900, uncovered=np.nan)[9], 0.06792) + assert np.isclose(bw.values("chr17", 59890, 59900, uncovered=None)[9], 0.06792) def test_values_binned_exact_vs_from_zoom(bw, bb): - nan = float("nan") - vals = bw.values("chr17", 85525, 85730, bins=2, exact=False, uncovered=nan) + # Statistics over only the covered bases (`uncovered=None`). + vals = bw.values("chr17", 85525, 85730, bins=2, exact=False, uncovered=None) assert np.allclose(vals, [0.10737355, 2.72889167]) - vals = bw.values("chr17", 85525, 85730, bins=2, exact=True, uncovered=nan) + vals = bw.values("chr17", 85525, 85730, bins=2, exact=True, uncovered=None) assert np.allclose(vals, [0.06770935, 2.48644244]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=False, uncovered=nan) + vals = bw.values("chr17", 59900, 60105, bins=2, exact=False, uncovered=None) assert np.allclose(vals, [0.53580606, 0.55134715]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=True, uncovered=nan) + vals = bw.values("chr17", 59900, 60105, bins=2, exact=True, uncovered=None) assert np.allclose(vals, [0.53620019, 0.55277108]) - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False, uncovered=nan) + vals = bb.values( + "chr21", 14_760_000, 14_800_000, bins=1, exact=False, uncovered=None + ) assert np.allclose(vals, [1.23191877]) - vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True, uncovered=nan) + vals = bb.values( + "chr21", 14_760_000, 14_800_000, bins=1, exact=True, uncovered=None + ) assert np.allclose(vals, [1.34086629]) diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py index cf22b9c..c81d775 100644 --- a/pybigtools/tests/test_raster.py +++ b/pybigtools/tests/test_raster.py @@ -44,6 +44,19 @@ def bb_path(): return str(TEST_DIR / "data/mini.bb") +@pytest.fixture +def big_bw_path(): + """Real bigWig with 10 zoom levels (chr17, 83M bases); used to exercise + the zoom-interpolation code path (`exact=False` with non-trivial bin sizes).""" + return str(REPO_ROOT / "bigtools/resources/test/valid.bigWig") + + +@pytest.fixture +def big_bb_path(): + """Real bigBed with 5 zoom levels (chr21, 48M bases).""" + return str(TEST_DIR / "data/bigBedExample.bb") + + def test_values(bw_path, bb_path): # Default `uncovered=None` (NaN): uncovered bases are NaN. for path in [bw_path, bb_path]: @@ -59,9 +72,22 @@ def test_values(bw_path, bb_path): assert np.allclose( x1, [ - np.nan, np.nan, - np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan, - np.nan, np.nan, + np.nan, + np.nan, + np.nan, + 1, + 1, + np.nan, + 2, + 2, + 2, + 2, + 3, + 3, + np.nan, + np.nan, + np.nan, + np.nan, ], equal_nan=True, ) @@ -77,7 +103,7 @@ def test_fillna(bw_path, bb_path): # positions. It is INDEPENDENT of `oob`: out-of-chromosome positions are # filled by `oob` and not touched by `fillna`. - # Base-level: default `uncovered=NaN` leaves uncovered bases as NaN; + # Base-level: default `uncovered=None` leaves uncovered bases as NaN; # `fillna=-99` replaces them after the fact. for path in [bw_path, bb_path]: x = pbt.open(path).values("chr1", 0, 12, fillna=-99) @@ -102,14 +128,27 @@ def test_fillna(bw_path, bb_path): assert np.allclose( x, [ - np.nan, np.nan, - 0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0, - np.nan, np.nan, + np.nan, + np.nan, + 0, + 1, + 1, + 0, + 2, + 2, + 2, + 2, + 3, + 3, + 0, + 0, + np.nan, + np.nan, ], equal_nan=True, ) - # Binned: empty bin under `uncovered=NaN` (default) returns NaN; `fillna=0` + # Binned: empty bin under `uncovered=None` (default) returns NaN; `fillna=0` # replaces that. Partial bins are unaffected because they returned a # real number. for path in [bw_path, bb_path]: @@ -121,39 +160,38 @@ def test_fillna(bw_path, bb_path): def test_binned_mean(bw_path, bb_path): - # uncovered=NaN: classical mean over only covered bases; empty bins → NaN. - uncovered = np.nan + # uncovered=None: classical mean over only covered bases; empty bins → NaN. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="mean" + "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="mean" ) assert np.allclose( x1, - [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="mean" + "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="mean" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="mean" + "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="mean" ) assert np.allclose(x1, [1, 1, 2, 2.5, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="mean" + "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="mean" ) assert np.allclose(x1, [1.5, 2.5], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="mean" + "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="mean" ) assert np.allclose(x1, [2], equal_nan=True) @@ -205,63 +243,62 @@ def test_binned_mean_uncovered_zero(bw_path, bb_path): def test_binned_minmax(bw_path, bb_path): - # uncovered=NaN: classical min/max over only covered bases; empty bins → NaN. - uncovered = np.nan + # uncovered=None: classical min/max over only covered bases; empty bins → NaN. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="min" + "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="min" ) assert np.allclose( x1, - [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], equal_nan=True, ) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="max" + "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="max" ) assert np.allclose( x1, - [uncovered, 1, 1, uncovered, 2, 2, 2, 2, 3, 3, uncovered, uncovered], + [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], equal_nan=True, ) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="min" + "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="min" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="max" + "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="max" ) - assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) + assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="min" + "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="min" ) assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="max" + "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="max" ) assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="min" + "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="min" ) assert np.allclose(x1, [1, 2], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="max" + "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="max" ) assert np.allclose(x1, [2, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="min" + "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="min" ) assert np.allclose(x1, [1], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="max" + "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="max" ) assert np.allclose(x1, [3], equal_nan=True) @@ -339,11 +376,61 @@ def test_binned_coverage_breadth(bw_path, bb_path, n_bins, exact): assert np.nansum(x1) == 8 +def test_values_arr_param(bw_path, bb_path): + # Pass a pre-allocated array. The function writes into it and returns the + # same object. + for path in [bw_path, bb_path]: + arr = np.full(12, -7.0) + ret = pbt.open(path).values("chr1", 0, 12, uncovered=0, arr=arr) + assert ret is arr # same object + assert np.allclose(arr, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0]) + + # Binned: pre-allocated array of the correct length. + for path in [bw_path, bb_path]: + arr = np.full(6, -7.0) + ret = pbt.open(path).values( + "chr1", + 0, + 12, + bins=6, + exact=True, + summary="mean", + uncovered=None, + arr=arr, + ) + assert ret is arr + assert np.allclose(arr, [1, 1, 2, 2, 3, np.nan], equal_nan=True) + + +def test_values_arr_wrong_size(bw_path): + # Wrong-length `arr` raises ValueError. + with pytest.raises(ValueError): + pbt.open(bw_path).values("chr1", 0, 12, arr=np.zeros(11)) # need 12 + + with pytest.raises(ValueError): + pbt.open(bw_path).values( + "chr1", 0, 12, bins=6, exact=True, arr=np.zeros(5) + ) # need 6 + + +def test_values_bad_args(bw_path): + # Unknown chromosome. + with pytest.raises(KeyError): + pbt.open(bw_path).values("nope", 0, 12) + + # Unknown summary string. + with pytest.raises(ValueError): + pbt.open(bw_path).values("chr1", 0, 12, bins=4, summary="median") + + +# -- These tests check Parity with UCSC via pybbi -- + + @pytest.mark.parametrize( "start,end", [(start, end) for start in [0, 2] for end in [10, 12]] ) def test_bbi_consistent_values(bw_path, bb_path, start, end): - # pybigtools now defaults to `uncovered=NaN`, pybbi to `missing=0.0`. + # pybigtools defaults to `uncovered=None` (NaN), pybbi to `missing=0.0`. # Pass matching values explicitly for the comparison. for path in [bw_path, bb_path]: x1 = pbt.open(path).values("chr1", start, end, uncovered=0.0) @@ -362,19 +449,22 @@ def test_bbi_consistent_values_match_coverage(bb_path, bw_path): @pytest.mark.parametrize( - "start,end,uncovered,oob", - [ - (start, end, uncovered, oob) - for start in [-2, 0, 2] - for end in [10, 12, 14] - for uncovered in [np.nan, 0, -1] - for oob in [np.nan, 0, -1] - ], + "uncovered,oob", + [(uncovered, oob) for uncovered in [None, 0, -1] for oob in [np.nan, 0, -1]], +) +@pytest.mark.parametrize( + "start,end", + [(start, end) for start in [-2, 0, 2] for end in [10, 12, 14]], ) -def test_bbi_consistent_values_uncovered_oob(bw_path, bb_path, start, end, uncovered, oob): +def test_bbi_consistent_values_uncovered_oob( + bw_path, bb_path, start, end, uncovered, oob +): + # `uncovered=None` (pybigtools) ≡ `missing=NaN` (pybbi); both mean "leave + # uncovered bases as NaN." + pbbi_missing = np.nan if uncovered is None else uncovered for path in [bw_path, bb_path]: x1 = pbt.open(path).values("chr1", start, end, uncovered=uncovered, oob=oob) - x2 = bbi.open(path).fetch("chr1", start, end, missing=uncovered, oob=oob) + x2 = bbi.open(path).fetch("chr1", start, end, missing=pbbi_missing, oob=oob) assert np.allclose(x1, x2, equal_nan=True) @@ -388,16 +478,15 @@ def test_bbi_consistent_values_uncovered_oob(bw_path, bb_path, start, end, uncov for exact in [True] # `bin_covered` (= `cov` in pybbi) is excluded: pybigtools always returns # the factual coverage fraction for empty bins (0.0), while pybbi fills - # empty bins with `uncovered`. Intentional divergence. + # empty bins with `missing`. Intentional divergence. for summary in ["sum", "mean", "std", "min", "max"] ], ) def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summary): # pybbi computes "regular" statistics (exclude uncovered) and fills empty bins - # with `uncovered`. To compare apples-to-apples under pybigtools' new unified - # per-base `uncovered` semantics, pass `uncovered=NaN` to both — that means - # "exclude uncovered" in pybigtools, matching pybbi's behavior for partial - # bins, and NaN for empty bins in both. + # with `missing`. Under pybigtools' per-base `uncovered` semantics, the + # equivalent comparison passes `uncovered=None` to pybigtools and + # `missing=NaN` to pybbi: classical stats over partial bins, NaN for empty. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( "chr1", @@ -406,7 +495,7 @@ def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summ bins=n_bins, exact=exact, summary=summary, - uncovered=np.nan, + uncovered=None, ) x2 = bbi.open(path).fetch( "chr1", @@ -434,13 +523,13 @@ def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summ def test_bbi_consistent_binned_uncovered_oob( bw_path, bb_path, start, end, n_bins, exact, oob ): - # `uncovered` is restricted to NaN for parity: pybigtools' per-base `uncovered` + # `uncovered` is restricted to None for parity: pybigtools' per-base `uncovered` # semantics diverge from pybbi's empty-bin-fill semantics whenever the bin is - # partially covered. With `uncovered=NaN` both libraries compute classical - # statistics over the covered bases. + # partially covered. With pybigtools' `uncovered=None` and pybbi's + # `missing=NaN`, both compute classical statistics over the covered bases. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, uncovered=np.nan, oob=oob + "chr1", start, end, bins=n_bins, exact=exact, uncovered=None, oob=oob ) x2 = bbi.open(path).fetch( "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan, oob=oob @@ -457,9 +546,110 @@ def test_bbi_consistent_binned_upsample(bw_path, bb_path, start, end, n_bins, ex # so per-base vs empty-bin `uncovered` semantics agree. for path in [bb_path, bw_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, uncovered=np.nan + "chr1", start, end, bins=n_bins, exact=exact, uncovered=None ) x2 = bbi.open(path).fetch( "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan ) assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,summary", + [ + # Large query ranges + few bins → bin size big enough that `best_zoom` + # selects a real zoom level, exercising the zoom-interpolation path. + # `valid.bigWig` has zoom resolutions [10, 40, 160, 640, 2560, ...]. + # For a query of width W and N bins, max_zoom_size = W / (2*N). + # E.g. 100kb / (2*10 bins) = 5000 ≥ resolution 2560 → uses zoom 2560. + (100_000, 200_000, 10, "mean"), + (100_000, 200_000, 10, "std"), + (100_000, 200_000, 10, "min"), + (100_000, 200_000, 10, "max"), + (100_000, 200_000, 10, "sum"), + (100_000, 200_000, 50, "mean"), + (0, 1_000_000, 100, "mean"), + (0, 1_000_000, 100, "std"), + ], +) +def test_bbi_consistent_binned_zoom_bw(big_bw_path, start, end, n_bins, summary): + # Parity vs pybbi when traversing the zoom-interpolation path. This is the + # path that the UCSC ceil+rescale normalization affects, so a passing + # parity test here validates that normalization is correctly applied. + x1 = pbt.open(big_bw_path).values( + "chr17", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + uncovered=None, + ) + x2 = bbi.open(big_bw_path).fetch( + "chr17", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + missing=np.nan, + ) + assert np.allclose(x1, x2, equal_nan=True) + + +@pytest.mark.parametrize( + "start,end,n_bins,summary", + [ + (10_000_000, 20_000_000, 50, "mean"), + (10_000_000, 20_000_000, 50, "min"), + (10_000_000, 20_000_000, 50, "max"), + (10_000_000, 20_000_000, 100, "sum"), + ], +) +def test_bbi_consistent_binned_zoom_bb(big_bb_path, start, end, n_bins, summary): + # Same parity check on the bigBed zoom path. + x1 = pbt.open(big_bb_path).values( + "chr21", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + uncovered=None, + ) + x2 = bbi.open(big_bb_path).fetch( + "chr21", + start, + end, + bins=n_bins, + exact=False, + summary=summary, + missing=np.nan, + ) + assert np.allclose(x1, x2, equal_nan=True) + + +def test_values_fully_out_of_bounds(bw_path, bb_path): + # Query entirely past the end of the chromosome. + # chr1 length is 12. Query [20, 30) is fully OOB. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 20, 30) + # Default oob=NaN. + assert len(x) == 10 + assert all(np.isnan(v) for v in x) + + # With oob=-1, the whole array is -1. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 20, 30, oob=-1) + assert np.array_equal(x, np.full(10, -1.0)) + + # Query entirely before the start (negative range). + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", -10, -2) + assert len(x) == 8 + assert all(np.isnan(v) for v in x) + + # Binned version of the same — all bins OOB, all bins get oob. + for path in [bw_path, bb_path]: + x = pbt.open(path).values("chr1", 20, 30, bins=5, exact=True, oob=0) + assert np.array_equal(x, np.zeros(5)) From ab354ab1c7ee47338e832d3ebf9e46e5cac191f2 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 11:16:16 -0400 Subject: [PATCH 14/17] Add fill_binned tests --- pybigtools/src/raster.rs | 186 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/pybigtools/src/raster.rs b/pybigtools/src/raster.rs index 6110899..31dd6e5 100644 --- a/pybigtools/src/raster.rs +++ b/pybigtools/src/raster.rs @@ -946,4 +946,190 @@ mod tests { .unwrap(); assert_eq!(arr.to_vec(), vec![5.0, 5.0, 5.0, 0.0, 0.0]); } + + /// Build an iterator of `Ok(BBIRecord::Value(...))` from a list of triples. + fn vals(items: Vec<(u32, u32, f32)>) -> impl Iterator> { + items + .into_iter() + .map(|(s, e, v)| Value { + start: s, + end: e, + value: v, + }) + .map(BBIRecord::Value) + .map(Ok) + } + + #[test] + fn test_fill_binned_sum_sum_squares() { + // 2 bins over range 0..20. Values: + // [0, 5) value=2 → bin 0 contributes 2*5 = 10 + // [5, 15) value=3 → bin 0 contributes 3*5 = 15; bin 1 contributes 3*5 = 15 + // Bin 0: sum=25 over 10 covered bp; bin 1: sum=15 over 5 covered bp. + let start = 0; + let end = 20; + let n_bins = 2; + let intervals = vec![(0u32, 5u32, 2.0f32), (5, 15, 3.0)]; + + // Sum, uncovered=None: only covered contributes. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Sum, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![25.0, 15.0]); + + // Sum, uncovered=Some(1.0): bin 1 also gets 1.0 * 5 uncovered bases. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Sum, + n_bins, + Some(1.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![25.0, 15.0 + 1.0 * 5.0]); + + // SumSquares, uncovered=None. + // Bin 0: 2*2*5 + 3*3*5 = 20 + 45 = 65. Bin 1: 3*3*5 = 45. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::SumSquares, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![65.0, 45.0]); + + // SumSquares, uncovered=Some(2.0): adds 2*2 = 4 per uncovered base. + // Bin 1 has 5 uncovered bp → +20. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::SumSquares, + n_bins, + Some(2.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![65.0, 45.0 + 4.0 * 5.0]); + } + + #[test] + fn test_fill_binned_bases_and_bin_covered() { + // 4 bins of width 5 over [0, 20). One value covering [0, 12). + // Bin 0 [0, 5): fully covered (5). + // Bin 1 [5, 10): fully covered (5). + // Bin 2 [10, 15): partially covered (2 of 5). + // Bin 3 [15, 20): no coverage (0). + let start = 0; + let end = 20; + let n_bins = 4; + let intervals = vec![(0u32, 12u32, 1.0f32)]; + + // BasesCovered: actual integer count of covered bases per bin. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::BasesCovered, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 2.0, 0.0]); + + // `uncovered` must not affect BasesCovered. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::BasesCovered, + n_bins, + Some(99.0), + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![5.0, 5.0, 2.0, 0.0]); + + // BinCovered: bases_covered / bin_size. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::BinCovered, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert_eq!(arr.to_vec(), vec![1.0, 1.0, 0.4, 0.0]); + } + + #[test] + fn test_fill_binned_std() { + // 2 bins over [0, 10). Bin 0 sees two distinct values; bin 1 is empty. + // [0, 3) v=1 → contributes value 1 over 3 bases in bin 0 + // [3, 5) v=4 → contributes value 4 over 2 bases in bin 0 + // Bin 0 sum = 1*3 + 4*2 = 11; sum_squares = 1*3 + 16*2 = 35; n_covered = 5. + // Sample variance = (35 - 11*11/5) / (5-1) = (35 - 24.2) / 4 = 2.7 + // std = sqrt(2.7). + let start = 0; + let end = 10; + let n_bins = 2; + let intervals = vec![(0u32, 3u32, 1.0f32), (3, 5, 4.0)]; + + // uncovered=None: classical sample std over only the covered bases. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals.clone()), + Summary::Std, + n_bins, + None, + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 2.7_f64.sqrt()).abs() < 1e-12); + assert!(arr[1].is_nan()); // empty bin → NaN + + // uncovered=Some(0.0): the 0-padded computation. + // Bin 0: s = 11 + 0*0 = 11; ss = 35 + 0 = 35; n = 5 (bin_width). + // variance = (35 - 121/5) / 4 = (35 - 24.2)/4 = 2.7 + // std = sqrt(2.7). Same as None because bin 0 is fully covered. + // Bin 1: s = 0; ss = 0; n = 5. variance = 0/4 = 0. std = 0. + let mut arr = Array::from(vec![0.0; n_bins]); + fill_binned( + start, + end, + vals(intervals), + Summary::Std, + n_bins, + Some(0.0), + arr.view_mut(), + ) + .unwrap(); + assert!((arr[0] - 2.7_f64.sqrt()).abs() < 1e-12); + assert!((arr[1] - 0.0).abs() < 1e-12); + } } From 5698e893807954339c3fbb37530c176ea3ff4d98 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 11:37:01 -0400 Subject: [PATCH 15/17] deprecate: `missing` kwarg and the implicit fill-with-zero default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap the Rust `BBIReader` in a thin Python class (in `__init__.py`) that stages a deprecation cycle for `values()`: - `missing` is accepted as a deprecated alias for `fillna`. Using it raises a `DeprecationWarning`. - Not passing `fillna` at all raises a `DeprecationWarning` noting that the default behavior has changed: empty bins / uncovered positions are now returned as NaN instead of filled with 0. The Rust-level default (`fillna=None`) is applied. Users who want the old behavior pass `fillna=0` explicitly; users who want to silence the warning pass `fillna=None`. The wrapper uses a sentinel to distinguish "user did not pass `fillna`" from "user passed `fillna=None`" — a distinction PyO3 can't make at the Rust boundary. Other attributes (`chroms`, `info`, `values`'s non-deprecated kwargs, context-manager protocol, etc.) are delegated to the underlying Rust object via `__getattr__` and explicit forwarders. Tests: - Two new tests pin the deprecation behavior: `test_values_missing_alias_deprecated` and `test_values_default_fillna_deprecated`. - Existing `.values(...)` calls in tests have `fillna=...` added explicitly (53 in test_raster.py, 38 in test_api.py) so tests don't rely on the default and don't trigger spurious warnings. - `pyproject.toml`: ignore `DeprecationWarning` from pybigtools during test runs so warnings don't flood output. Tests that exercise the deprecation path opt back in via `pytest.warns(...)`. --- pybigtools/pybigtools/__init__.py | 85 +++++++ pybigtools/pyproject.toml | 9 + pybigtools/tests/test_api.py | 148 ++++++++---- pybigtools/tests/test_raster.py | 364 ++++++++++++++++++++++++++---- 4 files changed, 519 insertions(+), 87 deletions(-) diff --git a/pybigtools/pybigtools/__init__.py b/pybigtools/pybigtools/__init__.py index e3b625e..d291550 100644 --- a/pybigtools/pybigtools/__init__.py +++ b/pybigtools/pybigtools/__init__.py @@ -1,10 +1,95 @@ # ruff: noqa: F403 # ruff: noqa: F405 # This is here to make sure that source installs with pip work correctly +import warnings as _warnings + from pybigtools.pybigtools import * from pybigtools import pybigtools as bt __doc__ = bt.__doc__ if hasattr(bt, "__all__"): __all__ = bt.__all__ + +_RustBBIReader = bt.BBIReader +_rust_open = bt.open + + +# Sentinel distinguishing "user did not pass" from "user passed None" in `values`. +_UNSET = object() + + +class BBIReader: + """Python-side wrapper around the Rust ``BBIReader`` that stages a + deprecation cycle on the ``values()`` parameters. All other attributes + are delegated to the underlying Rust object. + """ + + __slots__ = ("_rust",) + + def __init__(self, rust_reader): + object.__setattr__(self, "_rust", rust_reader) + + def __getattr__(self, name): + return getattr(self._rust, name) + + def __enter__(self): + self._rust.__enter__() + return self + + def __exit__(self, exc_type, exc_value, traceback): + return self._rust.__exit__(exc_type, exc_value, traceback) + + def values( + self, + chrom, + start=None, + end=None, + bins=None, + summary="mean", + exact=False, + uncovered=None, + oob=float("nan"), + fillna=_UNSET, + missing=_UNSET, + arr=None, + ): + if missing is not _UNSET: + _warnings.warn( + "`missing` is deprecated and will be removed in a future " + "release; use `fillna` instead.", + DeprecationWarning, + stacklevel=2, + ) + if fillna is _UNSET: + fillna = missing + if fillna is _UNSET: + _warnings.warn( + "The default behavior of `values()` has changed: empty bins / " + "uncovered positions are now returned as NaN instead of " + "filled with 0. Pass `fillna=0` to keep the previous " + "behavior, or `fillna=None` to silence this warning. See " + "also `uncovered` to include uncovered bases in summary " + "statistic calculations.", + DeprecationWarning, + stacklevel=2, + ) + fillna = None + return self._rust.values( + chrom, + start, + end, + bins=bins, + summary=summary, + exact=exact, + uncovered=uncovered, + oob=oob, + fillna=fillna, + arr=arr, + ) + + +def open(path, mode="r"): # noqa: A001 - intentional shadow of builtin + return BBIReader(_rust_open(path, mode)) + + del bt diff --git a/pybigtools/pyproject.toml b/pybigtools/pyproject.toml index 76f494e..8e7d966 100644 --- a/pybigtools/pyproject.toml +++ b/pybigtools/pyproject.toml @@ -49,6 +49,15 @@ repository = "https://github.com/jackh726/bigtools" [tool.maturin] python-source = "." +[tool.pytest.ini_options] +filterwarnings = [ + # Deprecation warnings from pybigtools' `values()` are expected during + # the deprecation cycle and would otherwise flood the test output. + # Tests that exercise the deprecation path itself use + # `pytest.warns(DeprecationWarning)` locally. + "ignore::DeprecationWarning:pybigtools", +] + [tool.uv] default-groups = ["dev"] cache-keys = [{file = "pyproject.toml"}, {file = "Cargo.toml"}, {file = "**/*.rs"}] diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index c2d3cc1..0d7c5e0 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -258,65 +258,91 @@ def test_zoom_records_oob(bw, bb): def test_values_bp(bw, bb): # (chrom, None, None) => all values on chrom - assert len(bw.values("chr17")) == 83_257_441 - assert len(bb.values("chr21")) == 48_129_895 + assert len(bw.values("chr17", fillna=None)) == 83_257_441 + assert len(bb.values("chr21", fillna=None)) == 48_129_895 # (chrom, start, None) => all values from (start, ) - assert len(bw.values("chr17", 0)) == 83_257_441 - assert len(bw.values("chr17", 10)) == 83_257_441 - 10 - assert len(bb.values("chr21", 0)) == 48_129_895 - assert len(bb.values("chr21", 10)) == 48_129_895 - 10 + assert len(bw.values("chr17", 0, fillna=None)) == 83_257_441 + assert len(bw.values("chr17", 10, fillna=None)) == 83_257_441 - 10 + assert len(bb.values("chr21", 0, fillna=None)) == 48_129_895 + assert len(bb.values("chr21", 10, fillna=None)) == 48_129_895 - 10 # (chrom, start, end) - assert len(bw.values("chr17", 100_000, 110_000)) == 10_000 - assert len(bb.values("chr21", 10_148_000, 10_158_000)) == 10_000 + assert len(bw.values("chr17", 100_000, 110_000, fillna=None)) == 10_000 + assert len(bb.values("chr21", 10_148_000, 10_158_000, fillna=None)) == 10_000 def test_values_binned(bw, bb): - assert len(bw.values("chr17", 100000, 110000, 10)) == 10 - assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 + assert len(bw.values("chr17", 100000, 110000, 10, fillna=None)) == 10 + assert len(bb.values("chr21", 10_148_000, 10_158_000, 10, fillna=None)) == 10 # Statistics over only the covered bases (`uncovered=None`). assert ( - bw.values("chr17", 100000, 110000, 10, uncovered=None)[0] == 0.44886381671868925 + bw.values("chr17", 100000, 110000, 10, uncovered=None, fillna=None)[0] + == 0.44886381671868925 + ) + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=None, fillna=None)[0] + == 1.0 ) - assert bb.values("chr21", 10_148_000, 10_158_000, 10, uncovered=None)[0] == 1.0 assert ( - bw.values("chr17", 100000, 110000, 10, "max", uncovered=None)[0] + bw.values("chr17", 100000, 110000, 10, "max", uncovered=None, fillna=None)[0] == 1.1978399753570557 ) assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "max", uncovered=None)[0] == 1.0 + bb.values( + "chr21", 10_148_000, 10_158_000, 10, "max", uncovered=None, fillna=None + )[0] + == 1.0 ) assert ( - bw.values("chr17", 100000, 110000, 10, "min", uncovered=None)[0] + bw.values("chr17", 100000, 110000, 10, "min", uncovered=None, fillna=None)[0] == 0.05403999984264374 ) assert ( - bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=None)[0] == 1.0 + bb.values( + "chr21", 10_148_000, 10_158_000, 10, "min", uncovered=None, fillna=None + )[0] + == 1.0 ) # Uncovered bases counted as 0 in min (min0 semantics). - assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0)[0] == 0.0 + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "min", uncovered=0, fillna=None)[ + 0 + ] + == 0.0 + ) def test_values_binned_exact(bw, bb): # Statistics over only the covered bases (`uncovered=None`). assert np.isclose( - bw.values("chr17", 100000, 110000, 10, "mean", exact=True, uncovered=None)[0], + bw.values( + "chr17", 100000, 110000, 10, "mean", exact=True, uncovered=None, fillna=None + )[0], 0.4542629980980206, ) assert ( bb.values( - "chr21", 10_148_000, 10_158_000, 10, "mean", exact=True, uncovered=None + "chr21", + 10_148_000, + 10_158_000, + 10, + "mean", + exact=True, + uncovered=None, + fillna=None, )[0] == 1.0 ) assert np.allclose( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True, uncovered=0.0), + bw.values( + "chr17", 59890, 59900, 10, "mean", exact=True, uncovered=0.0, fillna=None + ), [ 0.0, 0.0, @@ -334,7 +360,9 @@ def test_values_binned_exact(bw, bb): def test_values_binned_uncovered_oob(bw, bb): assert np.allclose( - bw.values("chr17", 59890, 59900, 10, "mean", exact=True, uncovered=-1.0), + bw.values( + "chr17", 59890, 59900, 10, "mean", exact=True, uncovered=-1.0, fillna=None + ), [ -1.0, -1.0, @@ -349,41 +377,67 @@ def test_values_binned_uncovered_oob(bw, bb): ], ) - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0) + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, fillna=None) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0) + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, fillna=None) assert math.isnan(x[0]) assert not math.isnan(x[19]) - x = bw.values("chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0) + x = bw.values( + "chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0, fillna=None + ) assert x[0] == 0.0 - x = bb.values("chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0) + x = bb.values( + "chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=0.0, fillna=None + ) assert x[0] == 0.0 - assert np.isclose(bw.values("chr17", 59890, 59900)[9], 0.06792) - assert np.isclose(bw.values("chr17", 59890, 59900, uncovered=None)[9], 0.06792) + assert np.isclose(bw.values("chr17", 59890, 59900, fillna=None)[9], 0.06792) + assert np.isclose( + bw.values("chr17", 59890, 59900, uncovered=None, fillna=None)[9], 0.06792 + ) def test_values_binned_exact_vs_from_zoom(bw, bb): # Statistics over only the covered bases (`uncovered=None`). - vals = bw.values("chr17", 85525, 85730, bins=2, exact=False, uncovered=None) + vals = bw.values( + "chr17", 85525, 85730, bins=2, exact=False, uncovered=None, fillna=None + ) assert np.allclose(vals, [0.10737355, 2.72889167]) - vals = bw.values("chr17", 85525, 85730, bins=2, exact=True, uncovered=None) + vals = bw.values( + "chr17", 85525, 85730, bins=2, exact=True, uncovered=None, fillna=None + ) assert np.allclose(vals, [0.06770935, 2.48644244]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=False, uncovered=None) + vals = bw.values( + "chr17", 59900, 60105, bins=2, exact=False, uncovered=None, fillna=None + ) assert np.allclose(vals, [0.53580606, 0.55134715]) - vals = bw.values("chr17", 59900, 60105, bins=2, exact=True, uncovered=None) + vals = bw.values( + "chr17", 59900, 60105, bins=2, exact=True, uncovered=None, fillna=None + ) assert np.allclose(vals, [0.53620019, 0.55277108]) vals = bb.values( - "chr21", 14_760_000, 14_800_000, bins=1, exact=False, uncovered=None + "chr21", + 14_760_000, + 14_800_000, + bins=1, + exact=False, + uncovered=None, + fillna=None, ) assert np.allclose(vals, [1.23191877]) vals = bb.values( - "chr21", 14_760_000, 14_800_000, bins=1, exact=True, uncovered=None + "chr21", + 14_760_000, + 14_800_000, + bins=1, + exact=True, + uncovered=None, + fillna=None, ) assert np.allclose(vals, [1.34086629]) @@ -393,7 +447,16 @@ def test_values_assign_to_array(bw, bb): arr = np.zeros(20) ret_arr = bw.values( - "chr17", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=np.nan, arr=arr + "chr17", + -10, + 10, + 20, + "mean", + exact=True, + uncovered=0.0, + oob=np.nan, + arr=arr, + fillna=None, ) assert math.isnan(arr[0]) assert arr[19] == 0.0 @@ -402,7 +465,16 @@ def test_values_assign_to_array(bw, bb): assert np.array_equal(arr, ret_arr, equal_nan=True) ret_arr = bb.values( - "chr21", -10, 10, 20, "mean", exact=True, uncovered=0.0, oob=np.nan, arr=arr + "chr21", + -10, + 10, + 20, + "mean", + exact=True, + uncovered=0.0, + oob=np.nan, + arr=arr, + fillna=None, ) assert math.isnan(arr[0]) assert arr[19] == 0.0 @@ -419,18 +491,18 @@ def test_values_oob(bw): end = chroms[chrom] + bin_size start = end - 2 * bin_size # Try to get values - v = bw.values(chrom, start, end, bins=3) + v = bw.values(chrom, start, end, bins=3, fillna=None) assert len(v) == 3 assert np.isnan(v[2]) # Note: also happens if explicitly setting oob: - v = bw.values(chrom, start, end, bins=3, oob=0) + v = bw.values(chrom, start, end, bins=3, oob=0, fillna=None) assert len(v) == 3 assert v[2] == 0 def test_big_gene_pred(): bb = pybigtools.open(REPO_ROOT / "bigtools/resources/test/bigGenePred.bb") - bb.values("chr21", 14_000_000, 18_000_000, 100) + bb.values("chr21", 14_000_000, 18_000_000, 100, fillna=None) def test_average_over_bed(bw, bb): diff --git a/pybigtools/tests/test_raster.py b/pybigtools/tests/test_raster.py index c81d775..906bec9 100644 --- a/pybigtools/tests/test_raster.py +++ b/pybigtools/tests/test_raster.py @@ -60,7 +60,7 @@ def big_bb_path(): def test_values(bw_path, bb_path): # Default `uncovered=None` (NaN): uncovered bases are NaN. for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", 0, 12) + x1 = pbt.open(path).values("chr1", 0, 12, fillna=None) assert np.allclose( x1, [np.nan, 1, 1, np.nan, 2, 2, 2, 2, 3, 3, np.nan, np.nan], @@ -68,7 +68,7 @@ def test_values(bw_path, bb_path): ) for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", -2, 14) + x1 = pbt.open(path).values("chr1", -2, 14, fillna=None) assert np.allclose( x1, [ @@ -94,10 +94,31 @@ def test_values(bw_path, bb_path): # `uncovered=0` reproduces the previous fill-with-zero behavior. for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", 0, 12, uncovered=0) + x1 = pbt.open(path).values("chr1", 0, 12, uncovered=0, fillna=None) assert np.allclose(x1, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0], equal_nan=True) +def test_values_missing_alias_deprecated(bw_path): + # `missing` is a deprecated alias for `fillna`. Using it raises a + # DeprecationWarning but otherwise behaves identically to `fillna`. + with pytest.warns(DeprecationWarning, match=r"`missing` is deprecated"): + x = pbt.open(bw_path).values("chr1", 0, 12, missing=0) + # Behavior matches `fillna=0`. + y = pbt.open(bw_path).values("chr1", 0, 12, fillna=0) + assert np.array_equal(x, y) + + +def test_values_default_fillna_deprecated(bw_path): + # Not passing `fillna` triggers a DeprecationWarning about the default + # change. The new default is `fillna=None` (leave NaN), matching the + # current Rust-level default. + with pytest.warns(DeprecationWarning, match=r"default behavior of `values"): + x = pbt.open(bw_path).values("chr1", 0, 12) + # Equivalent to explicit fillna=None. + y = pbt.open(bw_path).values("chr1", 0, 12, fillna=None) + assert np.array_equal(x, y, equal_nan=True) + + def test_fillna(bw_path, bb_path): # `fillna` is a post-rasterization fill that replaces NaN at in-bounds # positions. It is INDEPENDENT of `oob`: out-of-chromosome positions are @@ -163,7 +184,14 @@ def test_binned_mean(bw_path, bb_path): # uncovered=None: classical mean over only covered bases; empty bins → NaN. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="mean" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="mean", + fillna=None, ) assert np.allclose( x1, @@ -173,25 +201,53 @@ def test_binned_mean(bw_path, bb_path): for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="mean" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="mean", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="mean" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="mean", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2.5, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="mean" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="mean", + fillna=None, ) assert np.allclose(x1, [1.5, 2.5], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="mean" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="mean", + fillna=None, ) assert np.allclose(x1, [2], equal_nan=True) @@ -202,7 +258,14 @@ def test_binned_mean_uncovered_zero(bw_path, bb_path): uncovered = 0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="mean" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, ) assert np.allclose( x1, @@ -212,7 +275,14 @@ def test_binned_mean_uncovered_zero(bw_path, bb_path): for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="mean" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, ) assert np.allclose(x1, [0.5, 0.5, 2, 2, 3, uncovered], equal_nan=True) @@ -225,19 +295,40 @@ def test_binned_mean_uncovered_zero(bw_path, bb_path): # mean0: 0.5 0.5 2.0 2.5 1.0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="mean" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, ) assert np.allclose(x1, [0.5, 0.5, 2.0, 2.5, 1.0], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="mean" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, ) assert np.allclose(x1, [1, 10 / 6], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="mean" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="mean", + fillna=None, ) assert np.allclose(x1, [16 / 12], equal_nan=True) @@ -246,7 +337,14 @@ def test_binned_minmax(bw_path, bb_path): # uncovered=None: classical min/max over only covered bases; empty bins → NaN. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="min" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="min", + fillna=None, ) assert np.allclose( x1, @@ -254,7 +352,14 @@ def test_binned_minmax(bw_path, bb_path): equal_nan=True, ) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=None, summary="max" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=None, + summary="max", + fillna=None, ) assert np.allclose( x1, @@ -264,41 +369,97 @@ def test_binned_minmax(bw_path, bb_path): for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="min" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="min", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=None, summary="max" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=None, + summary="max", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2, 3, np.nan], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="min" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="min", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2, 3], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=None, summary="max" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=None, + summary="max", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="min" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="min", + fillna=None, ) assert np.allclose(x1, [1, 2], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=None, summary="max" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=None, + summary="max", + fillna=None, ) assert np.allclose(x1, [2, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="min" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="min", + fillna=None, ) assert np.allclose(x1, [1], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=None, summary="max" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=None, + summary="max", + fillna=None, ) assert np.allclose(x1, [3], equal_nan=True) @@ -308,7 +469,14 @@ def test_binned_minmax_uncovered_zero(bw_path, bb_path): uncovered = 0 for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="min" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, ) assert np.allclose( x1, @@ -316,7 +484,14 @@ def test_binned_minmax_uncovered_zero(bw_path, bb_path): equal_nan=True, ) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=12, exact=True, uncovered=uncovered, summary="max" + "chr1", + 0, + 12, + bins=12, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, ) assert np.allclose( x1, @@ -326,41 +501,97 @@ def test_binned_minmax_uncovered_zero(bw_path, bb_path): for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="min" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, ) assert np.allclose(x1, [0, 0, 2, 2, 3, uncovered], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=6, exact=True, uncovered=uncovered, summary="max" + "chr1", + 0, + 12, + bins=6, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 2, 3, uncovered], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="min" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, ) assert np.allclose(x1, [0, 0, 2, 2, 0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=5, exact=True, uncovered=uncovered, summary="max" + "chr1", + 0, + 12, + bins=5, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, ) assert np.allclose(x1, [1, 1, 2, 3, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="min" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, ) assert np.allclose(x1, [0, 0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=2, exact=True, uncovered=uncovered, summary="max" + "chr1", + 0, + 12, + bins=2, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, ) assert np.allclose(x1, [2, 3], equal_nan=True) for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="min" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="min", + fillna=None, ) assert np.allclose(x1, [0], equal_nan=True) x1 = pbt.open(path).values( - "chr1", 0, 12, bins=1, exact=True, uncovered=uncovered, summary="max" + "chr1", + 0, + 12, + bins=1, + exact=True, + uncovered=uncovered, + summary="max", + fillna=None, ) assert np.allclose(x1, [3], equal_nan=True) @@ -371,7 +602,13 @@ def test_binned_minmax_uncovered_zero(bw_path, bb_path): def test_binned_coverage_breadth(bw_path, bb_path, n_bins, exact): for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", 0, 12, bins=n_bins, exact=exact, summary="bases_covered" + "chr1", + 0, + 12, + bins=n_bins, + exact=exact, + summary="bases_covered", + fillna=None, ) assert np.nansum(x1) == 8 @@ -381,7 +618,7 @@ def test_values_arr_param(bw_path, bb_path): # same object. for path in [bw_path, bb_path]: arr = np.full(12, -7.0) - ret = pbt.open(path).values("chr1", 0, 12, uncovered=0, arr=arr) + ret = pbt.open(path).values("chr1", 0, 12, uncovered=0, arr=arr, fillna=None) assert ret is arr # same object assert np.allclose(arr, [0, 1, 1, 0, 2, 2, 2, 2, 3, 3, 0, 0]) @@ -397,6 +634,7 @@ def test_values_arr_param(bw_path, bb_path): summary="mean", uncovered=None, arr=arr, + fillna=None, ) assert ret is arr assert np.allclose(arr, [1, 1, 2, 2, 3, np.nan], equal_nan=True) @@ -405,22 +643,30 @@ def test_values_arr_param(bw_path, bb_path): def test_values_arr_wrong_size(bw_path): # Wrong-length `arr` raises ValueError. with pytest.raises(ValueError): - pbt.open(bw_path).values("chr1", 0, 12, arr=np.zeros(11)) # need 12 + pbt.open(bw_path).values( + "chr1", 0, 12, arr=np.zeros(11), fillna=None + ) # need 12 with pytest.raises(ValueError): pbt.open(bw_path).values( - "chr1", 0, 12, bins=6, exact=True, arr=np.zeros(5) + "chr1", + 0, + 12, + bins=6, + exact=True, + arr=np.zeros(5), + fillna=None, ) # need 6 def test_values_bad_args(bw_path): # Unknown chromosome. with pytest.raises(KeyError): - pbt.open(bw_path).values("nope", 0, 12) + pbt.open(bw_path).values("nope", 0, 12, fillna=None) # Unknown summary string. with pytest.raises(ValueError): - pbt.open(bw_path).values("chr1", 0, 12, bins=4, summary="median") + pbt.open(bw_path).values("chr1", 0, 12, bins=4, summary="median", fillna=None) # -- These tests check Parity with UCSC via pybbi -- @@ -433,14 +679,14 @@ def test_bbi_consistent_values(bw_path, bb_path, start, end): # pybigtools defaults to `uncovered=None` (NaN), pybbi to `missing=0.0`. # Pass matching values explicitly for the comparison. for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", start, end, uncovered=0.0) + x1 = pbt.open(path).values("chr1", start, end, uncovered=0.0, fillna=None) x2 = bbi.open(path).fetch("chr1", start, end, missing=0.0) assert np.allclose(x1, x2, equal_nan=True) def test_bbi_consistent_values_match_coverage(bb_path, bw_path): - bb1 = pbt.open(bb_path).values("chr1", uncovered=0.0) - bw1 = pbt.open(bw_path).values("chr1", uncovered=0.0) + bb1 = pbt.open(bb_path).values("chr1", uncovered=0.0, fillna=None) + bw1 = pbt.open(bw_path).values("chr1", uncovered=0.0, fillna=None) assert np.allclose(bb1, bw1, equal_nan=True) bb2 = bbi.open(bb_path).fetch("chr1", 0, -1) @@ -463,7 +709,9 @@ def test_bbi_consistent_values_uncovered_oob( # uncovered bases as NaN." pbbi_missing = np.nan if uncovered is None else uncovered for path in [bw_path, bb_path]: - x1 = pbt.open(path).values("chr1", start, end, uncovered=uncovered, oob=oob) + x1 = pbt.open(path).values( + "chr1", start, end, uncovered=uncovered, oob=oob, fillna=None + ) x2 = bbi.open(path).fetch("chr1", start, end, missing=pbbi_missing, oob=oob) assert np.allclose(x1, x2, equal_nan=True) @@ -496,6 +744,7 @@ def test_bbi_consistent_binned(bw_path, bb_path, start, end, n_bins, exact, summ exact=exact, summary=summary, uncovered=None, + fillna=None, ) x2 = bbi.open(path).fetch( "chr1", @@ -529,7 +778,14 @@ def test_bbi_consistent_binned_uncovered_oob( # `missing=NaN`, both compute classical statistics over the covered bases. for path in [bw_path, bb_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, uncovered=None, oob=oob + "chr1", + start, + end, + bins=n_bins, + exact=exact, + uncovered=None, + oob=oob, + fillna=None, ) x2 = bbi.open(path).fetch( "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan, oob=oob @@ -546,7 +802,13 @@ def test_bbi_consistent_binned_upsample(bw_path, bb_path, start, end, n_bins, ex # so per-base vs empty-bin `uncovered` semantics agree. for path in [bb_path, bw_path]: x1 = pbt.open(path).values( - "chr1", start, end, bins=n_bins, exact=exact, uncovered=None + "chr1", + start, + end, + bins=n_bins, + exact=exact, + uncovered=None, + fillna=None, ) x2 = bbi.open(path).fetch( "chr1", start, end, bins=n_bins, exact=exact, missing=np.nan @@ -584,6 +846,7 @@ def test_bbi_consistent_binned_zoom_bw(big_bw_path, start, end, n_bins, summary) exact=False, summary=summary, uncovered=None, + fillna=None, ) x2 = bbi.open(big_bw_path).fetch( "chr17", @@ -616,6 +879,7 @@ def test_bbi_consistent_binned_zoom_bb(big_bb_path, start, end, n_bins, summary) exact=False, summary=summary, uncovered=None, + fillna=None, ) x2 = bbi.open(big_bb_path).fetch( "chr21", @@ -633,23 +897,25 @@ def test_values_fully_out_of_bounds(bw_path, bb_path): # Query entirely past the end of the chromosome. # chr1 length is 12. Query [20, 30) is fully OOB. for path in [bw_path, bb_path]: - x = pbt.open(path).values("chr1", 20, 30) + x = pbt.open(path).values("chr1", 20, 30, fillna=None) # Default oob=NaN. assert len(x) == 10 assert all(np.isnan(v) for v in x) # With oob=-1, the whole array is -1. for path in [bw_path, bb_path]: - x = pbt.open(path).values("chr1", 20, 30, oob=-1) + x = pbt.open(path).values("chr1", 20, 30, oob=-1, fillna=None) assert np.array_equal(x, np.full(10, -1.0)) # Query entirely before the start (negative range). for path in [bw_path, bb_path]: - x = pbt.open(path).values("chr1", -10, -2) + x = pbt.open(path).values("chr1", -10, -2, fillna=None) assert len(x) == 8 assert all(np.isnan(v) for v in x) # Binned version of the same — all bins OOB, all bins get oob. for path in [bw_path, bb_path]: - x = pbt.open(path).values("chr1", 20, 30, bins=5, exact=True, oob=0) + x = pbt.open(path).values( + "chr1", 20, 30, bins=5, exact=True, oob=0, fillna=None + ) assert np.array_equal(x, np.zeros(5)) From 7ab4a3d90b8afb8429c4d4ef47e812f4a0b0f47b Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 13 May 2026 12:35:27 -0400 Subject: [PATCH 16/17] Add mini.bw and mini.bb test fixtures --- pybigtools/tests/data/mini.bb | Bin 0 -> 732 bytes pybigtools/tests/data/mini.bw | Bin 0 -> 527 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 pybigtools/tests/data/mini.bb create mode 100644 pybigtools/tests/data/mini.bw diff --git a/pybigtools/tests/data/mini.bb b/pybigtools/tests/data/mini.bb new file mode 100644 index 0000000000000000000000000000000000000000..c9252708e634036222c94080fa7d7a8c7262af39 GIT binary patch literal 732 zcmaF;sk5Dhfq@}}2}l5eCX{Z0(##CZ32m~Pg5E@J|fYc;FHK;>`7}S7C F0RUWlA2 Date: Mon, 18 May 2026 13:35:30 -0400 Subject: [PATCH 17/17] Bump rustix in lockfile to enable nightly build --- Cargo.lock | 268 +++++++++++++++++++++++++++++------------------------ 1 file changed, 145 insertions(+), 123 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f0899e3..6a10cab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,24 +19,24 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.1" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -58,12 +58,13 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "once_cell_polyfill", + "windows-sys 0.61.2", ] [[package]] @@ -155,6 +156,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + [[package]] name = "byteorder" version = "1.4.3" @@ -190,20 +197,19 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.3.19" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd304a20bff958a57f04c4e96a2e7594cc4490a0e809cbd48bb6437edaa452d" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.19" +version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c6a3f08f1fe5662a35cfe393aec09c4df95f60ee93b7556505260f75eee9e1" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstream", "anstyle", @@ -213,11 +219,11 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.12" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", "syn 2.0.89", @@ -225,9 +231,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "colorchoice" @@ -278,33 +284,19 @@ checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "errno" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ - "cc", "libc", + "windows-sys 0.61.2", ] [[package]] name = "fastrand" -version = "1.9.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "fnv" @@ -428,12 +420,6 @@ version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -449,12 +435,6 @@ dependencies = [ "libc", ] -[[package]] -name = "hermit-abi" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" - [[package]] name = "http" version = "0.2.9" @@ -489,36 +469,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] -name = "instant" -version = "0.1.12" +name = "is_terminal_polyfill" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "io-lifetimes" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" -dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "is-terminal" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f" -dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix", - "windows-sys 0.48.0", -] +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -537,9 +491,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "libc" -version = "0.2.142" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libdeflate-sys" @@ -561,9 +515,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.3.7" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "log" @@ -655,7 +609,7 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi", "libc", ] @@ -689,6 +643,12 @@ version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "openssl-probe" version = "0.1.5" @@ -802,7 +762,7 @@ version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "pyo3-build-config", "quote", @@ -854,15 +814,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags", -] - [[package]] name = "ring" version = "0.17.3" @@ -891,16 +842,15 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.37.19" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -977,7 +927,7 @@ version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -1037,9 +987,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" @@ -1071,15 +1021,14 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.5.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", "rustix", - "windows-sys 0.45.0", + "windows-sys 0.52.0", ] [[package]] @@ -1200,9 +1149,9 @@ dependencies = [ [[package]] name = "utf8parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "wasi" @@ -1220,6 +1169,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-sys" version = "0.42.0" @@ -1237,35 +1192,38 @@ dependencies = [ [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.42.2", + "windows-targets 0.48.0", ] [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.48.0", + "windows-targets 0.52.6", ] [[package]] -name = "windows-targets" -version = "0.42.2" +name = "windows-sys" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", ] [[package]] @@ -1283,6 +1241,22 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -1295,6 +1269,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -1307,6 +1287,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -1319,6 +1305,18 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -1331,6 +1329,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -1343,6 +1347,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -1355,6 +1365,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -1366,3 +1382,9 @@ name = "windows_x86_64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"