From 9b0aab05f1fe16614070696a30fcfd0a75533a77 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 1 Dec 2025 22:33:06 -0500 Subject: [PATCH 1/2] chore: Add additional pre-commit hooks for better repo-review compliance --- .github/dependabot.yml | 6 +- .github/workflows/ci.yml | 7 +- .pre-commit-config.yaml | 22 +- CHANGES.md | 224 ++++++++++-------- CITATION.cff | 6 +- CONTRIBUTING.md | 12 +- README.md | 15 +- docs/api-resources.rst | 15 +- docs/guide-bedtools.md | 3 - docs/guide-intervalops.md | 74 +++++- docs/guide-quickstart.rst | 4 +- docs/guide-recipes.md | 18 +- docs/index.rst | 2 +- .../tutorial_assign_motifs_to_peaks.ipynb | 2 +- src/bioframe/core/arrops.py | 24 +- src/bioframe/core/checks.py | 4 +- src/bioframe/core/specs.py | 5 +- src/bioframe/extras.py | 33 ++- src/bioframe/io/bed.py | 6 +- src/bioframe/io/fileops.py | 6 +- src/bioframe/ops.py | 18 +- src/bioframe/vis.py | 4 +- tests/test_bed.py | 167 +++++++------ tests/test_extras.py | 119 ++++------ tests/test_fileops.py | 11 +- uv.lock | 12 +- 26 files changed, 453 insertions(+), 366 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 3bf5d590..59519b08 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,9 +5,9 @@ updates: schedule: interval: "weekly" groups: - actions: - patterns: - - "*" + actions: + patterns: + - "*" - package-ecosystem: "pip" directory: "/" schedule: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fba5e6f..28234994 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,22 +2,21 @@ name: CI on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - Test: runs-on: ubuntu-latest strategy: matrix: - python-version: [ "3.10", "3.11", "3.12", "3.13" ] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3171329b..1ebd4d87 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,7 @@ +ci: + autoupdate_schedule: monthly + autoupdate_commit_msg: "chore: Update pre-commit hooks" + autofix_commit_msg: "style: Pre-commit fixes" repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 @@ -11,6 +15,22 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.7 hooks: - - id: ruff + - id: ruff-check types_or: [python, pyi, jupyter] args: [--fix, --show-fixes, --exit-non-zero-on-fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-no-log-warn + - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal + - id: text-unicode-replacement-char + + - repo: https://github.com/rbubley/mirrors-prettier + rev: v3.7.3 + hooks: + - id: prettier + args: ["--cache-location=.prettier_cache/cache"] diff --git a/CHANGES.md b/CHANGES.md index 9c86ead3..5c466d6d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,21 +7,25 @@ Date: 2025-04-08 API changes: -* bigtools engine for bigwig and bigbed. -* run length functions `mark_runs` and `compress_runs`. + +- bigtools engine for bigwig and bigbed. +- run length functions `mark_runs` and `compress_runs`. Maintenance: -* Numpy 2.x support. + +- Numpy 2.x support. ## v0.7.2 Date: 2024-06-19 API changes: -* `read_alignment` function introduced in v0.7.0 has been pluralized to `read_alignments` + +- `read_alignment` function introduced in v0.7.0 has been pluralized to `read_alignments` Maintenance: -* Skip `read_alignments` tests on big-endian architectures by @nvictus in https://github.com/open2c/bioframe/pull/216 + +- Skip `read_alignments` tests on big-endian architectures by @nvictus in https://github.com/open2c/bioframe/pull/216 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.1...v0.7.2 @@ -30,8 +34,9 @@ Maintenance: Date: 2024-06-17 Maintenance: -* Refactor join arrayops and intidx internals by @nvictus in https://github.com/open2c/bioframe/pull/204 -* NumPy 2.0 was released. Pin `numpy < 2` until we migrate. + +- Refactor join arrayops and intidx internals by @nvictus in https://github.com/open2c/bioframe/pull/204 +- NumPy 2.0 was released. Pin `numpy < 2` until we migrate. **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.7.0...v0.7.1 @@ -40,17 +45,21 @@ Maintenance: Date: 2024-05-20 API changes: -* Add `to_bed` function to validate and write standard BED files @gamazeps in https://github.com/open2c/bioframe/pull/203 -* `read_bam` deprecated in favor of `read_alignments` @gamazeps in https://github.com/open2c/bioframe/pull/206 + +- Add `to_bed` function to validate and write standard BED files @gamazeps in https://github.com/open2c/bioframe/pull/203 +- `read_bam` deprecated in favor of `read_alignments` @gamazeps in https://github.com/open2c/bioframe/pull/206 Documentation: -* Add "bioframe for bedtools users" guide to docs by @gamazeps in https://github.com/open2c/bioframe/pull/198 + +- Add "bioframe for bedtools users" guide to docs by @gamazeps in https://github.com/open2c/bioframe/pull/198 Bug fixes: -* Fix contig name and JSON issues in read_bam implementation by @gamazeps in https://github.com/open2c/bioframe/pull/206 + +- Fix contig name and JSON issues in read_bam implementation by @gamazeps in https://github.com/open2c/bioframe/pull/206 New Contributors: -* @gamazeps made their first contribution in https://github.com/open2c/bioframe/pull/203 + +- @gamazeps made their first contribution in https://github.com/open2c/bioframe/pull/203 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.4...v0.7.0 @@ -59,15 +68,18 @@ New Contributors: Date: 2024-04-06 Maintenance: -* Migrate from setuptools `pkg_resources` to `importlib.resources` by @nvictus in https://github.com/open2c/bioframe/pull/194 -* Use `importlib.metadata` for versioning by @nvictus in https://github.com/open2c/bioframe/pull/195 + +- Migrate from setuptools `pkg_resources` to `importlib.resources` by @nvictus in https://github.com/open2c/bioframe/pull/194 +- Use `importlib.metadata` for versioning by @nvictus in https://github.com/open2c/bioframe/pull/195 Bug fixes: -* Overlap point segment patch #183 by @smitkadvani in https://github.com/open2c/bioframe/pull/184 -* #167: Replaced np.int with int as the attribute is deprecated by numpy by @harshit148 in https://github.com/open2c/bioframe/pull/192 + +- Overlap point segment patch #183 by @smitkadvani in https://github.com/open2c/bioframe/pull/184 +- #167: Replaced np.int with int as the attribute is deprecated by numpy by @harshit148 in https://github.com/open2c/bioframe/pull/192 New Contributors: -* @harshit148 made a first contribution in https://github.com/open2c/bioframe/pull/192 + +- @harshit148 made a first contribution in https://github.com/open2c/bioframe/pull/192 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.3...v0.6.4 @@ -76,8 +88,9 @@ New Contributors: Date: 2024-03-11 Fixes: -* Prevent dropout from `closest` in some cases of left intervals with no neighbors by @agalitsyna in https://github.com/open2c/bioframe/pull/185 -* Fix overlap returning float indexes causing failing tests (numpy v1.22.4, pandas v1.5.2) by @agalitsyna in https://github.com/open2c/bioframe/pull/185 + +- Prevent dropout from `closest` in some cases of left intervals with no neighbors by @agalitsyna in https://github.com/open2c/bioframe/pull/185 +- Fix overlap returning float indexes causing failing tests (numpy v1.22.4, pandas v1.5.2) by @agalitsyna in https://github.com/open2c/bioframe/pull/185 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.2...v0.6.3 @@ -86,10 +99,12 @@ Fixes: Date: 2024-02-08 Changes: -* cols and df_view_col passed to downstream functions by @smitkadvani in https://github.com/open2c/bioframe/pull/182 + +- cols and df_view_col passed to downstream functions by @smitkadvani in https://github.com/open2c/bioframe/pull/182 Fixes: -* Update to new UCSC hgdownload url by @golobor and @nvictus in https://github.com/open2c/bioframe/pull/187 + +- Update to new UCSC hgdownload url by @golobor and @nvictus in https://github.com/open2c/bioframe/pull/187 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.1...v0.6.2 @@ -101,9 +116,9 @@ API changes: Default behavior of `ensure_nullable` option in `overlap` was modified to minimize the possibility of regressions in libraries that depend on legacy behavior. -* The new option was renamed `ensure_int` and is `True` by default. It ensures that output coordinate columns are always returned with an integer dtype, as was the case in prior versions. This is achieved by converting columns having non-nullable NumPy dtypes to Pandas nullable ones in the specific case where the result of an **outer join** generates missing values; otherwise, column dtypes are preserved unchanged in the output. -* Unlike previous minor versions of bioframe, the nullable dtype chosen will have the **same underlying type** as the corresponding column from the input (i.e, an input dataframe using `np.uint32` start coordinates may yield a `pd.UInt32` start column in the output). -* This behavior can be turned off by setting `ensure_int` to `False`, in which case outer joins on dataframes using NumPy dtypes may produce floating point output columns when missing values are introduced (stored as `NaN`), following the native casting behavior of such columns. +- The new option was renamed `ensure_int` and is `True` by default. It ensures that output coordinate columns are always returned with an integer dtype, as was the case in prior versions. This is achieved by converting columns having non-nullable NumPy dtypes to Pandas nullable ones in the specific case where the result of an **outer join** generates missing values; otherwise, column dtypes are preserved unchanged in the output. +- Unlike previous minor versions of bioframe, the nullable dtype chosen will have the **same underlying type** as the corresponding column from the input (i.e, an input dataframe using `np.uint32` start coordinates may yield a `pd.UInt32` start column in the output). +- This behavior can be turned off by setting `ensure_int` to `False`, in which case outer joins on dataframes using NumPy dtypes may produce floating point output columns when missing values are introduced (stored as `NaN`), following the native casting behavior of such columns. **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.6.0...v0.6.1 @@ -112,15 +127,18 @@ Default behavior of `ensure_nullable` option in `overlap` was modified to minimi Date: 2024-01-04 API changes: -* `overlap`: In previous versions, output coordinate columns were always converted to Pandas "nullable" `Int64` dtype before returning outer join results. In the interest of flexibility, memory efficiency, and least surprise, the coordinate columns returned in the output dataframe now preserve dtype from the input dataframes, following native type casting rules if missing data are introduced. We introduce the `ensure_nullable` argument to force Pandas nullable dtypes in the output coordinates. See the [docs](https://bioframe.readthedocs.io/en/latest/api-intervalops.html#bioframe.ops.overlap) for more details. (#178) + +- `overlap`: In previous versions, output coordinate columns were always converted to Pandas "nullable" `Int64` dtype before returning outer join results. In the interest of flexibility, memory efficiency, and least surprise, the coordinate columns returned in the output dataframe now preserve dtype from the input dataframes, following native type casting rules if missing data are introduced. We introduce the `ensure_nullable` argument to force Pandas nullable dtypes in the output coordinates. See the [docs](https://bioframe.readthedocs.io/en/latest/api-intervalops.html#bioframe.ops.overlap) for more details. (#178) Bug fixes: -* Fixed `coverage` with custom `cols1` (#170) + +- Fixed `coverage` with custom `cols1` (#170) Documentation: -* Added contributing guidelines and NumFOCUS affiliation. -* Updated README and added CITATION.cff file. -* Updated performance benchmarks. + +- Added contributing guidelines and NumFOCUS affiliation. +- Updated README and added CITATION.cff file. +- Updated performance benchmarks. **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.1...v0.6.0 @@ -129,7 +147,8 @@ Documentation: Date: 2023-11-08 Bug fixes: -* Series are treated like dict in `make_chromarms` + +- Series are treated like dict in `make_chromarms` **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.5.0...v0.5.1 @@ -138,19 +157,22 @@ Bug fixes: Date: 2023-10-05 API changes: -* New builtin curated genome assembly database (metadata, chromsizes, cytobands): - * `bioframe.list_assemblies()` - * `bioframe.assembly_info()` -* New UCSC RGB color converter utility #158 -* Options added to `pair_by_distance` + +- New builtin curated genome assembly database (metadata, chromsizes, cytobands): + - `bioframe.list_assemblies()` + - `bioframe.assembly_info()` +- New UCSC RGB color converter utility #158 +- Options added to `pair_by_distance` Bug fixes: -* Make expand throw an error if both pad and scale are passed (#148) -* Fixes to bioframe.select query interval semantics (#147) + +- Make expand throw an error if both pad and scale are passed (#148) +- Fixes to bioframe.select query interval semantics (#147) Maintenance: -* Migrate to hatch build system and pyproject.toml -* Various refactorings + +- Migrate to hatch build system and pyproject.toml +- Various refactorings **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.1...v0.5.0 @@ -159,7 +181,8 @@ Maintenance: Date: 2023-04-22 Bug fixes: -* Fix bug introduced in the last release in `select` and `select_*` query interval semantics. Results of select are now consistent with the query interval being interpreted as half-open, closed on the left. + +- Fix bug introduced in the last release in `select` and `select_*` query interval semantics. Results of select are now consistent with the query interval being interpreted as half-open, closed on the left. **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.4.0...v0.4.1 @@ -168,15 +191,17 @@ Bug fixes: Date: 2023-03-23 API changes: -* New strand-aware directionality options for `closest()` via `direction_col` #129. -* New index-based range query selectors on single bioframes to complement `select()` #128: - * `select_mask()` returns boolean indices corresponding to intervals that overlap the query region - * `select_indices()` returns integer indices corresponding to intervals that overlap the query region - * `select_labels()` returns pandas label indices corresponding to intervals that overlap the query region + +- New strand-aware directionality options for `closest()` via `direction_col` #129. +- New index-based range query selectors on single bioframes to complement `select()` #128: + - `select_mask()` returns boolean indices corresponding to intervals that overlap the query region + - `select_indices()` returns integer indices corresponding to intervals that overlap the query region + - `select_labels()` returns pandas label indices corresponding to intervals that overlap the query region Bug fixes: -* Import fixes in sandbox -* Relax bioframe validator to permit using same column as start and end (e.g. point variants). + +- Import fixes in sandbox +- Relax bioframe validator to permit using same column as start and end (e.g. point variants). **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.3...v0.4.0 @@ -185,8 +210,9 @@ Bug fixes: Date: 2022-02-28 Bug fixes: -* fixed a couple functions returning an error instance instead of raising -* fetch_mrna link fixed + +- fixed a couple functions returning an error instance instead of raising +- fetch_mrna link fixed **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.2...v0.3.3 @@ -195,8 +221,9 @@ Bug fixes: Date: 2022-02-01 Bug fixes: -* fixed error in is_contained -* tutorial updates + +- fixed error in is_contained +- tutorial updates **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.1...v0.3.2 @@ -206,7 +233,7 @@ Date: 2021-11-15 API changes: -* `bioframe.sort_bedframe` does not append columns or modify their dtypes. +- `bioframe.sort_bedframe` does not append columns or modify their dtypes. **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.3.0...v0.3.1 @@ -215,46 +242,51 @@ API changes: Date: 2021-08-31 Conceptual changes: -* we formulated strict definitions for genomic intervals, dataframes, and - their various properties. All bioframe functions are expected to follow - to these definitions tightly. + +- we formulated strict definitions for genomic intervals, dataframes, and + their various properties. All bioframe functions are expected to follow + to these definitions tightly. API changes: -* reorganize modules: - * ops - operations on genomic interval dataframes - * extras - miscellaneous operations, most involving - genomic sequences and gene annotations - * vis - visualizations of genomic interval dataframes - * core.arrops - operations on genomic interval arrays - * core.checks - tests for definitions of genomic interval dataframes - * core.construction - construction and sanitation of genomic interval dataframes - * core.specs - specifications for the implementation of genomic intervals in pandas.dataframes - (i.e. column names, datatypes, etc) - * core.stringops - operations on genomic interval strings - * io.fileops - I/O on common file formats for genomic data - * io.schemas - schemas for standard tabular formats for genomic data storage - * io.resources - interfaces to popular online genomic data resources - -* new functions: extras.pair_by_distance, ops.sort_bedframe, ops.assign_view, - dataframe constructors - -* existing functions: - * expand: take negative values and fractional values - * overlap: change default suffixes, keep_order=True - * subtract: add return_index and keep_order - -* enable pd.NA for missing values, typecasting + +- reorganize modules: + - ops - operations on genomic interval dataframes + - extras - miscellaneous operations, most involving + genomic sequences and gene annotations + - vis - visualizations of genomic interval dataframes + - core.arrops - operations on genomic interval arrays + - core.checks - tests for definitions of genomic interval dataframes + - core.construction - construction and sanitation of genomic interval dataframes + - core.specs - specifications for the implementation of genomic intervals in pandas.dataframes + (i.e. column names, datatypes, etc) + - core.stringops - operations on genomic interval strings + - io.fileops - I/O on common file formats for genomic data + - io.schemas - schemas for standard tabular formats for genomic data storage + - io.resources - interfaces to popular online genomic data resources + +- new functions: extras.pair_by_distance, ops.sort_bedframe, ops.assign_view, + dataframe constructors + +- existing functions: + - expand: take negative values and fractional values + - overlap: change default suffixes, keep_order=True + - subtract: add return_index and keep_order + +- enable pd.NA for missing values, typecasting New data: -* add schemas for bedpe, gap, UCSCmRNA, pgsnp -* add tables with curated detailed genome assembly information + +- add schemas for bedpe, gap, UCSCmRNA, pgsnp +- add tables with curated detailed genome assembly information Bugfixes: -* None?.. + +- None?.. Miscellaneous: -* speed up frac_gc is faster now -* drop support for Python 3.6, add support for 3.9 + +- speed up frac_gc is faster now +- drop support for Python 3.6, add support for 3.9 **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.2.0...v0.3.0 @@ -263,12 +295,14 @@ Miscellaneous: Date: 2020-12-02 API changes: -* `read_chromsizes` and `fetch_chromsizes`: add new `as_bed` parameter. -* `read_chromsizes` and `fetch_chromsizes`: revert to filtering chromosome names by default, but clearly expose `filter_chroms` kwarg. + +- `read_chromsizes` and `fetch_chromsizes`: add new `as_bed` parameter. +- `read_chromsizes` and `fetch_chromsizes`: revert to filtering chromosome names by default, but clearly expose `filter_chroms` kwarg. Bug fixes: -* Fixed `bioframe.split` -* Restored `frac_genome_coverage` + +- Fixed `bioframe.split` +- Restored `frac_genome_coverage` **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.1.0...v0.2.0 @@ -279,13 +313,15 @@ Date: 2020-09-23 First beta release. What's new: -* New extensive dataframe genomic interval arithmetic toolsuite. -* Improved region handling and region querying functions. -* [Documentation!](https://bioframe.readthedocs.io/) + +- New extensive dataframe genomic interval arithmetic toolsuite. +- Improved region handling and region querying functions. +- [Documentation!](https://bioframe.readthedocs.io/) Maintenance: -* Dropped Python 2 support -* Refactoring of various genome operations and resources. -* Improved testing and linting + +- Dropped Python 2 support +- Refactoring of various genome operations and resources. +- Improved testing and linting **Full Changelog**: https://github.com/open2c/bioframe/compare/v0.0.12...v0.1.0 diff --git a/CITATION.cff b/CITATION.cff index c7153f06..d26ca5d4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,14 +2,14 @@ cff-version: 1.2.0 type: software title: bioframe license: MIT -repository-code: 'https://github.com/open2c/bioframe' +repository-code: "https://github.com/open2c/bioframe" message: >- If you use this software, please cite it using the metadata from this file. authors: - given-names: Nezar family-names: Abdennur - orcid: 'https://orcid.org/0000-0001-5814-0864' + orcid: "https://orcid.org/0000-0001-5814-0864" - given-names: Geoffrey family-names: Fudenberg orcid: "https://orcid.org/0000-0001-5905-6517" @@ -57,7 +57,7 @@ preferred-citation: - family-names: Open2C - given-names: Nezar family-names: Abdennur - orcid: 'https://orcid.org/0000-0001-5814-0864' + orcid: "https://orcid.org/0000-0001-5814-0864" - given-names: Geoffrey family-names: Fudenberg orcid: "https://orcid.org/0000-0001-5905-6517" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aaa645f0..9c620ea6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,22 +1,19 @@ # Contributing - ## General guidelines If you haven't contributed to open-source before, we recommend you read [this excellent guide by GitHub on how to contribute to open source](https://opensource.guide/how-to-contribute). The guide is long, so you can gloss over things you're familiar with. If you're not already familiar with it, we follow the [fork and pull model](https://help.github.com/articles/about-collaborative-development-models) on GitHub. Also, check out this recommended [git workflow](https://www.asmeurer.com/git-workflow/). - ## Contributing Code This project has a number of requirements for all code contributed. -* We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention. -* We use [NumPy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html). -* It's ideal if user-facing API changes or new features have documentation added. -* It is best if all new functionality and/or bug fixes have unit tests added with each use-case. - +- We follow the [PEP-8 style](https://www.python.org/dev/peps/pep-0008/) convention. +- We use [NumPy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html). +- It's ideal if user-facing API changes or new features have documentation added. +- It is best if all new functionality and/or bug fixes have unit tests added with each use-case. ## Setting up Your Development Environment @@ -96,7 +93,6 @@ This will build the documentation and serve it on a local http server which list Documentation from the `main` branch and tagged releases is automatically built and hosted on [readthedocs](https://readthedocs.org/). - ## Acknowledgments This document is based off of the [guidelines from the sparse project](https://github.com/pydata/sparse/blob/master/docs/contributing.rst). diff --git a/README.md b/README.md index 99a75fd9..cd7f38b1 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,9 @@ Bioframe enables flexible and scalable operations on genomic interval dataframes Bioframe is built directly on top of [Pandas](https://pandas.pydata.org/). Bioframe provides: -* A variety of genomic interval operations that work directly on dataframes. -* Operations for special classes of genomic intervals, including chromosome arms and fixed-size bins. -* Conveniences for diverse tabular genomic data formats and loading genome assembly summary information. +- A variety of genomic interval operations that work directly on dataframes. +- Operations for special classes of genomic intervals, including chromosome arms and fixed-size bins. +- Conveniences for diverse tabular genomic data formats and loading genome assembly summary information. Read the [documentation](https://bioframe.readthedocs.io/en/latest/), including the [guide](https://bioframe.readthedocs.io/en/latest/guide-intervalops.html), as well as the [publication](https://doi.org/10.1093/bioinformatics/btae088) for more information. @@ -34,10 +34,10 @@ pip install bioframe Interested in contributing to bioframe? That's great! To get started, check out the [contributing guide](https://github.com/open2c/bioframe/blob/main/CONTRIBUTING.md). Discussions about the project roadmap take place on the [Open2C Discord](https://discord.com/invite/qVfSbDYHNG) server and regular developer meetings scheduled there. Anyone can join and participate! - ## Interval operations Key genomic interval operations in bioframe include: + - `overlap`: Find pairs of overlapping genomic intervals between two dataframes. - `closest`: For every interval in a dataframe, find the closest intervals in a second dataframe. - `cluster`: Group overlapping intervals in a dataframe into clusters. @@ -46,6 +46,7 @@ Key genomic interval operations in bioframe include: Bioframe additionally has functions that are frequently used for genomic interval operations and can be expressed as combinations of these core operations and dataframe operations, including: `coverage`, `expand`, `merge`, `select`, and `subtract`. To `overlap` two dataframes, call: + ```python import bioframe as bf @@ -62,8 +63,8 @@ For these two input dataframes, with intervals all on the same chromosome: - To `merge` all overlapping intervals in a dataframe, call: + ```python import bioframe as bf @@ -90,12 +91,12 @@ ctcf_motif_calls = bioframe.read_table(jaspar_url, schema='jaspar', skiprows=1) ``` ## Tutorials -See this [jupyter notebook](https://github.com/open2c/bioframe/tree/master/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb) for an example of how to assign TF motifs to ChIP-seq peaks using bioframe. +See this [jupyter notebook](https://github.com/open2c/bioframe/tree/master/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb) for an example of how to assign TF motifs to ChIP-seq peaks using bioframe. ## Citing -If you use ***bioframe*** in your work, please cite: +If you use **_bioframe_** in your work, please cite: ```bibtex @article{bioframe_2024, diff --git a/docs/api-resources.rst b/docs/api-resources.rst index 252b134d..088ee4dd 100644 --- a/docs/api-resources.rst +++ b/docs/api-resources.rst @@ -8,7 +8,7 @@ Bioframe provides a collection of genome assembly metadata for commonly used genomes. These are accessible through a convenient dataclass interface via :func:`bioframe.assembly_info`. The assemblies are listed in a manifest YAML file, and each assembly -has a mandatory companion file called `seqinfo` that contains the sequence +has a mandatory companion file called _seqinfo_ that contains the sequence names, lengths, and other information. The records in the manifest file contain the following fields: @@ -22,7 +22,7 @@ the following fields: - ``default_units``: default assembly units to include from the seqinfo file - ``url``: URL to where the corresponding sequence files can be downloaded -The `seqinfo` file is a TSV file with the following columns (with header): +The _seqinfo_ file is a TSV file with the following columns (with header): - ``name``: canonical sequence name - ``length``: sequence length @@ -31,21 +31,20 @@ The `seqinfo` file is a TSV file with the following columns (with header): - ``unit``: assembly unit of the chromosome (e.g., "primary", "non-nuclear", "decoy") - ``aliases``: comma-separated list of aliases for the sequence name -We currently do not include sequences with "alt" or "patch" roles in `seqinfo` files, but we +We currently do not include sequences with "alt" or "patch" roles in _seqinfo_ files, but we do support the inclusion of additional decoy sequences (as used by so-called NGS *analysis sets* for human genome assemblies) by marking them as members of a "decoy" assembly unit. -The `cytoband` file is an optional TSV file with the following columns (with header): - +The _cytoband_ file is an optional TSV file with the following columns (with header): - ``chrom``: chromosome name - ``start``: start position - ``end``: end position - ``band``: cytogenetic coordinate (name of the band) - ``stain``: Giesma stain result -The order of the sequences in the `seqinfo` file is treated as canonical. -The ordering of the chromosomes in the `cytobands` file should match the order -of the chromosomes in the `seqinfo` file. +The order of the sequences in the _seqinfo_ file is treated as canonical. +The ordering of the chromosomes in the _cytobands_ file should match the order +of the chromosomes in the _seqinfo_ file. The manifest and companion files are stored in the ``bioframe/io/data`` directory. New assemblies can be requested by opening an issue on GitHub or by submitting a pull request. diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 74a5845d..a99a57a0 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -14,7 +14,6 @@ kernelspec: # Bioframe for bedtools users - Bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk. Bioframe supports reading a number of standard genomics text file formats via [`read_table`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.fileops.read_table), including BED files (see [schemas](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py)), which will load them as pandas DataFrames, a complete list of helper functions is [available here](API_fileops). @@ -25,7 +24,6 @@ For example, with gtf files, you do not need to turn them into bed files, you ca Finally, if needed, bioframe provides a convenience function to write dataframes to a standard BED file using [`to_bed`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.bed.to_bed). - ## `bedtools intersect` ### Select unique entries from the first bed overlapping the second bed `-u` @@ -107,7 +105,6 @@ out = bf.overlap(A, B, how='inner', suffixes=('_', ''))[B.columns] > **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `B`, while the latter result --- the join output --- will have an integer range index, like a pandas merge. - ### Intersect multiple beds against A ```sh diff --git a/docs/guide-intervalops.md b/docs/guide-intervalops.md index ed9ea57e..67bdcb24 100644 --- a/docs/guide-intervalops.md +++ b/docs/guide-intervalops.md @@ -19,6 +19,7 @@ This guide provides an introdution into how to use bioframe to perform genomic i The following modules are used in this guide: ``` + ```{code-cell} ipython3 import itertools @@ -36,6 +37,7 @@ import bioframe.vis ```{eval-rst} The core objects in bioframe are pandas DatFrames of genomic intervals, or BedFrames. These can either be defined directly with :py:class:`pandas.DataFrame`: ``` + ```{code-cell} ipython3 df1 = pd.DataFrame([ ['chr1', 1, 5], @@ -45,15 +47,18 @@ df1 = pd.DataFrame([ columns=['chrom', 'start', 'end'] ) ``` + ```{eval-rst} Or via functions in :mod:`bioframe.core.construction`, e.g.: ``` + ```{code-cell} ipython3 df2 = bioframe.from_any( [['chr1', 4, 8], ['chr1', 10, 11]], name_col='chrom') ``` + ```{eval-rst} Or ingested from datasets and databases with functions in :mod:`bioframe.io.fileops` and :mod:`bioframe.io.resources`. ``` @@ -68,9 +73,11 @@ BedFrames satisfy the following properties: Whether a dataframe satisfies these properties can be checked with :func:`bioframe.core.checks.is_bedframe`: ``` + ```{code-cell} ipython3 bioframe.is_bedframe(df2) ``` + ```{eval-rst} See :mod:`bioframe.core.checks` for other functions that test properties of BedFrames and :ref:`Technical Notes` for detailed definitions. :func:`bioframe.core.construction.sanitize_bedframe` attempts to modfiy a DataFrame such that it satisfies bedFrame requirements. @@ -79,6 +86,7 @@ See :mod:`bioframe.core.checks` for other functions that test properties of BedF ```{eval-rst} :py:mod:`bioframe.vis` provides plotting utilities for intervals: ``` + ```{code-cell} ipython3 bf.vis.plot_intervals(df1, show_coords=True, xlim=(0,16)) plt.title('bedFrame1 intervals'); @@ -88,15 +96,18 @@ plt.title('bedFrame2 intervals'); ``` ## Overlap + ```{eval-rst} Calculating the overlap between two sets of genomic intervals is a crucial genomic interval operation. Using :func:`bioframe.overlap`, we can see the two dataframes defined above, ``df1`` and ``df2``, contain two pairs of overlapping intervals: ``` + ```{code-cell} ipython3 overlapping_intervals = bf.overlap(df1, df2, how='inner', suffixes=('_1','_2')) display(overlapping_intervals) ``` + ```{code-cell} ipython3 for i, reg_pair in overlapping_intervals.iterrows(): bf.vis.plot_intervals_arr( @@ -108,20 +119,23 @@ for i, reg_pair in overlapping_intervals.iterrows(): show_coords = True) plt.title(f'overlapping pair #{i}') ``` -Note that we passed custom suffixes for the outputs (defaults are ``suffixes=("","_")``), -as well as a custom overlap mode (``how='inner'``). The default overlap mode, ``how='left'`` returns each interval in ``df1`` whether or not it overlaps an interval in ``df2``. + +Note that we passed custom suffixes for the outputs (defaults are `suffixes=("","_")`), +as well as a custom overlap mode (`how='inner'`). The default overlap mode, `how='left'` returns each interval in `df1` whether or not it overlaps an interval in `df2`. + ```{code-cell} ipython3 overlapping_intervals = bf.overlap(df1, df2) display(overlapping_intervals) ``` - ## Cluster + ```{eval-rst} It is often useful to find overlapping intervals within a single set of genomic intervals. In `bioframe`, this is achieved with :func:`bioframe.cluster`. This function returns a DataFrame where subsets of overlapping intervals are assigned to the same group, reported in a new column. To demonstrate the usage of :func:`bioframe.cluster`, we use the same ``df1`` as above: ``` + ```{code-cell} ipython3 df1 = pd.DataFrame([ ['chr1', 1, 5], @@ -136,20 +150,23 @@ bf.vis.plot_intervals(df1, show_coords=True, xlim=(0,16)) ``` Cluster returns a DataFrame where each interval is assigned to a group: + ```{code-cell} ipython3 df_annotated = bf.cluster(df1, min_dist=0) display(df_annotated) bf.vis.plot_intervals(df_annotated, labels=df_annotated['cluster'], show_coords=True, xlim=(0,16)) ``` -Note that using ``min_dist=0`` and ``min_dist=None`` give different results, as the latter only clusters overlapping intervals and not adjacent intervals: +Note that using `min_dist=0` and `min_dist=None` give different results, as the latter only clusters overlapping intervals and not adjacent intervals: + ```{code-cell} ipython3 df_annotated = bf.cluster(df1, min_dist=None) display(df_annotated) bf.vis.plot_intervals(df_annotated, labels=df_annotated['cluster'], show_coords=True, xlim=(0,16)) ``` -Extending the minimum distance to two (``min_dist=2``) makes all intervals part of the same cluster "0": +Extending the minimum distance to two (`min_dist=2`) makes all intervals part of the same cluster "0": + ```{code-cell} ipython3 df_annotated = bf.cluster(df1, min_dist=2) display(df_annotated) @@ -157,11 +174,13 @@ bf.vis.plot_intervals(df_annotated, labels=df_annotated['cluster'], show_coords= ``` ## Merge + ```{eval-rst} Instead of returning cluster assignments, :func:`bioframe.merge` returns a new dataframe of merged genomic intervals. As with :func:`bioframe.cluster`, using ``min_dist=0`` and ``min_dist=None`` gives different results. If ``min_dist=0``, this returns a dataframe of two intervals: ``` + ```{code-cell} ipython3 df_merged = bf.merge(df1, min_dist=0) @@ -169,7 +188,8 @@ display(df_merged) bf.vis.plot_intervals(df_merged, show_coords=True, xlim=(0,16)) ``` -If ``min_dist=None``, this returns a dataframe of three intervals: +If `min_dist=None`, this returns a dataframe of three intervals: + ```{code-cell} ipython3 df_merged = bf.merge(df1, min_dist=None) display(df_merged) @@ -177,13 +197,16 @@ bf.vis.plot_intervals(df_merged, show_coords=True, xlim=(0,16)) ``` ## Closest + ```{eval-rst} In genomics, it is often useful not only to find features that overlap, but also features that are nearby along the genome. In bioframe, this is achieved using :func:`bioframe.closest`. ``` + ```{code-cell} ipython3 closest_intervals = bf.closest(df1, df2, suffixes=('_1','_2')) display(closest_intervals) ``` + ```{code-cell} ipython3 for i, reg_pair in closest_intervals.iterrows(): bf.vis.plot_intervals_arr( @@ -199,6 +222,7 @@ for i, reg_pair in closest_intervals.iterrows(): ```{eval-rst} By default, :func:`bioframe.closest` reports overlapping intervals. This can be modified by passing ``ignore_overlap=True``. Note the closest pair #2 and #3, which did not overlap, remain the same: ``` + ```{code-cell} ipython3 closest_intervals = bf.closest(df1, df2, ignore_overlaps=True, suffixes=('_1','_2')) for i, reg_pair in closest_intervals.iterrows(): @@ -215,6 +239,7 @@ for i, reg_pair in closest_intervals.iterrows(): ```{eval-rst} Closest intervals within a single DataFrame can be found simply by passing a single dataframe to :func:`bioframe.closest`. The number of closest intervals to report per query interval can be adjusted with ``k``. ``` + ```{code-cell} ipython3 bf.closest(df1, k=2) ``` @@ -223,6 +248,7 @@ bf.closest(df1, k=2) Closest intervals upstream of the features in df1 can be found by ignoring downstream and overlaps. Upstream/downstream direction is defined by genomic coordinates by default (smaller coordinate is upstream). ``` + ```{code-cell} ipython3 bf.closest(df1, df2, ignore_overlaps=True, @@ -233,6 +259,7 @@ bf.closest(df1, df2, If the features in df1 have direction (e.g., genes have transcription direction), then the definition of upstream/downstream direction can be changed to the direction of the features by `direction_col`: ``` + ```{code-cell} ipython3 df1["strand"] = np.where(np.random.rand(len(df1)) > 0.5, "+", "-") bf.closest(df1, df2, @@ -241,8 +268,8 @@ bf.closest(df1, df2, direction_col='strand') ``` - ## Coverage & Count Overlaps + ```{eval-rst} For two sets of genomic features, it is often useful to calculate the number of basepairs covered and the number of overlapping intervals. While these are fairly straightforward to compute from the output of :func:`bioframe.overlap` with :func:`pandas.groupby` and column renaming, since these are very frequently used, they are provided as core bioframe functions. ``` @@ -251,12 +278,14 @@ For two sets of genomic features, it is often useful to calculate the number of df1_coverage = bf.coverage(df1, df2) display(df1_coverage) ``` + ```{code-cell} ipython3 df1_coverage_and_count = bf.count_overlaps(df1_coverage, df2) display(df1_coverage_and_count) ``` -This plot shows the coverage and number of overlaps for intervals in ``df1`` by ``df2``: +This plot shows the coverage and number of overlaps for intervals in `df1` by `df2`: + ```{code-cell} ipython3 bf.vis.plot_intervals( df1_coverage_and_count, @@ -268,15 +297,18 @@ bf.vis.plot_intervals(df2, show_coords=True, xlim=(0,16), colors='lightpink') ``` ## Subtract & Set Difference + ```{eval-rst} Bioframe has two functions for computing differences between sets of intervals: at the level of basepairs and at the level of whole intervals. Basepair-level subtraction is performed with :func:`bioframe.subtract`: ``` + ```{code-cell} ipython3 subtracted_intervals = bf.subtract(df1, df2) display(subtracted_intervals) ``` + ```{code-cell} ipython3 bf.vis.plot_intervals(subtracted_intervals, show_coords=True, xlim=(0,16)) ``` @@ -284,15 +316,18 @@ bf.vis.plot_intervals(subtracted_intervals, show_coords=True, xlim=(0,16)) ```{eval-rst} Interval-level differences are calculated with :func:`bioframe.setdiff`: ``` + ```{code-cell} ipython3 setdiff_intervals = bf.setdiff(df1, df2) display(setdiff_intervals) ``` + ```{code-cell} ipython3 bf.vis.plot_intervals(setdiff_intervals, show_coords=True, xlim=(0,16)) ``` ## Expand + ```{eval-rst} :func:`bioframe.expand` enables quick resizing of intervals. @@ -301,10 +336,12 @@ Expand supports additive resizing, with ``pad``. Note that unless subsequently trimmed (with :func:`bioframe.trim`), expanded intervals can have negative values: ``` + ```{code-cell} ipython3 expanded_intervals = bf.expand(df1, pad=2) display(expanded_intervals) ``` + ```{code-cell} ipython3 bf.vis.plot_intervals(expanded_intervals, show_coords=True, xlim=(0,16)) ``` @@ -312,15 +349,18 @@ bf.vis.plot_intervals(expanded_intervals, show_coords=True, xlim=(0,16)) ```{eval-rst} Expand also supports multiplicative resizing, with ``scale``. Note that ``scale=0`` resizes all intervals to their midpoints: ``` + ```{code-cell} ipython3 expanded_intervals = bf.expand(df1, scale=0) display(expanded_intervals) ``` + ```{code-cell} ipython3 bf.vis.plot_intervals(expanded_intervals, show_coords=True, xlim=(0,16)) ``` ## Genomic Views + ```{eval-rst} Certain interval operations are often used relative to a set of reference intervals, whether those are chromosomes, scaffolds, or sub-intervals of either. Bioframe formalizes this with the concept of a `genomic view`, implemented as pandas dataframes, termed viewFrames, that satisfy the following: @@ -339,6 +379,7 @@ The following genomic interval operations make use of views, though also have us ``` ## Complement + ```{eval-rst} Equally important to finding which genomic features overlap is finding those that do not. :func:`bioframe.complement` returns a BedFrame of intervals not covered by any intervals in an input BedFrame. @@ -354,13 +395,15 @@ display(df_complemented) bf.vis.plot_intervals(df_complemented, show_coords=True, xlim=(0,16), colors='lightpink') ``` -If no view is provided, complement is calculated per unique chromosome in the input with right limits of ``np.iinfo(np.int64).max``. +If no view is provided, complement is calculated per unique chromosome in the input with right limits of `np.iinfo(np.int64).max`. + ```{code-cell} ipython3 df_complemented = bf.complement(df1) display(df_complemented) ``` ## Trim + ```{eval-rst} Certain regions are often best avoided for genomic analyses. :func:`bioframe.trim` trims intervals to a specified view. Intervals falling outside of view regions have their filled with null values. ``` @@ -378,7 +421,9 @@ view_df = pd.DataFrame( trimmed_intervals = bf.trim(df1, view_df) display(trimmed_intervals) ``` -Note that the last interval of ``df1`` fell beyond 'chr1q' and is now null, and the last interval now ends at 9 instead of 10. + +Note that the last interval of `df1` fell beyond 'chr1q' and is now null, and the last interval now ends at 9 instead of 10. + ```{code-cell} ipython3 bf.vis.plot_intervals(trimmed_intervals, show_coords=True, xlim=(0,16)) ``` @@ -386,6 +431,7 @@ bf.vis.plot_intervals(trimmed_intervals, show_coords=True, xlim=(0,16)) If no view is provided, this function trims intervals at zero to avoid negative values. ## Sorting + ```{eval-rst} If no view is provided, :func:`bioframe.sort_bedframe` sorts by ("chrom", "start", "end") columns: ``` @@ -403,17 +449,19 @@ display( bf.sort_bedframe(df_unsorted) ) ``` Views enable a specifying a sort order on a set of intervals. This flexibility is useful when the desired sorting order is non-lexicographical, e.g. with chrM after autosomes and chrX: + ```{code-cell} ipython3 display( bf.sort_bedframe(df_unsorted, view_df) ) ``` ## Selecting & Slicing -Since bioFrame operates directly with [pandas](https://pandas.pydata.org/) *DataFrames*, all typical selection and slicing operations are directly relevant. +Since bioFrame operates directly with [pandas](https://pandas.pydata.org/) _DataFrames_, all typical selection and slicing operations are directly relevant. ```{eval-rst} Bioframe also provides a function :func:`bioframe.select` that enables selecting interval subsets using UCSC string format: ``` + ```{code-cell} ipython3 display( bioframe.select(df_unsorted,'chrX:8-14') ) ``` @@ -424,13 +472,13 @@ display( bioframe.select(df_unsorted,'chrX:8-14') ) Genomic analyses often deal with dataframes with inhomogeneously named columns. Bioframe offers a way to set the default column names that are most convenient for your analyses. -Default bedframe column names are stored in ``bioframe.core.specs_rc``. +Default bedframe column names are stored in `bioframe.core.specs_rc`. ```{code-cell} ipython3 bf.core.specs._rc ``` -If the dataframes we wish to work with have `['CHROMOSOME', 'LEFT', 'RIGHT']`, we can either pass cols to operations in ``bioframe.ops``: +If the dataframes we wish to work with have `['CHROMOSOME', 'LEFT', 'RIGHT']`, we can either pass cols to operations in `bioframe.ops`: ```{code-cell} ipython3 df1_diff_colnames = pd.DataFrame([ diff --git a/docs/guide-quickstart.rst b/docs/guide-quickstart.rst index f3946b4c..b4d2a6c3 100644 --- a/docs/guide-quickstart.rst +++ b/docs/guide-quickstart.rst @@ -8,14 +8,14 @@ Installation $ pip install bioframe -To install the latest development version of `bioframe` from +To install the latest development version of ``bioframe`` from github, first make a local clone of the github repository: .. code-block:: bash $ git clone https://github.com/open2c/bioframe -Then, compile and install `bioframe` in +Then, compile and install ``bioframe`` in `development mode `_. This installs the package without moving it to a system folder, and thus allows for testing changes to the python code on the fly. .. code-block:: bash diff --git a/docs/guide-recipes.md b/docs/guide-recipes.md index d12955a3..0a1da002 100644 --- a/docs/guide-recipes.md +++ b/docs/guide-recipes.md @@ -15,51 +15,67 @@ kernelspec: # How do I ## Obtain overlapping intervals with matching strandedness? -Use overlap with the ``on`` argument: + +Use overlap with the `on` argument: + ``` df = bf.overlap(df1, df2, on=[‘strand’]) ``` ## Obtain overlapping intervals with opposite strandedness? + Overlap then filter pairs of opposite strandedness: + ``` df = bf.overlap(df1, df2) df = df.loc[df["strand"]!=df["strand_"]] ``` + ## Obtain intervals that exceed 50% coverage by another set of intervals? + Coverage, then filter pairs by fractional coverage: + ``` df = bf.coverage(df1, df2) df = df[ ( df["coverage"] / (df["end"]-df["start"]) ) >=0.50] ``` ## Shift all intervals on the positive strand by 10bp? + Use pandas indexing: + ``` df.loc[df.strand=="+",["start", "end"]] += 10 ``` ## Obtain intervals overlapped by at least 2 intervals from another set? + Count overlaps, then filter: + ``` df = bf.count_overlaps(df1, df2) df = df[ df["count"] >= 2] ``` ## Find strand-specific downstream genomic features? + Use closest after filtering by strand, and passing the `ignore_upsream=True` argument. + ``` bioframe.closest(df1.loc[df1['strand']=='+'], df2, ignore_upstream=True) ``` For gener, the upstream/downstream direction might be defined by the direction of transcription. Use `direction_col='strand'` to set up the direction: + ``` bioframe.closest(df1, df2, ignore_upstream=True, direction_col='strand') ``` ## Drop non-autosomes from a bedframe? + Use pandas DataFrame.isin(values): + ``` df[ ~df.chrom.isin(['chrX','chrY'])] ``` diff --git a/docs/index.rst b/docs/index.rst index ba473692..d8891e4b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,7 +1,7 @@ .. bioframe documentation master file, created by sphinx-quickstart on Sat Apr 11 11:44:26 2020. You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. + contain the root ``toctree`` directive. bioframe ======== diff --git a/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb b/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb index 7ca68e6b..ea7a98b5 100644 --- a/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb +++ b/docs/tutorials/tutorial_assign_motifs_to_peaks.ipynb @@ -349,7 +349,7 @@ "\n", "print(\n", " f\"fraction of peaks without motifs \"\n", - " f\"{np.round(np.sum(motifs_per_peak==0)/len(motifs_per_peak), 2)}\"\n", + " f\"{np.round(np.sum(motifs_per_peak == 0) / len(motifs_per_peak), 2)}\"\n", ")" ] }, diff --git a/src/bioframe/core/arrops.py b/src/bioframe/core/arrops.py index b79abcb9..ab8c7527 100644 --- a/src/bioframe/core/arrops.py +++ b/src/bioframe/core/arrops.py @@ -354,14 +354,18 @@ def overlap_intervals(starts1, ends1, starts2, ends2, closed=False, sort=False): ) # Generate IDs of pairs of overlapping intervals - ovids1 = np.concatenate([ - np.repeat(ids1[match_2in1_mask], match_2in1_ends - match_2in1_starts), - ids1[arange_multi(match_1in2_starts, match_1in2_ends)], - ]) - ovids2 = np.concatenate([ - ids2[arange_multi(match_2in1_starts, match_2in1_ends)], - np.repeat(ids2[match_1in2_mask], match_1in2_ends - match_1in2_starts), - ]) + ovids1 = np.concatenate( + [ + np.repeat(ids1[match_2in1_mask], match_2in1_ends - match_2in1_starts), + ids1[arange_multi(match_1in2_starts, match_1in2_ends)], + ] + ) + ovids2 = np.concatenate( + [ + ids2[arange_multi(match_2in1_starts, match_2in1_ends)], + np.repeat(ids2[match_1in2_mask], match_1in2_ends - match_1in2_starts), + ] + ) if sort: idx = np.lexsort([ovids2, ovids1]) @@ -723,9 +727,7 @@ def closest_intervals( # Combine the results events1 = np.concatenate([left_ids1, right_ids1, ovids1]) events2 = np.concatenate([left_ids2, right_ids2, ovids2]) - dists = np.concatenate( - [left_dists, right_dists, np.zeros(ovids1.shape[0])] - ) + dists = np.concatenate([left_dists, right_dists, np.zeros(ovids1.shape[0])]) if len(events1) == 0: return np.array([], dtype=int), np.array([], dtype=int) diff --git a/src/bioframe/core/checks.py b/src/bioframe/core/checks.py index 88a413b2..826ac231 100644 --- a/src/bioframe/core/checks.py +++ b/src/bioframe/core/checks.py @@ -126,9 +126,7 @@ def is_cataloged( if not _verify_columns(view_df, [view_name_col], return_as_bool=True): if raise_errors: - raise ValueError(f"Could not find \ - `{view_name_col}` \ - column in view_df") + raise ValueError(f"Could not find `{view_name_col}` column in view_df") return False if not set(df[df_view_col].copy().dropna().values).issubset( diff --git a/src/bioframe/core/specs.py b/src/bioframe/core/specs.py index 42fab2f3..b541d4bc 100644 --- a/src/bioframe/core/specs.py +++ b/src/bioframe/core/specs.py @@ -31,8 +31,7 @@ def __init__(self, new_colnames): if isinstance(new_colnames, collections.abc.Iterable): if len(new_colnames) != 3: raise ValueError( - "Please, specify new columns using a list of " - "3 strings or a dict!" + "Please, specify new columns using a list of 3 strings or a dict!" ) ( _rc["colnames"]["chrom"], @@ -49,7 +48,7 @@ def __init__(self, new_colnames): ) else: raise ValueError( - "Please, specify new columns using a list of " "3 strings or a dict!" + "Please, specify new columns using a list of 3 strings or a dict!" ) def __enter__(self): diff --git a/src/bioframe/extras.py b/src/bioframe/extras.py index d8cbe3b4..2743f47e 100644 --- a/src/bioframe/extras.py +++ b/src/bioframe/extras.py @@ -16,7 +16,7 @@ "mark_runs", "merge_runs", "pair_by_distance", - "seq_gc" + "seq_gc", ] @@ -242,8 +242,7 @@ def frac_mapped(df, fasta_records, return_input=True): if not set(df["chrom"].values).issubset(set(fasta_records.keys())): raise ValueError( - "chrom from intervals not in fasta_records: " - "double-check genome agreement" + "chrom from intervals not in fasta_records: double-check genome agreement" ) if not isinstance(fasta_records, dict): raise ValueError( @@ -550,7 +549,7 @@ def mark_runs( *, allow_overlaps: bool = False, reset_counter: bool = True, - run_col: str = 'run', + run_col: str = "run", cols: tuple[str, str, str] | None = None, ) -> pd.DataFrame: """ @@ -628,11 +627,9 @@ def mark_runs( # Find borders of consecutive equal values values = group[col].to_numpy() - if values.dtype.kind == 'f': + if values.dtype.kind == "f": is_value_border = np.r_[ - True, - ~np.isclose(values[1:], values[:-1], equal_nan=True), - False + True, ~np.isclose(values[1:], values[:-1], equal_nan=True), False ] else: is_value_border = np.r_[True, values[1:] != values[:-1], False] @@ -724,17 +721,15 @@ def merge_runs( col, allow_overlaps=allow_overlaps, reset_counter=False, - run_col='_run', + run_col="_run", ) - df_merged = ( - df_runs - .groupby('_run') - .agg(**{ - ck: (ck, 'first'), - sk: (sk, 'min'), - ek: (ek, 'max'), - col: (col, 'first'), - **agg - }) + df_merged = df_runs.groupby("_run").agg( + **{ + ck: (ck, "first"), + sk: (sk, "min"), + ek: (ek, "max"), + col: (col, "first"), + **agg, + } ) return df_merged.reset_index(drop=True) diff --git a/src/bioframe/io/bed.py b/src/bioframe/io/bed.py index 06d9d1a8..bf1aa030 100644 --- a/src/bioframe/io/bed.py +++ b/src/bioframe/io/bed.py @@ -47,6 +47,7 @@ We also don't enforce limiting name fields to 7-bit printable ascii. """ + from __future__ import annotations import pathlib @@ -96,7 +97,7 @@ "strand": "OU", "thickStart": "iu", "thickEnd": "iu", - "itemRgb": "iOU", # can believe 0 is i + "itemRgb": "iOU", # can believe 0 is i "blockCount": "iu", "blockSizes": "OU", "blockStarts": "OU", @@ -326,7 +327,8 @@ def check_itemRgb(df: pd.DataFrame) -> dict[bool]: # Check that the itemRgb column contains only integers between 0 and 255, inclusive is_in_range = ( - df["itemRgb"].astype(str) + df["itemRgb"] + .astype(str) .str.split(",") .apply(lambda x: all([int(i) >= 0 and int(i) <= 255 for i in x])) ).all() diff --git a/src/bioframe/io/fileops.py b/src/bioframe/io/fileops.py index 7d6cfcda..92a43f2a 100644 --- a/src/bioframe/io/fileops.py +++ b/src/bioframe/io/fileops.py @@ -500,16 +500,14 @@ def _find_ucsc_binary(path, cmd): elif path.endswith(cmd): if not os.path.isfile(path) and os.access(path, os.X_OK): raise ValueError( - f"{cmd} is absent in the provided path or cannot be " - f"executed: {path}. " + f"{cmd} is absent in the provided path or cannot be executed: {path}. " ) cmd = path else: cmd = os.path.join(path, cmd) if not os.path.isfile(cmd) and os.access(cmd, os.X_OK): raise ValueError( - f"{cmd} is absent in the provided path or cannot be " - f"executed: {path}. " + f"{cmd} is absent in the provided path or cannot be executed: {path}. " ) return cmd diff --git a/src/bioframe/ops.py b/src/bioframe/ops.py index 83180755..934603d8 100644 --- a/src/bioframe/ops.py +++ b/src/bioframe/ops.py @@ -976,8 +976,7 @@ def _closest_intidxs( tie_func = lambda x: tie_breaking_col(x).values # noqa: E731 else: ValueError( - "tie_breaking_col must be either a column label or " - "f(DataFrame) -> Series" + "tie_breaking_col must be either a column label or f(DataFrame) -> Series" ) # Find overlapping intervals per chromosome. @@ -1187,12 +1186,8 @@ def closest( df_overlap = None if return_overlap: - overlap_start = np.maximum( - df1[sk1].values[events1], df2[sk2].values[events2] - ) - overlap_end = np.minimum( - df1[ek1].values[events1], df2[ek2].values[events2] - ) + overlap_start = np.maximum(df1[sk1].values[events1], df2[sk2].values[events2]) + overlap_end = np.minimum(df1[ek1].values[events1], df2[ek2].values[events2]) have_overlap = overlap_start < overlap_end df_overlap = pd.DataFrame( { @@ -1223,7 +1218,7 @@ def closest( { "distance": np.maximum(distance_left, distance_right), }, - dtype=pd.Int64Dtype() + dtype=pd.Int64Dtype(), ) df_distance[na_mask] = pd.NA @@ -1235,10 +1230,7 @@ def closest( df_input2 = df2.iloc[events2].reset_index(drop=True) df_input2.columns = [col + suffixes[1] for col in df_input2.columns] df_input2 = df_input2.astype( - { - sk2 + suffixes[1]: pd.Int64Dtype(), - ek2 + suffixes[1]: pd.Int64Dtype() - } + {sk2 + suffixes[1]: pd.Int64Dtype(), ek2 + suffixes[1]: pd.Int64Dtype()} ) df_input2[na_mask] = pd.NA diff --git a/src/bioframe/vis.py b/src/bioframe/vis.py index 2c875df4..e65ec189 100644 --- a/src/bioframe/vis.py +++ b/src/bioframe/vis.py @@ -136,9 +136,7 @@ def plot_intervals_arr( else: labels = itertools.cycle(labels) - for (start, end, level, color, label) in zip( - starts, ends, levels, colors, labels - ): + for start, end, level, color, label in zip(starts, ends, levels, colors, labels): _plot_interval(start, end, level, facecolor=color) if label is not None: plt.text( diff --git a/tests/test_bed.py b/tests/test_bed.py index a65c24b7..7e9af31c 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -9,10 +9,9 @@ def test_involution(): with tempfile.TemporaryDirectory() as directory: - for schema in ['narrowPeak', 'bed12']: - bf = bioframe.read_table(f'tests/test_data/{schema}.bed', - schema=schema) - fname = os.path.join(directory, f'{schema}.bed') + for schema in ["narrowPeak", "bed12"]: + bf = bioframe.read_table(f"tests/test_data/{schema}.bed", schema=schema) + fname = os.path.join(directory, f"{schema}.bed") bioframe.to_bed(bf, fname) involution = bioframe.read_table(fname, schema=schema) pd.testing.assert_frame_equal(bf, involution) @@ -20,163 +19,163 @@ def test_involution(): def test_chrom_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'chrom'] = 'value with space' + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "chrom"] = "value with space" with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'chrom'] = '' # must be non empty + bf.loc[0, "chrom"] = "" # must be non empty with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'chrom'] = 'a'*300 # must be shorter than 256 + bf.loc[0, "chrom"] = "a" * 300 # must be shorter than 256 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_end_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'end'] = 10 # end must be after start - bf.loc[0, 'start'] = 11 + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "end"] = 10 # end must be after start + bf.loc[0, "start"] = 11 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_name_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'name'] = '' # must not be empty + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "name"] = "" # must not be empty with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'name'] = 'a'*300 # must be less than 255 char + bf.loc[0, "name"] = "a" * 300 # must be less than 255 char with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_score_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") # negative value is enforced by the normal types - bf.loc[0, 'score'] = 1001 + bf.loc[0, "score"] = 1001 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed'), strict_score=True) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed"), strict_score=True) - bf['score'] = '.' # enforced to be a number by the types + bf["score"] = "." # enforced to be a number by the types with pytest.raises(TypeError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_strand_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'strand'] = '*' + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "strand"] = "*" with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_thick_validators(): with tempfile.TemporaryDirectory() as directory: - for direction in ['Start', 'End']: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'start'] = 100 - bf.loc[0, 'end'] = 1000 - bf.loc[0, f'thick{direction}'] = 1001 + for direction in ["Start", "End"]: + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "start"] = 100 + bf.loc[0, "end"] = 1000 + bf.loc[0, f"thick{direction}"] = 1001 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, f'thick{direction}'] = 99 + bf.loc[0, f"thick{direction}"] = 99 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_itemRgb_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") bf["itemRgb"] = bf["itemRgb"].astype(str) - bf.loc[0, 'itemRgb'] = 'a,12,13' # must be integers + bf.loc[0, "itemRgb"] = "a,12,13" # must be integers with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'itemRgb'] = '12,13' # must be 1 or 3 integers + bf.loc[0, "itemRgb"] = "12,13" # must be 1 or 3 integers with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'itemRgb'] = '12,13,14,15' # must be 1 or 3 integers + bf.loc[0, "itemRgb"] = "12,13,14,15" # must be 1 or 3 integers with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'itemRgb'] = '12,13,300' # must be between 0 and 255 + bf.loc[0, "itemRgb"] = "12,13,300" # must be between 0 and 255 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'itemRgb'] = '300' # must be between 0 and 255 + bf.loc[0, "itemRgb"] = "300" # must be between 0 and 255 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_blockCount_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'blockCount'] = 0 + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "blockCount"] = 0 with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_blockSizes_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'blockCount'] = 2 - bf.loc[0, 'blockSizes'] = '2,a,' + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "blockCount"] = 2 + bf.loc[0, "blockSizes"] = "2,a," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'blockCount'] = 2 - bf.loc[0, 'blockSizes'] = '2,2,2,' + bf.loc[0, "blockCount"] = 2 + bf.loc[0, "blockSizes"] = "2,2,2," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) def test_blockStarts_validators(): with tempfile.TemporaryDirectory() as directory: - bf = bioframe.read_table('tests/test_data/bed12.bed', schema='bed12') - bf.loc[0, 'blockCount'] = 2 - bf.loc[0, 'blockSizes'] = '2,4,' - bf.loc[0, 'blockStarts'] = '0,a,' + bf = bioframe.read_table("tests/test_data/bed12.bed", schema="bed12") + bf.loc[0, "blockCount"] = 2 + bf.loc[0, "blockSizes"] = "2,4," + bf.loc[0, "blockStarts"] = "0,a," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) - bf.loc[0, 'blockCount'] = 2 - bf.loc[0, 'blockSizes'] = '1,1,' - bf.loc[0, 'blockStarts'] = '0,2,5,' + bf.loc[0, "blockCount"] = 2 + bf.loc[0, "blockSizes"] = "1,1," + bf.loc[0, "blockStarts"] = "0,2,5," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) # ends after end - bf.loc[0, 'start'] = 1 - bf.loc[0, 'end'] = 10 - bf.loc[0, 'blockCount'] = 1 - bf.loc[0, 'blockSizes'] = '100,' - bf.loc[0, 'blockStarts'] = '0,' + bf.loc[0, "start"] = 1 + bf.loc[0, "end"] = 10 + bf.loc[0, "blockCount"] = 1 + bf.loc[0, "blockSizes"] = "100," + bf.loc[0, "blockStarts"] = "0," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) # ends before end - bf.loc[0, 'start'] = 1 - bf.loc[0, 'end'] = 10 - bf.loc[0, 'blockCount'] = 1 - bf.loc[0, 'blockSizes'] = '1,' - bf.loc[0, 'blockStarts'] = '0,' + bf.loc[0, "start"] = 1 + bf.loc[0, "end"] = 10 + bf.loc[0, "blockCount"] = 1 + bf.loc[0, "blockSizes"] = "1," + bf.loc[0, "blockStarts"] = "0," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) # overlap - bf.loc[0, 'start'] = 1 - bf.loc[0, 'end'] = 10 - bf.loc[0, 'blockCount'] = 2 - bf.loc[0, 'blockSizes'] = '5,5,' - bf.loc[0, 'blockStarts'] = '0,1,' + bf.loc[0, "start"] = 1 + bf.loc[0, "end"] = 10 + bf.loc[0, "blockCount"] = 2 + bf.loc[0, "blockSizes"] = "5,5," + bf.loc[0, "blockStarts"] = "0,1," with pytest.raises(ValueError): - bioframe.to_bed(bf, os.path.join(directory, 'foo.bed')) + bioframe.to_bed(bf, os.path.join(directory, "foo.bed")) diff --git a/tests/test_extras.py b/tests/test_extras.py index 388d76a6..3a11e3c1 100644 --- a/tests/test_extras.py +++ b/tests/test_extras.py @@ -307,58 +307,43 @@ def test_pair_by_distance(): def test_mark_merge_runs(): - df1 = pd.DataFrame([ - # chr1 - # consecutive run of "c" - ["chr1", 85563, 129897, "c", 0.2], - ["chr1", 129897, 508340, "c", 0.8], - ["chr1", 508340, 620903, "c", 0.5], - - # singleton run of "c" separated by 1bp from previous run - ["chr1", 620904, 688020, "c", 0.7], - - # consecutive with previous interval but different value of "name" - ["chr1", 688020, 858415, "b", 0.8], - - # chr2 - ["chr2", 548402, 639680, "a", 0.6], - ["chr2", 639680, 1026586, "b", 0.8], - - # chr3 - ["chr3", 260538, 272930, "c", 0.5], - ["chr3", 272930, 470969, "c", 0.5], - ["chr3", 470969, 502336, "c", 0.5], - ], columns=["chrom", "start", "end", "name", "score"]) + df1 = pd.DataFrame( + [ + # chr1 + # consecutive run of "c" + ["chr1", 85563, 129897, "c", 0.2], + ["chr1", 129897, 508340, "c", 0.8], + ["chr1", 508340, 620903, "c", 0.5], + # singleton run of "c" separated by 1bp from previous run + ["chr1", 620904, 688020, "c", 0.7], + # consecutive with previous interval but different value of "name" + ["chr1", 688020, 858415, "b", 0.8], + # chr2 + ["chr2", 548402, 639680, "a", 0.6], + ["chr2", 639680, 1026586, "b", 0.8], + # chr3 + ["chr3", 260538, 272930, "c", 0.5], + ["chr3", 272930, 470969, "c", 0.5], + ["chr3", 470969, 502336, "c", 0.5], + ], + columns=["chrom", "start", "end", "name", "score"], + ) runs = bioframe.mark_runs(df1, "name") assert ( runs["name"].to_numpy() == np.array(["c", "c", "c", "c", "b", "a", "b", "c", "c", "c"]) ).all() - assert ( - runs["run"].to_numpy() - == np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0]) - ).all() + assert (runs["run"].to_numpy() == np.array([0, 0, 0, 1, 2, 0, 1, 0, 0, 0])).all() runs = bioframe.mark_runs(df1, "name", reset_counter=False) - assert ( - runs["run"].to_numpy() - == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5]) - ).all() + assert (runs["run"].to_numpy() == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])).all() runs = bioframe.mark_runs(df1, "name", run_col="foo", reset_counter=False) - assert ( - runs["foo"].to_numpy() - == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5]) - ).all() + assert (runs["foo"].to_numpy() == np.array([0, 0, 0, 1, 2, 3, 4, 5, 5, 5])).all() - merged = bioframe.merge_runs( - df1, "name", agg={"score_mean": ("score", "mean")} - ) - assert ( - merged["name"].to_numpy() - == np.array(["c", "c", "b", "a", "b", "c"]) - ).all() + merged = bioframe.merge_runs(df1, "name", agg={"score_mean": ("score", "mean")}) + assert (merged["name"].to_numpy() == np.array(["c", "c", "b", "a", "b", "c"])).all() assert np.allclose( merged["score_mean"].to_numpy(), np.array([0.5, 0.7, 0.8, 0.6, 0.8, 0.5]), @@ -366,30 +351,29 @@ def test_mark_merge_runs(): def test_mark_merge_runs__with_overlaps(): - df1 = pd.DataFrame([ - # chr1 - # consecutive run of "c" - ["chr1", 85563, 129897, "c", 0.2], - ["chr1", 129897, 508340, "c", 0.8], - ["chr1", 508340, 620903, "c", 0.5], - - # singleton run of "c" separated by 1bp from previous run - ["chr1", 620904, 688020, "c", 0.7], - - # consecutive with previous interval but different value of "name" - ["chr1", 688020, 858415, "b", 0.8], - # overlapping with previous interval - ["chr1", 700000, 900000, "b", 0.8], - - # chr2 - ["chr2", 548402, 639680, "a", 0.6], - ["chr2", 639680, 1026586, "b", 0.8], - - # chr3 - ["chr3", 260538, 272930, "c", 0.5], - ["chr3", 272930, 470969, "c", 0.5], - ["chr3", 470969, 502336, "c", 0.5], - ], columns=["chrom", "start", "end", "name", "score"]) + df1 = pd.DataFrame( + [ + # chr1 + # consecutive run of "c" + ["chr1", 85563, 129897, "c", 0.2], + ["chr1", 129897, 508340, "c", 0.8], + ["chr1", 508340, 620903, "c", 0.5], + # singleton run of "c" separated by 1bp from previous run + ["chr1", 620904, 688020, "c", 0.7], + # consecutive with previous interval but different value of "name" + ["chr1", 688020, 858415, "b", 0.8], + # overlapping with previous interval + ["chr1", 700000, 900000, "b", 0.8], + # chr2 + ["chr2", 548402, 639680, "a", 0.6], + ["chr2", 639680, 1026586, "b", 0.8], + # chr3 + ["chr3", 260538, 272930, "c", 0.5], + ["chr3", 272930, 470969, "c", 0.5], + ["chr3", 470969, 502336, "c", 0.5], + ], + columns=["chrom", "start", "end", "name", "score"], + ) with pytest.raises(ValueError): bioframe.mark_runs(df1, "name") @@ -399,7 +383,4 @@ def test_mark_merge_runs__with_overlaps(): runs["name"].to_numpy() == np.array(["c", "c", "c", "c", "b", "b", "a", "b", "c", "c", "c"]) ).all() - assert ( - runs["run"].to_numpy() - == np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0]) - ).all() + assert (runs["run"].to_numpy() == np.array([0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 0])).all() diff --git a/tests/test_fileops.py b/tests/test_fileops.py index 2bfa7b2f..0cd6fde8 100644 --- a/tests/test_fileops.py +++ b/tests/test_fileops.py @@ -51,18 +51,19 @@ def test_read_chromsizes(): def test_read_beds(): # Checking that we properly read common bed schemas - schemas = ['narrowPeak', 'jaspar', 'bed9', 'bed12'] + schemas = ["narrowPeak", "jaspar", "bed9", "bed12"] for schema in schemas: - _ = bioframe.read_table(f'tests/test_data/{schema}.bed', schema=schema, - schema_is_strict=True) + _ = bioframe.read_table( + f"tests/test_data/{schema}.bed", schema=schema, schema_is_strict=True + ) @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems") def test_read_sam(): pytest.importorskip("pysam") # SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam - _ = bioframe.read_alignments('tests/test_data/toy.sam') + _ = bioframe.read_alignments("tests/test_data/toy.sam") @pytest.mark.skipif(is_big_endian, reason="Test skipped on big-endian systems") @@ -70,4 +71,4 @@ def test_read_bam(): pytest.importorskip("pysam") # converted toy.sam via `samtools view -bS toy.sam > toy.bam; # index file created with `samtools index toy.bam` - _ = bioframe.read_alignments('tests/test_data/toy.bam') + _ = bioframe.read_alignments("tests/test_data/toy.bam") diff --git a/uv.lock b/uv.lock index e7b40928..6fffeabe 100644 --- a/uv.lock +++ b/uv.lock @@ -221,6 +221,11 @@ docs = [ { name = "sphinx-rtd-theme", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, ] test = [ + { name = "biopython", version = "1.83", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, + { name = "biopython", version = "1.85", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "biopython", version = "1.86", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "pybbi" }, + { name = "pysam" }, { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, { name = "pytest", version = "9.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, @@ -265,7 +270,12 @@ docs = [ { name = "sphinx-copybutton" }, { name = "sphinx-rtd-theme" }, ] -test = [{ name = "pytest", specifier = ">=6.0" }] +test = [ + { name = "biopython" }, + { name = "pybbi" }, + { name = "pysam" }, + { name = "pytest", specifier = ">=6.0" }, +] [[package]] name = "biopython" From ef7fd1c36a85884fc735afe1ee81ef2be167c555 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 1 Dec 2025 22:33:24 -0500 Subject: [PATCH 2/2] Restore version 2 identifier to rtd config --- .readthedocs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ab8988dc..4ae06397 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -2,6 +2,8 @@ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details +version: 2 + build: os: ubuntu-24.04 tools: