From e5cc95c5aa2e5afe9af9fb952b7e4eda19d4d4bb Mon Sep 17 00:00:00 2001 From: Bernt Popp Date: Tue, 7 Apr 2026 14:22:42 +0200 Subject: [PATCH 1/4] fix: ONT amplicon pipeline uses dedicated ont_amplicon_params config (#79) The ONT amplicon pipeline was reading model_file from pacbio_params, which defaults to ERRHMM-SEQUEL.model (a PacBio Sequel model). The v0.43.5 "fix" only added a runtime warning but still used the wrong model. Changes: - Add ont_amplicon_params config section with ERRHMM-ONT.model default - Add OntAmpliconConfig TypedDict for type-safe parameter access - ONT pipeline reads from ont_amplicon_params (not pacbio_params) - CLI routes --model-file to ont_amplicon_params for --platform ont - Add _apply_ont_amplicon_params() function in CLI - Config schema validation and path resolution for ont_amplicon_params - Update docs with ONT configuration section Co-Authored-By: Claude Opus 4.6 (1M context) --- config.json | 6 +++ docs/about/changelog.md | 11 ++++++ docs/guides/amplicon-simulation.md | 31 ++++++++++++++- muc_one_up/cli/commands/reads.py | 38 ++++++++++++++++++- muc_one_up/config.py | 16 ++++++++ .../read_simulator/ont_amplicon_pipeline.py | 29 +++++++------- muc_one_up/type_defs.py | 10 +++++ tests/cli/test_amplicon_platform.py | 8 ++++ .../test_ont_amplicon_pipeline.py | 6 ++- 9 files changed, 135 insertions(+), 20 deletions(-) diff --git a/config.json b/config.json index 94425a7..fb6e54b 100644 --- a/config.json +++ b/config.json @@ -530,6 +530,12 @@ "min_rq": 0.99, "threads": 8 }, + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 + }, "amplicon_params": { "forward_primer": "GGAGAAAAGGAGACTTCGGCTACCCAG", "reverse_primer": "GCCGTTGTGCACCAGAGTAGAAGCTGA", diff --git a/docs/about/changelog.md b/docs/about/changelog.md index 52e5a0f..158fb53 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -9,6 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- ONT amplicon pipeline now reads from dedicated `ont_amplicon_params` config section instead of `pacbio_params` -- was using PacBio Sequel model (ERRHMM-SEQUEL) for ONT reads (#79) +- CLI routes `--model-file` overrides to `ont_amplicon_params` for ONT and `pacbio_params` for PacBio + +### Added +- `ont_amplicon_params` config section with ONT-specific defaults (ERRHMM-ONT.model) +- `OntAmpliconConfig` TypedDict for type-safe ONT parameter access +- Config schema validation and path resolution for `ont_amplicon_params` +- Test for ONT platform using `ont_amplicon_params` (not `pacbio_params`) +- Documentation: ONT configuration section in amplicon simulation guide + --- ## [0.43.4] - 2026-04-06 diff --git a/docs/guides/amplicon-simulation.md b/docs/guides/amplicon-simulation.md index 47d4bd7..e36546a 100644 --- a/docs/guides/amplicon-simulation.md +++ b/docs/guides/amplicon-simulation.md @@ -108,16 +108,45 @@ Stages 1-4 (extraction, PCR bias, template generation) are shared. The platforms | Stage | PacBio (`--platform pacbio`) | ONT (`--platform ont`) | |-------|------------------------------|------------------------| +| Config section | `pacbio_params` | `ont_amplicon_params` | | pbsim3 pass_num | 3+ (multi-pass CLR) | 1 (single-pass) | | Consensus | CCS (multi-pass -> HiFi) | None (skip) | | minimap2 preset | `map-hifi` | `map-ont` | -| Error model | `QSHMM-SEQUEL.model` etc. | `ERRHMM-ONT-HQ.model` etc. | +| Error model | `QSHMM-SEQUEL.model` etc. | `ERRHMM-ONT.model` etc. | | Output suffix | `*_amplicon_hifi.bam` | `*_amplicon_ont.bam` | | Tool dependencies | pbsim3, ccs, samtools, minimap2 | pbsim3, samtools, minimap2 | !!! note "ONT does not require CCS" The ONT pipeline skips CCS consensus entirely. Each pbsim3 template produces one single-pass read, which is converted directly to FASTQ and aligned. This means `--coverage` maps directly to the number of output reads (before alignment filtering). +### ONT Configuration + +The ONT amplicon pipeline reads from the `ont_amplicon_params` config section (separate from `pacbio_params`): + +```json +"ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 +} +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model_type` | string | `"errhmm"` | pbsim3 model type (`errhmm` or `qshmm`) | +| `model_file` | string | `ERRHMM-ONT.model` | pbsim3 ONT error model file | +| `threads` | int | `4` | Number of threads | +| `accuracy_mean` | float | `0.95` | Mean read accuracy | + +Available ONT models in `reference/pbsim3/`: + +- `ERRHMM-ONT.model` -- standard ONT error profile +- `QSHMM-ONT.model` -- quality score ONT model +- `QSHMM-ONT-HQ.model` -- high-quality ONT model (e.g., Kit14/Dorado) + +The `--model-file` CLI flag overrides `ont_amplicon_params.model_file` for ONT, and `pacbio_params.model_file` for PacBio. + --- ## PCR Length Bias Model diff --git a/muc_one_up/cli/commands/reads.py b/muc_one_up/cli/commands/reads.py index 48dbf4e..6a3ce60 100644 --- a/muc_one_up/cli/commands/reads.py +++ b/muc_one_up/cli/commands/reads.py @@ -136,6 +136,39 @@ def _setup_read_config( logging.info(f"Using random seed: {seed} (results will be reproducible)") +def _apply_ont_amplicon_params( + config: dict[str, Any], + model_type: str | None, + model_file: str | None, + seed: int | None, +) -> None: + """Apply ONT amplicon CLI overrides to ont_amplicon_params. + + Mutates config in place. Uses ont_amplicon_params section (separate + from pacbio_params) so the ONT pipeline gets an ONT-specific model. + + Raises: + click.ClickException: If required params are missing after merge. + """ + if "ont_amplicon_params" not in config: + config["ont_amplicon_params"] = {} + + params = config["ont_amplicon_params"] + if model_type is not None: + params["model_type"] = model_type + if model_file is not None: + params["model_file"] = model_file + if seed is not None: + params["seed"] = seed + logging.info("Using random seed: %d (results will be reproducible)", seed) + + # Default to ERRHMM-ONT.model if no model configured + if "model_file" not in params: + params["model_file"] = "reference/pbsim3/ERRHMM-ONT.model" + if "model_type" not in params: + params["model_type"] = "errhmm" + + def _apply_pacbio_params( config: dict[str, Any], model_type: str | None, @@ -564,7 +597,10 @@ def amplicon( "Add forward_primer and reverse_primer to config.json." ) - _apply_pacbio_params(config, model_type, model_file, seed) + if platform == "ont": + _apply_ont_amplicon_params(config, model_type, model_file, seed) + else: + _apply_pacbio_params(config, model_type, model_file, seed) # Apply PCR bias overrides if "pcr_bias" not in config["amplicon_params"]: diff --git a/muc_one_up/config.py b/muc_one_up/config.py index 7ce06a2..80a2690 100644 --- a/muc_one_up/config.py +++ b/muc_one_up/config.py @@ -184,6 +184,21 @@ ], "additionalProperties": False, }, + "ont_amplicon_params": { + "type": "object", + "properties": { + "model_type": {"type": "string", "enum": ["qshmm", "errhmm"]}, + "model_file": {"type": "string"}, + "threads": {"type": "number", "minimum": 1}, + "seed": {"type": ["number", "null"]}, + "accuracy_mean": { + "type": ["number", "null"], + "minimum": 0.0, + "maximum": 1.0, + }, + }, + "additionalProperties": False, + }, "constants": { "type": "object", # Allow either flat format (for backward compatibility with tests) @@ -649,6 +664,7 @@ def load_config_raw(config_path: str) -> dict[str, Any]: "sample_target_bed", ], "pacbio_params": ["model_file"], + "ont_amplicon_params": ["model_file"], } diff --git a/muc_one_up/read_simulator/ont_amplicon_pipeline.py b/muc_one_up/read_simulator/ont_amplicon_pipeline.py index 6fda296..88ef45b 100644 --- a/muc_one_up/read_simulator/ont_amplicon_pipeline.py +++ b/muc_one_up/read_simulator/ont_amplicon_pipeline.py @@ -50,7 +50,7 @@ def simulate_ont_amplicon_pipeline( no CCS consensus). See module docstring for pipeline stages. Args: - config: Pipeline config with tools, amplicon_params, pacbio_params, + config: Pipeline config with tools, amplicon_params, ont_amplicon_params, read_simulation sections. input_fa: Input FASTA (haploid or diploid). human_reference: Optional reference for alignment. @@ -68,10 +68,10 @@ def simulate_ont_amplicon_pipeline( from typing import cast - from ..type_defs import AmpliconConfig, PacbioConfig, ReadSimulationConfig + from ..type_defs import AmpliconConfig, OntAmpliconConfig, ReadSimulationConfig amplicon_params = cast(AmpliconConfig, config.get("amplicon_params", {})) - pacbio_params = cast(PacbioConfig, config.get("pacbio_params", {})) + ont_params = cast(OntAmpliconConfig, config.get("ont_amplicon_params", {})) tools = config.get("tools", {}) rs_raw = config.get("read_simulation", {}) rs_config = cast(ReadSimulationConfig, rs_raw) @@ -81,20 +81,20 @@ def simulate_ont_amplicon_pipeline( samtools_cmd = tools.get("samtools", "samtools") minimap2_cmd = tools.get("minimap2", "minimap2") - # Params from config - model_type = pacbio_params["model_type"] - model_file = pacbio_params["model_file"] - threads = pacbio_params.get("threads", 4) - seed = pacbio_params.get("seed") - accuracy_mean = pacbio_params.get("accuracy_mean", 0.95) + # ONT-specific params — uses ont_amplicon_params (NOT pacbio_params) + model_type = ont_params.get("model_type", "errhmm") + model_file = ont_params.get("model_file", "reference/pbsim3/ERRHMM-ONT.model") + threads = ont_params.get("threads", 4) + seed = ont_params.get("seed") + accuracy_mean = ont_params.get("accuracy_mean", 0.95) - # Warn if model file doesn't look like an ONT model + # Validate model file looks like an ONT model model_name = Path(model_file).name.upper() if "ONT" not in model_name and "NANOPORE" not in model_name: logging.warning( "ONT amplicon mode is using model file '%s' which does not appear " - "to be an ONT model. Consider using --model-file with an ONT model " - "(e.g., ERRHMM-ONT.model or QSHMM-ONT-HQ.model).", + "to be an ONT model. The default is ERRHMM-ONT.model. " + "Check ont_amplicon_params.model_file in config or use --model-file.", model_file, ) @@ -111,10 +111,7 @@ def simulate_ont_amplicon_pipeline( expected_range_tuple = (int(expected_range[0]), int(expected_range[1])) _rs_cov = rs_config.get("coverage") - _pb_cov = pacbio_params.get("coverage") - total_coverage: int = int( - _rs_cov if _rs_cov is not None else (_pb_cov if _pb_cov is not None else 30) - ) + total_coverage: int = int(_rs_cov if _rs_cov is not None else 30) pcr_bias_config = amplicon_params.get("pcr_bias", {}) diff --git a/muc_one_up/type_defs.py b/muc_one_up/type_defs.py index 2ac50ed..e7b090f 100644 --- a/muc_one_up/type_defs.py +++ b/muc_one_up/type_defs.py @@ -220,6 +220,16 @@ class PacbioConfig(TypedDict, total=False): length_max: int +class OntAmpliconConfig(TypedDict, total=False): + """ONT amplicon simulation parameters (pbsim3 single-pass mode).""" + + model_type: str + model_file: str + threads: int + seed: int | None + accuracy_mean: float + + # --------------------------------------------------------------------------- # Legacy aliases (kept for backward compatibility) # --------------------------------------------------------------------------- diff --git a/tests/cli/test_amplicon_platform.py b/tests/cli/test_amplicon_platform.py index a0a528e..e001974 100644 --- a/tests/cli/test_amplicon_platform.py +++ b/tests/cli/test_amplicon_platform.py @@ -69,6 +69,14 @@ def test_platform_ont_sets_ont_amplicon(tmp_path): assert config["read_simulation"]["simulator"] == "ont-amplicon" +def test_ont_platform_uses_ont_amplicon_params(tmp_path): + """--platform ont should populate ont_amplicon_params, not pacbio_params.""" + config = _invoke_amplicon(tmp_path, platform="ont") + assert "ont_amplicon_params" in config + # Default ONT model should be set + assert "ONT" in config["ont_amplicon_params"]["model_file"].upper() + + def test_assay_type_set_for_both_platforms(tmp_path): """Both platforms should set assay_type='amplicon'.""" for platform in [None, "ont"]: diff --git a/tests/read_simulator/test_ont_amplicon_pipeline.py b/tests/read_simulator/test_ont_amplicon_pipeline.py index bbbb998..11b685e 100644 --- a/tests/read_simulator/test_ont_amplicon_pipeline.py +++ b/tests/read_simulator/test_ont_amplicon_pipeline.py @@ -32,7 +32,7 @@ def ont_amplicon_config(tmp_path, muc1_primers): "reverse_primer": muc1_primers["reverse"], "pcr_bias": {"preset": "default"}, }, - "pacbio_params": { + "ont_amplicon_params": { "model_type": "errhmm", "model_file": str(model_file), "threads": 4, @@ -192,7 +192,9 @@ def test_warns_on_non_ont_model( import logging # Use a PacBio model name to trigger warning - ont_amplicon_config["pacbio_params"]["model_file"] = str(tmp_path / "ERRHMM-SEQUEL.model") + ont_amplicon_config["ont_amplicon_params"]["model_file"] = str( + tmp_path / "ERRHMM-SEQUEL.model" + ) (tmp_path / "ERRHMM-SEQUEL.model").write_text("model") mock_prep.return_value = _make_prep(tmp_path) From d5bae3c24617bc72b16381f7da3e171a55807489 Mon Sep 17 00:00:00 2001 From: Bernt Popp Date: Tue, 7 Apr 2026 14:25:37 +0200 Subject: [PATCH 2/4] fix: address review findings on PR #94 - Log when default ONT model is applied (visibility for missing config) - Consistent threads default: 8 in code, config, and docs (was 4 in code/docs) Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/guides/amplicon-simulation.md | 2 +- muc_one_up/cli/commands/reads.py | 1 + muc_one_up/read_simulator/ont_amplicon_pipeline.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/guides/amplicon-simulation.md b/docs/guides/amplicon-simulation.md index e36546a..ac00193 100644 --- a/docs/guides/amplicon-simulation.md +++ b/docs/guides/amplicon-simulation.md @@ -136,7 +136,7 @@ The ONT amplicon pipeline reads from the `ont_amplicon_params` config section (s |-----------|------|---------|-------------| | `model_type` | string | `"errhmm"` | pbsim3 model type (`errhmm` or `qshmm`) | | `model_file` | string | `ERRHMM-ONT.model` | pbsim3 ONT error model file | -| `threads` | int | `4` | Number of threads | +| `threads` | int | `8` | Number of threads | | `accuracy_mean` | float | `0.95` | Mean read accuracy | Available ONT models in `reference/pbsim3/`: diff --git a/muc_one_up/cli/commands/reads.py b/muc_one_up/cli/commands/reads.py index 6a3ce60..2c8f7c8 100644 --- a/muc_one_up/cli/commands/reads.py +++ b/muc_one_up/cli/commands/reads.py @@ -165,6 +165,7 @@ def _apply_ont_amplicon_params( # Default to ERRHMM-ONT.model if no model configured if "model_file" not in params: params["model_file"] = "reference/pbsim3/ERRHMM-ONT.model" + logging.info("Using default ONT model: %s", params["model_file"]) if "model_type" not in params: params["model_type"] = "errhmm" diff --git a/muc_one_up/read_simulator/ont_amplicon_pipeline.py b/muc_one_up/read_simulator/ont_amplicon_pipeline.py index 88ef45b..5bd2dde 100644 --- a/muc_one_up/read_simulator/ont_amplicon_pipeline.py +++ b/muc_one_up/read_simulator/ont_amplicon_pipeline.py @@ -84,7 +84,7 @@ def simulate_ont_amplicon_pipeline( # ONT-specific params — uses ont_amplicon_params (NOT pacbio_params) model_type = ont_params.get("model_type", "errhmm") model_file = ont_params.get("model_file", "reference/pbsim3/ERRHMM-ONT.model") - threads = ont_params.get("threads", 4) + threads = ont_params.get("threads", 8) seed = ont_params.get("seed") accuracy_mean = ont_params.get("accuracy_mean", 0.95) From 1bab68763304c49ba61a2959abb5f676c593f49f Mon Sep 17 00:00:00 2001 From: Bernt Popp Date: Tue, 7 Apr 2026 14:36:23 +0200 Subject: [PATCH 3/4] docs: add ont_amplicon_params to configuration reference Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/reference/configuration.md | 43 ++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 684671c..8bdb674 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -33,6 +33,7 @@ MucOneUp requires a JSON configuration file specifying: "read_simulation": { ... }, "nanosim_params": { ... }, "pacbio_params": { ... }, + "ont_amplicon_params": { ... }, "amplicon_params": { ... } } ``` @@ -595,11 +596,44 @@ Parameters for PacBio HiFi read simulation. --- +## ONT Amplicon Parameters Section + +### Purpose + +Parameters for ONT amplicon read simulation (pbsim3 single-pass mode). Separate from `pacbio_params` so each platform uses its own error model. + +### Structure + +```json +{ + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 + } +} +``` + +### Fields + +| Field | Type | Description | Default | +|-------|------|-------------|---------| +| `model_type` | string | "qshmm" or "errhmm" | "errhmm" | +| `model_file` | string | pbsim3 ONT error model file | ERRHMM-ONT.model | +| `threads` | number | Parallel threads (minimum 1) | 8 | +| `seed` | integer | Random seed (null = random) | null | +| `accuracy_mean` | number | Mean read accuracy (0.0-1.0) | 0.95 | + +Available ONT models: `ERRHMM-ONT.model`, `QSHMM-ONT.model`, `QSHMM-ONT-HQ.model`. + +--- + ## Amplicon Parameters Section ### Purpose -Parameters for PacBio amplicon read simulation with PCR length bias modeling. +Parameters for amplicon read simulation (shared by PacBio and ONT) with PCR length bias modeling. ### Structure @@ -747,6 +781,13 @@ See the [Amplicon Simulation Guide](../guides/amplicon-simulation.md) for detail "min_rq": 0.99, "threads": 4, "seed": 42 + }, + + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "/path/to/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 } } ``` From 7d476b5b5051b34f2cc7292a0c8bfb95cb61c5b4 Mon Sep 17 00:00:00 2001 From: Bernt Popp Date: Tue, 7 Apr 2026 14:39:02 +0200 Subject: [PATCH 4/4] fix: address Copilot null-safety review on PR #94 - Treat None as missing in ONT pipeline (use `or` instead of default arg) - Fix docstring: remove incorrect Raises section, document fallback behavior - CLI: use `params.get()` truthy check to handle explicit null values - Schema: disallow null for accuracy_mean (must be a number) Co-Authored-By: Claude Opus 4.6 (1M context) --- muc_one_up/cli/commands/reads.py | 11 +++++------ muc_one_up/config.py | 2 +- muc_one_up/read_simulator/ont_amplicon_pipeline.py | 11 ++++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/muc_one_up/cli/commands/reads.py b/muc_one_up/cli/commands/reads.py index 2c8f7c8..64e6698 100644 --- a/muc_one_up/cli/commands/reads.py +++ b/muc_one_up/cli/commands/reads.py @@ -146,9 +146,8 @@ def _apply_ont_amplicon_params( Mutates config in place. Uses ont_amplicon_params section (separate from pacbio_params) so the ONT pipeline gets an ONT-specific model. - - Raises: - click.ClickException: If required params are missing after merge. + Falls back to sensible defaults (ERRHMM-ONT.model) when values are + missing or explicitly null. """ if "ont_amplicon_params" not in config: config["ont_amplicon_params"] = {} @@ -162,11 +161,11 @@ def _apply_ont_amplicon_params( params["seed"] = seed logging.info("Using random seed: %d (results will be reproducible)", seed) - # Default to ERRHMM-ONT.model if no model configured - if "model_file" not in params: + # Default to ERRHMM-ONT.model if missing or explicitly null + if not params.get("model_file"): params["model_file"] = "reference/pbsim3/ERRHMM-ONT.model" logging.info("Using default ONT model: %s", params["model_file"]) - if "model_type" not in params: + if not params.get("model_type"): params["model_type"] = "errhmm" diff --git a/muc_one_up/config.py b/muc_one_up/config.py index 80a2690..ef525fd 100644 --- a/muc_one_up/config.py +++ b/muc_one_up/config.py @@ -192,7 +192,7 @@ "threads": {"type": "number", "minimum": 1}, "seed": {"type": ["number", "null"]}, "accuracy_mean": { - "type": ["number", "null"], + "type": "number", "minimum": 0.0, "maximum": 1.0, }, diff --git a/muc_one_up/read_simulator/ont_amplicon_pipeline.py b/muc_one_up/read_simulator/ont_amplicon_pipeline.py index 5bd2dde..8da8214 100644 --- a/muc_one_up/read_simulator/ont_amplicon_pipeline.py +++ b/muc_one_up/read_simulator/ont_amplicon_pipeline.py @@ -82,11 +82,12 @@ def simulate_ont_amplicon_pipeline( minimap2_cmd = tools.get("minimap2", "minimap2") # ONT-specific params — uses ont_amplicon_params (NOT pacbio_params) - model_type = ont_params.get("model_type", "errhmm") - model_file = ont_params.get("model_file", "reference/pbsim3/ERRHMM-ONT.model") - threads = ont_params.get("threads", 8) - seed = ont_params.get("seed") - accuracy_mean = ont_params.get("accuracy_mean", 0.95) + # Treat None as missing (possible from partial configs with explicit nulls) + model_type = ont_params.get("model_type") or "errhmm" + model_file = ont_params.get("model_file") or "reference/pbsim3/ERRHMM-ONT.model" + threads = ont_params.get("threads") or 8 + seed = ont_params.get("seed") # None is valid here (random seed) + accuracy_mean = ont_params.get("accuracy_mean") or 0.95 # Validate model file looks like an ONT model model_name = Path(model_file).name.upper()