diff --git a/config.json b/config.json index 94425a7..fb6e54b 100644 --- a/config.json +++ b/config.json @@ -530,6 +530,12 @@ "min_rq": 0.99, "threads": 8 }, + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 + }, "amplicon_params": { "forward_primer": "GGAGAAAAGGAGACTTCGGCTACCCAG", "reverse_primer": "GCCGTTGTGCACCAGAGTAGAAGCTGA", diff --git a/docs/about/changelog.md b/docs/about/changelog.md index 52e5a0f..158fb53 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -9,6 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- ONT amplicon pipeline now reads from dedicated `ont_amplicon_params` config section instead of `pacbio_params` -- was using PacBio Sequel model (ERRHMM-SEQUEL) for ONT reads (#79) +- CLI routes `--model-file` overrides to `ont_amplicon_params` for ONT and `pacbio_params` for PacBio + +### Added +- `ont_amplicon_params` config section with ONT-specific defaults (ERRHMM-ONT.model) +- `OntAmpliconConfig` TypedDict for type-safe ONT parameter access +- Config schema validation and path resolution for `ont_amplicon_params` +- Test for ONT platform using `ont_amplicon_params` (not `pacbio_params`) +- Documentation: ONT configuration section in amplicon simulation guide + --- ## [0.43.4] - 2026-04-06 diff --git a/docs/guides/amplicon-simulation.md b/docs/guides/amplicon-simulation.md index 47d4bd7..ac00193 100644 --- a/docs/guides/amplicon-simulation.md +++ b/docs/guides/amplicon-simulation.md @@ -108,16 +108,45 @@ Stages 1-4 (extraction, PCR bias, template generation) are shared. The platforms | Stage | PacBio (`--platform pacbio`) | ONT (`--platform ont`) | |-------|------------------------------|------------------------| +| Config section | `pacbio_params` | `ont_amplicon_params` | | pbsim3 pass_num | 3+ (multi-pass CLR) | 1 (single-pass) | | Consensus | CCS (multi-pass -> HiFi) | None (skip) | | minimap2 preset | `map-hifi` | `map-ont` | -| Error model | `QSHMM-SEQUEL.model` etc. | `ERRHMM-ONT-HQ.model` etc. | +| Error model | `QSHMM-SEQUEL.model` etc. | `ERRHMM-ONT.model` etc. | | Output suffix | `*_amplicon_hifi.bam` | `*_amplicon_ont.bam` | | Tool dependencies | pbsim3, ccs, samtools, minimap2 | pbsim3, samtools, minimap2 | !!! note "ONT does not require CCS" The ONT pipeline skips CCS consensus entirely. Each pbsim3 template produces one single-pass read, which is converted directly to FASTQ and aligned. This means `--coverage` maps directly to the number of output reads (before alignment filtering). +### ONT Configuration + +The ONT amplicon pipeline reads from the `ont_amplicon_params` config section (separate from `pacbio_params`): + +```json +"ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 +} +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model_type` | string | `"errhmm"` | pbsim3 model type (`errhmm` or `qshmm`) | +| `model_file` | string | `ERRHMM-ONT.model` | pbsim3 ONT error model file | +| `threads` | int | `8` | Number of threads | +| `accuracy_mean` | float | `0.95` | Mean read accuracy | + +Available ONT models in `reference/pbsim3/`: + +- `ERRHMM-ONT.model` -- standard ONT error profile +- `QSHMM-ONT.model` -- quality score ONT model +- `QSHMM-ONT-HQ.model` -- high-quality ONT model (e.g., Kit14/Dorado) + +The `--model-file` CLI flag overrides `ont_amplicon_params.model_file` for ONT, and `pacbio_params.model_file` for PacBio. + --- ## PCR Length Bias Model diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 684671c..8bdb674 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -33,6 +33,7 @@ MucOneUp requires a JSON configuration file specifying: "read_simulation": { ... }, "nanosim_params": { ... }, "pacbio_params": { ... }, + "ont_amplicon_params": { ... }, "amplicon_params": { ... } } ``` @@ -595,11 +596,44 @@ Parameters for PacBio HiFi read simulation. --- +## ONT Amplicon Parameters Section + +### Purpose + +Parameters for ONT amplicon read simulation (pbsim3 single-pass mode). Separate from `pacbio_params` so each platform uses its own error model. + +### Structure + +```json +{ + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "reference/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 + } +} +``` + +### Fields + +| Field | Type | Description | Default | +|-------|------|-------------|---------| +| `model_type` | string | "qshmm" or "errhmm" | "errhmm" | +| `model_file` | string | pbsim3 ONT error model file | ERRHMM-ONT.model | +| `threads` | number | Parallel threads (minimum 1) | 8 | +| `seed` | integer | Random seed (null = random) | null | +| `accuracy_mean` | number | Mean read accuracy (0.0-1.0) | 0.95 | + +Available ONT models: `ERRHMM-ONT.model`, `QSHMM-ONT.model`, `QSHMM-ONT-HQ.model`. + +--- + ## Amplicon Parameters Section ### Purpose -Parameters for PacBio amplicon read simulation with PCR length bias modeling. +Parameters for amplicon read simulation (shared by PacBio and ONT) with PCR length bias modeling. ### Structure @@ -747,6 +781,13 @@ See the [Amplicon Simulation Guide](../guides/amplicon-simulation.md) for detail "min_rq": 0.99, "threads": 4, "seed": 42 + }, + + "ont_amplicon_params": { + "model_type": "errhmm", + "model_file": "/path/to/pbsim3/ERRHMM-ONT.model", + "threads": 8, + "accuracy_mean": 0.95 } } ``` diff --git a/muc_one_up/cli/commands/reads.py b/muc_one_up/cli/commands/reads.py index 48dbf4e..64e6698 100644 --- a/muc_one_up/cli/commands/reads.py +++ b/muc_one_up/cli/commands/reads.py @@ -136,6 +136,39 @@ def _setup_read_config( logging.info(f"Using random seed: {seed} (results will be reproducible)") +def _apply_ont_amplicon_params( + config: dict[str, Any], + model_type: str | None, + model_file: str | None, + seed: int | None, +) -> None: + """Apply ONT amplicon CLI overrides to ont_amplicon_params. + + Mutates config in place. Uses ont_amplicon_params section (separate + from pacbio_params) so the ONT pipeline gets an ONT-specific model. + Falls back to sensible defaults (ERRHMM-ONT.model) when values are + missing or explicitly null. + """ + if "ont_amplicon_params" not in config: + config["ont_amplicon_params"] = {} + + params = config["ont_amplicon_params"] + if model_type is not None: + params["model_type"] = model_type + if model_file is not None: + params["model_file"] = model_file + if seed is not None: + params["seed"] = seed + logging.info("Using random seed: %d (results will be reproducible)", seed) + + # Default to ERRHMM-ONT.model if missing or explicitly null + if not params.get("model_file"): + params["model_file"] = "reference/pbsim3/ERRHMM-ONT.model" + logging.info("Using default ONT model: %s", params["model_file"]) + if not params.get("model_type"): + params["model_type"] = "errhmm" + + def _apply_pacbio_params( config: dict[str, Any], model_type: str | None, @@ -564,7 +597,10 @@ def amplicon( "Add forward_primer and reverse_primer to config.json." ) - _apply_pacbio_params(config, model_type, model_file, seed) + if platform == "ont": + _apply_ont_amplicon_params(config, model_type, model_file, seed) + else: + _apply_pacbio_params(config, model_type, model_file, seed) # Apply PCR bias overrides if "pcr_bias" not in config["amplicon_params"]: diff --git a/muc_one_up/config.py b/muc_one_up/config.py index 7ce06a2..ef525fd 100644 --- a/muc_one_up/config.py +++ b/muc_one_up/config.py @@ -184,6 +184,21 @@ ], "additionalProperties": False, }, + "ont_amplicon_params": { + "type": "object", + "properties": { + "model_type": {"type": "string", "enum": ["qshmm", "errhmm"]}, + "model_file": {"type": "string"}, + "threads": {"type": "number", "minimum": 1}, + "seed": {"type": ["number", "null"]}, + "accuracy_mean": { + "type": "number", + "minimum": 0.0, + "maximum": 1.0, + }, + }, + "additionalProperties": False, + }, "constants": { "type": "object", # Allow either flat format (for backward compatibility with tests) @@ -649,6 +664,7 @@ def load_config_raw(config_path: str) -> dict[str, Any]: "sample_target_bed", ], "pacbio_params": ["model_file"], + "ont_amplicon_params": ["model_file"], } diff --git a/muc_one_up/read_simulator/ont_amplicon_pipeline.py b/muc_one_up/read_simulator/ont_amplicon_pipeline.py index 6fda296..8da8214 100644 --- a/muc_one_up/read_simulator/ont_amplicon_pipeline.py +++ b/muc_one_up/read_simulator/ont_amplicon_pipeline.py @@ -50,7 +50,7 @@ def simulate_ont_amplicon_pipeline( no CCS consensus). See module docstring for pipeline stages. Args: - config: Pipeline config with tools, amplicon_params, pacbio_params, + config: Pipeline config with tools, amplicon_params, ont_amplicon_params, read_simulation sections. input_fa: Input FASTA (haploid or diploid). human_reference: Optional reference for alignment. @@ -68,10 +68,10 @@ def simulate_ont_amplicon_pipeline( from typing import cast - from ..type_defs import AmpliconConfig, PacbioConfig, ReadSimulationConfig + from ..type_defs import AmpliconConfig, OntAmpliconConfig, ReadSimulationConfig amplicon_params = cast(AmpliconConfig, config.get("amplicon_params", {})) - pacbio_params = cast(PacbioConfig, config.get("pacbio_params", {})) + ont_params = cast(OntAmpliconConfig, config.get("ont_amplicon_params", {})) tools = config.get("tools", {}) rs_raw = config.get("read_simulation", {}) rs_config = cast(ReadSimulationConfig, rs_raw) @@ -81,20 +81,21 @@ def simulate_ont_amplicon_pipeline( samtools_cmd = tools.get("samtools", "samtools") minimap2_cmd = tools.get("minimap2", "minimap2") - # Params from config - model_type = pacbio_params["model_type"] - model_file = pacbio_params["model_file"] - threads = pacbio_params.get("threads", 4) - seed = pacbio_params.get("seed") - accuracy_mean = pacbio_params.get("accuracy_mean", 0.95) + # ONT-specific params — uses ont_amplicon_params (NOT pacbio_params) + # Treat None as missing (possible from partial configs with explicit nulls) + model_type = ont_params.get("model_type") or "errhmm" + model_file = ont_params.get("model_file") or "reference/pbsim3/ERRHMM-ONT.model" + threads = ont_params.get("threads") or 8 + seed = ont_params.get("seed") # None is valid here (random seed) + accuracy_mean = ont_params.get("accuracy_mean") or 0.95 - # Warn if model file doesn't look like an ONT model + # Validate model file looks like an ONT model model_name = Path(model_file).name.upper() if "ONT" not in model_name and "NANOPORE" not in model_name: logging.warning( "ONT amplicon mode is using model file '%s' which does not appear " - "to be an ONT model. Consider using --model-file with an ONT model " - "(e.g., ERRHMM-ONT.model or QSHMM-ONT-HQ.model).", + "to be an ONT model. The default is ERRHMM-ONT.model. " + "Check ont_amplicon_params.model_file in config or use --model-file.", model_file, ) @@ -111,10 +112,7 @@ def simulate_ont_amplicon_pipeline( expected_range_tuple = (int(expected_range[0]), int(expected_range[1])) _rs_cov = rs_config.get("coverage") - _pb_cov = pacbio_params.get("coverage") - total_coverage: int = int( - _rs_cov if _rs_cov is not None else (_pb_cov if _pb_cov is not None else 30) - ) + total_coverage: int = int(_rs_cov if _rs_cov is not None else 30) pcr_bias_config = amplicon_params.get("pcr_bias", {}) diff --git a/muc_one_up/type_defs.py b/muc_one_up/type_defs.py index 2ac50ed..e7b090f 100644 --- a/muc_one_up/type_defs.py +++ b/muc_one_up/type_defs.py @@ -220,6 +220,16 @@ class PacbioConfig(TypedDict, total=False): length_max: int +class OntAmpliconConfig(TypedDict, total=False): + """ONT amplicon simulation parameters (pbsim3 single-pass mode).""" + + model_type: str + model_file: str + threads: int + seed: int | None + accuracy_mean: float + + # --------------------------------------------------------------------------- # Legacy aliases (kept for backward compatibility) # --------------------------------------------------------------------------- diff --git a/tests/cli/test_amplicon_platform.py b/tests/cli/test_amplicon_platform.py index a0a528e..e001974 100644 --- a/tests/cli/test_amplicon_platform.py +++ b/tests/cli/test_amplicon_platform.py @@ -69,6 +69,14 @@ def test_platform_ont_sets_ont_amplicon(tmp_path): assert config["read_simulation"]["simulator"] == "ont-amplicon" +def test_ont_platform_uses_ont_amplicon_params(tmp_path): + """--platform ont should populate ont_amplicon_params, not pacbio_params.""" + config = _invoke_amplicon(tmp_path, platform="ont") + assert "ont_amplicon_params" in config + # Default ONT model should be set + assert "ONT" in config["ont_amplicon_params"]["model_file"].upper() + + def test_assay_type_set_for_both_platforms(tmp_path): """Both platforms should set assay_type='amplicon'.""" for platform in [None, "ont"]: diff --git a/tests/read_simulator/test_ont_amplicon_pipeline.py b/tests/read_simulator/test_ont_amplicon_pipeline.py index bbbb998..11b685e 100644 --- a/tests/read_simulator/test_ont_amplicon_pipeline.py +++ b/tests/read_simulator/test_ont_amplicon_pipeline.py @@ -32,7 +32,7 @@ def ont_amplicon_config(tmp_path, muc1_primers): "reverse_primer": muc1_primers["reverse"], "pcr_bias": {"preset": "default"}, }, - "pacbio_params": { + "ont_amplicon_params": { "model_type": "errhmm", "model_file": str(model_file), "threads": 4, @@ -192,7 +192,9 @@ def test_warns_on_non_ont_model( import logging # Use a PacBio model name to trigger warning - ont_amplicon_config["pacbio_params"]["model_file"] = str(tmp_path / "ERRHMM-SEQUEL.model") + ont_amplicon_config["ont_amplicon_params"]["model_file"] = str( + tmp_path / "ERRHMM-SEQUEL.model" + ) (tmp_path / "ERRHMM-SEQUEL.model").write_text("model") mock_prep.return_value = _make_prep(tmp_path)