Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ The code for the software itself is also available and citeable in Zenodo: https
* [Usage](#usage)
* [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format)
* [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer)
* [`strkit merge`: VCF merging tool](#strkit-merge-vcf-merging-tool)
* [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis)
* [Usage](#usage-1)
* [Further documentation](#further-documentation)
Expand Down Expand Up @@ -296,6 +297,12 @@ the start-up instructions.



### `strkit merge`: VCF merging tool

TODO



### `strkit mi`: Mendelian inheritance analysis

Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered
Expand Down
1 change: 1 addition & 0 deletions docs/output_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ VCF meta fields (non-exhaustive):
* `phasing`: present and set to `partial` if using SNV/HP phasing
* `reference`: absolute file URI to FASTA reference (`file://<...>`)
* `strkitVersion`: STRkit version used to generate the file
* `strkitCommand`: STRkit subcommand used to generate the VCF (either `call` or `merge`)
* `strkitCatalogNumLoci`: Number of total loci in the catalog used for genotyping
* `strkitCatalogHash`: SHA256 hash of the loci loaded from the catalog

Expand Down
8 changes: 5 additions & 3 deletions strkit/call/call_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,17 +264,18 @@ def call_sample(

from heapq import merge as heapq_merge
from numpy.random import default_rng as np_default_rng
from pathlib import Path
from pysam import VariantFile

from strkit.logger import get_main_logger
from strkit.vcf_utils.header import build_vcf_header

from .loci import load_loci
from .output import (
output_json_report_header,
output_json_report_results,
output_json_report_footer,
output_tsv as output_tsv_fn,
build_vcf_header,
output_contig_vcf_lines,
)
from .utils import get_new_seed
Expand Down Expand Up @@ -312,8 +313,9 @@ def call_sample(
vf: VariantFile | None = None
if vcf_path is not None:
vh = build_vcf_header(
sample_id_str,
params.reference_file,
"call",
(sample_id_str,),
Path(params.reference_file),
partial_phasing=params.snv_vcf is not None or params.use_hp,
num_loci=num_loci,
loci_hash=loci_hash,
Expand Down
3 changes: 1 addition & 2 deletions strkit/call/output/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer
from .tsv import output_tsv
from .vcf import build_vcf_header, output_contig_vcf_lines
from .vcf import output_contig_vcf_lines

__all__ = [
"output_json_report_header",
"output_json_report_results",
"output_json_report_footer",
"output_tsv",
"build_vcf_header",
"output_contig_vcf_lines",
]
154 changes: 19 additions & 135 deletions strkit/call/output/vcf.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
from __future__ import annotations

from collections import Counter
from datetime import datetime
from functools import cache
from os.path import commonprefix
from pathlib import Path
from pysam import FastaFile, VariantHeader
from typing import Iterable, TYPE_CHECKING

from strkit import __version__
from strkit import vcf_utils as vu
from strkit.utils import is_none, idx_0_getter
from ..utils import cn_getter

Expand All @@ -19,131 +15,19 @@
from ..types import LocusResult

__all__ = [
"build_vcf_header",
"output_contig_vcf_lines",
]


# VCF_ALLELE_CNV_TR = "<CNV:TR>"

# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
# ("SVLEN", "A", "Integer", "Length of the structural variant"),
# ("CN", "A", "Float", "Copy number of allele"),
# ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
# ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
# ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
# ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
# ("CIRUC", ".", "Float", "Confidence interval around RUC"),
# ("CIRB", ".", "Integer", "Confidence interval around RB"),
# )

VCF_INFO_VT = "VT"
VCF_INFO_MOTIF = "MOTIF"
VCF_INFO_REFMC = "REFMC"
VCF_INFO_BED_START = "BED_START"
VCF_INFO_BED_END = "BED_END"
VCF_INFO_ANCH = "ANCH"

VT_STR = "str"
VT_SNV = "snv"


def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
# noinspection PyTypeChecker
return map(str.upper, x)


def build_vcf_header(
sample_id: str, reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
) -> VariantHeader:
vh = VariantHeader() # automatically sets VCF version to 4.2

# Add file date
now = datetime.now()
vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")

# Add source
vh.add_meta("source", "strkit")

# Mark that we have partial phasing if we're using HP/SNVs
if partial_phasing:
vh.add_meta("phasing", "partial")

# Add an absolute path to the reference genome
vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}")

# Add all contigs from the reference genome file + lengths
with FastaFile(reference_file) as rf:
for contig in rf.references:
vh.contigs.add(contig, length=rf.get_reference_length(contig))

# Add STRkit-specific fields:
# - marking version
vh.add_meta("strkitVersion", str(__version__))
# - indicating number of loci provided (i.e., catalogue size)
vh.add_meta("strkitCatalogNumLoci", str(num_loci))
# - indicating hash of STRkitLocus objects (for checking catalogue sameness)
vh.add_meta("strkitCatalogLociHash", loci_hash)

# Add CNV:TR alt type (symbolic allele: tandem repeat)
# vh.add_meta("ALT", "<ID=CNV:TR,Description=\"Tandem repeat\">")

# Set up basic VCF formats
vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
vh.formats.add("DP", 1, "Integer", "Read depth")
vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
vh.formats.add("GT", 1, "String", "Genotype")
vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
vh.formats.add("PS", 1, "Integer", "Phase set")
vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")

# Set up VCF info fields
vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
vh.info.add(
VCF_INFO_BED_START,
1,
"Integer",
"Original start position of the locus as defined in the catalog (0-based inclusive)",
)
vh.info.add(
VCF_INFO_BED_END,
1,
"Integer",
"Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)",
)
vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")

# Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO
# for iv in VCF_TR_INFO_RECORDS:
# vh.info.add(*iv)

# Add the sample
vh.add_sample(sample_id)

return vh


def _vr_pos_key(vr: VariantRecord) -> int:
return vr.pos


@cache
def _blank_entry(n_alleles: int) -> tuple[None, ...]:
return tuple([None] * n_alleles)


class SkipWritingLocus(Exception):
pass


def create_result_vcf_records(
params: CallParams,
variant_file: VariantFile,
Expand All @@ -160,7 +44,7 @@ def create_result_vcf_records(

if "ref_start_anchor" not in result:
logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start)
raise SkipWritingLocus()
raise vu.record.SkipWritingLocus()

ref_start_anchor = result["ref_start_anchor"].upper()
ref_seq = result["ref_seq"].upper()
Expand All @@ -176,11 +60,11 @@ def create_result_vcf_records(

if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks
logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs)
raise SkipWritingLocus()
raise vu.record.SkipWritingLocus()

if any(map(is_none, peak_start_anchor_seqs)): # Occurs when no consensus for one of the peaks
logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs)
raise SkipWritingLocus()
raise vu.record.SkipWritingLocus()

peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
Expand Down Expand Up @@ -235,24 +119,24 @@ def create_result_vcf_records(
alleles=seq_alleles,
)

vr.info[VCF_INFO_VT] = VT_STR
vr.info[VCF_INFO_MOTIF] = result["motif"]
vr.info[VCF_INFO_REFMC] = result["ref_cn"]
vr.info[VCF_INFO_BED_START] = result["start"]
vr.info[VCF_INFO_BED_END] = result["end"]
vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_STR
vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"]
vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"]
vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"]
vr.info[vu.header.VCF_INFO_BED_END.key] = result["end"]
vr.info[vu.header.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset

try:
vr.samples[sample_id]["GT"] = (
tuple(map(seq_alleles_raw.index, seqs_with_anchors))
if call is not None and peak_seqs
else _blank_entry(n_alleles)
vr.samples[sample_id]["GT"] = vu.record.genotype_indices(
alleles=seq_alleles_raw,
call=None if call is None or not peak_seqs else seqs_with_anchors,
n_alleles=n_alleles,
)
except ValueError:
logger.error(
"results[%d] (locus_id=%s): one of %s not in %s",
result_idx, locus_id, seqs_with_anchors, seq_alleles_raw)
raise SkipWritingLocus()
result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw)
raise vu.record.SkipWritingLocus()

if am := result.get("assign_method"):
vr.samples[sample_id]["PM"] = am
Expand Down Expand Up @@ -329,9 +213,9 @@ def create_result_vcf_records(
alleles=snv_alleles,
)

snv_vr.info[VCF_INFO_VT] = VT_SNV
snv_vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_SNV

snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles)
snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
snv_vr.samples[sample_id]["AD"] = snv["rcs"]

Expand Down Expand Up @@ -369,7 +253,7 @@ def output_contig_vcf_lines(
logger,
)
)
except SkipWritingLocus:
except vu.record.SkipWritingLocus:
pass # just skipping the locus, nothing to do here as we've already logged
except Exception as e: # fallback if we didn't handle a case properly in create_result_vcf_records
logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e)
Expand Down
16 changes: 16 additions & 0 deletions strkit/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,10 @@ def add_call_parser_args(call_parser):
call_parser.add_argument("--profile", action="store_true", help="Profile function calls (for development.)")


def add_merge_parser_args(merge_parser):
merge_parser.add_argument("vcf_files", type=str, nargs="+", help="STRkit-generated VCF files to merge together.")


def add_mi_parser_args(mi_parser):
mi_parser.add_argument(
"--debug",
Expand Down Expand Up @@ -479,6 +483,12 @@ def _exec_call(p_args) -> None:
)


def _exec_merge(p_args) -> None:
from strkit.merge.merge_vcfs import merge_vcfs
logger = _main_logger(p_args)
merge_vcfs(tuple(Path(p) for p in p_args.vcf_files), logger)


def _exec_mi(p_args) -> None:
from strkit.mi.base import BaseCalculator
from strkit.mi.expansionhunter import ExpansionHunterCalculator
Expand Down Expand Up @@ -688,6 +698,12 @@ def _make_subparser(*names: str, help_text: str, exec_func: Callable, arg_func:
exec_func=_exec_call,
arg_func=add_call_parser_args)

_make_subparser(
"merge",
help_text="A merging tool for STRkit VCF output.",
exec_func=_exec_merge,
arg_func=add_merge_parser_args)

_make_subparser(
"mi",
help_text="A Mendelian inheritance calculator for different TR genotyping callers.",
Expand Down
Empty file added strkit/merge/__init__.py
Empty file.
Loading