From 60e8e0ec0ff2454db7839b443a822b264cfa4c85 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 17 Feb 2026 17:02:49 -0500 Subject: [PATCH 01/16] feat: add STRkit VCF merge subcommand work in progress --- strkit/merge/__init__.py | 0 strkit/merge/merge_vcfs.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 strkit/merge/__init__.py create mode 100644 strkit/merge/merge_vcfs.py diff --git a/strkit/merge/__init__.py b/strkit/merge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py new file mode 100644 index 0000000..3418e3d --- /dev/null +++ b/strkit/merge/merge_vcfs.py @@ -0,0 +1,35 @@ +from pathlib import Path +from pysam import VariantFile, VariantHeader + +from .. import __version__ + +__all__ = ["merge_vcfs"] + + +def check_headers(hdrs: tuple[VariantHeader, ...]): + # check contigs + # check STRkit versions + # check references + # check info field overlaps + # TODO: return some info to help? + pass + + +def merge_vcfs(paths: tuple[Path, ...], out_file: Path): + fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths) + + try: + hdrs = tuple(f.header for f in fhs) + + # Check that headers indicate these files can be merged (they are STRkit headers and are compatible) + check_headers(hdrs) # TODO + + # TODO: for each STR, need to: + # - ensure they're compatible (same ID with same # of loci perhaps?) + # - resolve ref (which can vary between) - probably selecting the longest one and + # pasting any needed prefix (diff. between sample's ref and current ref) onto it. + # TODO: for SNVs, we can only deliver heterozygous calls. need to note this for the user. + + finally: + for fh in fhs: + fh.close() From 8623edc1b9d8b4306deed7d52ccf9b5c4f0563dc Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 18 Feb 2026 17:17:29 -0500 Subject: [PATCH 02/16] add merge subcommand --- strkit/entry.py | 15 +++++++++++++++ strkit/merge/merge_vcfs.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/strkit/entry.py b/strkit/entry.py index e018458..cbfa338 100644 --- a/strkit/entry.py +++ b/strkit/entry.py @@ -272,6 +272,10 @@ def add_call_parser_args(call_parser): call_parser.add_argument("--profile", action="store_true", help="Profile function calls (for development.)") +def add_merge_parser_args(merge_parser): + merge_parser.add_argument("vcf_files", type=str, nargs="+", help="STRkit-generated VCF files to merge together.") + + def add_mi_parser_args(mi_parser): mi_parser.add_argument( "--debug", @@ -479,6 +483,11 @@ def _exec_call(p_args) -> None: ) +def _exec_merge(p_args) -> None: + from strkit.merge.merge_vcfs import merge_vcfs + merge_vcfs(tuple(Path(p) for p in p_args.vcf_files)) + + def _exec_mi(p_args) -> None: from strkit.mi.base import BaseCalculator from strkit.mi.expansionhunter import ExpansionHunterCalculator @@ -688,6 +697,12 @@ def _make_subparser(*names: str, help_text: str, exec_func: Callable, arg_func: exec_func=_exec_call, arg_func=add_call_parser_args) + _make_subparser( + "merge", + help_text="A merging tool for STRkit VCF output.", + exec_func=_exec_merge, + arg_func=add_merge_parser_args) + _make_subparser( "mi", help_text="A Mendelian inheritance calculator for different TR genotyping callers.", diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py index 3418e3d..110bd48 100644 --- a/strkit/merge/merge_vcfs.py +++ b/strkit/merge/merge_vcfs.py @@ -15,7 +15,7 @@ def check_headers(hdrs: tuple[VariantHeader, ...]): pass -def merge_vcfs(paths: tuple[Path, ...], out_file: Path): +def merge_vcfs(paths: tuple[Path, ...]): fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths) try: From 0aafe1c416230436dab03dfeb195736fc4d9cdd5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 18 Feb 2026 17:17:41 -0500 Subject: [PATCH 03/16] work on first steps of vcf merging --- strkit/merge/merge_vcfs.py | 106 ++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 6 deletions(-) diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py index 110bd48..cdab2e5 100644 --- a/strkit/merge/merge_vcfs.py +++ b/strkit/merge/merge_vcfs.py @@ -1,28 +1,122 @@ +from __future__ import annotations + from pathlib import Path from pysam import VariantFile, VariantHeader +from typing import TYPE_CHECKING from .. import __version__ +from ..logger import get_main_logger + +if TYPE_CHECKING: + from logging import Logger __all__ = ["merge_vcfs"] -def check_headers(hdrs: tuple[VariantHeader, ...]): - # check contigs - # check STRkit versions +def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): + header_contigs: set[tuple[str, ...]] = set() + strkit_versions: set[str] = set() + strkit_catalog_hashes: set[str] = set() + + for idx, hdr in enumerate(headers, 1): + header_contigs.add(tuple(sorted(hdr.contigs))) + strkit_version_found = False + strkit_catalog_hash_found = False + for line in hdr.records: + if line.key == "strkitVersion": + strkit_versions.add(line.value) + strkit_version_found = True + elif line.key == "strkitCatalogLociHash": + strkit_catalog_hashes.add(line.value) + strkit_catalog_hash_found = True + if not strkit_version_found: + logger.critical("did not find strkitVersion in VCF #%d", idx) + exit(1) + if not strkit_catalog_hash_found: + logger.critical("did not find strkitCatalogLociHash in VCF #%d", idx) + exit(1) + # check references # check info field overlaps + + passed_validation = True + + # check contigs + if len(header_contigs) > 1: + logger.warning("more than one header contig set found - are all VCFs using the same reference genome?") + passed_validation = False + # check STRkit versions + if len(strkit_versions) > 1: + logger.warning( + "more than one STRkit version found (%s) - output may be different across versions", + ", ".join(sorted(strkit_versions)), + ) + passed_validation = False + # check catalog hashes + if len(strkit_catalog_hashes) > 1: + logger.warning( + "more than one catalog hash found (%s) - are all locus catalogs the same? merging may not work with " + "different catalogs" + ) + passed_validation = False + + if passed_validation: + logger.info("VCF headers passed validation") + else: + logger.warning("VCF headers failed validation") + # TODO: return some info to help? - pass def merge_vcfs(paths: tuple[Path, ...]): + logger = get_main_logger() # TODO: inject + fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths) try: - hdrs = tuple(f.header for f in fhs) + headers = tuple(f.header for f in fhs) # Check that headers indicate these files can be merged (they are STRkit headers and are compatible) - check_headers(hdrs) # TODO + check_headers(headers, logger) # TODO + + # -------------------------------------------------------------------------------------------------------------- + + # Try merging headers + merged_header = headers[0] + for hdr in headers[1:]: + merged_header = merged_header.merge(hdr) + + print(merged_header, end="") + + # -------------------------------------------------------------------------------------------------------------- + + iters = tuple(f.fetch() for f in fhs) + ptrs = [next(i, None) for i in iters] + + while any(p is not None for p in ptrs): + # find compatible records for merging based on ID (requires STRkit to output variant IDs for everything) + if len(set(p.id for p in ptrs)) == 1: + contig = ptrs[0].contig + + refs = tuple(p.ref for p in ptrs) + + if len(set(refs)) > 1: + print("TODO ref consolidation!") + ref = refs[0] + # TODO: if this happens, we need to also + else: + ref = refs[0] + + print(refs) + + # all same locus, can merge + rec = merged_header.new_record(contig, alleles=(ref,)) + pass + ptrs = [next(i, None) for i in iters] + else: + # we have a record which doesn't apply to all samples + print("TODO some samples!") + # TODO: selectively advance # TODO: for each STR, need to: # - ensure they're compatible (same ID with same # of loci perhaps?) From 3091ce5f29cb70cf8c3b403d7da897dbd7073b63 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 18 Feb 2026 17:30:14 -0500 Subject: [PATCH 04/16] docs: wip section for vcf merge --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index dbecdd8..304c155 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The code for the software itself is also available and citeable in Zenodo: https * [Usage](#usage) * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format) * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer) + * [`strkit merge`: VCF merging tool](#strkit-merge-vcf-merging-tool) * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis) * [Usage](#usage-1) * [Further documentation](#further-documentation) @@ -296,6 +297,12 @@ the start-up instructions. +### `strkit merge`: VCF merging tool + +TODO + + + ### `strkit mi`: Mendelian inheritance analysis Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered From 3a188f02b954f17d6c7216f191f3499f892f53c5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 19 Feb 2026 10:22:05 -0500 Subject: [PATCH 05/16] merge work --- strkit/merge/merge_vcfs.py | 91 +++++++++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py index cdab2e5..5ec9a29 100644 --- a/strkit/merge/merge_vcfs.py +++ b/strkit/merge/merge_vcfs.py @@ -1,14 +1,15 @@ from __future__ import annotations from pathlib import Path -from pysam import VariantFile, VariantHeader -from typing import TYPE_CHECKING +from pysam import VariantFile +from typing import TYPE_CHECKING, Callable -from .. import __version__ +from ..call.output import vcf from ..logger import get_main_logger if TYPE_CHECKING: from logging import Logger + from pysam import VariantHeader, VariantRecord __all__ = ["merge_vcfs"] @@ -68,6 +69,68 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): # TODO: return some info to help? +def merge_str_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: + contig = records[0].contig + + refs = tuple(p.ref for p in records) + + common_info_keys = ( + vcf.VCF_INFO_VT, + vcf.VCF_INFO_MOTIF, + vcf.VCF_INFO_REFMC, + vcf.VCF_INFO_BED_START, + vcf.VCF_INFO_BED_END, + ) + + infos = set(tuple(p.info[k] for k in common_info_keys) for p in records) + + if len(infos) > 1: + logger.warning("merge_str_records encountered more than one set of INFO field values") + # TODO: more reporting + + if len(set(refs)) > 1: + print("TODO ref consolidation!") + record_max_anchor = max(records, key=lambda p: p.info["ANCH"]) + max_anchor = record_max_anchor.info["ANCH"] + ref_max_anchor = record_max_anchor.ref + start = record_max_anchor.start + added_prefix_lengths = tuple(max_anchor - p.info["ANCH"] for p in records) + + # TODO: also consider if there's extra on the end or something? + ref = ref_max_anchor + # TODO: this misses SNVs that are in the VCF anchor. Maybe STRkit should genotype these. + alts = sorted(set(ref_max_anchor[:added_prefix_lengths[i]] + a for i, p in enumerate(records) for a in p.alts)) + else: + ref = refs[0] + alts = sorted(set(a for p in records for a in (p.alts or ())), key=lambda a: (len(a), a)) + start = records[0].start + + # TODO: merge info/format fields + + # all same locus, can merge + alleles = (ref, *(alts or (".",))) + print(alleles) + rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id) + + common_info = next(iter(infos)) + for ki, k in enumerate(common_info_keys): + rec.info[k] = common_info[ki] + + return rec + + +def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: + rec = header.new_record() # TODO + print("TODO: SNV MERGE") + return rec + + +MERGE_FUNCTION: dict[str, Callable[[VariantHeader, list[VariantRecord], Logger], VariantRecord]] = { + vcf.VT_STR: merge_str_records, + vcf.VT_SNV: merge_snv_records, +} + + def merge_vcfs(paths: tuple[Path, ...]): logger = get_main_logger() # TODO: inject @@ -95,26 +158,12 @@ def merge_vcfs(paths: tuple[Path, ...]): while any(p is not None for p in ptrs): # find compatible records for merging based on ID (requires STRkit to output variant IDs for everything) - if len(set(p.id for p in ptrs)) == 1: - contig = ptrs[0].contig - - refs = tuple(p.ref for p in ptrs) - - if len(set(refs)) > 1: - print("TODO ref consolidation!") - ref = refs[0] - # TODO: if this happens, we need to also - else: - ref = refs[0] - - print(refs) - - # all same locus, can merge - rec = merged_header.new_record(contig, alleles=(ref,)) - pass + if all(p is not None for p in ptrs) and len(set(p.id for p in ptrs)) == 1: + print(MERGE_FUNCTION[ptrs[0].info["VT"]](merged_header, ptrs, logger), end="") ptrs = [next(i, None) for i in iters] else: - # we have a record which doesn't apply to all samples + # we have a record which doesn't apply to all samples - either some VCFs are finished, or the pointers + # are at different points in the file. print("TODO some samples!") # TODO: selectively advance From e6aad2e2892a3cadedcc1b44ba42a2d3b1a0e3f5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 19 Feb 2026 11:38:50 -0500 Subject: [PATCH 06/16] work on snv merging --- strkit/merge/merge_vcfs.py | 64 ++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py index 5ec9a29..431d771 100644 --- a/strkit/merge/merge_vcfs.py +++ b/strkit/merge/merge_vcfs.py @@ -14,6 +14,18 @@ __all__ = ["merge_vcfs"] +COMMON_STR_INFO_KEYS = ( + vcf.VCF_INFO_VT, + vcf.VCF_INFO_MOTIF, + vcf.VCF_INFO_REFMC, + vcf.VCF_INFO_BED_START, + vcf.VCF_INFO_BED_END, +) +COMMON_SNV_INFO_KEYS = ( + vcf.VCF_INFO_VT, +) + + def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): header_contigs: set[tuple[str, ...]] = set() strkit_versions: set[str] = set() @@ -73,19 +85,13 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge contig = records[0].contig refs = tuple(p.ref for p in records) + if len(set(refs)) > 1: + logger.debug("merge_str_records: encountered >1 STR reference sequence") - common_info_keys = ( - vcf.VCF_INFO_VT, - vcf.VCF_INFO_MOTIF, - vcf.VCF_INFO_REFMC, - vcf.VCF_INFO_BED_START, - vcf.VCF_INFO_BED_END, - ) - - infos = set(tuple(p.info[k] for k in common_info_keys) for p in records) + infos = set(tuple(p.info[k] for k in COMMON_STR_INFO_KEYS) for p in records) if len(infos) > 1: - logger.warning("merge_str_records encountered more than one set of INFO field values") + logger.warning("merge_str_records: encountered >1 set of INFO field values for STR") # TODO: more reporting if len(set(refs)) > 1: @@ -99,10 +105,11 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge # TODO: also consider if there's extra on the end or something? ref = ref_max_anchor # TODO: this misses SNVs that are in the VCF anchor. Maybe STRkit should genotype these. + # TODO: filter out ref allele alts = sorted(set(ref_max_anchor[:added_prefix_lengths[i]] + a for i, p in enumerate(records) for a in p.alts)) else: ref = refs[0] - alts = sorted(set(a for p in records for a in (p.alts or ())), key=lambda a: (len(a), a)) + alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref), key=lambda a: (len(a), a)) start = records[0].start # TODO: merge info/format fields @@ -112,16 +119,47 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge print(alleles) rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id) + # -- add INFO (common across samples) to merged STR record --------------------------------------------------------- common_info = next(iter(infos)) - for ki, k in enumerate(common_info_keys): + for ki, k in enumerate(COMMON_STR_INFO_KEYS): rec.info[k] = common_info[ki] + # ------------------------------------------------------------------------------------------------------------------ + + # Done common stuff, now need to add/transform sample data + for sample in header.samples: + pass return rec def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: - rec = header.new_record() # TODO print("TODO: SNV MERGE") + print([str(r) for r in records]) + + # confirm we have a single base ref + refs = tuple(p.ref for p in records) + if len(ref_set := set(refs)) > 1: + logger.critical("merge_snv_records: encountered >1 SNP reference sequence (%s)", ref_set) + exit(1) + ref = refs[0] + + infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_KEYS) for p in records) + + if len(infos) > 1: + logger.warning("merge_str_records: encountered >1 set of INFO field values for STR") + + alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref)) + alleles = (ref, *alts) + rec = header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles) # TODO + + common_info = next(iter(infos)) + for ki, k in enumerate(COMMON_SNV_INFO_KEYS): + rec.info[k] = common_info[ki] + + # Done common stuff, now need to add/transform sample data + for sample in header.samples: + pass + return rec From e0bc4cbf536eff74fea4701ed87d649f19fb704b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 08:24:01 -0500 Subject: [PATCH 07/16] fix(call): error within error handling in vcf writing --- strkit/call/output/vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 152a41c..adab8fa 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -251,7 +251,7 @@ def create_result_vcf_records( except ValueError: logger.error( "results[%d] (locus_id=%s): one of %s not in %s", - result_idx, locus_id, seqs_with_anchors, seq_alleles_raw) + result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw) raise SkipWritingLocus() if am := result.get("assign_method"): From 399ad115bad340372576243e2eb13c01fbc785fb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 08:33:24 -0500 Subject: [PATCH 08/16] refact: move some VCF writing functions to common utils file --- strkit/call/output/vcf.py | 21 ++++++--------------- strkit/vcf_utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 15 deletions(-) create mode 100644 strkit/vcf_utils.py diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index adab8fa..3ed4a6f 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -2,7 +2,6 @@ from collections import Counter from datetime import datetime -from functools import cache from os.path import commonprefix from pathlib import Path from pysam import FastaFile, VariantHeader @@ -10,6 +9,7 @@ from strkit import __version__ from strkit.utils import is_none, idx_0_getter +from strkit.vcf_utils import SkipWritingLocus, genotype_indices from ..utils import cn_getter if TYPE_CHECKING: @@ -135,15 +135,6 @@ def _vr_pos_key(vr: VariantRecord) -> int: return vr.pos -@cache -def _blank_entry(n_alleles: int) -> tuple[None, ...]: - return tuple([None] * n_alleles) - - -class SkipWritingLocus(Exception): - pass - - def create_result_vcf_records( params: CallParams, variant_file: VariantFile, @@ -243,10 +234,10 @@ def create_result_vcf_records( vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset try: - vr.samples[sample_id]["GT"] = ( - tuple(map(seq_alleles_raw.index, seqs_with_anchors)) - if call is not None and peak_seqs - else _blank_entry(n_alleles) + vr.samples[sample_id]["GT"] = genotype_indices( + alleles=seq_alleles_raw, + call=None if call is None or not peak_seqs else seqs_with_anchors, + n_alleles=n_alleles, ) except ValueError: logger.error( @@ -331,7 +322,7 @@ def create_result_vcf_records( snv_vr.info[VCF_INFO_VT] = VT_SNV - snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"])) + snv_vr.samples[sample_id]["GT"] = genotype_indices(snv_alleles, snv["call"], n_alleles) snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"]) snv_vr.samples[sample_id]["AD"] = snv["rcs"] diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils.py new file mode 100644 index 0000000..90590a2 --- /dev/null +++ b/strkit/vcf_utils.py @@ -0,0 +1,30 @@ +__all__ = [ + "SkipWritingLocus", + "blank_vcf_genotype", + "genotype_indices", +] + + +class SkipWritingLocus(Exception): + pass + + +def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]: + """ + Generates a blank VCF genotype for a sample locus. + :param n_alleles: Number of alleles to use for the blank genotype. + :return: Tuple of [None] * + """ + return tuple([None] * n_alleles) + + +def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]: + """ + Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of + possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple. + :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...) + :param call: Call for the sample: (alt3, alt1) or None if not called. + :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.) + :return: Genotype index tuple, e.g., (3, 1). + """ + return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles) From 0e1a9a29c67faf9e0093551e541b6518661f193c Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 14:02:32 -0500 Subject: [PATCH 09/16] refact: move more VCF-writing utils/consts into vcf_utils --- strkit/call/output/vcf.py | 65 +++++++++++++-------------------------- strkit/vcf_utils.py | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 43 deletions(-) diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 3ed4a6f..c101272 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -1,15 +1,13 @@ from __future__ import annotations from collections import Counter -from datetime import datetime from os.path import commonprefix from pathlib import Path from pysam import FastaFile, VariantHeader from typing import Iterable, TYPE_CHECKING -from strkit import __version__ +from strkit import __version__, vcf_utils as vu from strkit.utils import is_none, idx_0_getter -from strkit.vcf_utils import SkipWritingLocus, genotype_indices from ..utils import cn_getter if TYPE_CHECKING: @@ -37,13 +35,6 @@ # ("CIRB", ".", "Integer", "Confidence interval around RB"), # ) -VCF_INFO_VT = "VT" -VCF_INFO_MOTIF = "MOTIF" -VCF_INFO_REFMC = "REFMC" -VCF_INFO_BED_START = "BED_START" -VCF_INFO_BED_END = "BED_END" -VCF_INFO_ANCH = "ANCH" - VT_STR = "str" VT_SNV = "snv" @@ -59,8 +50,7 @@ def build_vcf_header( vh = VariantHeader() # automatically sets VCF version to 4.2 # Add file date - now = datetime.now() - vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}") + vu.add_vcf_file_date(vh) # Add source vh.add_meta("source", "strkit") @@ -103,23 +93,12 @@ def build_vcf_header( vh.formats.add("PS", 1, "Integer", "Phase set") vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") - # Set up VCF info fields - vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)") - vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string") - vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome") - vh.info.add( - VCF_INFO_BED_START, - 1, - "Integer", - "Original start position of the locus as defined in the catalog (0-based inclusive)", - ) - vh.info.add( - VCF_INFO_BED_END, - 1, - "Integer", - "Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)", - ) - vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size") + # Set up VCF info fields (mutates vh.info) + vu.VCF_INFO_VT.add_to_info(vh.info) + vu.VCF_INFO_MOTIF.add_to_info(vh.info) + vu.VCF_INFO_REFMC.add_to_info(vh.info) + vu.VCF_INFO_BED_START.add_to_info(vh.info) + vu.VCF_INFO_BED_END.add_to_info(vh.info) # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO # for iv in VCF_TR_INFO_RECORDS: @@ -151,7 +130,7 @@ def create_result_vcf_records( if "ref_start_anchor" not in result: logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start) - raise SkipWritingLocus() + raise vu.SkipWritingLocus() ref_start_anchor = result["ref_start_anchor"].upper() ref_seq = result["ref_seq"].upper() @@ -167,11 +146,11 @@ def create_result_vcf_records( if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs) - raise SkipWritingLocus() + raise vu.SkipWritingLocus() if any(map(is_none, peak_start_anchor_seqs)): # Occurs when no consensus for one of the peaks logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs) - raise SkipWritingLocus() + raise vu.SkipWritingLocus() peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs)) common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper]) @@ -226,15 +205,15 @@ def create_result_vcf_records( alleles=seq_alleles, ) - vr.info[VCF_INFO_VT] = VT_STR - vr.info[VCF_INFO_MOTIF] = result["motif"] - vr.info[VCF_INFO_REFMC] = result["ref_cn"] - vr.info[VCF_INFO_BED_START] = result["start"] - vr.info[VCF_INFO_BED_END] = result["end"] - vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset + vr.info[vu.VCF_INFO_VT.key] = VT_STR + vr.info[vu.VCF_INFO_MOTIF.key] = result["motif"] + vr.info[vu.VCF_INFO_REFMC.key] = result["ref_cn"] + vr.info[vu.VCF_INFO_BED_START.key] = result["start"] + vr.info[vu.VCF_INFO_BED_END.key] = result["end"] + vr.info[vu.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset try: - vr.samples[sample_id]["GT"] = genotype_indices( + vr.samples[sample_id]["GT"] = vu.genotype_indices( alleles=seq_alleles_raw, call=None if call is None or not peak_seqs else seqs_with_anchors, n_alleles=n_alleles, @@ -243,7 +222,7 @@ def create_result_vcf_records( logger.error( "results[%d] (locus_id=%s): one of %s not in %s", result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw) - raise SkipWritingLocus() + raise vu.SkipWritingLocus() if am := result.get("assign_method"): vr.samples[sample_id]["PM"] = am @@ -320,9 +299,9 @@ def create_result_vcf_records( alleles=snv_alleles, ) - snv_vr.info[VCF_INFO_VT] = VT_SNV + snv_vr.info[vu.VCF_INFO_VT.key] = VT_SNV - snv_vr.samples[sample_id]["GT"] = genotype_indices(snv_alleles, snv["call"], n_alleles) + snv_vr.samples[sample_id]["GT"] = vu.genotype_indices(snv_alleles, snv["call"], n_alleles) snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"]) snv_vr.samples[sample_id]["AD"] = snv["rcs"] @@ -360,7 +339,7 @@ def output_contig_vcf_lines( logger, ) ) - except SkipWritingLocus: + except vu.SkipWritingLocus: pass # just skipping the locus, nothing to do here as we've already logged except Exception as e: # fallback if we didn't handle a case properly in create_result_vcf_records logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e) diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils.py index 90590a2..5df8a3f 100644 --- a/strkit/vcf_utils.py +++ b/strkit/vcf_utils.py @@ -1,14 +1,71 @@ +from dataclasses import dataclass +from datetime import datetime +from pysam import VariantHeader, VariantHeaderMetadata + __all__ = [ + "VCF_INFO_VT", + "VCF_INFO_MOTIF", + "VCF_INFO_REFMC", + "VCF_INFO_BED_START", + "VCF_INFO_BED_END", + "VCF_INFO_ANCH", + "VT_STR", + "VT_SNV", "SkipWritingLocus", "blank_vcf_genotype", "genotype_indices", ] +@dataclass +class VcfInfo: + key: str + n: int | str # number of values in record row + type: str # VCF type + description: str + + def add_to_info(self, info: VariantHeaderMetadata): + """ + Add info field to variant file header metadata via mutation. + :param info: VariantHeaderMetadata instance to add the record to. This object is mutated. + """ + info.add(self.key, self.n, self.type, self.description) + + +VCF_INFO_VT = VcfInfo("VT", 1, "String", "Variant record type (str/snv)") +VCF_INFO_MOTIF = VcfInfo("MOTIF", 1, "String", "Motif string") +VCF_INFO_REFMC = VcfInfo("REFMC", 1, "Integer", "Motif copy number in the reference genome") +VCF_INFO_BED_START = VcfInfo( + "BED_START", + 1, + "Integer", + "Original start position of the locus as defined in the catalog (0-based inclusive)", +) +VCF_INFO_BED_END = VcfInfo( + "BED_END", + 1, + "Integer", + "Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)", +) +VCF_INFO_ANCH = VcfInfo("ANCH", 1, "Integer", "Five-prime anchor size") + +VT_STR = "str" +VT_SNV = "snv" + + class SkipWritingLocus(Exception): pass +def add_vcf_file_date(vh: VariantHeader): + """ + Adds a fileDate header metadata line to a variant header via mutation. + :param vh: The variant header to add to. + """ + now = datetime.now() + vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}") + + def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]: """ Generates a blank VCF genotype for a sample locus. From d24508b0334a386a7e32826c6ed4f2590a1d8dd3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 14:12:33 -0500 Subject: [PATCH 10/16] refact: accept multiple samples for the build_vcf_header helper --- strkit/call/call_sample.py | 2 +- strkit/call/output/vcf.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py index d157c12..cdfa183 100644 --- a/strkit/call/call_sample.py +++ b/strkit/call/call_sample.py @@ -312,7 +312,7 @@ def call_sample( vf: VariantFile | None = None if vcf_path is not None: vh = build_vcf_header( - sample_id_str, + (sample_id_str,), params.reference_file, partial_phasing=params.snv_vcf is not None or params.use_hp, num_loci=num_loci, diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index c101272..6d93d54 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -45,7 +45,7 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]: def build_vcf_header( - sample_id: str, reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str + sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str ) -> VariantHeader: vh = VariantHeader() # automatically sets VCF version to 4.2 @@ -104,8 +104,9 @@ def build_vcf_header( # for iv in VCF_TR_INFO_RECORDS: # vh.info.add(*iv) - # Add the sample - vh.add_sample(sample_id) + # Add the sample(s) + for sample_id in sample_ids: + vh.add_sample(sample_id) return vh From f46981c3fe7211ac5762e695e063fae6c605d8c2 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 14:21:34 -0500 Subject: [PATCH 11/16] refact: multi-file vcf_utils module --- strkit/call/output/vcf.py | 40 ++++++++++---------- strkit/vcf_utils/__init__.py | 3 ++ strkit/{vcf_utils.py => vcf_utils/header.py} | 28 -------------- strkit/vcf_utils/record.py | 23 +++++++++++ 4 files changed, 46 insertions(+), 48 deletions(-) create mode 100644 strkit/vcf_utils/__init__.py rename strkit/{vcf_utils.py => vcf_utils/header.py} (58%) create mode 100644 strkit/vcf_utils/record.py diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 6d93d54..45b3f90 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -50,7 +50,7 @@ def build_vcf_header( vh = VariantHeader() # automatically sets VCF version to 4.2 # Add file date - vu.add_vcf_file_date(vh) + vu.header.add_vcf_file_date(vh) # Add source vh.add_meta("source", "strkit") @@ -94,11 +94,11 @@ def build_vcf_header( vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") # Set up VCF info fields (mutates vh.info) - vu.VCF_INFO_VT.add_to_info(vh.info) - vu.VCF_INFO_MOTIF.add_to_info(vh.info) - vu.VCF_INFO_REFMC.add_to_info(vh.info) - vu.VCF_INFO_BED_START.add_to_info(vh.info) - vu.VCF_INFO_BED_END.add_to_info(vh.info) + vu.header.VCF_INFO_VT.add_to_info(vh.info) + vu.header.VCF_INFO_MOTIF.add_to_info(vh.info) + vu.header.VCF_INFO_REFMC.add_to_info(vh.info) + vu.header.VCF_INFO_BED_START.add_to_info(vh.info) + vu.header.VCF_INFO_BED_END.add_to_info(vh.info) # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO # for iv in VCF_TR_INFO_RECORDS: @@ -131,7 +131,7 @@ def create_result_vcf_records( if "ref_start_anchor" not in result: logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start) - raise vu.SkipWritingLocus() + raise vu.record.SkipWritingLocus() ref_start_anchor = result["ref_start_anchor"].upper() ref_seq = result["ref_seq"].upper() @@ -147,11 +147,11 @@ def create_result_vcf_records( if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs) - raise vu.SkipWritingLocus() + raise vu.record.SkipWritingLocus() if any(map(is_none, peak_start_anchor_seqs)): # Occurs when no consensus for one of the peaks logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs) - raise vu.SkipWritingLocus() + raise vu.record.SkipWritingLocus() peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs)) common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper]) @@ -206,15 +206,15 @@ def create_result_vcf_records( alleles=seq_alleles, ) - vr.info[vu.VCF_INFO_VT.key] = VT_STR - vr.info[vu.VCF_INFO_MOTIF.key] = result["motif"] - vr.info[vu.VCF_INFO_REFMC.key] = result["ref_cn"] - vr.info[vu.VCF_INFO_BED_START.key] = result["start"] - vr.info[vu.VCF_INFO_BED_END.key] = result["end"] - vr.info[vu.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset + vr.info[vu.header.VCF_INFO_VT.key] = VT_STR + vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"] + vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"] + vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"] + vr.info[vu.header.VCF_INFO_BED_END.key] = result["end"] + vr.info[vu.header.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset try: - vr.samples[sample_id]["GT"] = vu.genotype_indices( + vr.samples[sample_id]["GT"] = vu.record.genotype_indices( alleles=seq_alleles_raw, call=None if call is None or not peak_seqs else seqs_with_anchors, n_alleles=n_alleles, @@ -223,7 +223,7 @@ def create_result_vcf_records( logger.error( "results[%d] (locus_id=%s): one of %s not in %s", result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw) - raise vu.SkipWritingLocus() + raise vu.record.SkipWritingLocus() if am := result.get("assign_method"): vr.samples[sample_id]["PM"] = am @@ -300,9 +300,9 @@ def create_result_vcf_records( alleles=snv_alleles, ) - snv_vr.info[vu.VCF_INFO_VT.key] = VT_SNV + snv_vr.info[vu.header.VCF_INFO_VT.key] = VT_SNV - snv_vr.samples[sample_id]["GT"] = vu.genotype_indices(snv_alleles, snv["call"], n_alleles) + snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles) snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"]) snv_vr.samples[sample_id]["AD"] = snv["rcs"] @@ -340,7 +340,7 @@ def output_contig_vcf_lines( logger, ) ) - except vu.SkipWritingLocus: + except vu.record.SkipWritingLocus: pass # just skipping the locus, nothing to do here as we've already logged except Exception as e: # fallback if we didn't handle a case properly in create_result_vcf_records logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e) diff --git a/strkit/vcf_utils/__init__.py b/strkit/vcf_utils/__init__.py new file mode 100644 index 0000000..aaa6cc6 --- /dev/null +++ b/strkit/vcf_utils/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["header", "record"] + +from . import header, record diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils/header.py similarity index 58% rename from strkit/vcf_utils.py rename to strkit/vcf_utils/header.py index 5df8a3f..79d635c 100644 --- a/strkit/vcf_utils.py +++ b/strkit/vcf_utils/header.py @@ -11,9 +11,6 @@ "VCF_INFO_ANCH", "VT_STR", "VT_SNV", - "SkipWritingLocus", - "blank_vcf_genotype", - "genotype_indices", ] @@ -53,10 +50,6 @@ def add_to_info(self, info: VariantHeaderMetadata): VT_SNV = "snv" -class SkipWritingLocus(Exception): - pass - - def add_vcf_file_date(vh: VariantHeader): """ Adds a fileDate header metadata line to a variant header via mutation. @@ -64,24 +57,3 @@ def add_vcf_file_date(vh: VariantHeader): """ now = datetime.now() vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}") - - -def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]: - """ - Generates a blank VCF genotype for a sample locus. - :param n_alleles: Number of alleles to use for the blank genotype. - :return: Tuple of [None] * - """ - return tuple([None] * n_alleles) - - -def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]: - """ - Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of - possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple. - :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...) - :param call: Call for the sample: (alt3, alt1) or None if not called. - :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.) - :return: Genotype index tuple, e.g., (3, 1). - """ - return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles) diff --git a/strkit/vcf_utils/record.py b/strkit/vcf_utils/record.py new file mode 100644 index 0000000..39e3df5 --- /dev/null +++ b/strkit/vcf_utils/record.py @@ -0,0 +1,23 @@ +class SkipWritingLocus(Exception): + pass + + +def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]: + """ + Generates a blank VCF genotype for a sample locus. + :param n_alleles: Number of alleles to use for the blank genotype. + :return: Tuple of [None] * + """ + return tuple([None] * n_alleles) + + +def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]: + """ + Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of + possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple. + :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...) + :param call: Call for the sample: (alt3, alt1) or None if not called. + :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.) + :return: Genotype index tuple, e.g., (3, 1). + """ + return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles) From 3b8cc486de0bf1f48e6353d74bb7391e0229cc78 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 19:39:04 -0500 Subject: [PATCH 12/16] feat: add strkitCommand metadata to STRkit VCFs (call or merge) --- docs/output_formats.md | 1 + strkit/call/call_sample.py | 1 + strkit/call/output/vcf.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/output_formats.md b/docs/output_formats.md index c17230a..4f5a24b 100644 --- a/docs/output_formats.md +++ b/docs/output_formats.md @@ -168,6 +168,7 @@ VCF meta fields (non-exhaustive): * `phasing`: present and set to `partial` if using SNV/HP phasing * `reference`: absolute file URI to FASTA reference (`file://<...>`) * `strkitVersion`: STRkit version used to generate the file +* `strkitCommand`: STRkit subcommand used to generate the VCF (either `call` or `merge`) * `strkitCatalogNumLoci`: Number of total loci in the catalog used for genotyping * `strkitCatalogHash`: SHA256 hash of the loci loaded from the catalog diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py index cdfa183..5568333 100644 --- a/strkit/call/call_sample.py +++ b/strkit/call/call_sample.py @@ -312,6 +312,7 @@ def call_sample( vf: VariantFile | None = None if vcf_path is not None: vh = build_vcf_header( + "call", (sample_id_str,), params.reference_file, partial_phasing=params.snv_vcf is not None or params.use_hp, diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 45b3f90..c1ee213 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -45,7 +45,7 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]: def build_vcf_header( - sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str + command: str, sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str ) -> VariantHeader: vh = VariantHeader() # automatically sets VCF version to 4.2 @@ -70,6 +70,8 @@ def build_vcf_header( # Add STRkit-specific fields: # - marking version vh.add_meta("strkitVersion", str(__version__)) + # - the subcommand being used to generate this VCF (call|merge) + vh.add_meta("strkitCommand", command) # - indicating number of loci provided (i.e., catalogue size) vh.add_meta("strkitCatalogNumLoci", str(num_loci)) # - indicating hash of STRkitLocus objects (for checking catalogue sameness) From 0f99102a4631f44379ac1b632ed2b1d38ceb54d3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 20:22:26 -0500 Subject: [PATCH 13/16] chore: allow contigs to be passed directly to VCF header builder --- strkit/call/call_sample.py | 3 ++- strkit/call/output/vcf.py | 29 ++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py index 5568333..7d6e16b 100644 --- a/strkit/call/call_sample.py +++ b/strkit/call/call_sample.py @@ -264,6 +264,7 @@ def call_sample( from heapq import merge as heapq_merge from numpy.random import default_rng as np_default_rng + from pathlib import Path from pysam import VariantFile from strkit.logger import get_main_logger @@ -314,7 +315,7 @@ def call_sample( vh = build_vcf_header( "call", (sample_id_str,), - params.reference_file, + Path(params.reference_file), partial_phasing=params.snv_vcf is not None or params.use_hp, num_loci=num_loci, loci_hash=loci_hash, diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index c1ee213..7d271ea 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -45,7 +45,13 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]: def build_vcf_header( - command: str, sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str + command: str, + sample_ids: tuple[str, ...], + reference_file: Path | str, # if Path, turn into file URI. Otherwise, treat "as-is". + partial_phasing: bool, + num_loci: int, + loci_hash: str, + contigs: tuple[tuple[str, int], ...] | None = None, ) -> VariantHeader: vh = VariantHeader() # automatically sets VCF version to 4.2 @@ -59,13 +65,22 @@ def build_vcf_header( if partial_phasing: vh.add_meta("phasing", "partial") - # Add an absolute path to the reference genome - vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}") + # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file + # URI. + vh.add_meta( + "reference", + f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file + ) - # Add all contigs from the reference genome file + lengths - with FastaFile(reference_file) as rf: - for contig in rf.references: - vh.contigs.add(contig, length=rf.get_reference_length(contig)) + if contigs is not None: + # If we're passed contig records directly (from merge), add them rather than pulling them from the reference. + for contig, contig_length in contigs: + vh.contigs.add(contig, length=contig_length) + else: + # Add all contigs from the reference genome file + lengths + with FastaFile(str(reference_file)) as rf: + for contig in rf.references: + vh.contigs.add(contig, length=rf.get_reference_length(contig)) # Add STRkit-specific fields: # - marking version From e082f98aad375111baa9a79fc8df1c103427da6d Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 20:22:32 -0500 Subject: [PATCH 14/16] missing info add --- strkit/call/output/vcf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index 7d271ea..b9eb2f0 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -116,6 +116,7 @@ def build_vcf_header( vu.header.VCF_INFO_REFMC.add_to_info(vh.info) vu.header.VCF_INFO_BED_START.add_to_info(vh.info) vu.header.VCF_INFO_BED_END.add_to_info(vh.info) + vu.header.VCF_INFO_ANCH.add_to_info(vh.info) # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO # for iv in VCF_TR_INFO_RECORDS: From d6ee8c8016dde01eb19d5b40197c46c659c6ccb1 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 20:22:45 -0500 Subject: [PATCH 15/16] merge work (doesn't run) --- strkit/entry.py | 3 +- strkit/merge/merge_vcfs.py | 135 ++++++++++++++++++++++++------------- 2 files changed, 92 insertions(+), 46 deletions(-) diff --git a/strkit/entry.py b/strkit/entry.py index cbfa338..475211f 100644 --- a/strkit/entry.py +++ b/strkit/entry.py @@ -485,7 +485,8 @@ def _exec_call(p_args) -> None: def _exec_merge(p_args) -> None: from strkit.merge.merge_vcfs import merge_vcfs - merge_vcfs(tuple(Path(p) for p in p_args.vcf_files)) + logger = _main_logger(p_args) + merge_vcfs(tuple(Path(p) for p in p_args.vcf_files), logger) def _exec_mi(p_args) -> None: diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py index 431d771..b1398ca 100644 --- a/strkit/merge/merge_vcfs.py +++ b/strkit/merge/merge_vcfs.py @@ -1,41 +1,62 @@ from __future__ import annotations -from pathlib import Path from pysam import VariantFile from typing import TYPE_CHECKING, Callable +from .. import vcf_utils as vu from ..call.output import vcf -from ..logger import get_main_logger if TYPE_CHECKING: from logging import Logger + from pathlib import Path from pysam import VariantHeader, VariantRecord __all__ = ["merge_vcfs"] -COMMON_STR_INFO_KEYS = ( - vcf.VCF_INFO_VT, - vcf.VCF_INFO_MOTIF, - vcf.VCF_INFO_REFMC, - vcf.VCF_INFO_BED_START, - vcf.VCF_INFO_BED_END, +COMMON_STR_INFO_ITEMS = ( + vu.header.VCF_INFO_VT, + vu.header.VCF_INFO_MOTIF, + vu.header.VCF_INFO_REFMC, + vu.header.VCF_INFO_BED_START, + vu.header.VCF_INFO_BED_END, ) -COMMON_SNV_INFO_KEYS = ( - vcf.VCF_INFO_VT, +COMMON_SNV_INFO_ITEMS = ( + vu.header.VCF_INFO_VT, ) -def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): +class VariantFileForMerge: + def __init__(self, path: Path): + self.path = path + self.f = VariantFile(str(path)) + self.h = self.f.header + self.samples = tuple(s for s in self.h.samples) + + def close(self): + self.f.close() + + +def merge_headers( + files: tuple[VariantFileForMerge, ...], logger: Logger +) -> tuple[VariantHeader, dict[str, VariantFileForMerge]]: + partial_phasing: bool = False header_contigs: set[tuple[str, ...]] = set() strkit_versions: set[str] = set() strkit_catalog_hashes: set[str] = set() + sample_id_file_map: dict[str, VariantFileForMerge] = {} - for idx, hdr in enumerate(headers, 1): + # -- Loop through headers to collect information for header validation and merging --------------------------------- + + for idx, f in enumerate(files, 1): + hdr = f.h header_contigs.add(tuple(sorted(hdr.contigs))) strkit_version_found = False strkit_catalog_hash_found = False for line in hdr.records: + if line.key == "phasing" and line.value == "partial": + # make phasing partial for the whole file if at least one file has it + partial_phasing = True if line.key == "strkitVersion": strkit_versions.add(line.value) strkit_version_found = True @@ -48,9 +69,13 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): if not strkit_catalog_hash_found: logger.critical("did not find strkitCatalogLociHash in VCF #%d", idx) exit(1) + if intersect := set(hdr.samples).intersection(set(sample_id_file_map.keys())): + logger.critical("found overlap between sample IDs: %s", intersect) + exit(1) + for s in hdr.samples: + sample_id_file_map[s] = f # add samples from file to map - # check references - # check info field overlaps + # -- Perform pre-merge validation on the header data --------------------------------------------------------------- passed_validation = True @@ -72,23 +97,36 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger): "different catalogs" ) passed_validation = False + # check references + # TODO + # check info field overlaps + # TODO if passed_validation: logger.info("VCF headers passed validation") else: logger.warning("VCF headers failed validation") - # TODO: return some info to help? + # -- Create header for merged VCF ---------------------------------------------------------------------------------- + # TODO: maybe just create a header from scratch... + merged_header = files[0].h.copy() + for f in files[1:]: + for s in f.samples: + merged_header.add_sample(s) + + print(merged_header, end="") + + return merged_header, sample_id_file_map -def merge_str_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: +def merge_str_records(merged_header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: contig = records[0].contig refs = tuple(p.ref for p in records) if len(set(refs)) > 1: logger.debug("merge_str_records: encountered >1 STR reference sequence") - infos = set(tuple(p.info[k] for k in COMMON_STR_INFO_KEYS) for p in records) + infos = set(tuple(p.info[k.key] for k in COMMON_STR_INFO_ITEMS) for p in records) if len(infos) > 1: logger.warning("merge_str_records: encountered >1 set of INFO field values for STR") @@ -116,23 +154,23 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge # all same locus, can merge alleles = (ref, *(alts or (".",))) - print(alleles) - rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id) + print("new str alleles", alleles) + rec = merged_header.new_record(contig, start=start, alleles=alleles, id=records[0].id) # -- add INFO (common across samples) to merged STR record --------------------------------------------------------- common_info = next(iter(infos)) - for ki, k in enumerate(COMMON_STR_INFO_KEYS): + for ki, k in enumerate(COMMON_STR_INFO_ITEMS): rec.info[k] = common_info[ki] # ------------------------------------------------------------------------------------------------------------------ # Done common stuff, now need to add/transform sample data - for sample in header.samples: + for sample in merged_header.samples: pass return rec -def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: +def merge_snv_records(merged_header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord: print("TODO: SNV MERGE") print([str(r) for r in records]) @@ -143,21 +181,39 @@ def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logge exit(1) ref = refs[0] - infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_KEYS) for p in records) + infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_ITEMS) for p in records) if len(infos) > 1: logger.warning("merge_str_records: encountered >1 set of INFO field values for STR") alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref)) alleles = (ref, *alts) - rec = header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles) # TODO + print(f"new snv alleles ({len(alleles)})", alleles) + rec = merged_header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles) # TODO common_info = next(iter(infos)) - for ki, k in enumerate(COMMON_SNV_INFO_KEYS): - rec.info[k] = common_info[ki] + for ki, k in enumerate(COMMON_SNV_INFO_ITEMS): + rec.info[k.key] = common_info[ki] + # ------------------------------------------------------------------------------------------------------------------ # Done common stuff, now need to add/transform sample data - for sample in header.samples: + + calls_by_sample_id = {} + for r in records: + for s_id in merged_header.samples: + calls_by_sample_id[s_id] = TODO + + for sample_id in merged_header.samples: + # TODO: preserve phasing + rec.samples[sample_id]["GT"] = vu.record.genotype_indices( + alleles, calls_by_sample_id[sample_id], TODO_N_ALLELES + ) + # TODO + # rec.samples[sample_id]["GT"] = ( + # tuple(map(alleles.index, seqs_with_anchors)) + # if rec[sample_id] is not None + # else _blank_entry(n_alleles) + # ) pass return rec @@ -169,29 +225,18 @@ def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logge } -def merge_vcfs(paths: tuple[Path, ...]): - logger = get_main_logger() # TODO: inject - - fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths) +def merge_vcfs(paths: tuple[Path, ...], logger: Logger): + fhs: tuple[VariantFileForMerge, ...] = tuple(VariantFileForMerge(p) for p in paths) try: - headers = tuple(f.header for f in fhs) - - # Check that headers indicate these files can be merged (they are STRkit headers and are compatible) - check_headers(headers, logger) # TODO - - # -------------------------------------------------------------------------------------------------------------- - - # Try merging headers - merged_header = headers[0] - for hdr in headers[1:]: - merged_header = merged_header.merge(hdr) - + # Check that headers indicate these files can be merged (they are STRkit headers and are compatible), + # then merge them. Also computes a map from sample ID --> file. + merged_header, sample_id_file_map = merge_headers(fhs, logger) # TODO print(merged_header, end="") # -------------------------------------------------------------------------------------------------------------- - iters = tuple(f.fetch() for f in fhs) + iters = tuple(f.f.fetch() for f in fhs) ptrs = [next(i, None) for i in iters] while any(p is not None for p in ptrs): @@ -202,7 +247,7 @@ def merge_vcfs(paths: tuple[Path, ...]): else: # we have a record which doesn't apply to all samples - either some VCFs are finished, or the pointers # are at different points in the file. - print("TODO some samples!") + print("TODO some vcfs!") # TODO: selectively advance # TODO: for each STR, need to: From 702d3fe93a712f0c82f662fe277022253e6067de Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 26 Feb 2026 20:47:39 -0500 Subject: [PATCH 16/16] refact: move vcf header build fn to vcf_utils --- strkit/call/call_sample.py | 2 +- strkit/call/output/__init__.py | 3 +- strkit/call/output/vcf.py | 111 +-------------------------------- strkit/vcf_utils/header.py | 107 +++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 111 deletions(-) diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py index 7d6e16b..392f78a 100644 --- a/strkit/call/call_sample.py +++ b/strkit/call/call_sample.py @@ -268,6 +268,7 @@ def call_sample( from pysam import VariantFile from strkit.logger import get_main_logger + from strkit.vcf_utils.header import build_vcf_header from .loci import load_loci from .output import ( @@ -275,7 +276,6 @@ def call_sample( output_json_report_results, output_json_report_footer, output_tsv as output_tsv_fn, - build_vcf_header, output_contig_vcf_lines, ) from .utils import get_new_seed diff --git a/strkit/call/output/__init__.py b/strkit/call/output/__init__.py index 23a1322..5707b6a 100644 --- a/strkit/call/output/__init__.py +++ b/strkit/call/output/__init__.py @@ -1,12 +1,11 @@ from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer from .tsv import output_tsv -from .vcf import build_vcf_header, output_contig_vcf_lines +from .vcf import output_contig_vcf_lines __all__ = [ "output_json_report_header", "output_json_report_results", "output_json_report_footer", "output_tsv", - "build_vcf_header", "output_contig_vcf_lines", ] diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py index b9eb2f0..94731da 100644 --- a/strkit/call/output/vcf.py +++ b/strkit/call/output/vcf.py @@ -2,11 +2,9 @@ from collections import Counter from os.path import commonprefix -from pathlib import Path -from pysam import FastaFile, VariantHeader from typing import Iterable, TYPE_CHECKING -from strkit import __version__, vcf_utils as vu +from strkit import vcf_utils as vu from strkit.utils import is_none, idx_0_getter from ..utils import cn_getter @@ -17,118 +15,15 @@ from ..types import LocusResult __all__ = [ - "build_vcf_header", "output_contig_vcf_lines", ] -# VCF_ALLELE_CNV_TR = "" - -# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = ( -# ("SVLEN", "A", "Integer", "Length of the structural variant"), -# ("CN", "A", "Float", "Copy number of allele"), -# ("RN", "A", "Integer", "Total number of repeat sequences in this allele"), -# ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"), -# ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"), -# ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"), -# ("CIRUC", ".", "Float", "Confidence interval around RUC"), -# ("CIRB", ".", "Integer", "Confidence interval around RB"), -# ) - -VT_STR = "str" -VT_SNV = "snv" - - def iter_to_upper(x: Iterable[str]) -> Iterable[str]: # noinspection PyTypeChecker return map(str.upper, x) -def build_vcf_header( - command: str, - sample_ids: tuple[str, ...], - reference_file: Path | str, # if Path, turn into file URI. Otherwise, treat "as-is". - partial_phasing: bool, - num_loci: int, - loci_hash: str, - contigs: tuple[tuple[str, int], ...] | None = None, -) -> VariantHeader: - vh = VariantHeader() # automatically sets VCF version to 4.2 - - # Add file date - vu.header.add_vcf_file_date(vh) - - # Add source - vh.add_meta("source", "strkit") - - # Mark that we have partial phasing if we're using HP/SNVs - if partial_phasing: - vh.add_meta("phasing", "partial") - - # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file - # URI. - vh.add_meta( - "reference", - f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file - ) - - if contigs is not None: - # If we're passed contig records directly (from merge), add them rather than pulling them from the reference. - for contig, contig_length in contigs: - vh.contigs.add(contig, length=contig_length) - else: - # Add all contigs from the reference genome file + lengths - with FastaFile(str(reference_file)) as rf: - for contig in rf.references: - vh.contigs.add(contig, length=rf.get_reference_length(contig)) - - # Add STRkit-specific fields: - # - marking version - vh.add_meta("strkitVersion", str(__version__)) - # - the subcommand being used to generate this VCF (call|merge) - vh.add_meta("strkitCommand", command) - # - indicating number of loci provided (i.e., catalogue size) - vh.add_meta("strkitCatalogNumLoci", str(num_loci)) - # - indicating hash of STRkitLocus objects (for checking catalogue sameness) - vh.add_meta("strkitCatalogLociHash", loci_hash) - - # Add CNV:TR alt type (symbolic allele: tandem repeat) - # vh.add_meta("ALT", "") - - # Set up basic VCF formats - vh.formats.add("AD", ".", "Integer", "Read depth for each allele") - vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence") - vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)") - vh.formats.add("DP", 1, "Integer", "Read depth") - vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)") - vh.formats.add("GT", 1, "String", "Genotype") - vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele") - vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele") - vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele") - vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.") - vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call") - vh.formats.add("PS", 1, "Integer", "Phase set") - vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") - - # Set up VCF info fields (mutates vh.info) - vu.header.VCF_INFO_VT.add_to_info(vh.info) - vu.header.VCF_INFO_MOTIF.add_to_info(vh.info) - vu.header.VCF_INFO_REFMC.add_to_info(vh.info) - vu.header.VCF_INFO_BED_START.add_to_info(vh.info) - vu.header.VCF_INFO_BED_END.add_to_info(vh.info) - vu.header.VCF_INFO_ANCH.add_to_info(vh.info) - - # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO - # for iv in VCF_TR_INFO_RECORDS: - # vh.info.add(*iv) - - # Add the sample(s) - for sample_id in sample_ids: - vh.add_sample(sample_id) - - return vh - - def _vr_pos_key(vr: VariantRecord) -> int: return vr.pos @@ -224,7 +119,7 @@ def create_result_vcf_records( alleles=seq_alleles, ) - vr.info[vu.header.VCF_INFO_VT.key] = VT_STR + vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_STR vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"] vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"] vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"] @@ -318,7 +213,7 @@ def create_result_vcf_records( alleles=snv_alleles, ) - snv_vr.info[vu.header.VCF_INFO_VT.key] = VT_SNV + snv_vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_SNV snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles) snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"]) diff --git a/strkit/vcf_utils/header.py b/strkit/vcf_utils/header.py index 79d635c..62ea574 100644 --- a/strkit/vcf_utils/header.py +++ b/strkit/vcf_utils/header.py @@ -1,6 +1,13 @@ +from __future__ import annotations + from dataclasses import dataclass from datetime import datetime from pysam import VariantHeader, VariantHeaderMetadata +from typing import TYPE_CHECKING +from .. import __version__ + +if TYPE_CHECKING: + from pathlib import Path __all__ = [ "VCF_INFO_VT", @@ -11,6 +18,7 @@ "VCF_INFO_ANCH", "VT_STR", "VT_SNV", + "build_vcf_header", ] @@ -49,6 +57,19 @@ def add_to_info(self, info: VariantHeaderMetadata): VT_STR = "str" VT_SNV = "snv" +# VCF_ALLELE_CNV_TR = "" + +# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = ( +# ("SVLEN", "A", "Integer", "Length of the structural variant"), +# ("CN", "A", "Float", "Copy number of allele"), +# ("RN", "A", "Integer", "Total number of repeat sequences in this allele"), +# ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"), +# ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"), +# ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"), +# ("CIRUC", ".", "Float", "Confidence interval around RUC"), +# ("CIRB", ".", "Integer", "Confidence interval around RB"), +# ) + def add_vcf_file_date(vh: VariantHeader): """ @@ -57,3 +78,89 @@ def add_vcf_file_date(vh: VariantHeader): """ now = datetime.now() vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}") + + +def build_vcf_header( + command: str, + sample_ids: tuple[str, ...], + reference_file: Path | str, # if Path, turn into file URI. Otherwise, treat "as-is". + partial_phasing: bool, + num_loci: int, + loci_hash: str, + contigs: tuple[tuple[str, int], ...] | None = None, +) -> VariantHeader: + vh = VariantHeader() # automatically sets VCF version to 4.2 + + # Add file date + add_vcf_file_date(vh) + + # Add source + vh.add_meta("source", "strkit") + + # Mark that we have partial phasing if we're using HP/SNVs + if partial_phasing: + vh.add_meta("phasing", "partial") + + # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file + # URI. + vh.add_meta( + "reference", + f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file + ) + + if contigs is not None: + # If we're passed contig records directly (from merge), add them rather than pulling them from the reference. + for contig, contig_length in contigs: + vh.contigs.add(contig, length=contig_length) + else: + # Add all contigs from the reference genome file + lengths + from pysam import FastaFile + with FastaFile(str(reference_file)) as rf: + for contig in rf.references: + vh.contigs.add(contig, length=rf.get_reference_length(contig)) + + # Add STRkit-specific fields: + # - marking version + vh.add_meta("strkitVersion", str(__version__)) + # - the subcommand being used to generate this VCF (call|merge) + vh.add_meta("strkitCommand", command) + # - indicating number of loci provided (i.e., catalogue size) + vh.add_meta("strkitCatalogNumLoci", str(num_loci)) + # - indicating hash of STRkitLocus objects (for checking catalogue sameness) + vh.add_meta("strkitCatalogLociHash", loci_hash) + + # Add CNV:TR alt type (symbolic allele: tandem repeat) + # vh.add_meta("ALT", "") + + # Set up basic VCF formats + vh.formats.add("AD", ".", "Integer", "Read depth for each allele") + vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence") + vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)") + vh.formats.add("DP", 1, "Integer", "Read depth") + vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)") + vh.formats.add("GT", 1, "String", "Genotype") + vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele") + vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele") + vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele") + vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.") + vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call") + vh.formats.add("PS", 1, "Integer", "Phase set") + vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)") + + # Set up VCF info fields (mutates vh.info) + VCF_INFO_VT.add_to_info(vh.info) + VCF_INFO_MOTIF.add_to_info(vh.info) + VCF_INFO_REFMC.add_to_info(vh.info) + VCF_INFO_BED_START.add_to_info(vh.info) + VCF_INFO_BED_END.add_to_info(vh.info) + VCF_INFO_ANCH.add_to_info(vh.info) + + # Add INFO records for tandem repeat copies - these are new to VCF4.4! TODO + # for iv in VCF_TR_INFO_RECORDS: + # vh.info.add(*iv) + + # Add the sample(s) + for sample_id in sample_ids: + vh.add_sample(sample_id) + + return vh