davidlougheed · davidlougheed · Feb 17, 2026 · Feb 18, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ The code for the software itself is also available and citeable in Zenodo: https
     * [Usage](#usage)
     * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format)
   * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer)
+  * [`strkit merge`: VCF merging tool](#strkit-merge-vcf-merging-tool)
   * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis)
     * [Usage](#usage-1)
     * [Further documentation](#further-documentation)
@@ -296,6 +297,12 @@ the start-up instructions.
 
 
 
+### `strkit merge`: VCF merging tool
+
+TODO
+
+
+
 ### `strkit mi`: Mendelian inheritance analysis
 
 Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered 

diff --git a/docs/output_formats.md b/docs/output_formats.md
@@ -168,6 +168,7 @@ VCF meta fields (non-exhaustive):
 * `phasing`: present and set to `partial` if using SNV/HP phasing
 * `reference`: absolute file URI to FASTA reference (`file://<...>`)
 * `strkitVersion`: STRkit version used to generate the file
+* `strkitCommand`: STRkit subcommand used to generate the VCF (either `call` or `merge`)
 * `strkitCatalogNumLoci`: Number of total loci in the catalog used for genotyping
 * `strkitCatalogHash`: SHA256 hash of the loci loaded from the catalog
 

diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
@@ -264,17 +264,18 @@ def call_sample(
 
     from heapq import merge as heapq_merge
     from numpy.random import default_rng as np_default_rng
+    from pathlib import Path
     from pysam import VariantFile
 
     from strkit.logger import get_main_logger
+    from strkit.vcf_utils.header import build_vcf_header
 
     from .loci import load_loci
     from .output import (
         output_json_report_header,
         output_json_report_results,
         output_json_report_footer,
         output_tsv as output_tsv_fn,
-        build_vcf_header,
         output_contig_vcf_lines,
     )
     from .utils import get_new_seed
@@ -312,8 +313,9 @@ def call_sample(
     vf: VariantFile | None = None
     if vcf_path is not None:
         vh = build_vcf_header(
-            sample_id_str,
-            params.reference_file,
+            "call",
+            (sample_id_str,),
+            Path(params.reference_file),
             partial_phasing=params.snv_vcf is not None or params.use_hp,
             num_loci=num_loci,
             loci_hash=loci_hash,

diff --git a/strkit/call/output/__init__.py b/strkit/call/output/__init__.py
@@ -1,12 +1,11 @@
 from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer
 from .tsv import output_tsv
-from .vcf import build_vcf_header, output_contig_vcf_lines
+from .vcf import output_contig_vcf_lines
 
 __all__ = [
     "output_json_report_header",
     "output_json_report_results",
     "output_json_report_footer",
     "output_tsv",
-    "build_vcf_header",
     "output_contig_vcf_lines",
 ]
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
@@ -1,14 +1,10 @@
 from __future__ import annotations
 
 from collections import Counter
-from datetime import datetime
-from functools import cache
 from os.path import commonprefix
-from pathlib import Path
-from pysam import FastaFile, VariantHeader
 from typing import Iterable, TYPE_CHECKING
 
-from strkit import __version__
+from strkit import vcf_utils as vu
 from strkit.utils import is_none, idx_0_getter
 from ..utils import cn_getter
 
@@ -19,131 +15,19 @@
     from ..types import LocusResult
 
 __all__ = [
-    "build_vcf_header",
     "output_contig_vcf_lines",
 ]
 
 
-# VCF_ALLELE_CNV_TR = "<CNV:TR>"
-
-# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
-#     ("SVLEN", "A", "Integer", "Length of the structural variant"),
-#     ("CN", "A", "Float", "Copy number of allele"),
-#     ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
-#     ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
-#     ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
-#     ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
-#     ("CIRUC", ".", "Float", "Confidence interval around RUC"),
-#     ("CIRB", ".", "Integer", "Confidence interval around RB"),
-# )
-
-VCF_INFO_VT = "VT"
-VCF_INFO_MOTIF = "MOTIF"
-VCF_INFO_REFMC = "REFMC"
-VCF_INFO_BED_START = "BED_START"
-VCF_INFO_BED_END = "BED_END"
-VCF_INFO_ANCH = "ANCH"
-
-VT_STR = "str"
-VT_SNV = "snv"
-
-
 def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
     # noinspection PyTypeChecker
     return map(str.upper, x)
 
 
-def build_vcf_header(
-    sample_id: str, reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
-) -> VariantHeader:
-    vh = VariantHeader()  # automatically sets VCF version to 4.2
-
-    # Add file date
-    now = datetime.now()
-    vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")
-
-    # Add source
-    vh.add_meta("source", "strkit")
-
-    # Mark that we have partial phasing if we're using HP/SNVs
-    if partial_phasing:
-        vh.add_meta("phasing", "partial")
-
-    # Add an absolute path to the reference genome
-    vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}")
-
-    # Add all contigs from the reference genome file + lengths
-    with FastaFile(reference_file) as rf:
-        for contig in rf.references:
-            vh.contigs.add(contig, length=rf.get_reference_length(contig))
-
-    # Add STRkit-specific fields:
-    #  - marking version
-    vh.add_meta("strkitVersion", str(__version__))
-    #  - indicating number of loci provided (i.e., catalogue size)
-    vh.add_meta("strkitCatalogNumLoci", str(num_loci))
-    #  - indicating hash of STRkitLocus objects (for checking catalogue sameness)
-    vh.add_meta("strkitCatalogLociHash", loci_hash)
-
-    # Add CNV:TR alt type (symbolic allele: tandem repeat)
-    # vh.add_meta("ALT", "<ID=CNV:TR,Description=\"Tandem repeat\">")
-
-    # Set up basic VCF formats
-    vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
-    vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
-    vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
-    vh.formats.add("DP", 1, "Integer", "Read depth")
-    vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
-    vh.formats.add("GT", 1, "String", "Genotype")
-    vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
-    vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
-    vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
-    vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
-    vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
-    vh.formats.add("PS", 1, "Integer", "Phase set")
-    vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
-
-    # Set up VCF info fields
-    vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
-    vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
-    vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
-    vh.info.add(
-        VCF_INFO_BED_START,
-        1,
-        "Integer",
-        "Original start position of the locus as defined in the catalog (0-based inclusive)",
-    )
-    vh.info.add(
-        VCF_INFO_BED_END,
-        1,
-        "Integer",
-        "Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)",
-    )
-    vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")
-
-    # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
-    # for iv in VCF_TR_INFO_RECORDS:
-    #     vh.info.add(*iv)
-
-    # Add the sample
-    vh.add_sample(sample_id)
-
-    return vh
-
-
 def _vr_pos_key(vr: VariantRecord) -> int:
     return vr.pos
 
 
-@cache
-def _blank_entry(n_alleles: int) -> tuple[None, ...]:
-    return tuple([None] * n_alleles)
-
-
-class SkipWritingLocus(Exception):
-    pass
-
-
 def create_result_vcf_records(
     params: CallParams,
     variant_file: VariantFile,
@@ -160,7 +44,7 @@ def create_result_vcf_records(
 
     if "ref_start_anchor" not in result:
         logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start)
-        raise SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     ref_start_anchor = result["ref_start_anchor"].upper()
     ref_seq = result["ref_seq"].upper()
@@ -176,11 +60,11 @@ def create_result_vcf_records(
 
     if any(map(is_none, peak_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs)
-        raise SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     if any(map(is_none, peak_start_anchor_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs)
-        raise SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
     common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
@@ -235,24 +119,24 @@ def create_result_vcf_records(
         alleles=seq_alleles,
     )
 
-    vr.info[VCF_INFO_VT] = VT_STR
-    vr.info[VCF_INFO_MOTIF] = result["motif"]
-    vr.info[VCF_INFO_REFMC] = result["ref_cn"]
-    vr.info[VCF_INFO_BED_START] = result["start"]
-    vr.info[VCF_INFO_BED_END] = result["end"]
-    vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
+    vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_STR
+    vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"]
+    vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"]
+    vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"]
+    vr.info[vu.header.VCF_INFO_BED_END.key] = result["end"]
+    vr.info[vu.header.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset
 
     try:
-        vr.samples[sample_id]["GT"] = (
-            tuple(map(seq_alleles_raw.index, seqs_with_anchors))
-            if call is not None and peak_seqs
-            else _blank_entry(n_alleles)
+        vr.samples[sample_id]["GT"] = vu.record.genotype_indices(
+            alleles=seq_alleles_raw,
+            call=None if call is None or not peak_seqs else seqs_with_anchors,
+            n_alleles=n_alleles,
         )
     except ValueError:
         logger.error(
             "results[%d] (locus_id=%s): one of %s not in %s",
-            result_idx, locus_id, seqs_with_anchors, seq_alleles_raw)
-        raise SkipWritingLocus()
+            result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw)
+        raise vu.record.SkipWritingLocus()
 
     if am := result.get("assign_method"):
         vr.samples[sample_id]["PM"] = am
@@ -329,9 +213,9 @@ def create_result_vcf_records(
                 alleles=snv_alleles,
             )
 
-            snv_vr.info[VCF_INFO_VT] = VT_SNV
+            snv_vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_SNV
 
-            snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
+            snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles)
             snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
             snv_vr.samples[sample_id]["AD"] = snv["rcs"]
 
@@ -369,7 +253,7 @@ def output_contig_vcf_lines(
                     logger,
                 )
             )
-        except SkipWritingLocus:
+        except vu.record.SkipWritingLocus:
             pass  # just skipping the locus, nothing to do here as we've already logged
         except Exception as e:  # fallback if we didn't handle a case properly in create_result_vcf_records
             logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e)

diff --git a/strkit/entry.py b/strkit/entry.py
@@ -272,6 +272,10 @@ def add_call_parser_args(call_parser):
     call_parser.add_argument("--profile", action="store_true", help="Profile function calls (for development.)")
 
 
+def add_merge_parser_args(merge_parser):
+    merge_parser.add_argument("vcf_files", type=str, nargs="+", help="STRkit-generated VCF files to merge together.")
+
+
 def add_mi_parser_args(mi_parser):
     mi_parser.add_argument(
         "--debug",
@@ -479,6 +483,12 @@ def _exec_call(p_args) -> None:
     )
 
 
+def _exec_merge(p_args) -> None:
+    from strkit.merge.merge_vcfs import merge_vcfs
+    logger = _main_logger(p_args)
+    merge_vcfs(tuple(Path(p) for p in p_args.vcf_files), logger)
+
+
 def _exec_mi(p_args) -> None:
     from strkit.mi.base import BaseCalculator
     from strkit.mi.expansionhunter import ExpansionHunterCalculator
@@ -688,6 +698,12 @@ def _make_subparser(*names: str, help_text: str, exec_func: Callable, arg_func:
         exec_func=_exec_call,
         arg_func=add_call_parser_args)
 
+    _make_subparser(
+        "merge",
+        help_text="A merging tool for STRkit VCF output.",
+        exec_func=_exec_merge,
+        arg_func=add_merge_parser_args)
+
     _make_subparser(
         "mi",
         help_text="A Mendelian inheritance calculator for different TR genotyping callers.",

diff --git a/strkit/merge/__init__.py b/strkit/merge/__init__.py