From 60e8e0ec0ff2454db7839b443a822b264cfa4c85 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Tue, 17 Feb 2026 17:02:49 -0500
Subject: [PATCH 01/16] feat: add STRkit VCF merge subcommand work in progress

---
 strkit/merge/__init__.py   |  0
 strkit/merge/merge_vcfs.py | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 strkit/merge/__init__.py
 create mode 100644 strkit/merge/merge_vcfs.py

diff --git a/strkit/merge/__init__.py b/strkit/merge/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
new file mode 100644
index 0000000..3418e3d
--- /dev/null
+++ b/strkit/merge/merge_vcfs.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+from pysam import VariantFile, VariantHeader
+
+from .. import __version__
+
+__all__ = ["merge_vcfs"]
+
+
+def check_headers(hdrs: tuple[VariantHeader, ...]):
+    # check contigs
+    # check STRkit versions
+    # check references
+    # check info field overlaps
+    # TODO: return some info to help?
+    pass
+
+
+def merge_vcfs(paths: tuple[Path, ...], out_file: Path):
+    fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths)
+
+    try:
+        hdrs = tuple(f.header for f in fhs)
+
+        # Check that headers indicate these files can be merged (they are STRkit headers and are compatible)
+        check_headers(hdrs)  # TODO
+
+        # TODO: for each STR, need to:
+        #  - ensure they're compatible (same ID with same # of loci perhaps?)
+        #  - resolve ref (which can vary between) - probably selecting the longest one and
+        #     pasting any needed prefix (diff. between sample's ref and current ref) onto it.
+        # TODO: for SNVs, we can only deliver heterozygous calls. need to note this for the user.
+
+    finally:
+        for fh in fhs:
+            fh.close()

From 8623edc1b9d8b4306deed7d52ccf9b5c4f0563dc Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 18 Feb 2026 17:17:29 -0500
Subject: [PATCH 02/16] add merge subcommand

---
 strkit/entry.py            | 15 +++++++++++++++
 strkit/merge/merge_vcfs.py |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/strkit/entry.py b/strkit/entry.py
index e018458..cbfa338 100644
--- a/strkit/entry.py
+++ b/strkit/entry.py
@@ -272,6 +272,10 @@ def add_call_parser_args(call_parser):
     call_parser.add_argument("--profile", action="store_true", help="Profile function calls (for development.)")
 
 
+def add_merge_parser_args(merge_parser):
+    merge_parser.add_argument("vcf_files", type=str, nargs="+", help="STRkit-generated VCF files to merge together.")
+
+
 def add_mi_parser_args(mi_parser):
     mi_parser.add_argument(
         "--debug",
@@ -479,6 +483,11 @@ def _exec_call(p_args) -> None:
     )
 
 
+def _exec_merge(p_args) -> None:
+    from strkit.merge.merge_vcfs import merge_vcfs
+    merge_vcfs(tuple(Path(p) for p in p_args.vcf_files))
+
+
 def _exec_mi(p_args) -> None:
     from strkit.mi.base import BaseCalculator
     from strkit.mi.expansionhunter import ExpansionHunterCalculator
@@ -688,6 +697,12 @@ def _make_subparser(*names: str, help_text: str, exec_func: Callable, arg_func:
         exec_func=_exec_call,
         arg_func=add_call_parser_args)
 
+    _make_subparser(
+        "merge",
+        help_text="A merging tool for STRkit VCF output.",
+        exec_func=_exec_merge,
+        arg_func=add_merge_parser_args)
+
     _make_subparser(
         "mi",
         help_text="A Mendelian inheritance calculator for different TR genotyping callers.",
diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
index 3418e3d..110bd48 100644
--- a/strkit/merge/merge_vcfs.py
+++ b/strkit/merge/merge_vcfs.py
@@ -15,7 +15,7 @@ def check_headers(hdrs: tuple[VariantHeader, ...]):
     pass
 
 
-def merge_vcfs(paths: tuple[Path, ...], out_file: Path):
+def merge_vcfs(paths: tuple[Path, ...]):
     fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths)
 
     try:

From 0aafe1c416230436dab03dfeb195736fc4d9cdd5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 18 Feb 2026 17:17:41 -0500
Subject: [PATCH 03/16] work on first steps of vcf merging

---
 strkit/merge/merge_vcfs.py | 106 ++++++++++++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 6 deletions(-)

diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
index 110bd48..cdab2e5 100644
--- a/strkit/merge/merge_vcfs.py
+++ b/strkit/merge/merge_vcfs.py
@@ -1,28 +1,122 @@
+from __future__ import annotations
+
 from pathlib import Path
 from pysam import VariantFile, VariantHeader
+from typing import TYPE_CHECKING
 
 from .. import __version__
+from ..logger import get_main_logger
+
+if TYPE_CHECKING:
+    from logging import Logger
 
 __all__ = ["merge_vcfs"]
 
 
-def check_headers(hdrs: tuple[VariantHeader, ...]):
-    # check contigs
-    # check STRkit versions
+def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
+    header_contigs: set[tuple[str, ...]] = set()
+    strkit_versions: set[str] = set()
+    strkit_catalog_hashes: set[str] = set()
+
+    for idx, hdr in enumerate(headers, 1):
+        header_contigs.add(tuple(sorted(hdr.contigs)))
+        strkit_version_found = False
+        strkit_catalog_hash_found = False
+        for line in hdr.records:
+            if line.key == "strkitVersion":
+                strkit_versions.add(line.value)
+                strkit_version_found = True
+            elif line.key == "strkitCatalogLociHash":
+                strkit_catalog_hashes.add(line.value)
+                strkit_catalog_hash_found = True
+        if not strkit_version_found:
+            logger.critical("did not find strkitVersion in VCF #%d", idx)
+            exit(1)
+        if not strkit_catalog_hash_found:
+            logger.critical("did not find strkitCatalogLociHash in VCF #%d", idx)
+            exit(1)
+
     # check references
     # check info field overlaps
+
+    passed_validation = True
+
+    # check contigs
+    if len(header_contigs) > 1:
+        logger.warning("more than one header contig set found - are all VCFs using the same reference genome?")
+        passed_validation = False
+    # check STRkit versions
+    if len(strkit_versions) > 1:
+        logger.warning(
+            "more than one STRkit version found (%s) - output may be different across versions",
+            ", ".join(sorted(strkit_versions)),
+        )
+        passed_validation = False
+    # check catalog hashes
+    if len(strkit_catalog_hashes) > 1:
+        logger.warning(
+            "more than one catalog hash found (%s) - are all locus catalogs the same? merging may not work with "
+            "different catalogs"
+        )
+        passed_validation = False
+
+    if passed_validation:
+        logger.info("VCF headers passed validation")
+    else:
+        logger.warning("VCF headers failed validation")
+
     # TODO: return some info to help?
-    pass
 
 
 def merge_vcfs(paths: tuple[Path, ...]):
+    logger = get_main_logger()  # TODO: inject
+
     fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths)
 
     try:
-        hdrs = tuple(f.header for f in fhs)
+        headers = tuple(f.header for f in fhs)
 
         # Check that headers indicate these files can be merged (they are STRkit headers and are compatible)
-        check_headers(hdrs)  # TODO
+        check_headers(headers, logger)  # TODO
+
+        # --------------------------------------------------------------------------------------------------------------
+
+        # Try merging headers
+        merged_header = headers[0]
+        for hdr in headers[1:]:
+            merged_header = merged_header.merge(hdr)
+
+        print(merged_header, end="")
+
+        # --------------------------------------------------------------------------------------------------------------
+
+        iters = tuple(f.fetch() for f in fhs)
+        ptrs = [next(i, None) for i in iters]
+
+        while any(p is not None for p in ptrs):
+            # find compatible records for merging based on ID (requires STRkit to output variant IDs for everything)
+            if len(set(p.id for p in ptrs)) == 1:
+                contig = ptrs[0].contig
+
+                refs = tuple(p.ref for p in ptrs)
+
+                if len(set(refs)) > 1:
+                    print("TODO ref consolidation!")
+                    ref = refs[0]
+                    # TODO: if this happens, we need to also
+                else:
+                    ref = refs[0]
+
+                print(refs)
+
+                # all same locus, can merge
+                rec = merged_header.new_record(contig, alleles=(ref,))
+                pass
+                ptrs = [next(i, None) for i in iters]
+            else:
+                # we have a record which doesn't apply to all samples
+                print("TODO some samples!")
+                # TODO: selectively advance
 
         # TODO: for each STR, need to:
         #  - ensure they're compatible (same ID with same # of loci perhaps?)

From 3091ce5f29cb70cf8c3b403d7da897dbd7073b63 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Wed, 18 Feb 2026 17:30:14 -0500
Subject: [PATCH 04/16] docs: wip section for vcf merge

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index dbecdd8..304c155 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ The code for the software itself is also available and citeable in Zenodo: https
     * [Usage](#usage)
     * [Further documentation on the STRkit caller, including output format](#further-documentation-on-the-strkit-caller-including-output-format)
   * [`strkit visualize`: Call visualizer](#strkit-visualize-call-visualizer)
+  * [`strkit merge`: VCF merging tool](#strkit-merge-vcf-merging-tool)
   * [`strkit mi`: Mendelian inheritance analysis](#strkit-mi-mendelian-inheritance-analysis)
     * [Usage](#usage-1)
     * [Further documentation](#further-documentation)
@@ -296,6 +297,12 @@ the start-up instructions.
 
 
 
+### `strkit merge`: VCF merging tool
+
+TODO
+
+
+
 ### `strkit mi`: Mendelian inheritance analysis
 
 Using trio data, candidate de novo STR mutations (or genotyping errors/dropout rates) can be discovered 

From 3a188f02b954f17d6c7216f191f3499f892f53c5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 19 Feb 2026 10:22:05 -0500
Subject: [PATCH 05/16] merge work

---
 strkit/merge/merge_vcfs.py | 91 +++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 21 deletions(-)

diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
index cdab2e5..5ec9a29 100644
--- a/strkit/merge/merge_vcfs.py
+++ b/strkit/merge/merge_vcfs.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 from pathlib import Path
-from pysam import VariantFile, VariantHeader
-from typing import TYPE_CHECKING
+from pysam import VariantFile
+from typing import TYPE_CHECKING, Callable
 
-from .. import __version__
+from ..call.output import vcf
 from ..logger import get_main_logger
 
 if TYPE_CHECKING:
     from logging import Logger
+    from pysam import VariantHeader, VariantRecord
 
 __all__ = ["merge_vcfs"]
 
@@ -68,6 +69,68 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
     # TODO: return some info to help?
 
 
+def merge_str_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
+    contig = records[0].contig
+
+    refs = tuple(p.ref for p in records)
+
+    common_info_keys = (
+        vcf.VCF_INFO_VT,
+        vcf.VCF_INFO_MOTIF,
+        vcf.VCF_INFO_REFMC,
+        vcf.VCF_INFO_BED_START,
+        vcf.VCF_INFO_BED_END,
+    )
+
+    infos = set(tuple(p.info[k] for k in common_info_keys) for p in records)
+
+    if len(infos) > 1:
+        logger.warning("merge_str_records encountered more than one set of INFO field values")
+        # TODO: more reporting
+
+    if len(set(refs)) > 1:
+        print("TODO ref consolidation!")
+        record_max_anchor = max(records, key=lambda p: p.info["ANCH"])
+        max_anchor = record_max_anchor.info["ANCH"]
+        ref_max_anchor = record_max_anchor.ref
+        start = record_max_anchor.start
+        added_prefix_lengths = tuple(max_anchor - p.info["ANCH"] for p in records)
+
+        # TODO: also consider if there's extra on the end or something?
+        ref = ref_max_anchor
+        # TODO: this misses SNVs that are in the VCF anchor. Maybe STRkit should genotype these.
+        alts = sorted(set(ref_max_anchor[:added_prefix_lengths[i]] + a for i, p in enumerate(records) for a in p.alts))
+    else:
+        ref = refs[0]
+        alts = sorted(set(a for p in records for a in (p.alts or ())), key=lambda a: (len(a), a))
+        start = records[0].start
+
+    # TODO: merge info/format fields
+
+    # all same locus, can merge
+    alleles = (ref, *(alts or (".",)))
+    print(alleles)
+    rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id)
+
+    common_info = next(iter(infos))
+    for ki, k in enumerate(common_info_keys):
+        rec.info[k] = common_info[ki]
+
+    return rec
+
+
+def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
+    rec = header.new_record()  # TODO
+    print("TODO: SNV MERGE")
+    return rec
+
+
+MERGE_FUNCTION: dict[str, Callable[[VariantHeader, list[VariantRecord], Logger], VariantRecord]] = {
+    vcf.VT_STR: merge_str_records,
+    vcf.VT_SNV: merge_snv_records,
+}
+
+
 def merge_vcfs(paths: tuple[Path, ...]):
     logger = get_main_logger()  # TODO: inject
 
@@ -95,26 +158,12 @@ def merge_vcfs(paths: tuple[Path, ...]):
 
         while any(p is not None for p in ptrs):
             # find compatible records for merging based on ID (requires STRkit to output variant IDs for everything)
-            if len(set(p.id for p in ptrs)) == 1:
-                contig = ptrs[0].contig
-
-                refs = tuple(p.ref for p in ptrs)
-
-                if len(set(refs)) > 1:
-                    print("TODO ref consolidation!")
-                    ref = refs[0]
-                    # TODO: if this happens, we need to also
-                else:
-                    ref = refs[0]
-
-                print(refs)
-
-                # all same locus, can merge
-                rec = merged_header.new_record(contig, alleles=(ref,))
-                pass
+            if all(p is not None for p in ptrs) and len(set(p.id for p in ptrs)) == 1:
+                print(MERGE_FUNCTION[ptrs[0].info["VT"]](merged_header, ptrs, logger), end="")
                 ptrs = [next(i, None) for i in iters]
             else:
-                # we have a record which doesn't apply to all samples
+                # we have a record which doesn't apply to all samples - either some VCFs are finished, or the pointers
+                # are at different points in the file.
                 print("TODO some samples!")
                 # TODO: selectively advance
 

From e6aad2e2892a3cadedcc1b44ba42a2d3b1a0e3f5 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 19 Feb 2026 11:38:50 -0500
Subject: [PATCH 06/16] work on snv merging

---
 strkit/merge/merge_vcfs.py | 64 ++++++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 13 deletions(-)

diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
index 5ec9a29..431d771 100644
--- a/strkit/merge/merge_vcfs.py
+++ b/strkit/merge/merge_vcfs.py
@@ -14,6 +14,18 @@
 __all__ = ["merge_vcfs"]
 
 
+COMMON_STR_INFO_KEYS = (
+    vcf.VCF_INFO_VT,
+    vcf.VCF_INFO_MOTIF,
+    vcf.VCF_INFO_REFMC,
+    vcf.VCF_INFO_BED_START,
+    vcf.VCF_INFO_BED_END,
+)
+COMMON_SNV_INFO_KEYS = (
+    vcf.VCF_INFO_VT,
+)
+
+
 def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
     header_contigs: set[tuple[str, ...]] = set()
     strkit_versions: set[str] = set()
@@ -73,19 +85,13 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge
     contig = records[0].contig
 
     refs = tuple(p.ref for p in records)
+    if len(set(refs)) > 1:
+        logger.debug("merge_str_records: encountered >1 STR reference sequence")
 
-    common_info_keys = (
-        vcf.VCF_INFO_VT,
-        vcf.VCF_INFO_MOTIF,
-        vcf.VCF_INFO_REFMC,
-        vcf.VCF_INFO_BED_START,
-        vcf.VCF_INFO_BED_END,
-    )
-
-    infos = set(tuple(p.info[k] for k in common_info_keys) for p in records)
+    infos = set(tuple(p.info[k] for k in COMMON_STR_INFO_KEYS) for p in records)
 
     if len(infos) > 1:
-        logger.warning("merge_str_records encountered more than one set of INFO field values")
+        logger.warning("merge_str_records: encountered >1 set of INFO field values for STR")
         # TODO: more reporting
 
     if len(set(refs)) > 1:
@@ -99,10 +105,11 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge
         # TODO: also consider if there's extra on the end or something?
         ref = ref_max_anchor
         # TODO: this misses SNVs that are in the VCF anchor. Maybe STRkit should genotype these.
+        # TODO: filter out ref allele
         alts = sorted(set(ref_max_anchor[:added_prefix_lengths[i]] + a for i, p in enumerate(records) for a in p.alts))
     else:
         ref = refs[0]
-        alts = sorted(set(a for p in records for a in (p.alts or ())), key=lambda a: (len(a), a))
+        alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref), key=lambda a: (len(a), a))
         start = records[0].start
 
     # TODO: merge info/format fields
@@ -112,16 +119,47 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge
     print(alleles)
     rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id)
 
+    # -- add INFO (common across samples) to merged STR record ---------------------------------------------------------
     common_info = next(iter(infos))
-    for ki, k in enumerate(common_info_keys):
+    for ki, k in enumerate(COMMON_STR_INFO_KEYS):
         rec.info[k] = common_info[ki]
+    # ------------------------------------------------------------------------------------------------------------------
+
+    # Done common stuff, now need to add/transform sample data
+    for sample in header.samples:
+        pass
 
     return rec
 
 
 def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
-    rec = header.new_record()  # TODO
     print("TODO: SNV MERGE")
+    print([str(r) for r in records])
+
+    # confirm we have a single base ref
+    refs = tuple(p.ref for p in records)
+    if len(ref_set := set(refs)) > 1:
+        logger.critical("merge_snv_records: encountered >1 SNP reference sequence (%s)", ref_set)
+        exit(1)
+    ref = refs[0]
+
+    infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_KEYS) for p in records)
+
+    if len(infos) > 1:
+        logger.warning("merge_str_records: encountered >1 set of INFO field values for STR")
+
+    alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref))
+    alleles = (ref, *alts)
+    rec = header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles)  # TODO
+
+    common_info = next(iter(infos))
+    for ki, k in enumerate(COMMON_SNV_INFO_KEYS):
+        rec.info[k] = common_info[ki]
+
+    # Done common stuff, now need to add/transform sample data
+    for sample in header.samples:
+        pass
+
     return rec
 
 

From e0bc4cbf536eff74fea4701ed87d649f19fb704b Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 08:24:01 -0500
Subject: [PATCH 07/16] fix(call): error within error handling in vcf writing

---
 strkit/call/output/vcf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 152a41c..adab8fa 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -251,7 +251,7 @@ def create_result_vcf_records(
     except ValueError:
         logger.error(
             "results[%d] (locus_id=%s): one of %s not in %s",
-            result_idx, locus_id, seqs_with_anchors, seq_alleles_raw)
+            result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw)
         raise SkipWritingLocus()
 
     if am := result.get("assign_method"):

From 399ad115bad340372576243e2eb13c01fbc785fb Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 08:33:24 -0500
Subject: [PATCH 08/16] refact: move some VCF writing functions to common utils
 file

---
 strkit/call/output/vcf.py | 21 ++++++---------------
 strkit/vcf_utils.py       | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 15 deletions(-)
 create mode 100644 strkit/vcf_utils.py

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index adab8fa..3ed4a6f 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -2,7 +2,6 @@
 
 from collections import Counter
 from datetime import datetime
-from functools import cache
 from os.path import commonprefix
 from pathlib import Path
 from pysam import FastaFile, VariantHeader
@@ -10,6 +9,7 @@
 
 from strkit import __version__
 from strkit.utils import is_none, idx_0_getter
+from strkit.vcf_utils import SkipWritingLocus, genotype_indices
 from ..utils import cn_getter
 
 if TYPE_CHECKING:
@@ -135,15 +135,6 @@ def _vr_pos_key(vr: VariantRecord) -> int:
     return vr.pos
 
 
-@cache
-def _blank_entry(n_alleles: int) -> tuple[None, ...]:
-    return tuple([None] * n_alleles)
-
-
-class SkipWritingLocus(Exception):
-    pass
-
-
 def create_result_vcf_records(
     params: CallParams,
     variant_file: VariantFile,
@@ -243,10 +234,10 @@ def create_result_vcf_records(
     vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
 
     try:
-        vr.samples[sample_id]["GT"] = (
-            tuple(map(seq_alleles_raw.index, seqs_with_anchors))
-            if call is not None and peak_seqs
-            else _blank_entry(n_alleles)
+        vr.samples[sample_id]["GT"] = genotype_indices(
+            alleles=seq_alleles_raw,
+            call=None if call is None or not peak_seqs else seqs_with_anchors,
+            n_alleles=n_alleles,
         )
     except ValueError:
         logger.error(
@@ -331,7 +322,7 @@ def create_result_vcf_records(
 
             snv_vr.info[VCF_INFO_VT] = VT_SNV
 
-            snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
+            snv_vr.samples[sample_id]["GT"] = genotype_indices(snv_alleles, snv["call"], n_alleles)
             snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
             snv_vr.samples[sample_id]["AD"] = snv["rcs"]
 
diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils.py
new file mode 100644
index 0000000..90590a2
--- /dev/null
+++ b/strkit/vcf_utils.py
@@ -0,0 +1,30 @@
+__all__ = [
+    "SkipWritingLocus",
+    "blank_vcf_genotype",
+    "genotype_indices",
+]
+
+
+class SkipWritingLocus(Exception):
+    pass
+
+
+def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]:
+    """
+    Generates a blank VCF genotype for a sample locus.
+    :param n_alleles: Number of alleles to use for the blank genotype.
+    :return: Tuple of [None] * <number of alleles>
+    """
+    return tuple([None] * n_alleles)
+
+
+def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]:
+    """
+    Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of
+    possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple.
+    :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...)
+    :param call: Call for the sample: (alt3, alt1) or None if not called.
+    :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.)
+    :return: Genotype index tuple, e.g., (3, 1).
+    """
+    return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles)

From 0e1a9a29c67faf9e0093551e541b6518661f193c Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 14:02:32 -0500
Subject: [PATCH 09/16] refact: move more VCF-writing utils/consts into
 vcf_utils

---
 strkit/call/output/vcf.py | 65 +++++++++++++--------------------------
 strkit/vcf_utils.py       | 57 ++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 3ed4a6f..c101272 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -1,15 +1,13 @@
 from __future__ import annotations
 
 from collections import Counter
-from datetime import datetime
 from os.path import commonprefix
 from pathlib import Path
 from pysam import FastaFile, VariantHeader
 from typing import Iterable, TYPE_CHECKING
 
-from strkit import __version__
+from strkit import __version__, vcf_utils as vu
 from strkit.utils import is_none, idx_0_getter
-from strkit.vcf_utils import SkipWritingLocus, genotype_indices
 from ..utils import cn_getter
 
 if TYPE_CHECKING:
@@ -37,13 +35,6 @@
 #     ("CIRB", ".", "Integer", "Confidence interval around RB"),
 # )
 
-VCF_INFO_VT = "VT"
-VCF_INFO_MOTIF = "MOTIF"
-VCF_INFO_REFMC = "REFMC"
-VCF_INFO_BED_START = "BED_START"
-VCF_INFO_BED_END = "BED_END"
-VCF_INFO_ANCH = "ANCH"
-
 VT_STR = "str"
 VT_SNV = "snv"
 
@@ -59,8 +50,7 @@ def build_vcf_header(
     vh = VariantHeader()  # automatically sets VCF version to 4.2
 
     # Add file date
-    now = datetime.now()
-    vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")
+    vu.add_vcf_file_date(vh)
 
     # Add source
     vh.add_meta("source", "strkit")
@@ -103,23 +93,12 @@ def build_vcf_header(
     vh.formats.add("PS", 1, "Integer", "Phase set")
     vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
 
-    # Set up VCF info fields
-    vh.info.add(VCF_INFO_VT, 1, "String", "Variant record type (str/snv)")
-    vh.info.add(VCF_INFO_MOTIF, 1, "String", "Motif string")
-    vh.info.add(VCF_INFO_REFMC, 1, "Integer", "Motif copy number in the reference genome")
-    vh.info.add(
-        VCF_INFO_BED_START,
-        1,
-        "Integer",
-        "Original start position of the locus as defined in the catalog (0-based inclusive)",
-    )
-    vh.info.add(
-        VCF_INFO_BED_END,
-        1,
-        "Integer",
-        "Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)",
-    )
-    vh.info.add(VCF_INFO_ANCH, 1, "Integer", "Five-prime anchor size")
+    # Set up VCF info fields (mutates vh.info)
+    vu.VCF_INFO_VT.add_to_info(vh.info)
+    vu.VCF_INFO_MOTIF.add_to_info(vh.info)
+    vu.VCF_INFO_REFMC.add_to_info(vh.info)
+    vu.VCF_INFO_BED_START.add_to_info(vh.info)
+    vu.VCF_INFO_BED_END.add_to_info(vh.info)
 
     # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
     # for iv in VCF_TR_INFO_RECORDS:
@@ -151,7 +130,7 @@ def create_result_vcf_records(
 
     if "ref_start_anchor" not in result:
         logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start)
-        raise SkipWritingLocus()
+        raise vu.SkipWritingLocus()
 
     ref_start_anchor = result["ref_start_anchor"].upper()
     ref_seq = result["ref_seq"].upper()
@@ -167,11 +146,11 @@ def create_result_vcf_records(
 
     if any(map(is_none, peak_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs)
-        raise SkipWritingLocus()
+        raise vu.SkipWritingLocus()
 
     if any(map(is_none, peak_start_anchor_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs)
-        raise SkipWritingLocus()
+        raise vu.SkipWritingLocus()
 
     peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
     common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
@@ -226,15 +205,15 @@ def create_result_vcf_records(
         alleles=seq_alleles,
     )
 
-    vr.info[VCF_INFO_VT] = VT_STR
-    vr.info[VCF_INFO_MOTIF] = result["motif"]
-    vr.info[VCF_INFO_REFMC] = result["ref_cn"]
-    vr.info[VCF_INFO_BED_START] = result["start"]
-    vr.info[VCF_INFO_BED_END] = result["end"]
-    vr.info[VCF_INFO_ANCH] = params.vcf_anchor_size - anchor_offset
+    vr.info[vu.VCF_INFO_VT.key] = VT_STR
+    vr.info[vu.VCF_INFO_MOTIF.key] = result["motif"]
+    vr.info[vu.VCF_INFO_REFMC.key] = result["ref_cn"]
+    vr.info[vu.VCF_INFO_BED_START.key] = result["start"]
+    vr.info[vu.VCF_INFO_BED_END.key] = result["end"]
+    vr.info[vu.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset
 
     try:
-        vr.samples[sample_id]["GT"] = genotype_indices(
+        vr.samples[sample_id]["GT"] = vu.genotype_indices(
             alleles=seq_alleles_raw,
             call=None if call is None or not peak_seqs else seqs_with_anchors,
             n_alleles=n_alleles,
@@ -243,7 +222,7 @@ def create_result_vcf_records(
         logger.error(
             "results[%d] (locus_id=%s): one of %s not in %s",
             result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw)
-        raise SkipWritingLocus()
+        raise vu.SkipWritingLocus()
 
     if am := result.get("assign_method"):
         vr.samples[sample_id]["PM"] = am
@@ -320,9 +299,9 @@ def create_result_vcf_records(
                 alleles=snv_alleles,
             )
 
-            snv_vr.info[VCF_INFO_VT] = VT_SNV
+            snv_vr.info[vu.VCF_INFO_VT.key] = VT_SNV
 
-            snv_vr.samples[sample_id]["GT"] = genotype_indices(snv_alleles, snv["call"], n_alleles)
+            snv_vr.samples[sample_id]["GT"] = vu.genotype_indices(snv_alleles, snv["call"], n_alleles)
             snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
             snv_vr.samples[sample_id]["AD"] = snv["rcs"]
 
@@ -360,7 +339,7 @@ def output_contig_vcf_lines(
                     logger,
                 )
             )
-        except SkipWritingLocus:
+        except vu.SkipWritingLocus:
             pass  # just skipping the locus, nothing to do here as we've already logged
         except Exception as e:  # fallback if we didn't handle a case properly in create_result_vcf_records
             logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e)
diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils.py
index 90590a2..5df8a3f 100644
--- a/strkit/vcf_utils.py
+++ b/strkit/vcf_utils.py
@@ -1,14 +1,71 @@
+from dataclasses import dataclass
+from datetime import datetime
+from pysam import VariantHeader, VariantHeaderMetadata
+
 __all__ = [
+    "VCF_INFO_VT",
+    "VCF_INFO_MOTIF",
+    "VCF_INFO_REFMC",
+    "VCF_INFO_BED_START",
+    "VCF_INFO_BED_END",
+    "VCF_INFO_ANCH",
+    "VT_STR",
+    "VT_SNV",
     "SkipWritingLocus",
     "blank_vcf_genotype",
     "genotype_indices",
 ]
 
 
+@dataclass
+class VcfInfo:
+    key: str
+    n: int | str  # number of values in record row
+    type: str  # VCF type
+    description: str
+
+    def add_to_info(self, info: VariantHeaderMetadata):
+        """
+        Add info field to variant file header metadata via mutation.
+        :param info: VariantHeaderMetadata instance to add the record to. This object is mutated.
+        """
+        info.add(self.key, self.n, self.type, self.description)
+
+
+VCF_INFO_VT = VcfInfo("VT", 1, "String", "Variant record type (str/snv)")
+VCF_INFO_MOTIF = VcfInfo("MOTIF", 1, "String", "Motif string")
+VCF_INFO_REFMC = VcfInfo("REFMC", 1, "Integer", "Motif copy number in the reference genome")
+VCF_INFO_BED_START = VcfInfo(
+    "BED_START",
+    1,
+    "Integer",
+    "Original start position of the locus as defined in the catalog (0-based inclusive)",
+)
+VCF_INFO_BED_END = VcfInfo(
+    "BED_END",
+    1,
+    "Integer",
+    "Original end position of the locus as defined in the catalog (0-based exclusive, i.e., 1-based)",
+)
+VCF_INFO_ANCH = VcfInfo("ANCH", 1, "Integer", "Five-prime anchor size")
+
+VT_STR = "str"
+VT_SNV = "snv"
+
+
 class SkipWritingLocus(Exception):
     pass
 
 
+def add_vcf_file_date(vh: VariantHeader):
+    """
+    Adds a fileDate header metadata line to a variant header via mutation.
+    :param vh: The variant header to add to.
+    """
+    now = datetime.now()
+    vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")
+
+
 def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]:
     """
     Generates a blank VCF genotype for a sample locus.

From d24508b0334a386a7e32826c6ed4f2590a1d8dd3 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 14:12:33 -0500
Subject: [PATCH 10/16] refact: accept multiple samples for the
 build_vcf_header helper

---
 strkit/call/call_sample.py | 2 +-
 strkit/call/output/vcf.py  | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
index d157c12..cdfa183 100644
--- a/strkit/call/call_sample.py
+++ b/strkit/call/call_sample.py
@@ -312,7 +312,7 @@ def call_sample(
     vf: VariantFile | None = None
     if vcf_path is not None:
         vh = build_vcf_header(
-            sample_id_str,
+            (sample_id_str,),
             params.reference_file,
             partial_phasing=params.snv_vcf is not None or params.use_hp,
             num_loci=num_loci,
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index c101272..6d93d54 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -45,7 +45,7 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
 
 
 def build_vcf_header(
-    sample_id: str, reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
+    sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
 ) -> VariantHeader:
     vh = VariantHeader()  # automatically sets VCF version to 4.2
 
@@ -104,8 +104,9 @@ def build_vcf_header(
     # for iv in VCF_TR_INFO_RECORDS:
     #     vh.info.add(*iv)
 
-    # Add the sample
-    vh.add_sample(sample_id)
+    # Add the sample(s)
+    for sample_id in sample_ids:
+        vh.add_sample(sample_id)
 
     return vh
 

From f46981c3fe7211ac5762e695e063fae6c605d8c2 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 14:21:34 -0500
Subject: [PATCH 11/16] refact: multi-file vcf_utils module

---
 strkit/call/output/vcf.py                    | 40 ++++++++++----------
 strkit/vcf_utils/__init__.py                 |  3 ++
 strkit/{vcf_utils.py => vcf_utils/header.py} | 28 --------------
 strkit/vcf_utils/record.py                   | 23 +++++++++++
 4 files changed, 46 insertions(+), 48 deletions(-)
 create mode 100644 strkit/vcf_utils/__init__.py
 rename strkit/{vcf_utils.py => vcf_utils/header.py} (58%)
 create mode 100644 strkit/vcf_utils/record.py

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 6d93d54..45b3f90 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -50,7 +50,7 @@ def build_vcf_header(
     vh = VariantHeader()  # automatically sets VCF version to 4.2
 
     # Add file date
-    vu.add_vcf_file_date(vh)
+    vu.header.add_vcf_file_date(vh)
 
     # Add source
     vh.add_meta("source", "strkit")
@@ -94,11 +94,11 @@ def build_vcf_header(
     vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
 
     # Set up VCF info fields (mutates vh.info)
-    vu.VCF_INFO_VT.add_to_info(vh.info)
-    vu.VCF_INFO_MOTIF.add_to_info(vh.info)
-    vu.VCF_INFO_REFMC.add_to_info(vh.info)
-    vu.VCF_INFO_BED_START.add_to_info(vh.info)
-    vu.VCF_INFO_BED_END.add_to_info(vh.info)
+    vu.header.VCF_INFO_VT.add_to_info(vh.info)
+    vu.header.VCF_INFO_MOTIF.add_to_info(vh.info)
+    vu.header.VCF_INFO_REFMC.add_to_info(vh.info)
+    vu.header.VCF_INFO_BED_START.add_to_info(vh.info)
+    vu.header.VCF_INFO_BED_END.add_to_info(vh.info)
 
     # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
     # for iv in VCF_TR_INFO_RECORDS:
@@ -131,7 +131,7 @@ def create_result_vcf_records(
 
     if "ref_start_anchor" not in result:
         logger.debug("No ref anchor for %s:%d; skipping VCF output for locus", contig, start)
-        raise vu.SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     ref_start_anchor = result["ref_start_anchor"].upper()
     ref_seq = result["ref_seq"].upper()
@@ -147,11 +147,11 @@ def create_result_vcf_records(
 
     if any(map(is_none, peak_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.seqs: %s", result_idx, peak_seqs)
-        raise vu.SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     if any(map(is_none, peak_start_anchor_seqs)):  # Occurs when no consensus for one of the peaks
         logger.error("Encountered None in results[%d].peaks.start_anchor_seqs: %s", result_idx, peak_start_anchor_seqs)
-        raise vu.SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     peak_start_anchor_seqs_upper = tuple(iter_to_upper(peak_start_anchor_seqs))
     common_anchor_prefix = commonprefix([ref_start_anchor, *peak_start_anchor_seqs_upper])
@@ -206,15 +206,15 @@ def create_result_vcf_records(
         alleles=seq_alleles,
     )
 
-    vr.info[vu.VCF_INFO_VT.key] = VT_STR
-    vr.info[vu.VCF_INFO_MOTIF.key] = result["motif"]
-    vr.info[vu.VCF_INFO_REFMC.key] = result["ref_cn"]
-    vr.info[vu.VCF_INFO_BED_START.key] = result["start"]
-    vr.info[vu.VCF_INFO_BED_END.key] = result["end"]
-    vr.info[vu.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset
+    vr.info[vu.header.VCF_INFO_VT.key] = VT_STR
+    vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"]
+    vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"]
+    vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"]
+    vr.info[vu.header.VCF_INFO_BED_END.key] = result["end"]
+    vr.info[vu.header.VCF_INFO_ANCH.key] = params.vcf_anchor_size - anchor_offset
 
     try:
-        vr.samples[sample_id]["GT"] = vu.genotype_indices(
+        vr.samples[sample_id]["GT"] = vu.record.genotype_indices(
             alleles=seq_alleles_raw,
             call=None if call is None or not peak_seqs else seqs_with_anchors,
             n_alleles=n_alleles,
@@ -223,7 +223,7 @@ def create_result_vcf_records(
         logger.error(
             "results[%d] (locus_id=%s): one of %s not in %s",
             result_idx, result["locus_id"], seqs_with_anchors, seq_alleles_raw)
-        raise vu.SkipWritingLocus()
+        raise vu.record.SkipWritingLocus()
 
     if am := result.get("assign_method"):
         vr.samples[sample_id]["PM"] = am
@@ -300,9 +300,9 @@ def create_result_vcf_records(
                 alleles=snv_alleles,
             )
 
-            snv_vr.info[vu.VCF_INFO_VT.key] = VT_SNV
+            snv_vr.info[vu.header.VCF_INFO_VT.key] = VT_SNV
 
-            snv_vr.samples[sample_id]["GT"] = vu.genotype_indices(snv_alleles, snv["call"], n_alleles)
+            snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles)
             snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
             snv_vr.samples[sample_id]["AD"] = snv["rcs"]
 
@@ -340,7 +340,7 @@ def output_contig_vcf_lines(
                     logger,
                 )
             )
-        except vu.SkipWritingLocus:
+        except vu.record.SkipWritingLocus:
             pass  # just skipping the locus, nothing to do here as we've already logged
         except Exception as e:  # fallback if we didn't handle a case properly in create_result_vcf_records
             logger.exception("Error while writing VCF: unhandled exception at results[%d]", result_idx, exc_info=e)
diff --git a/strkit/vcf_utils/__init__.py b/strkit/vcf_utils/__init__.py
new file mode 100644
index 0000000..aaa6cc6
--- /dev/null
+++ b/strkit/vcf_utils/__init__.py
@@ -0,0 +1,3 @@
+__all__ = ["header", "record"]
+
+from . import header, record
diff --git a/strkit/vcf_utils.py b/strkit/vcf_utils/header.py
similarity index 58%
rename from strkit/vcf_utils.py
rename to strkit/vcf_utils/header.py
index 5df8a3f..79d635c 100644
--- a/strkit/vcf_utils.py
+++ b/strkit/vcf_utils/header.py
@@ -11,9 +11,6 @@
     "VCF_INFO_ANCH",
     "VT_STR",
     "VT_SNV",
-    "SkipWritingLocus",
-    "blank_vcf_genotype",
-    "genotype_indices",
 ]
 
 
@@ -53,10 +50,6 @@ def add_to_info(self, info: VariantHeaderMetadata):
 VT_SNV = "snv"
 
 
-class SkipWritingLocus(Exception):
-    pass
-
-
 def add_vcf_file_date(vh: VariantHeader):
     """
     Adds a fileDate header metadata line to a variant header via mutation.
@@ -64,24 +57,3 @@ def add_vcf_file_date(vh: VariantHeader):
     """
     now = datetime.now()
     vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")
-
-
-def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]:
-    """
-    Generates a blank VCF genotype for a sample locus.
-    :param n_alleles: Number of alleles to use for the blank genotype.
-    :return: Tuple of [None] * <number of alleles>
-    """
-    return tuple([None] * n_alleles)
-
-
-def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]:
-    """
-    Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of
-    possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple.
-    :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...)
-    :param call: Call for the sample: (alt3, alt1) or None if not called.
-    :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.)
-    :return: Genotype index tuple, e.g., (3, 1).
-    """
-    return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles)
diff --git a/strkit/vcf_utils/record.py b/strkit/vcf_utils/record.py
new file mode 100644
index 0000000..39e3df5
--- /dev/null
+++ b/strkit/vcf_utils/record.py
@@ -0,0 +1,23 @@
+class SkipWritingLocus(Exception):
+    pass
+
+
+def blank_vcf_genotype(n_alleles: int) -> tuple[None, ...]:
+    """
+    Generates a blank VCF genotype for a sample locus.
+    :param n_alleles: Number of alleles to use for the blank genotype.
+    :return: Tuple of [None] * <number of alleles>
+    """
+    return tuple([None] * n_alleles)
+
+
+def genotype_indices(alleles: tuple[str, ...], call: tuple[str, ...] | None, n_alleles: int) -> tuple[int | None, ...]:
+    """
+    Given possible variant alleles for a particular locus and a sample's call (the sample's alleles), plus the number of
+    possible alleles for the sample at the locus, return a VCF-compatible genotype index tuple.
+    :param alleles: Possible variant alleles: (ref, alt1, alt2, alt3, ...)
+    :param call: Call for the sample: (alt3, alt1) or None if not called.
+    :param n_alleles: Number of alleles for the sample locus (used for computing blank genotypes if not called.)
+    :return: Genotype index tuple, e.g., (3, 1).
+    """
+    return tuple(map(alleles.index, call)) if call is not None else blank_vcf_genotype(n_alleles)

From 3b8cc486de0bf1f48e6353d74bb7391e0229cc78 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 19:39:04 -0500
Subject: [PATCH 12/16] feat: add strkitCommand metadata to STRkit VCFs (call
 or merge)

---
 docs/output_formats.md     | 1 +
 strkit/call/call_sample.py | 1 +
 strkit/call/output/vcf.py  | 4 +++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/output_formats.md b/docs/output_formats.md
index c17230a..4f5a24b 100644
--- a/docs/output_formats.md
+++ b/docs/output_formats.md
@@ -168,6 +168,7 @@ VCF meta fields (non-exhaustive):
 * `phasing`: present and set to `partial` if using SNV/HP phasing
 * `reference`: absolute file URI to FASTA reference (`file://<...>`)
 * `strkitVersion`: STRkit version used to generate the file
+* `strkitCommand`: STRkit subcommand used to generate the VCF (either `call` or `merge`)
 * `strkitCatalogNumLoci`: Number of total loci in the catalog used for genotyping
 * `strkitCatalogHash`: SHA256 hash of the loci loaded from the catalog
 
diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
index cdfa183..5568333 100644
--- a/strkit/call/call_sample.py
+++ b/strkit/call/call_sample.py
@@ -312,6 +312,7 @@ def call_sample(
     vf: VariantFile | None = None
     if vcf_path is not None:
         vh = build_vcf_header(
+            "call",
             (sample_id_str,),
             params.reference_file,
             partial_phasing=params.snv_vcf is not None or params.use_hp,
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 45b3f90..c1ee213 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -45,7 +45,7 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
 
 
 def build_vcf_header(
-    sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
+    command: str, sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
 ) -> VariantHeader:
     vh = VariantHeader()  # automatically sets VCF version to 4.2
 
@@ -70,6 +70,8 @@ def build_vcf_header(
     # Add STRkit-specific fields:
     #  - marking version
     vh.add_meta("strkitVersion", str(__version__))
+    #  - the subcommand being used to generate this VCF (call|merge)
+    vh.add_meta("strkitCommand", command)
     #  - indicating number of loci provided (i.e., catalogue size)
     vh.add_meta("strkitCatalogNumLoci", str(num_loci))
     #  - indicating hash of STRkitLocus objects (for checking catalogue sameness)

From 0f99102a4631f44379ac1b632ed2b1d38ceb54d3 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 20:22:26 -0500
Subject: [PATCH 13/16] chore: allow contigs to be passed directly to VCF
 header builder

---
 strkit/call/call_sample.py |  3 ++-
 strkit/call/output/vcf.py  | 29 ++++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
index 5568333..7d6e16b 100644
--- a/strkit/call/call_sample.py
+++ b/strkit/call/call_sample.py
@@ -264,6 +264,7 @@ def call_sample(
 
     from heapq import merge as heapq_merge
     from numpy.random import default_rng as np_default_rng
+    from pathlib import Path
     from pysam import VariantFile
 
     from strkit.logger import get_main_logger
@@ -314,7 +315,7 @@ def call_sample(
         vh = build_vcf_header(
             "call",
             (sample_id_str,),
-            params.reference_file,
+            Path(params.reference_file),
             partial_phasing=params.snv_vcf is not None or params.use_hp,
             num_loci=num_loci,
             loci_hash=loci_hash,
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index c1ee213..7d271ea 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -45,7 +45,13 @@ def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
 
 
 def build_vcf_header(
-    command: str, sample_ids: tuple[str, ...], reference_file: str, partial_phasing: bool, num_loci: int, loci_hash: str
+    command: str,
+    sample_ids: tuple[str, ...],
+    reference_file: Path | str,  # if Path, turn into file URI. Otherwise, treat "as-is".
+    partial_phasing: bool,
+    num_loci: int,
+    loci_hash: str,
+    contigs: tuple[tuple[str, int], ...] | None = None,
 ) -> VariantHeader:
     vh = VariantHeader()  # automatically sets VCF version to 4.2
 
@@ -59,13 +65,22 @@ def build_vcf_header(
     if partial_phasing:
         vh.add_meta("phasing", "partial")
 
-    # Add an absolute path to the reference genome
-    vh.add_meta("reference", f"file://{str(Path(reference_file).resolve().absolute())}")
+    # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file
+    # URI.
+    vh.add_meta(
+        "reference",
+        f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file
+    )
 
-    # Add all contigs from the reference genome file + lengths
-    with FastaFile(reference_file) as rf:
-        for contig in rf.references:
-            vh.contigs.add(contig, length=rf.get_reference_length(contig))
+    if contigs is not None:
+        # If we're passed contig records directly (from merge), add them rather than pulling them from the reference.
+        for contig, contig_length in contigs:
+            vh.contigs.add(contig, length=contig_length)
+    else:
+        # Add all contigs from the reference genome file + lengths
+        with FastaFile(str(reference_file)) as rf:
+            for contig in rf.references:
+                vh.contigs.add(contig, length=rf.get_reference_length(contig))
 
     # Add STRkit-specific fields:
     #  - marking version

From e082f98aad375111baa9a79fc8df1c103427da6d Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 20:22:32 -0500
Subject: [PATCH 14/16] missing info add

---
 strkit/call/output/vcf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 7d271ea..b9eb2f0 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -116,6 +116,7 @@ def build_vcf_header(
     vu.header.VCF_INFO_REFMC.add_to_info(vh.info)
     vu.header.VCF_INFO_BED_START.add_to_info(vh.info)
     vu.header.VCF_INFO_BED_END.add_to_info(vh.info)
+    vu.header.VCF_INFO_ANCH.add_to_info(vh.info)
 
     # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
     # for iv in VCF_TR_INFO_RECORDS:

From d6ee8c8016dde01eb19d5b40197c46c659c6ccb1 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 20:22:45 -0500
Subject: [PATCH 15/16] merge work (doesn't run)

---
 strkit/entry.py            |   3 +-
 strkit/merge/merge_vcfs.py | 135 ++++++++++++++++++++++++-------------
 2 files changed, 92 insertions(+), 46 deletions(-)

diff --git a/strkit/entry.py b/strkit/entry.py
index cbfa338..475211f 100644
--- a/strkit/entry.py
+++ b/strkit/entry.py
@@ -485,7 +485,8 @@ def _exec_call(p_args) -> None:
 
 def _exec_merge(p_args) -> None:
     from strkit.merge.merge_vcfs import merge_vcfs
-    merge_vcfs(tuple(Path(p) for p in p_args.vcf_files))
+    logger = _main_logger(p_args)
+    merge_vcfs(tuple(Path(p) for p in p_args.vcf_files), logger)
 
 
 def _exec_mi(p_args) -> None:
diff --git a/strkit/merge/merge_vcfs.py b/strkit/merge/merge_vcfs.py
index 431d771..b1398ca 100644
--- a/strkit/merge/merge_vcfs.py
+++ b/strkit/merge/merge_vcfs.py
@@ -1,41 +1,62 @@
 from __future__ import annotations
 
-from pathlib import Path
 from pysam import VariantFile
 from typing import TYPE_CHECKING, Callable
 
+from .. import vcf_utils as vu
 from ..call.output import vcf
-from ..logger import get_main_logger
 
 if TYPE_CHECKING:
     from logging import Logger
+    from pathlib import Path
     from pysam import VariantHeader, VariantRecord
 
 __all__ = ["merge_vcfs"]
 
 
-COMMON_STR_INFO_KEYS = (
-    vcf.VCF_INFO_VT,
-    vcf.VCF_INFO_MOTIF,
-    vcf.VCF_INFO_REFMC,
-    vcf.VCF_INFO_BED_START,
-    vcf.VCF_INFO_BED_END,
+COMMON_STR_INFO_ITEMS = (
+    vu.header.VCF_INFO_VT,
+    vu.header.VCF_INFO_MOTIF,
+    vu.header.VCF_INFO_REFMC,
+    vu.header.VCF_INFO_BED_START,
+    vu.header.VCF_INFO_BED_END,
 )
-COMMON_SNV_INFO_KEYS = (
-    vcf.VCF_INFO_VT,
+COMMON_SNV_INFO_ITEMS = (
+    vu.header.VCF_INFO_VT,
 )
 
 
-def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
+class VariantFileForMerge:
+    def __init__(self, path: Path):
+        self.path = path
+        self.f = VariantFile(str(path))
+        self.h = self.f.header
+        self.samples = tuple(s for s in self.h.samples)
+
+    def close(self):
+        self.f.close()
+
+
+def merge_headers(
+    files: tuple[VariantFileForMerge, ...], logger: Logger
+) -> tuple[VariantHeader, dict[str, VariantFileForMerge]]:
+    partial_phasing: bool = False
     header_contigs: set[tuple[str, ...]] = set()
     strkit_versions: set[str] = set()
     strkit_catalog_hashes: set[str] = set()
+    sample_id_file_map: dict[str, VariantFileForMerge] = {}
 
-    for idx, hdr in enumerate(headers, 1):
+    # -- Loop through headers to collect information for header validation and merging ---------------------------------
+
+    for idx, f in enumerate(files, 1):
+        hdr = f.h
         header_contigs.add(tuple(sorted(hdr.contigs)))
         strkit_version_found = False
         strkit_catalog_hash_found = False
         for line in hdr.records:
+            if line.key == "phasing" and line.value == "partial":
+                # make phasing partial for the whole file if at least one file has it
+                partial_phasing = True
             if line.key == "strkitVersion":
                 strkit_versions.add(line.value)
                 strkit_version_found = True
@@ -48,9 +69,13 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
         if not strkit_catalog_hash_found:
             logger.critical("did not find strkitCatalogLociHash in VCF #%d", idx)
             exit(1)
+        if intersect := set(hdr.samples).intersection(set(sample_id_file_map.keys())):
+            logger.critical("found overlap between sample IDs: %s", intersect)
+            exit(1)
+        for s in hdr.samples:
+            sample_id_file_map[s] = f  # add samples from file to map
 
-    # check references
-    # check info field overlaps
+    # -- Perform pre-merge validation on the header data ---------------------------------------------------------------
 
     passed_validation = True
 
@@ -72,23 +97,36 @@ def check_headers(headers: tuple[VariantHeader, ...], logger: Logger):
             "different catalogs"
         )
         passed_validation = False
+    # check references
+    # TODO
+    # check info field overlaps
+    # TODO
 
     if passed_validation:
         logger.info("VCF headers passed validation")
     else:
         logger.warning("VCF headers failed validation")
 
-    # TODO: return some info to help?
+    # -- Create header for merged VCF ----------------------------------------------------------------------------------
+    # TODO: maybe just create a header from scratch...
+    merged_header = files[0].h.copy()
+    for f in files[1:]:
+        for s in f.samples:
+            merged_header.add_sample(s)
+
+    print(merged_header, end="")
+
+    return merged_header, sample_id_file_map
 
 
-def merge_str_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
+def merge_str_records(merged_header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
     contig = records[0].contig
 
     refs = tuple(p.ref for p in records)
     if len(set(refs)) > 1:
         logger.debug("merge_str_records: encountered >1 STR reference sequence")
 
-    infos = set(tuple(p.info[k] for k in COMMON_STR_INFO_KEYS) for p in records)
+    infos = set(tuple(p.info[k.key] for k in COMMON_STR_INFO_ITEMS) for p in records)
 
     if len(infos) > 1:
         logger.warning("merge_str_records: encountered >1 set of INFO field values for STR")
@@ -116,23 +154,23 @@ def merge_str_records(header: VariantHeader, records: list[VariantRecord], logge
 
     # all same locus, can merge
     alleles = (ref, *(alts or (".",)))
-    print(alleles)
-    rec = header.new_record(contig, start=start, alleles=alleles, id=records[0].id)
+    print("new str alleles", alleles)
+    rec = merged_header.new_record(contig, start=start, alleles=alleles, id=records[0].id)
 
     # -- add INFO (common across samples) to merged STR record ---------------------------------------------------------
     common_info = next(iter(infos))
-    for ki, k in enumerate(COMMON_STR_INFO_KEYS):
+    for ki, k in enumerate(COMMON_STR_INFO_ITEMS):
         rec.info[k] = common_info[ki]
     # ------------------------------------------------------------------------------------------------------------------
 
     # Done common stuff, now need to add/transform sample data
-    for sample in header.samples:
+    for sample in merged_header.samples:
         pass
 
     return rec
 
 
-def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
+def merge_snv_records(merged_header: VariantHeader, records: list[VariantRecord], logger: Logger) -> VariantRecord:
     print("TODO: SNV MERGE")
     print([str(r) for r in records])
 
@@ -143,21 +181,39 @@ def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logge
         exit(1)
     ref = refs[0]
 
-    infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_KEYS) for p in records)
+    infos = set(tuple(p.info[k] for k in COMMON_SNV_INFO_ITEMS) for p in records)
 
     if len(infos) > 1:
         logger.warning("merge_str_records: encountered >1 set of INFO field values for STR")
 
     alts = sorted(set(a for p in records for a in (p.alts or ()) if a != ref))
     alleles = (ref, *alts)
-    rec = header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles)  # TODO
+    print(f"new snv alleles ({len(alleles)})", alleles)
+    rec = merged_header.new_record(contig=records[0].contig, start=records[0].start, alleles=alleles)  # TODO
 
     common_info = next(iter(infos))
-    for ki, k in enumerate(COMMON_SNV_INFO_KEYS):
-        rec.info[k] = common_info[ki]
+    for ki, k in enumerate(COMMON_SNV_INFO_ITEMS):
+        rec.info[k.key] = common_info[ki]
+    # ------------------------------------------------------------------------------------------------------------------
 
     # Done common stuff, now need to add/transform sample data
-    for sample in header.samples:
+
+    calls_by_sample_id = {}
+    for r in records:
+        for s_id in merged_header.samples:
+            calls_by_sample_id[s_id] = TODO
+
+    for sample_id in merged_header.samples:
+        # TODO: preserve phasing
+        rec.samples[sample_id]["GT"] = vu.record.genotype_indices(
+            alleles, calls_by_sample_id[sample_id], TODO_N_ALLELES
+        )
+        # TODO
+        # rec.samples[sample_id]["GT"] = (
+        #     tuple(map(alleles.index, seqs_with_anchors))
+        #     if rec[sample_id] is not None
+        #     else _blank_entry(n_alleles)
+        # )
         pass
 
     return rec
@@ -169,29 +225,18 @@ def merge_snv_records(header: VariantHeader, records: list[VariantRecord], logge
 }
 
 
-def merge_vcfs(paths: tuple[Path, ...]):
-    logger = get_main_logger()  # TODO: inject
-
-    fhs: tuple[VariantFile, ...] = tuple(VariantFile(str(p)) for p in paths)
+def merge_vcfs(paths: tuple[Path, ...], logger: Logger):
+    fhs: tuple[VariantFileForMerge, ...] = tuple(VariantFileForMerge(p) for p in paths)
 
     try:
-        headers = tuple(f.header for f in fhs)
-
-        # Check that headers indicate these files can be merged (they are STRkit headers and are compatible)
-        check_headers(headers, logger)  # TODO
-
-        # --------------------------------------------------------------------------------------------------------------
-
-        # Try merging headers
-        merged_header = headers[0]
-        for hdr in headers[1:]:
-            merged_header = merged_header.merge(hdr)
-
+        # Check that headers indicate these files can be merged (they are STRkit headers and are compatible),
+        # then merge them. Also computes a map from sample ID --> file.
+        merged_header, sample_id_file_map = merge_headers(fhs, logger)  # TODO
         print(merged_header, end="")
 
         # --------------------------------------------------------------------------------------------------------------
 
-        iters = tuple(f.fetch() for f in fhs)
+        iters = tuple(f.f.fetch() for f in fhs)
         ptrs = [next(i, None) for i in iters]
 
         while any(p is not None for p in ptrs):
@@ -202,7 +247,7 @@ def merge_vcfs(paths: tuple[Path, ...]):
             else:
                 # we have a record which doesn't apply to all samples - either some VCFs are finished, or the pointers
                 # are at different points in the file.
-                print("TODO some samples!")
+                print("TODO some vcfs!")
                 # TODO: selectively advance
 
         # TODO: for each STR, need to:

From 702d3fe93a712f0c82f662fe277022253e6067de Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Thu, 26 Feb 2026 20:47:39 -0500
Subject: [PATCH 16/16] refact: move vcf header build fn to vcf_utils

---
 strkit/call/call_sample.py     |   2 +-
 strkit/call/output/__init__.py |   3 +-
 strkit/call/output/vcf.py      | 111 +--------------------------------
 strkit/vcf_utils/header.py     | 107 +++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+), 111 deletions(-)

diff --git a/strkit/call/call_sample.py b/strkit/call/call_sample.py
index 7d6e16b..392f78a 100644
--- a/strkit/call/call_sample.py
+++ b/strkit/call/call_sample.py
@@ -268,6 +268,7 @@ def call_sample(
     from pysam import VariantFile
 
     from strkit.logger import get_main_logger
+    from strkit.vcf_utils.header import build_vcf_header
 
     from .loci import load_loci
     from .output import (
@@ -275,7 +276,6 @@ def call_sample(
         output_json_report_results,
         output_json_report_footer,
         output_tsv as output_tsv_fn,
-        build_vcf_header,
         output_contig_vcf_lines,
     )
     from .utils import get_new_seed
diff --git a/strkit/call/output/__init__.py b/strkit/call/output/__init__.py
index 23a1322..5707b6a 100644
--- a/strkit/call/output/__init__.py
+++ b/strkit/call/output/__init__.py
@@ -1,12 +1,11 @@
 from .json_report import output_json_report_header, output_json_report_results, output_json_report_footer
 from .tsv import output_tsv
-from .vcf import build_vcf_header, output_contig_vcf_lines
+from .vcf import output_contig_vcf_lines
 
 __all__ = [
     "output_json_report_header",
     "output_json_report_results",
     "output_json_report_footer",
     "output_tsv",
-    "build_vcf_header",
     "output_contig_vcf_lines",
 ]
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index b9eb2f0..94731da 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -2,11 +2,9 @@
 
 from collections import Counter
 from os.path import commonprefix
-from pathlib import Path
-from pysam import FastaFile, VariantHeader
 from typing import Iterable, TYPE_CHECKING
 
-from strkit import __version__, vcf_utils as vu
+from strkit import vcf_utils as vu
 from strkit.utils import is_none, idx_0_getter
 from ..utils import cn_getter
 
@@ -17,118 +15,15 @@
     from ..types import LocusResult
 
 __all__ = [
-    "build_vcf_header",
     "output_contig_vcf_lines",
 ]
 
 
-# VCF_ALLELE_CNV_TR = "<CNV:TR>"
-
-# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
-#     ("SVLEN", "A", "Integer", "Length of the structural variant"),
-#     ("CN", "A", "Float", "Copy number of allele"),
-#     ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
-#     ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
-#     ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
-#     ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
-#     ("CIRUC", ".", "Float", "Confidence interval around RUC"),
-#     ("CIRB", ".", "Integer", "Confidence interval around RB"),
-# )
-
-VT_STR = "str"
-VT_SNV = "snv"
-
-
 def iter_to_upper(x: Iterable[str]) -> Iterable[str]:
     # noinspection PyTypeChecker
     return map(str.upper, x)
 
 
-def build_vcf_header(
-    command: str,
-    sample_ids: tuple[str, ...],
-    reference_file: Path | str,  # if Path, turn into file URI. Otherwise, treat "as-is".
-    partial_phasing: bool,
-    num_loci: int,
-    loci_hash: str,
-    contigs: tuple[tuple[str, int], ...] | None = None,
-) -> VariantHeader:
-    vh = VariantHeader()  # automatically sets VCF version to 4.2
-
-    # Add file date
-    vu.header.add_vcf_file_date(vh)
-
-    # Add source
-    vh.add_meta("source", "strkit")
-
-    # Mark that we have partial phasing if we're using HP/SNVs
-    if partial_phasing:
-        vh.add_meta("phasing", "partial")
-
-    # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file
-    # URI.
-    vh.add_meta(
-        "reference",
-        f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file
-    )
-
-    if contigs is not None:
-        # If we're passed contig records directly (from merge), add them rather than pulling them from the reference.
-        for contig, contig_length in contigs:
-            vh.contigs.add(contig, length=contig_length)
-    else:
-        # Add all contigs from the reference genome file + lengths
-        with FastaFile(str(reference_file)) as rf:
-            for contig in rf.references:
-                vh.contigs.add(contig, length=rf.get_reference_length(contig))
-
-    # Add STRkit-specific fields:
-    #  - marking version
-    vh.add_meta("strkitVersion", str(__version__))
-    #  - the subcommand being used to generate this VCF (call|merge)
-    vh.add_meta("strkitCommand", command)
-    #  - indicating number of loci provided (i.e., catalogue size)
-    vh.add_meta("strkitCatalogNumLoci", str(num_loci))
-    #  - indicating hash of STRkitLocus objects (for checking catalogue sameness)
-    vh.add_meta("strkitCatalogLociHash", loci_hash)
-
-    # Add CNV:TR alt type (symbolic allele: tandem repeat)
-    # vh.add_meta("ALT", "<ID=CNV:TR,Description=\"Tandem repeat\">")
-
-    # Set up basic VCF formats
-    vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
-    vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
-    vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
-    vh.formats.add("DP", 1, "Integer", "Read depth")
-    vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
-    vh.formats.add("GT", 1, "String", "Genotype")
-    vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
-    vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
-    vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
-    vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
-    vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
-    vh.formats.add("PS", 1, "Integer", "Phase set")
-    vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
-
-    # Set up VCF info fields (mutates vh.info)
-    vu.header.VCF_INFO_VT.add_to_info(vh.info)
-    vu.header.VCF_INFO_MOTIF.add_to_info(vh.info)
-    vu.header.VCF_INFO_REFMC.add_to_info(vh.info)
-    vu.header.VCF_INFO_BED_START.add_to_info(vh.info)
-    vu.header.VCF_INFO_BED_END.add_to_info(vh.info)
-    vu.header.VCF_INFO_ANCH.add_to_info(vh.info)
-
-    # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
-    # for iv in VCF_TR_INFO_RECORDS:
-    #     vh.info.add(*iv)
-
-    # Add the sample(s)
-    for sample_id in sample_ids:
-        vh.add_sample(sample_id)
-
-    return vh
-
-
 def _vr_pos_key(vr: VariantRecord) -> int:
     return vr.pos
 
@@ -224,7 +119,7 @@ def create_result_vcf_records(
         alleles=seq_alleles,
     )
 
-    vr.info[vu.header.VCF_INFO_VT.key] = VT_STR
+    vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_STR
     vr.info[vu.header.VCF_INFO_MOTIF.key] = result["motif"]
     vr.info[vu.header.VCF_INFO_REFMC.key] = result["ref_cn"]
     vr.info[vu.header.VCF_INFO_BED_START.key] = result["start"]
@@ -318,7 +213,7 @@ def create_result_vcf_records(
                 alleles=snv_alleles,
             )
 
-            snv_vr.info[vu.header.VCF_INFO_VT.key] = VT_SNV
+            snv_vr.info[vu.header.VCF_INFO_VT.key] = vu.header.VT_SNV
 
             snv_vr.samples[sample_id]["GT"] = vu.record.genotype_indices(snv_alleles, snv["call"], n_alleles)
             snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
diff --git a/strkit/vcf_utils/header.py b/strkit/vcf_utils/header.py
index 79d635c..62ea574 100644
--- a/strkit/vcf_utils/header.py
+++ b/strkit/vcf_utils/header.py
@@ -1,6 +1,13 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from datetime import datetime
 from pysam import VariantHeader, VariantHeaderMetadata
+from typing import TYPE_CHECKING
+from .. import __version__
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 __all__ = [
     "VCF_INFO_VT",
@@ -11,6 +18,7 @@
     "VCF_INFO_ANCH",
     "VT_STR",
     "VT_SNV",
+    "build_vcf_header",
 ]
 
 
@@ -49,6 +57,19 @@ def add_to_info(self, info: VariantHeaderMetadata):
 VT_STR = "str"
 VT_SNV = "snv"
 
+# VCF_ALLELE_CNV_TR = "<CNV:TR>"
+
+# VCF_TR_INFO_RECORDS: tuple[tuple[str, str, str, str], ...] = (
+#     ("SVLEN", "A", "Integer", "Length of the structural variant"),
+#     ("CN", "A", "Float", "Copy number of allele"),
+#     ("RN", "A", "Integer", "Total number of repeat sequences in this allele"),
+#     ("RUS", ".", "String", "Repeat unit sequence of the corresponding repeat sequence"),
+#     ("RUL", ".", "Integer", "Repeat unit length of the corresponding repeat sequence"),
+#     ("RB", ".", "Integer", "Total number of bases in the corresponding repeat sequence"),
+#     ("CIRUC", ".", "Float", "Confidence interval around RUC"),
+#     ("CIRB", ".", "Integer", "Confidence interval around RB"),
+# )
+
 
 def add_vcf_file_date(vh: VariantHeader):
     """
@@ -57,3 +78,89 @@ def add_vcf_file_date(vh: VariantHeader):
     """
     now = datetime.now()
     vh.add_meta("fileDate", f"{now.year}{now.month:02d}{now.day:02d}")
+
+
+def build_vcf_header(
+    command: str,
+    sample_ids: tuple[str, ...],
+    reference_file: Path | str,  # if Path, turn into file URI. Otherwise, treat "as-is".
+    partial_phasing: bool,
+    num_loci: int,
+    loci_hash: str,
+    contigs: tuple[tuple[str, int], ...] | None = None,
+) -> VariantHeader:
+    vh = VariantHeader()  # automatically sets VCF version to 4.2
+
+    # Add file date
+    add_vcf_file_date(vh)
+
+    # Add source
+    vh.add_meta("source", "strkit")
+
+    # Mark that we have partial phasing if we're using HP/SNVs
+    if partial_phasing:
+        vh.add_meta("phasing", "partial")
+
+    # Add an absolute path to the reference genome. If we're passed a string (from merge), assume this is already a file
+    # URI.
+    vh.add_meta(
+        "reference",
+        f"file://{str(reference_file.resolve().absolute())}" if isinstance(reference_file, Path) else reference_file
+    )
+
+    if contigs is not None:
+        # If we're passed contig records directly (from merge), add them rather than pulling them from the reference.
+        for contig, contig_length in contigs:
+            vh.contigs.add(contig, length=contig_length)
+    else:
+        # Add all contigs from the reference genome file + lengths
+        from pysam import FastaFile
+        with FastaFile(str(reference_file)) as rf:
+            for contig in rf.references:
+                vh.contigs.add(contig, length=rf.get_reference_length(contig))
+
+    # Add STRkit-specific fields:
+    #  - marking version
+    vh.add_meta("strkitVersion", str(__version__))
+    #  - the subcommand being used to generate this VCF (call|merge)
+    vh.add_meta("strkitCommand", command)
+    #  - indicating number of loci provided (i.e., catalogue size)
+    vh.add_meta("strkitCatalogNumLoci", str(num_loci))
+    #  - indicating hash of STRkitLocus objects (for checking catalogue sameness)
+    vh.add_meta("strkitCatalogLociHash", loci_hash)
+
+    # Add CNV:TR alt type (symbolic allele: tandem repeat)
+    # vh.add_meta("ALT", "<ID=CNV:TR,Description=\"Tandem repeat\">")
+
+    # Set up basic VCF formats
+    vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
+    vh.formats.add("ANCL", ".", "Integer", "Anchor length for the ref and each alt, five-prime of TR sequence")
+    vh.formats.add("CONS", ".", "String", "Consensus methods used for each alt (single/poa/best_rep)")
+    vh.formats.add("DP", 1, "Integer", "Read depth")
+    vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
+    vh.formats.add("GT", 1, "String", "Genotype")
+    vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
+    vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
+    vh.formats.add("MCRL", ".", "String", "Read-level motif copy numbers for each allele")
+    vh.formats.add("MMAS", 1, "Float", "Mean model (candidate TR sequence) alignment score across reads.")
+    vh.formats.add("NSNV", 1, "Integer", "Number of supporting SNVs for the STR peak-call")
+    vh.formats.add("PS", 1, "Integer", "Phase set")
+    vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
+
+    # Set up VCF info fields (mutates vh.info)
+    VCF_INFO_VT.add_to_info(vh.info)
+    VCF_INFO_MOTIF.add_to_info(vh.info)
+    VCF_INFO_REFMC.add_to_info(vh.info)
+    VCF_INFO_BED_START.add_to_info(vh.info)
+    VCF_INFO_BED_END.add_to_info(vh.info)
+    VCF_INFO_ANCH.add_to_info(vh.info)
+
+    # Add INFO records for tandem repeat copies - these are new to VCF4.4!  TODO
+    # for iv in VCF_TR_INFO_RECORDS:
+    #     vh.info.add(*iv)
+
+    # Add the sample(s)
+    for sample_id in sample_ids:
+        vh.add_sample(sample_id)
+
+    return vh