From 2ed1b8821d52ac99c8d46751a969dc6fa88afaea Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Sat, 4 Apr 2026 12:12:41 -0400
Subject: [PATCH] feat(convert): add TRF .dat -> .bed converter

---
 README.md                         | 43 ++++++++++++++++++---
 strkit/convert/_bed_4.py          |  3 +-
 strkit/convert/constants.py       | 15 +++++---
 strkit/convert/converter.py       | 62 +++++++++++++++++++++----------
 strkit/convert/expansionhunter.py |  3 +-
 strkit/convert/gangstr.py         |  3 +-
 strkit/convert/hipstr.py          |  3 +-
 strkit/convert/trf.py             | 44 ++++++++++++++++++++++
 strkit/convert/trgt.py            | 12 +++---
 9 files changed, 146 insertions(+), 42 deletions(-)
 create mode 100644 strkit/convert/trf.py

diff --git a/README.md b/README.md
index d0f32b6..6f2a884 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,11 @@ STRkit's main advantages over other callers include:
 * A license which allows use on any long-read sequencing data, versus TRGT whose 
   [license](https://github.com/PacificBiosciences/trgt/blob/main/LICENSE.md) restricts it to only PacBio data.
 
+Some other useful features include:
+* An interactive [call visualizer](#strkit-visualize-call-visualizer)
+* A locus catalog [converter](#strkit-convert-str-catalog-conversion), which includes a TRF `.dat` to `BED` converter
+* A pre-built Docker image ([`ghcr.io/davidlougheed/strkit`](https://ghcr.io/davidlougheed/strkit))
+
 If you use STRkit in published work, please cite our paper in *Genome Research*:
 
 > [Read-level genotyping of short tandem repeats using long reads and single-nucleotide variation with STRkit.](https://doi.org/10.1101/gr.280766.125)
@@ -420,26 +425,52 @@ Any extraneous columns are removed, (internally) leaving a four-column STR locus
 Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR
 BED as locus catalog input. However, other formats representing a catalog of STRs exist:
 
-* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot 
-  of information. This can be used as-is with STRkit, but it's safer for other tools to convert to
-  a four-column BED format.
+* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot 
+  of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but 
+  it's safer for other tools to convert to a four-column BED format.
+* STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would 
+  be needed to do this.
 * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
   which can specify more advanced STR structures.
+* Some tools support multiple motifs (see, e.g., 
+  [STRchive catalogs of pathogenic loci](https://strchive.org/loci/#downloads)).
 
 #### Usage
 
-The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format 
-(many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
+The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output 
+format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
 
 *Note:* Not all input/output format pairs have available converter functions; an error will be 
 printed to `stderr` if one does not exist.
 
-For example, to convert from a TRF BED to a TRGT repeat definition BED file:
+##### TRF DAT file to TRF BED file (*UCSC-style*)
+
+To convert from a TRF `.dat` file (as output by the TRF program) to a TRF BED file in the style of
+the one obtainable from UCSC (e.g., for 
+[hg38](https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/latest/)):
+
+```bash
+strkit convert --in-format trf --out-format trf in_file.trf.dat > out_file.trf.bed
+```
+
+##### TRF DAT file to 4-column BED file
+
+To convert from a TRF `.dat` file (as output by the TRF program) to a normal BED file:
+
+```bash
+strkit convert --in-format trf --out-format bed4 in_file.trf.dat > out_file.bed
+```
+
+##### TRF BED to TRGT repeat definition file
+
+To convert from a TRF BED to a TRGT repeat definition BED file:
 
 ```bash
 strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed
 ```
 
+##### TRGT repeat definition file to STRkit BED
+
 To convert from a TRGT repeat definition file to a STRkit BED, including locus IDs if set:
 
 ```bash
diff --git a/strkit/convert/_bed_4.py b/strkit/convert/_bed_4.py
index 7479c55..4e1b998 100644
--- a/strkit/convert/_bed_4.py
+++ b/strkit/convert/_bed_4.py
@@ -1,11 +1,12 @@
 import sys
 from logging import Logger
+from typing import Iterable
 
 __all__ = [
     "trf_to_bed_4",
 ]
 
 
-def trf_to_bed_4(trf_data: list, _logger: Logger):
+def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger):
     for item in trf_data:
         sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
diff --git a/strkit/convert/constants.py b/strkit/convert/constants.py
index 730db39..5f71eb3 100644
--- a/strkit/convert/constants.py
+++ b/strkit/convert/constants.py
@@ -1,13 +1,16 @@
 __all__ = [
-    "IN_FORMAT_TRF",
-    "IN_FORMAT_TRGT",
+    "FORMAT_BED4",
+    "FORMAT_TRF",
+    "FORMAT_TRGT",
     "CONVERTER_IN_FORMATS",
 ]
 
-IN_FORMAT_TRF = "trf"
-IN_FORMAT_TRGT = "trgt"
+FORMAT_BED4 = "bed4"
+FORMAT_TRF = "trf"
+FORMAT_TRGT = "trgt"
 
 CONVERTER_IN_FORMATS = (
-    IN_FORMAT_TRF,
-    IN_FORMAT_TRGT,
+    FORMAT_BED4,
+    FORMAT_TRF,
+    FORMAT_TRGT,
 )
diff --git a/strkit/convert/converter.py b/strkit/convert/converter.py
index eca1fdb..33c7c28 100644
--- a/strkit/convert/converter.py
+++ b/strkit/convert/converter.py
@@ -1,11 +1,12 @@
 from logging import Logger
-from typing import Callable
+from typing import Callable, Iterable, TypeAlias
 
 from ._bed_4 import trf_to_bed_4
-from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS
+from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS
 from .expansionhunter import trf_bed_to_eh
 from .hipstr import trf_bed_to_hipstr
 from .gangstr import trf_bed_to_gangstr
+from .trf import trf_dat_to_bed, trf_passthrough
 from .trgt import trgt_bed_to_bed4, trgt_bed_to_strkit_bed, trf_or_strkit_bed_to_trgt
 
 import strkit.constants as c
@@ -15,20 +16,25 @@
     "convert",
 ]
 
-convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = {
-    # TRF converters:
-    (IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
-    (IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr,
-    (IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr,
-    (IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x,
-    (IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4,
-    (IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4,  # or can just leave -asis
-    (IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
-    (IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
+ConverterFn: TypeAlias = Callable[[Iterable, Logger], None]
+
+convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = {
+    # TRF DAT to BED converter:
+    (FORMAT_TRF, FORMAT_TRF): trf_passthrough,
+    # BED4/TRF BED converters:
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_HIPSTR): trf_bed_to_hipstr,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_GANGSTR): trf_bed_to_gangstr,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_REPEATHMM): lambda x: x,
+    ((FORMAT_BED4, FORMAT_TRF), FORMAT_BED4): trf_to_bed_4,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRAGLR): trf_to_bed_4,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRKIT): trf_to_bed_4,  # or can just leave as-is
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
+    ((FORMAT_BED4, FORMAT_TRF), c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
     # TRGT converters:
-    (IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
-    (IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed,
-    (IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
+    (FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
+    (FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed,
+    (FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
 }
 
 CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))
@@ -37,8 +43,15 @@
 def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
     out_format = out_format.lower()
 
-    if in_format == IN_FORMAT_TRF:
-        if out_format == c.CALLER_REPEATHMM:
+    is_trf_dat: bool = False
+
+    if in_format == FORMAT_TRF:
+        if in_file.endswith(".dat"):
+            is_trf_dat = True
+        elif out_format == FORMAT_TRF:
+            logger.critical(f"No need to convert from TRF BED to TRF BED")
+            exit(1)
+        elif out_format == c.CALLER_REPEATHMM:
             logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input")
             return 1
         elif out_format == c.CALLER_STRKIT:
@@ -47,12 +60,21 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in
     if in_format not in CONVERTER_IN_FORMATS:
         logger.critical(f"Unsupported input format: {in_format}")
 
-    if (in_format, out_format) not in convert_formats:
+    converter: ConverterFn | None = None
+    for (i, o), cc in convert_formats.items():
+        if in_format in i and out_format == o:
+            converter = cc
+
+    if converter is None:
         logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)")
         return 1
 
     with open(in_file, "r") as tf:
-        data = [line.strip().split("\t") for line in tf]
+        if is_trf_dat:
+            data = trf_dat_to_bed(tf)
+        else:
+            data = [line.strip().split("\t") for line in tf]
+        # noinspection PyCallingNonCallable
+        converter(data, logger)
 
-    convert_formats[(in_format, out_format)](data, logger)
     return 0
diff --git a/strkit/convert/expansionhunter.py b/strkit/convert/expansionhunter.py
index b8bce31..34ef561 100644
--- a/strkit/convert/expansionhunter.py
+++ b/strkit/convert/expansionhunter.py
@@ -1,13 +1,14 @@
 import json
 import sys
 from logging import Logger
+from typing import Iterable
 
 __all__ = [
     "trf_bed_to_eh",
 ]
 
 
-def trf_bed_to_eh(trf_data: list, _logger: Logger):
+def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
     eh_formatted_loci = []
 
     for i, item in enumerate(trf_data, 1):
diff --git a/strkit/convert/gangstr.py b/strkit/convert/gangstr.py
index e73d01d..30a43bf 100644
--- a/strkit/convert/gangstr.py
+++ b/strkit/convert/gangstr.py
@@ -1,11 +1,12 @@
 import sys
 from logging import Logger
+from typing import Iterable
 
 __all__ = [
     "trf_bed_to_gangstr",
 ]
 
 
-def trf_bed_to_gangstr(trf_data: list, _logger: Logger):
+def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger):
     for i, item in enumerate(trf_data, 1):
         sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
diff --git a/strkit/convert/hipstr.py b/strkit/convert/hipstr.py
index 9366457..bfce7a3 100644
--- a/strkit/convert/hipstr.py
+++ b/strkit/convert/hipstr.py
@@ -1,11 +1,12 @@
 import sys
 from logging import Logger
+from typing import Iterable
 
 __all__ = [
     "trf_bed_to_hipstr",
 ]
 
 
-def trf_bed_to_hipstr(trf_data: list, _logger: Logger):
+def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger):
     for i, item in enumerate(trf_data, 1):
         sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
diff --git a/strkit/convert/trf.py b/strkit/convert/trf.py
new file mode 100644
index 0000000..460eaaa
--- /dev/null
+++ b/strkit/convert/trf.py
@@ -0,0 +1,44 @@
+import sys
+
+from logging import Logger
+from typing import Generator, Iterable, TextIO
+
+__all__ = ["trf_dat_to_bed", "trf_passthrough"]
+
+TRF_SEQUENCE_PREFIX = "Sequence: "
+
+# Input TRF dat file columns: see https://tandem.bu.edu/trf/definitions#table
+# start
+# end
+# period
+# copy number
+# consensus pattern size
+# % matches
+# % indels
+# align score
+# % A
+# % C
+# % G
+# % T
+# entropy
+# motif
+# sequence
+
+# Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence
+
+
+def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]:
+    current_contig = ""
+    for line in map(str.strip, file):
+        if not line:
+            continue
+        if line.startswith(TRF_SEQUENCE_PREFIX):
+            current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX)
+        data = line.split(" ")
+        if data[0].isdigit() and len(data) == 15:  # data column
+            yield [current_contig, *data[:2], "trf", *data[2:]]
+
+
+def trf_passthrough(trf_data: Iterable[list], _logger: Logger):
+    for data in trf_data:
+        sys.stdout.write("\t".join(data) + "\n")
diff --git a/strkit/convert/trgt.py b/strkit/convert/trgt.py
index cfd2a99..99b9738 100644
--- a/strkit/convert/trgt.py
+++ b/strkit/convert/trgt.py
@@ -1,6 +1,6 @@
 import sys
 from logging import Logger
-from typing import Literal
+from typing import Literal, Iterable
 
 __all__ = [
     "trgt_bed_to_bed4",
@@ -11,7 +11,7 @@
 from strkit.iupac import get_iupac_code_for_nt_set
 
 
-def trgt_bed_to_bed4(trgt_data: list, logger: Logger, annotations: Literal["strkit"] | None = None):
+def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None):
     """
     Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
     :param trgt_data: The loaded TRGT catalog (split by tab).
@@ -66,11 +66,11 @@ def _fmt_last_col(v: str) -> str:
         sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n")
 
 
-def trgt_bed_to_strkit_bed(trgt_data: list, logger: Logger):
+def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger):
     return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")
 
 
-def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger):
+def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):
     """
     Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
     :param trf_data: The loaded BED catalog data.
@@ -78,5 +78,5 @@ def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger):
     """
 
     for i, item in enumerate(trf_data):
-        motif = trf_data[-1]
-        sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
+        motif = item[-1]
+        sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")