davidlougheed · davidlougheed · Apr 5, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ test-*.bash
 *.bam
 *.bai
 *.bcf
+*.dat
 *.fasta
 *.fa.gz
 *.fa.gz.fai
@@ -24,6 +25,7 @@ test-*.bash
 *.log
 *.bed
 !tests/data/*.bed
+!tests/data/*.dat
 /*.json
 *.png
 *.time

diff --git a/README.md b/README.md
@@ -426,8 +426,9 @@ Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a
 BED as locus catalog input. However, other formats representing a catalog of STRs exist:
 
 * [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot 
-  of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but 
+  of information. Once converted to a BED file, this can be used as-is with STRkit, but 
   it's safer for other tools to convert to a four-column BED format.
+  See "[Converting TRF DAT files to BED](./docs/convert_trf_to_bed.md)" for more on how to do this.
 * STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would 
   be needed to do this.
 * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
@@ -440,6 +441,8 @@ BED as locus catalog input. However, other formats representing a catalog of STR
 The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output 
 format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
 
+If you want the output **sorted** (which many tools prefer or require), use the `--sort` flag.
+
 *Note:* Not all input/output format pairs have available converter functions; an error will be 
 printed to `stderr` if one does not exist.
 

diff --git a/docs/convert_trf_to_bed.md b/docs/convert_trf_to_bed.md
@@ -0,0 +1,20 @@
+# Converting TRF DAT files to BED
+
+[Tandem Repeats Finder](https://tandem.bu.edu/trf/trf.html) ([Benson 1999](https://pubmed.ncbi.nlm.nih.gov/9862982/)) 
+is a widely-used tool for discovering tandem repeat regions in genomes. However, its output is in the form of a `.dat` 
+file, which is not always convenient for use in other programs (e.g., TR callers, web browsers)
+
+STRkit provides a built-in subcommand, `strkit convert`, which can (among other things) convert TRF files from the TRF
+`.dat` format to either a standard four-column BED file, or a UCSC-style TRF BED file.
+
+To convert a TRF `.dat` file to a sorted four-column BED using STRkit, run the following command:
+
+```bash
+strkit convert --in-format trf --out-format bed4 --sort trf.dat > repeats.bed
+```
+
+To convert a TRF `.dat` file to a sorted UCSC-style TRF BED file, run the following command instead:
+
+```bash
+strkit convert --in-format trf --out-format trf --sort trf.dat > trf.bed
+```
diff --git a/strkit/convert/_bed_4.py b/strkit/convert/_bed_4.py
@@ -1,12 +1,11 @@
-import sys
 from logging import Logger
-from typing import Iterable
+from typing import Generator, Iterable
 
 __all__ = [
     "trf_to_bed_4",
 ]
 
 
-def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger):
+def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     for item in trf_data:
-        sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
+        yield "\t".join((*item[:3], item[-1])) + "\n"
diff --git a/strkit/convert/converter.py b/strkit/convert/converter.py
@@ -1,5 +1,7 @@
+import sys
+
 from logging import Logger
-from typing import Callable, Iterable, TypeAlias
+from typing import Callable, Generator, Iterable, TextIO, TypeAlias
 
 from ._bed_4 import trf_to_bed_4
 from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS
@@ -16,9 +18,9 @@
     "convert",
 ]
 
-ConverterFn: TypeAlias = Callable[[Iterable, Logger], None]
+ConverterFn: TypeAlias = Callable[[Iterable, Logger], Generator[str, None, None]]
 
-convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = {
+convert_formats: dict[tuple[str | tuple[str, ...], str], ConverterFn] = {
     # TRF DAT to BED converter:
     (FORMAT_TRF, FORMAT_TRF): trf_passthrough,
     # BED4/TRF BED converters:
@@ -40,7 +42,25 @@
 CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))
 
 
-def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
+def _load_bed_like(fh: TextIO, sort: bool) -> list:
+    data = []
+    contig_order = []
+
+    for line in map(lambda s: s.split("\t"), map(str.strip, fh)):
+        if not contig_order or line[0] != contig_order[-1]:
+            contig_order.append(line[0])
+        data.append([line[0], int(line[1]), int(line[2]), *line[3:]] if sort else line)
+
+    if sort:
+        data.sort(key=lambda dd: (contig_order.index(dd[0]), dd[1], dd[2]))
+        for d in data:
+            d[1] = str(d[1])
+            d[2] = str(d[2])
+
+    return data
+
+
+def convert(in_file: str, in_format: str, out_format: str, sort: bool, logger: Logger) -> int:
     out_format = out_format.lower()
 
     is_trf_dat: bool = False
@@ -71,10 +91,12 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in
 
     with open(in_file, "r") as tf:
         if is_trf_dat:
-            data = trf_dat_to_bed(tf)
+            data = trf_dat_to_bed(tf, sort)
         else:
-            data = [line.strip().split("\t") for line in tf]
+            # in all other cases, we have various forms of BED files
+            data = _load_bed_like(tf, sort)
         # noinspection PyCallingNonCallable
-        converter(data, logger)
+        for line in converter(data, logger):
+            sys.stdout.write(line)
 
     return 0
diff --git a/strkit/convert/expansionhunter.py b/strkit/convert/expansionhunter.py
@@ -1,14 +1,13 @@
 import json
-import sys
 from logging import Logger
-from typing import Iterable
+from typing import Generator, Iterable
 
 __all__ = [
     "trf_bed_to_eh",
 ]
 
 
-def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
+def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     eh_formatted_loci = []
 
     for i, item in enumerate(trf_data, 1):
@@ -19,4 +18,4 @@ def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
             "VariantType": "Repeat",
         })
 
-    sys.stdout.write(json.dumps(eh_formatted_loci, indent=2))
+    yield json.dumps(eh_formatted_loci, indent=2)
diff --git a/strkit/convert/gangstr.py b/strkit/convert/gangstr.py
@@ -1,12 +1,11 @@
-import sys
 from logging import Logger
-from typing import Iterable
+from typing import Generator, Iterable
 
 __all__ = [
     "trf_bed_to_gangstr",
 ]
 
 
-def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger):
+def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     for i, item in enumerate(trf_data, 1):
-        sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
+        yield "\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n"
diff --git a/strkit/convert/hipstr.py b/strkit/convert/hipstr.py
@@ -1,12 +1,11 @@
-import sys
 from logging import Logger
-from typing import Iterable
+from typing import Generator, Iterable
 
 __all__ = [
     "trf_bed_to_hipstr",
 ]
 
 
-def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger):
+def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     for i, item in enumerate(trf_data, 1):
-        sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
+        yield "\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n"
diff --git a/strkit/convert/trf.py b/strkit/convert/trf.py
@@ -1,5 +1,3 @@
-import sys
-
 from logging import Logger
 from typing import Generator, Iterable, TextIO
 
@@ -27,18 +25,40 @@
 # Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence
 
 
-def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]:
+def trf_dat_to_bed(file: TextIO, sort: bool) -> Generator[list, None, None]:
+    res = []  # only used if sort=True
+
+    def _output_sorted_res():
+        res.sort(key=lambda dd: (dd[1], dd[2]))
+        for r in res:
+            r[1] = str(r[1])
+            r[2] = str(r[2])
+            yield r
+        res.clear()
+
     current_contig = ""
     for line in map(str.strip, file):
         if not line:
             continue
+
         if line.startswith(TRF_SEQUENCE_PREFIX):
             current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX)
+            if sort:
+                yield from _output_sorted_res()
+
         data = line.split(" ")
         if data[0].isdigit() and len(data) == 15:  # data column
-            yield [current_contig, *data[:2], "trf", *data[2:]]
+            # TRF start coord is 1-indexed
+            start = int(data[0]) - 1
+            end = int(data[1])
+            if sort:
+                res.append([current_contig, start, end, "trf", *data[2:14]])
+            else:
+                yield [current_contig, str(start), str(end), "trf", *data[2:14]]
+
+    yield from _output_sorted_res()
 
 
-def trf_passthrough(trf_data: Iterable[list], _logger: Logger):
+def trf_passthrough(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     for data in trf_data:
-        sys.stdout.write("\t".join(data) + "\n")
+        yield "\t".join(data) + "\n"
diff --git a/strkit/convert/trgt.py b/strkit/convert/trgt.py
@@ -1,6 +1,5 @@
-import sys
 from logging import Logger
-from typing import Literal, Iterable
+from typing import Generator, Literal, Iterable
 
 __all__ = [
     "trgt_bed_to_bed4",
@@ -11,7 +10,9 @@
 from strkit.iupac import get_iupac_code_for_nt_set
 
 
-def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None):
+def trgt_bed_to_bed4(
+    trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None
+) -> Generator[str, None, None]:
     """
     Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
     :param trgt_data: The loaded TRGT catalog (split by tab).
@@ -57,20 +58,20 @@ def _fmt_last_col(v: str) -> str:
                         break
 
                 if not failed:  # found a consensus base for the multiple-motif STR, so we can convert it
-                    sys.stdout.write("\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n")
+                    yield "\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n"
                     continue
 
             data_str = "\t".join(data)
             logger.warning(f"Could not convert complex locus at line {line}; taking first motif: {data_str}")
 
-        sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n")
+        yield "\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n"
 
 
-def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger):
-    return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")
+def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger) -> Generator[str, None, None]:
+    yield from trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")
 
 
-def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):
+def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
     """
     Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
     :param trf_data: The loaded BED catalog data.
@@ -79,4 +80,4 @@ def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):
 
     for i, item in enumerate(trf_data):
         motif = item[-1]
-        sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
+        yield "\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n"
diff --git a/strkit/entry.py b/strkit/entry.py
@@ -463,13 +463,14 @@ def add_cc_parser_args(cc_parser):
     cc_parser.add_argument("paths", type=str, nargs="+", help="Paths to the BED catalogs to combine.")
 
 
-def add_cv_parser_args(al_parser):
+def add_cv_parser_args(cv_parser):
     from strkit.convert.converter import CONVERTER_OUTPUT_FORMATS
-    al_parser.add_argument("in_file", type=str, help="Input file to convert from.")
-    al_parser.add_argument(
+    cv_parser.add_argument("in_file", type=str, help="Input file to convert from.")
+    cv_parser.add_argument(
         "--in-format", type=str, choices=("trf", "trgt"), default="trf", help="Format to convert from."
     )
-    al_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.")
+    cv_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.")
+    cv_parser.add_argument("--sort", action="store_true", help="Whether to output a sorted BED file (slower).")
 
 
 def add_vs_parser_args(vs_parser):
@@ -632,7 +633,7 @@ def _exec_combine_catalogs(p_args):
 
 def _exec_convert(p_args):
     from strkit.convert.converter import convert
-    return convert(p_args.in_file, p_args.in_format, p_args.out_format, _main_logger(p_args))
+    return convert(p_args.in_file, p_args.in_format, p_args.out_format, p_args.sort, _main_logger(p_args))
 
 
 def _exec_viz_server(p_args):

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/data/trf_convert_sort.dat b/tests/data/trf_convert_sort.dat
@@ -0,0 +1,17 @@
+Tandem Repeats Finder Program written by:
+
+Gary Benson
+Program in Bioinformatics
+Boston University
+Version 4.09
+
+
+Sequence: Chr_Y.H1
+
+
+
+Parameters: 2 5 7 80 10 50 2000
+
+
+570 609 19 2.1 19 80 0 52 22 7 67 2 1.28 GGGCGGCGGGGGAAAAAAG GGGCGGTGGGGGAAAAGAGGGGCGGCGGGGGAGAGAAGGG
+1 62 28 2.2 28 91 0 103 22 16 43 17 1.87 TATCACCGGGGAGGAGGAGGGTAGTCAG TATCACCGGGGAGGAGGAGGGTGGTCAGTATCACCGGGGAGGAGGAGGGTAGTCGTTATCAC
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -0,0 +1,22 @@
+import logging
+from pathlib import Path
+from strkit.convert.constants import FORMAT_TRF, FORMAT_BED4
+from strkit.convert.converter import convert
+
+TRF_CONVERT_SORT_DAT = Path(__file__).parent / "data" / "trf_convert_sort.dat"
+
+logger = logging.getLogger(__name__)
+
+
+def test_convert_trf_dat_to_bed4(capsys):
+    res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=False, logger=logger)
+    assert res == 0
+    captured = capsys.readouterr()
+    assert captured.out == "Chr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\nChr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\n"
+
+
+def test_convert_trf_dat_to_bed4_sorted(capsys):
+    res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=True, logger=logger)
+    assert res == 0
+    captured = capsys.readouterr()
+    assert captured.out == "Chr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\nChr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\n"
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ test-*.bash @@
     *.bam
     *.bai
     *.bcf
+    *.dat
     *.fasta
     *.fa.gz
     *.fa.gz.fai
@@ Expand All / @@ -24,6 +25,7 @@ test-*.bash @@
     *.log
     *.bed
     !tests/data/*.bed
+    !tests/data/*.dat
     /*.json
     *.png
     *.time
@@ Expand Down @@