From 2ed1b8821d52ac99c8d46751a969dc6fa88afaea Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Sat, 4 Apr 2026 12:12:41 -0400 Subject: [PATCH] feat(convert): add TRF .dat -> .bed converter --- README.md | 43 ++++++++++++++++++--- strkit/convert/_bed_4.py | 3 +- strkit/convert/constants.py | 15 +++++--- strkit/convert/converter.py | 62 +++++++++++++++++++++---------- strkit/convert/expansionhunter.py | 3 +- strkit/convert/gangstr.py | 3 +- strkit/convert/hipstr.py | 3 +- strkit/convert/trf.py | 44 ++++++++++++++++++++++ strkit/convert/trgt.py | 12 +++--- 9 files changed, 146 insertions(+), 42 deletions(-) create mode 100644 strkit/convert/trf.py diff --git a/README.md b/README.md index d0f32b6..6f2a884 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,11 @@ STRkit's main advantages over other callers include: * A license which allows use on any long-read sequencing data, versus TRGT whose [license](https://github.com/PacificBiosciences/trgt/blob/main/LICENSE.md) restricts it to only PacBio data. +Some other useful features include: +* An interactive [call visualizer](#strkit-visualize-call-visualizer) +* A locus catalog [converter](#strkit-convert-str-catalog-conversion), which includes a TRF `.dat` to `BED` converter +* A pre-built Docker image ([`ghcr.io/davidlougheed/strkit`](https://ghcr.io/davidlougheed/strkit)) + If you use STRkit in published work, please cite our paper in *Genome Research*: > [Read-level genotyping of short tandem repeats using long reads and single-nucleotide variation with STRkit.](https://doi.org/10.1101/gr.280766.125) @@ -420,26 +425,52 @@ Any extraneous columns are removed, (internally) leaving a four-column STR locus Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR BED as locus catalog input. However, other formats representing a catalog of STRs exist: -* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot - of information. This can be used as-is with STRkit, but it's safer for other tools to convert to - a four-column BED format. +* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot + of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but + it's safer for other tools to convert to a four-column BED format. +* STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would + be needed to do this. * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md), which can specify more advanced STR structures. +* Some tools support multiple motifs (see, e.g., + [STRchive catalogs of pathogenic loci](https://strchive.org/loci/#downloads)). #### Usage -The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format -(many, see `strkit convert --help`), and an input file. Output is written to `stdout`. +The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output +format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`. *Note:* Not all input/output format pairs have available converter functions; an error will be printed to `stderr` if one does not exist. -For example, to convert from a TRF BED to a TRGT repeat definition BED file: +##### TRF DAT file to TRF BED file (*UCSC-style*) + +To convert from a TRF `.dat` file (as output by the TRF program) to a TRF BED file in the style of +the one obtainable from UCSC (e.g., for +[hg38](https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/latest/)): + +```bash +strkit convert --in-format trf --out-format trf in_file.trf.dat > out_file.trf.bed +``` + +##### TRF DAT file to 4-column BED file + +To convert from a TRF `.dat` file (as output by the TRF program) to a normal BED file: + +```bash +strkit convert --in-format trf --out-format bed4 in_file.trf.dat > out_file.bed +``` + +##### TRF BED to TRGT repeat definition file + +To convert from a TRF BED to a TRGT repeat definition BED file: ```bash strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed ``` +##### TRGT repeat definition file to STRkit BED + To convert from a TRGT repeat definition file to a STRkit BED, including locus IDs if set: ```bash diff --git a/strkit/convert/_bed_4.py b/strkit/convert/_bed_4.py index 7479c55..4e1b998 100644 --- a/strkit/convert/_bed_4.py +++ b/strkit/convert/_bed_4.py @@ -1,11 +1,12 @@ import sys from logging import Logger +from typing import Iterable __all__ = [ "trf_to_bed_4", ] -def trf_to_bed_4(trf_data: list, _logger: Logger): +def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger): for item in trf_data: sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n") diff --git a/strkit/convert/constants.py b/strkit/convert/constants.py index 730db39..5f71eb3 100644 --- a/strkit/convert/constants.py +++ b/strkit/convert/constants.py @@ -1,13 +1,16 @@ __all__ = [ - "IN_FORMAT_TRF", - "IN_FORMAT_TRGT", + "FORMAT_BED4", + "FORMAT_TRF", + "FORMAT_TRGT", "CONVERTER_IN_FORMATS", ] -IN_FORMAT_TRF = "trf" -IN_FORMAT_TRGT = "trgt" +FORMAT_BED4 = "bed4" +FORMAT_TRF = "trf" +FORMAT_TRGT = "trgt" CONVERTER_IN_FORMATS = ( - IN_FORMAT_TRF, - IN_FORMAT_TRGT, + FORMAT_BED4, + FORMAT_TRF, + FORMAT_TRGT, ) diff --git a/strkit/convert/converter.py b/strkit/convert/converter.py index eca1fdb..33c7c28 100644 --- a/strkit/convert/converter.py +++ b/strkit/convert/converter.py @@ -1,11 +1,12 @@ from logging import Logger -from typing import Callable +from typing import Callable, Iterable, TypeAlias from ._bed_4 import trf_to_bed_4 -from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS +from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS from .expansionhunter import trf_bed_to_eh from .hipstr import trf_bed_to_hipstr from .gangstr import trf_bed_to_gangstr +from .trf import trf_dat_to_bed, trf_passthrough from .trgt import trgt_bed_to_bed4, trgt_bed_to_strkit_bed, trf_or_strkit_bed_to_trgt import strkit.constants as c @@ -15,20 +16,25 @@ "convert", ] -convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = { - # TRF converters: - (IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh, - (IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr, - (IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr, - (IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x, - (IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4, - (IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4, # or can just leave -asis - (IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4, - (IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt, +ConverterFn: TypeAlias = Callable[[Iterable, Logger], None] + +convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = { + # TRF DAT to BED converter: + (FORMAT_TRF, FORMAT_TRF): trf_passthrough, + # BED4/TRF BED converters: + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_HIPSTR): trf_bed_to_hipstr, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_GANGSTR): trf_bed_to_gangstr, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_REPEATHMM): lambda x: x, + ((FORMAT_BED4, FORMAT_TRF), FORMAT_BED4): trf_to_bed_4, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRAGLR): trf_to_bed_4, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRKIT): trf_to_bed_4, # or can just leave as-is + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4, + ((FORMAT_BED4, FORMAT_TRF), c.CALLER_TRGT): trf_or_strkit_bed_to_trgt, # TRGT converters: - (IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4, - (IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed, - (IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4, + (FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4, + (FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed, + (FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4, } CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats))) @@ -37,8 +43,15 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int: out_format = out_format.lower() - if in_format == IN_FORMAT_TRF: - if out_format == c.CALLER_REPEATHMM: + is_trf_dat: bool = False + + if in_format == FORMAT_TRF: + if in_file.endswith(".dat"): + is_trf_dat = True + elif out_format == FORMAT_TRF: + logger.critical(f"No need to convert from TRF BED to TRF BED") + exit(1) + elif out_format == c.CALLER_REPEATHMM: logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input") return 1 elif out_format == c.CALLER_STRKIT: @@ -47,12 +60,21 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in if in_format not in CONVERTER_IN_FORMATS: logger.critical(f"Unsupported input format: {in_format}") - if (in_format, out_format) not in convert_formats: + converter: ConverterFn | None = None + for (i, o), cc in convert_formats.items(): + if in_format in i and out_format == o: + converter = cc + + if converter is None: logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)") return 1 with open(in_file, "r") as tf: - data = [line.strip().split("\t") for line in tf] + if is_trf_dat: + data = trf_dat_to_bed(tf) + else: + data = [line.strip().split("\t") for line in tf] + # noinspection PyCallingNonCallable + converter(data, logger) - convert_formats[(in_format, out_format)](data, logger) return 0 diff --git a/strkit/convert/expansionhunter.py b/strkit/convert/expansionhunter.py index b8bce31..34ef561 100644 --- a/strkit/convert/expansionhunter.py +++ b/strkit/convert/expansionhunter.py @@ -1,13 +1,14 @@ import json import sys from logging import Logger +from typing import Iterable __all__ = [ "trf_bed_to_eh", ] -def trf_bed_to_eh(trf_data: list, _logger: Logger): +def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger): eh_formatted_loci = [] for i, item in enumerate(trf_data, 1): diff --git a/strkit/convert/gangstr.py b/strkit/convert/gangstr.py index e73d01d..30a43bf 100644 --- a/strkit/convert/gangstr.py +++ b/strkit/convert/gangstr.py @@ -1,11 +1,12 @@ import sys from logging import Logger +from typing import Iterable __all__ = [ "trf_bed_to_gangstr", ] -def trf_bed_to_gangstr(trf_data: list, _logger: Logger): +def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger): for i, item in enumerate(trf_data, 1): sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n") diff --git a/strkit/convert/hipstr.py b/strkit/convert/hipstr.py index 9366457..bfce7a3 100644 --- a/strkit/convert/hipstr.py +++ b/strkit/convert/hipstr.py @@ -1,11 +1,12 @@ import sys from logging import Logger +from typing import Iterable __all__ = [ "trf_bed_to_hipstr", ] -def trf_bed_to_hipstr(trf_data: list, _logger: Logger): +def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger): for i, item in enumerate(trf_data, 1): sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n") diff --git a/strkit/convert/trf.py b/strkit/convert/trf.py new file mode 100644 index 0000000..460eaaa --- /dev/null +++ b/strkit/convert/trf.py @@ -0,0 +1,44 @@ +import sys + +from logging import Logger +from typing import Generator, Iterable, TextIO + +__all__ = ["trf_dat_to_bed", "trf_passthrough"] + +TRF_SEQUENCE_PREFIX = "Sequence: " + +# Input TRF dat file columns: see https://tandem.bu.edu/trf/definitions#table +# start +# end +# period +# copy number +# consensus pattern size +# % matches +# % indels +# align score +# % A +# % C +# % G +# % T +# entropy +# motif +# sequence + +# Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence + + +def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]: + current_contig = "" + for line in map(str.strip, file): + if not line: + continue + if line.startswith(TRF_SEQUENCE_PREFIX): + current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX) + data = line.split(" ") + if data[0].isdigit() and len(data) == 15: # data column + yield [current_contig, *data[:2], "trf", *data[2:]] + + +def trf_passthrough(trf_data: Iterable[list], _logger: Logger): + for data in trf_data: + sys.stdout.write("\t".join(data) + "\n") diff --git a/strkit/convert/trgt.py b/strkit/convert/trgt.py index cfd2a99..99b9738 100644 --- a/strkit/convert/trgt.py +++ b/strkit/convert/trgt.py @@ -1,6 +1,6 @@ import sys from logging import Logger -from typing import Literal +from typing import Literal, Iterable __all__ = [ "trgt_bed_to_bed4", @@ -11,7 +11,7 @@ from strkit.iupac import get_iupac_code_for_nt_set -def trgt_bed_to_bed4(trgt_data: list, logger: Logger, annotations: Literal["strkit"] | None = None): +def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None): """ Converts a TRGT repeat catalog to the STRkit/BED4 catalog format. :param trgt_data: The loaded TRGT catalog (split by tab). @@ -66,11 +66,11 @@ def _fmt_last_col(v: str) -> str: sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n") -def trgt_bed_to_strkit_bed(trgt_data: list, logger: Logger): +def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger): return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit") -def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger): +def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger): """ Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog. :param trf_data: The loaded BED catalog data. @@ -78,5 +78,5 @@ def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger): """ for i, item in enumerate(trf_data): - motif = trf_data[-1] - sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n") + motif = item[-1] + sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")