Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 37 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ STRkit's main advantages over other callers include:
* A license which allows use on any long-read sequencing data, versus TRGT whose
[license](https://github.com/PacificBiosciences/trgt/blob/main/LICENSE.md) restricts it to only PacBio data.

Some other useful features include:
* An interactive [call visualizer](#strkit-visualize-call-visualizer)
* A locus catalog [converter](#strkit-convert-str-catalog-conversion), which includes a TRF `.dat` to `BED` converter
* A pre-built Docker image ([`ghcr.io/davidlougheed/strkit`](https://ghcr.io/davidlougheed/strkit))

If you use STRkit in published work, please cite our paper in *Genome Research*:

> [Read-level genotyping of short tandem repeats using long reads and single-nucleotide variation with STRkit.](https://doi.org/10.1101/gr.280766.125)
Expand Down Expand Up @@ -420,26 +425,52 @@ Any extraneous columns are removed, (internally) leaving a four-column STR locus
Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a four-column STR
BED as locus catalog input. However, other formats representing a catalog of STRs exist:

* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a TSV/BED with a lot
of information. This can be used as-is with STRkit, but it's safer for other tools to convert to
a four-column BED format.
* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot
of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but
it's safer for other tools to convert to a four-column BED format.
* STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would
be needed to do this.
* [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
which can specify more advanced STR structures.
* Some tools support multiple motifs (see, e.g.,
[STRchive catalogs of pathogenic loci](https://strchive.org/loci/#downloads)).

#### Usage

The `strkit convert` sub-command requires an input format (`trf` or `trgt`), an output format
(many, see `strkit convert --help`), and an input file. Output is written to `stdout`.
The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output
format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.

*Note:* Not all input/output format pairs have available converter functions; an error will be
printed to `stderr` if one does not exist.

For example, to convert from a TRF BED to a TRGT repeat definition BED file:
##### TRF DAT file to TRF BED file (*UCSC-style*)

To convert from a TRF `.dat` file (as output by the TRF program) to a TRF BED file in the style of
the one obtainable from UCSC (e.g., for
[hg38](https://hgdownload.soe.ucsc.edu/goldenpath/hg38/bigZips/latest/)):

```bash
strkit convert --in-format trf --out-format trf in_file.trf.dat > out_file.trf.bed
```

##### TRF DAT file to 4-column BED file

To convert from a TRF `.dat` file (as output by the TRF program) to a normal BED file:

```bash
strkit convert --in-format trf --out-format bed4 in_file.trf.dat > out_file.bed
```

##### TRF BED to TRGT repeat definition file

To convert from a TRF BED to a TRGT repeat definition BED file:

```bash
strkit convert --in-format trf --out-format trgt in_file.trf.bed > out_file.bed
```

##### TRGT repeat definition file to STRkit BED

To convert from a TRGT repeat definition file to a STRkit BED, including locus IDs if set:

```bash
Expand Down
3 changes: 2 additions & 1 deletion strkit/convert/_bed_4.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import sys
from logging import Logger
from typing import Iterable

__all__ = [
"trf_to_bed_4",
]


def trf_to_bed_4(trf_data: list, _logger: Logger):
def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger):
for item in trf_data:
sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
15 changes: 9 additions & 6 deletions strkit/convert/constants.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
__all__ = [
"IN_FORMAT_TRF",
"IN_FORMAT_TRGT",
"FORMAT_BED4",
"FORMAT_TRF",
"FORMAT_TRGT",
"CONVERTER_IN_FORMATS",
]

IN_FORMAT_TRF = "trf"
IN_FORMAT_TRGT = "trgt"
FORMAT_BED4 = "bed4"
FORMAT_TRF = "trf"
FORMAT_TRGT = "trgt"

CONVERTER_IN_FORMATS = (
IN_FORMAT_TRF,
IN_FORMAT_TRGT,
FORMAT_BED4,
FORMAT_TRF,
FORMAT_TRGT,
)
62 changes: 42 additions & 20 deletions strkit/convert/converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from logging import Logger
from typing import Callable
from typing import Callable, Iterable, TypeAlias

from ._bed_4 import trf_to_bed_4
from .constants import IN_FORMAT_TRF, IN_FORMAT_TRGT, CONVERTER_IN_FORMATS
from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS
from .expansionhunter import trf_bed_to_eh
from .hipstr import trf_bed_to_hipstr
from .gangstr import trf_bed_to_gangstr
from .trf import trf_dat_to_bed, trf_passthrough
from .trgt import trgt_bed_to_bed4, trgt_bed_to_strkit_bed, trf_or_strkit_bed_to_trgt

import strkit.constants as c
Expand All @@ -15,20 +16,25 @@
"convert",
]

convert_formats: dict[tuple[str, str], Callable[[list, Logger], None]] = {
# TRF converters:
(IN_FORMAT_TRF, c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
(IN_FORMAT_TRF, c.CALLER_HIPSTR): trf_bed_to_hipstr,
(IN_FORMAT_TRF, c.CALLER_GANGSTR): trf_bed_to_gangstr,
(IN_FORMAT_TRF, c.CALLER_REPEATHMM): lambda x: x,
(IN_FORMAT_TRF, c.CALLER_STRAGLR): trf_to_bed_4,
(IN_FORMAT_TRF, c.CALLER_STRKIT): trf_to_bed_4, # or can just leave -asis
(IN_FORMAT_TRF, c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
(IN_FORMAT_TRF, c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
ConverterFn: TypeAlias = Callable[[Iterable, Logger], None]

convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = {
# TRF DAT to BED converter:
(FORMAT_TRF, FORMAT_TRF): trf_passthrough,
# BED4/TRF BED converters:
((FORMAT_BED4, FORMAT_TRF), c.CALLER_EXPANSIONHUNTER): trf_bed_to_eh,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_HIPSTR): trf_bed_to_hipstr,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_GANGSTR): trf_bed_to_gangstr,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_REPEATHMM): lambda x: x,
((FORMAT_BED4, FORMAT_TRF), FORMAT_BED4): trf_to_bed_4,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRAGLR): trf_to_bed_4,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_STRKIT): trf_to_bed_4, # or can just leave as-is
((FORMAT_BED4, FORMAT_TRF), c.CALLER_TANDEM_GENOTYPES): trf_to_bed_4,
((FORMAT_BED4, FORMAT_TRF), c.CALLER_TRGT): trf_or_strkit_bed_to_trgt,
# TRGT converters:
(IN_FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
(IN_FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed,
(IN_FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
(FORMAT_TRGT, c.CALLER_STRAGLR): trgt_bed_to_bed4,
(FORMAT_TRGT, c.CALLER_STRKIT): trgt_bed_to_strkit_bed,
(FORMAT_TRGT, c.CALLER_TANDEM_GENOTYPES): trgt_bed_to_bed4,
}

CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))
Expand All @@ -37,8 +43,15 @@
def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
out_format = out_format.lower()

if in_format == IN_FORMAT_TRF:
if out_format == c.CALLER_REPEATHMM:
is_trf_dat: bool = False

if in_format == FORMAT_TRF:
if in_file.endswith(".dat"):
is_trf_dat = True
elif out_format == FORMAT_TRF:
logger.critical(f"No need to convert from TRF BED to TRF BED")
exit(1)
elif out_format == c.CALLER_REPEATHMM:
logger.critical(f"No need to convert for '{out_format}'; TRF BED files are accepted as input")
return 1
elif out_format == c.CALLER_STRKIT:
Expand All @@ -47,12 +60,21 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in
if in_format not in CONVERTER_IN_FORMATS:
logger.critical(f"Unsupported input format: {in_format}")

if (in_format, out_format) not in convert_formats:
converter: ConverterFn | None = None
for (i, o), cc in convert_formats.items():
if in_format in i and out_format == o:
converter = cc

if converter is None:
logger.critical(f"Unsupported conversion: {in_format} -> {out_format} (no converter defined)")
return 1

with open(in_file, "r") as tf:
data = [line.strip().split("\t") for line in tf]
if is_trf_dat:
data = trf_dat_to_bed(tf)
else:
data = [line.strip().split("\t") for line in tf]
# noinspection PyCallingNonCallable
converter(data, logger)

convert_formats[(in_format, out_format)](data, logger)
return 0
3 changes: 2 additions & 1 deletion strkit/convert/expansionhunter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import json
import sys
from logging import Logger
from typing import Iterable

__all__ = [
"trf_bed_to_eh",
]


def trf_bed_to_eh(trf_data: list, _logger: Logger):
def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
eh_formatted_loci = []

for i, item in enumerate(trf_data, 1):
Expand Down
3 changes: 2 additions & 1 deletion strkit/convert/gangstr.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import sys
from logging import Logger
from typing import Iterable

__all__ = [
"trf_bed_to_gangstr",
]


def trf_bed_to_gangstr(trf_data: list, _logger: Logger):
def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger):
for i, item in enumerate(trf_data, 1):
sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
3 changes: 2 additions & 1 deletion strkit/convert/hipstr.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import sys
from logging import Logger
from typing import Iterable

__all__ = [
"trf_bed_to_hipstr",
]


def trf_bed_to_hipstr(trf_data: list, _logger: Logger):
def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger):
for i, item in enumerate(trf_data, 1):
sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
44 changes: 44 additions & 0 deletions strkit/convert/trf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys

from logging import Logger
from typing import Generator, Iterable, TextIO

__all__ = ["trf_dat_to_bed", "trf_passthrough"]

TRF_SEQUENCE_PREFIX = "Sequence: "

# Input TRF dat file columns: see https://tandem.bu.edu/trf/definitions#table
# start
# end
# period
# copy number
# consensus pattern size
# % matches
# % indels
# align score
# % A
# % C
# % G
# % T
# entropy
# motif
# sequence

# Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence


def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]:
current_contig = ""
for line in map(str.strip, file):
if not line:
continue
if line.startswith(TRF_SEQUENCE_PREFIX):
current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX)
data = line.split(" ")
if data[0].isdigit() and len(data) == 15: # data column
yield [current_contig, *data[:2], "trf", *data[2:]]


def trf_passthrough(trf_data: Iterable[list], _logger: Logger):
for data in trf_data:
sys.stdout.write("\t".join(data) + "\n")
12 changes: 6 additions & 6 deletions strkit/convert/trgt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
from logging import Logger
from typing import Literal
from typing import Literal, Iterable

__all__ = [
"trgt_bed_to_bed4",
Expand All @@ -11,7 +11,7 @@
from strkit.iupac import get_iupac_code_for_nt_set


def trgt_bed_to_bed4(trgt_data: list, logger: Logger, annotations: Literal["strkit"] | None = None):
def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None):
"""
Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
:param trgt_data: The loaded TRGT catalog (split by tab).
Expand Down Expand Up @@ -66,17 +66,17 @@ def _fmt_last_col(v: str) -> str:
sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n")


def trgt_bed_to_strkit_bed(trgt_data: list, logger: Logger):
def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger):
return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")


def trf_or_strkit_bed_to_trgt(trf_data: list, _logger: Logger):
def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):
"""
Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
:param trf_data: The loaded BED catalog data.
:param _logger: Logger instance (unused).
"""

for i, item in enumerate(trf_data):
motif = trf_data[-1]
sys.stdout.write("\t".join((*trf_data[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
motif = item[-1]
sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")