Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ test-*.bash
*.bam
*.bai
*.bcf
*.dat
*.fasta
*.fa.gz
*.fa.gz.fai
Expand All @@ -24,6 +25,7 @@ test-*.bash
*.log
*.bed
!tests/data/*.bed
!tests/data/*.dat
/*.json
*.png
*.time
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -426,8 +426,9 @@ Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a
BED as locus catalog input. However, other formats representing a catalog of STRs exist:

* [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot
of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but
of information. Once converted to a BED file, this can be used as-is with STRkit, but
it's safer for other tools to convert to a four-column BED format.
See "[Converting TRF DAT files to BED](./docs/convert_trf_to_bed.md)" for more on how to do this.
* STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would
be needed to do this.
* [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md),
Expand All @@ -440,6 +441,8 @@ BED as locus catalog input. However, other formats representing a catalog of STR
The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output
format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`.

If you want the output **sorted** (which many tools prefer or require), use the `--sort` flag.

*Note:* Not all input/output format pairs have available converter functions; an error will be
printed to `stderr` if one does not exist.

Expand Down
20 changes: 20 additions & 0 deletions docs/convert_trf_to_bed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Converting TRF DAT files to BED

[Tandem Repeats Finder](https://tandem.bu.edu/trf/trf.html) ([Benson 1999](https://pubmed.ncbi.nlm.nih.gov/9862982/))
is a widely-used tool for discovering tandem repeat regions in genomes. However, its output is in the form of a `.dat`
file, which is not always convenient for use in other programs (e.g., TR callers, web browsers)

STRkit provides a built-in subcommand, `strkit convert`, which can (among other things) convert TRF files from the TRF
`.dat` format to either a standard four-column BED file, or a UCSC-style TRF BED file.

To convert a TRF `.dat` file to a sorted four-column BED using STRkit, run the following command:

```bash
strkit convert --in-format trf --out-format bed4 --sort trf.dat > repeats.bed
```

To convert a TRF `.dat` file to a sorted UCSC-style TRF BED file, run the following command instead:

```bash
strkit convert --in-format trf --out-format trf --sort trf.dat > trf.bed
```
7 changes: 3 additions & 4 deletions strkit/convert/_bed_4.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import sys
from logging import Logger
from typing import Iterable
from typing import Generator, Iterable

__all__ = [
"trf_to_bed_4",
]


def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger):
def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
for item in trf_data:
sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n")
yield "\t".join((*item[:3], item[-1])) + "\n"
36 changes: 29 additions & 7 deletions strkit/convert/converter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import sys

from logging import Logger
from typing import Callable, Iterable, TypeAlias
from typing import Callable, Generator, Iterable, TextIO, TypeAlias

from ._bed_4 import trf_to_bed_4
from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS
Expand All @@ -16,9 +18,9 @@
"convert",
]

ConverterFn: TypeAlias = Callable[[Iterable, Logger], None]
ConverterFn: TypeAlias = Callable[[Iterable, Logger], Generator[str, None, None]]

convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = {
convert_formats: dict[tuple[str | tuple[str, ...], str], ConverterFn] = {
# TRF DAT to BED converter:
(FORMAT_TRF, FORMAT_TRF): trf_passthrough,
# BED4/TRF BED converters:
Expand All @@ -40,7 +42,25 @@
CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats)))


def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int:
def _load_bed_like(fh: TextIO, sort: bool) -> list:
data = []
contig_order = []

for line in map(lambda s: s.split("\t"), map(str.strip, fh)):
if not contig_order or line[0] != contig_order[-1]:
contig_order.append(line[0])
data.append([line[0], int(line[1]), int(line[2]), *line[3:]] if sort else line)

if sort:
data.sort(key=lambda dd: (contig_order.index(dd[0]), dd[1], dd[2]))
for d in data:
d[1] = str(d[1])
d[2] = str(d[2])

return data


def convert(in_file: str, in_format: str, out_format: str, sort: bool, logger: Logger) -> int:
out_format = out_format.lower()

is_trf_dat: bool = False
Expand Down Expand Up @@ -71,10 +91,12 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in

with open(in_file, "r") as tf:
if is_trf_dat:
data = trf_dat_to_bed(tf)
data = trf_dat_to_bed(tf, sort)
else:
data = [line.strip().split("\t") for line in tf]
# in all other cases, we have various forms of BED files
data = _load_bed_like(tf, sort)
# noinspection PyCallingNonCallable
converter(data, logger)
for line in converter(data, logger):
sys.stdout.write(line)

return 0
7 changes: 3 additions & 4 deletions strkit/convert/expansionhunter.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import json
import sys
from logging import Logger
from typing import Iterable
from typing import Generator, Iterable

__all__ = [
"trf_bed_to_eh",
]


def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
eh_formatted_loci = []

for i, item in enumerate(trf_data, 1):
Expand All @@ -19,4 +18,4 @@ def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger):
"VariantType": "Repeat",
})

sys.stdout.write(json.dumps(eh_formatted_loci, indent=2))
yield json.dumps(eh_formatted_loci, indent=2)
7 changes: 3 additions & 4 deletions strkit/convert/gangstr.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import sys
from logging import Logger
from typing import Iterable
from typing import Generator, Iterable

__all__ = [
"trf_bed_to_gangstr",
]


def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger):
def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
for i, item in enumerate(trf_data, 1):
sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n")
yield "\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n"
7 changes: 3 additions & 4 deletions strkit/convert/hipstr.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import sys
from logging import Logger
from typing import Iterable
from typing import Generator, Iterable

__all__ = [
"trf_bed_to_hipstr",
]


def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger):
def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
for i, item in enumerate(trf_data, 1):
sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n")
yield "\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n"
32 changes: 26 additions & 6 deletions strkit/convert/trf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import sys

from logging import Logger
from typing import Generator, Iterable, TextIO

Expand Down Expand Up @@ -27,18 +25,40 @@
# Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence


def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]:
def trf_dat_to_bed(file: TextIO, sort: bool) -> Generator[list, None, None]:
res = [] # only used if sort=True

def _output_sorted_res():
res.sort(key=lambda dd: (dd[1], dd[2]))
for r in res:
r[1] = str(r[1])
r[2] = str(r[2])
yield r
res.clear()

current_contig = ""
for line in map(str.strip, file):
if not line:
continue

if line.startswith(TRF_SEQUENCE_PREFIX):
current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX)
if sort:
yield from _output_sorted_res()

data = line.split(" ")
if data[0].isdigit() and len(data) == 15: # data column
yield [current_contig, *data[:2], "trf", *data[2:]]
# TRF start coord is 1-indexed
start = int(data[0]) - 1
end = int(data[1])
if sort:
res.append([current_contig, start, end, "trf", *data[2:14]])
else:
yield [current_contig, str(start), str(end), "trf", *data[2:14]]

yield from _output_sorted_res()


def trf_passthrough(trf_data: Iterable[list], _logger: Logger):
def trf_passthrough(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
for data in trf_data:
sys.stdout.write("\t".join(data) + "\n")
yield "\t".join(data) + "\n"
19 changes: 10 additions & 9 deletions strkit/convert/trgt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import sys
from logging import Logger
from typing import Literal, Iterable
from typing import Generator, Literal, Iterable

__all__ = [
"trgt_bed_to_bed4",
Expand All @@ -11,7 +10,9 @@
from strkit.iupac import get_iupac_code_for_nt_set


def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None):
def trgt_bed_to_bed4(
trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None
) -> Generator[str, None, None]:
"""
Converts a TRGT repeat catalog to the STRkit/BED4 catalog format.
:param trgt_data: The loaded TRGT catalog (split by tab).
Expand Down Expand Up @@ -57,20 +58,20 @@ def _fmt_last_col(v: str) -> str:
break

if not failed: # found a consensus base for the multiple-motif STR, so we can convert it
sys.stdout.write("\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n")
yield "\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n"
continue

data_str = "\t".join(data)
logger.warning(f"Could not convert complex locus at line {line}; taking first motif: {data_str}")

sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n")
yield "\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n"


def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger):
return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")
def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger) -> Generator[str, None, None]:
yield from trgt_bed_to_bed4(trgt_data, logger, annotations="strkit")


def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):
def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]:
"""
Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog.
:param trf_data: The loaded BED catalog data.
Expand All @@ -79,4 +80,4 @@ def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger):

for i, item in enumerate(trf_data):
motif = item[-1]
sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n")
yield "\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n"
11 changes: 6 additions & 5 deletions strkit/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,13 +463,14 @@ def add_cc_parser_args(cc_parser):
cc_parser.add_argument("paths", type=str, nargs="+", help="Paths to the BED catalogs to combine.")


def add_cv_parser_args(al_parser):
def add_cv_parser_args(cv_parser):
from strkit.convert.converter import CONVERTER_OUTPUT_FORMATS
al_parser.add_argument("in_file", type=str, help="Input file to convert from.")
al_parser.add_argument(
cv_parser.add_argument("in_file", type=str, help="Input file to convert from.")
cv_parser.add_argument(
"--in-format", type=str, choices=("trf", "trgt"), default="trf", help="Format to convert from."
)
al_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.")
cv_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.")
cv_parser.add_argument("--sort", action="store_true", help="Whether to output a sorted BED file (slower).")


def add_vs_parser_args(vs_parser):
Expand Down Expand Up @@ -632,7 +633,7 @@ def _exec_combine_catalogs(p_args):

def _exec_convert(p_args):
from strkit.convert.converter import convert
return convert(p_args.in_file, p_args.in_format, p_args.out_format, _main_logger(p_args))
return convert(p_args.in_file, p_args.in_format, p_args.out_format, p_args.sort, _main_logger(p_args))


def _exec_viz_server(p_args):
Expand Down
Empty file added tests/__init__.py
Empty file.
17 changes: 17 additions & 0 deletions tests/data/trf_convert_sort.dat
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Tandem Repeats Finder Program written by:

Gary Benson
Program in Bioinformatics
Boston University
Version 4.09


Sequence: Chr_Y.H1



Parameters: 2 5 7 80 10 50 2000


570 609 19 2.1 19 80 0 52 22 7 67 2 1.28 GGGCGGCGGGGGAAAAAAG GGGCGGTGGGGGAAAAGAGGGGCGGCGGGGGAGAGAAGGG
1 62 28 2.2 28 91 0 103 22 16 43 17 1.87 TATCACCGGGGAGGAGGAGGGTAGTCAG TATCACCGGGGAGGAGGAGGGTGGTCAGTATCACCGGGGAGGAGGAGGGTAGTCGTTATCAC
22 changes: 22 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging
from pathlib import Path
from strkit.convert.constants import FORMAT_TRF, FORMAT_BED4
from strkit.convert.converter import convert

TRF_CONVERT_SORT_DAT = Path(__file__).parent / "data" / "trf_convert_sort.dat"

logger = logging.getLogger(__name__)


def test_convert_trf_dat_to_bed4(capsys):
res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=False, logger=logger)
assert res == 0
captured = capsys.readouterr()
assert captured.out == "Chr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\nChr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\n"


def test_convert_trf_dat_to_bed4_sorted(capsys):
res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=True, logger=logger)
assert res == 0
captured = capsys.readouterr()
assert captured.out == "Chr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\nChr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\n"