diff --git a/.gitignore b/.gitignore index 911648e..27e686c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ test-*.bash *.bam *.bai *.bcf +*.dat *.fasta *.fa.gz *.fa.gz.fai @@ -24,6 +25,7 @@ test-*.bash *.log *.bed !tests/data/*.bed +!tests/data/*.dat /*.json *.png *.time diff --git a/README.md b/README.md index 6f2a884..05f3b27 100644 --- a/README.md +++ b/README.md @@ -426,8 +426,9 @@ Some other tools, e.g., [Straglr](https://github.com/bcgsc/straglr), also take a BED as locus catalog input. However, other formats representing a catalog of STRs exist: * [Tandem Repeats Finder](https://github.com/Benson-Genomics-Lab/TRF) outputs a `.dat` file with a lot - of information. Once converted to a BED file (see below), this can be used as-is with STRkit, but + of information. Once converted to a BED file, this can be used as-is with STRkit, but it's safer for other tools to convert to a four-column BED format. + See "[Converting TRF DAT files to BED](./docs/convert_trf_to_bed.md)" for more on how to do this. * STRkit supports using [IUPAC codes] to specify motifs. Manual post-processing of TRF output would be needed to do this. * [TRGT uses a custom repeat definition format](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md), @@ -440,6 +441,8 @@ BED as locus catalog input. However, other formats representing a catalog of STR The `strkit convert` sub-command requires an input format (`bed4`, `trf` or `trgt`), an output format (many, see `strkit convert --help`), and an input file. Output is written to `stdout`. +If you want the output **sorted** (which many tools prefer or require), use the `--sort` flag. + *Note:* Not all input/output format pairs have available converter functions; an error will be printed to `stderr` if one does not exist. diff --git a/docs/convert_trf_to_bed.md b/docs/convert_trf_to_bed.md new file mode 100644 index 0000000..121eebb --- /dev/null +++ b/docs/convert_trf_to_bed.md @@ -0,0 +1,20 @@ +# Converting TRF DAT files to BED + +[Tandem Repeats Finder](https://tandem.bu.edu/trf/trf.html) ([Benson 1999](https://pubmed.ncbi.nlm.nih.gov/9862982/)) +is a widely-used tool for discovering tandem repeat regions in genomes. However, its output is in the form of a `.dat` +file, which is not always convenient for use in other programs (e.g., TR callers, web browsers) + +STRkit provides a built-in subcommand, `strkit convert`, which can (among other things) convert TRF files from the TRF +`.dat` format to either a standard four-column BED file, or a UCSC-style TRF BED file. + +To convert a TRF `.dat` file to a sorted four-column BED using STRkit, run the following command: + +```bash +strkit convert --in-format trf --out-format bed4 --sort trf.dat > repeats.bed +``` + +To convert a TRF `.dat` file to a sorted UCSC-style TRF BED file, run the following command instead: + +```bash +strkit convert --in-format trf --out-format trf --sort trf.dat > trf.bed +``` diff --git a/strkit/convert/_bed_4.py b/strkit/convert/_bed_4.py index 4e1b998..417398a 100644 --- a/strkit/convert/_bed_4.py +++ b/strkit/convert/_bed_4.py @@ -1,12 +1,11 @@ -import sys from logging import Logger -from typing import Iterable +from typing import Generator, Iterable __all__ = [ "trf_to_bed_4", ] -def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger): +def trf_to_bed_4(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: for item in trf_data: - sys.stdout.write("\t".join((*item[:3], item[-1])) + "\n") + yield "\t".join((*item[:3], item[-1])) + "\n" diff --git a/strkit/convert/converter.py b/strkit/convert/converter.py index 33c7c28..79c660d 100644 --- a/strkit/convert/converter.py +++ b/strkit/convert/converter.py @@ -1,5 +1,7 @@ +import sys + from logging import Logger -from typing import Callable, Iterable, TypeAlias +from typing import Callable, Generator, Iterable, TextIO, TypeAlias from ._bed_4 import trf_to_bed_4 from .constants import FORMAT_BED4, FORMAT_TRF, FORMAT_TRGT, CONVERTER_IN_FORMATS @@ -16,9 +18,9 @@ "convert", ] -ConverterFn: TypeAlias = Callable[[Iterable, Logger], None] +ConverterFn: TypeAlias = Callable[[Iterable, Logger], Generator[str, None, None]] -convert_formats: dict[tuple[str | tuple[str, ...], str], Callable[[Iterable, Logger], None]] = { +convert_formats: dict[tuple[str | tuple[str, ...], str], ConverterFn] = { # TRF DAT to BED converter: (FORMAT_TRF, FORMAT_TRF): trf_passthrough, # BED4/TRF BED converters: @@ -40,7 +42,25 @@ CONVERTER_OUTPUT_FORMATS: tuple[str, ...] = tuple(sorted(set(k[1] for k in convert_formats))) -def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> int: +def _load_bed_like(fh: TextIO, sort: bool) -> list: + data = [] + contig_order = [] + + for line in map(lambda s: s.split("\t"), map(str.strip, fh)): + if not contig_order or line[0] != contig_order[-1]: + contig_order.append(line[0]) + data.append([line[0], int(line[1]), int(line[2]), *line[3:]] if sort else line) + + if sort: + data.sort(key=lambda dd: (contig_order.index(dd[0]), dd[1], dd[2])) + for d in data: + d[1] = str(d[1]) + d[2] = str(d[2]) + + return data + + +def convert(in_file: str, in_format: str, out_format: str, sort: bool, logger: Logger) -> int: out_format = out_format.lower() is_trf_dat: bool = False @@ -71,10 +91,12 @@ def convert(in_file: str, in_format: str, out_format: str, logger: Logger) -> in with open(in_file, "r") as tf: if is_trf_dat: - data = trf_dat_to_bed(tf) + data = trf_dat_to_bed(tf, sort) else: - data = [line.strip().split("\t") for line in tf] + # in all other cases, we have various forms of BED files + data = _load_bed_like(tf, sort) # noinspection PyCallingNonCallable - converter(data, logger) + for line in converter(data, logger): + sys.stdout.write(line) return 0 diff --git a/strkit/convert/expansionhunter.py b/strkit/convert/expansionhunter.py index 34ef561..e53733d 100644 --- a/strkit/convert/expansionhunter.py +++ b/strkit/convert/expansionhunter.py @@ -1,14 +1,13 @@ import json -import sys from logging import Logger -from typing import Iterable +from typing import Generator, Iterable __all__ = [ "trf_bed_to_eh", ] -def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger): +def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: eh_formatted_loci = [] for i, item in enumerate(trf_data, 1): @@ -19,4 +18,4 @@ def trf_bed_to_eh(trf_data: Iterable[list], _logger: Logger): "VariantType": "Repeat", }) - sys.stdout.write(json.dumps(eh_formatted_loci, indent=2)) + yield json.dumps(eh_formatted_loci, indent=2) diff --git a/strkit/convert/gangstr.py b/strkit/convert/gangstr.py index 30a43bf..02c47d9 100644 --- a/strkit/convert/gangstr.py +++ b/strkit/convert/gangstr.py @@ -1,12 +1,11 @@ -import sys from logging import Logger -from typing import Iterable +from typing import Generator, Iterable __all__ = [ "trf_bed_to_gangstr", ] -def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger): +def trf_bed_to_gangstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: for i, item in enumerate(trf_data, 1): - sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n") + yield "\t".join((*item[:3], str(len(item[-1])), item[-1])) + "\n" diff --git a/strkit/convert/hipstr.py b/strkit/convert/hipstr.py index bfce7a3..d7d8393 100644 --- a/strkit/convert/hipstr.py +++ b/strkit/convert/hipstr.py @@ -1,12 +1,11 @@ -import sys from logging import Logger -from typing import Iterable +from typing import Generator, Iterable __all__ = [ "trf_bed_to_hipstr", ] -def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger): +def trf_bed_to_hipstr(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: for i, item in enumerate(trf_data, 1): - sys.stdout.write("\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n") + yield "\t".join((*item[:3], str(len(item[-1])), str(round(float(item[5]))))) + "\n" diff --git a/strkit/convert/trf.py b/strkit/convert/trf.py index 460eaaa..2ea2fcf 100644 --- a/strkit/convert/trf.py +++ b/strkit/convert/trf.py @@ -1,5 +1,3 @@ -import sys - from logging import Logger from typing import Generator, Iterable, TextIO @@ -27,18 +25,40 @@ # Output BED columns: same as above, but with contig, with "trf" as the 4th column, and without sequence -def trf_dat_to_bed(file: TextIO) -> Generator[list, None, None]: +def trf_dat_to_bed(file: TextIO, sort: bool) -> Generator[list, None, None]: + res = [] # only used if sort=True + + def _output_sorted_res(): + res.sort(key=lambda dd: (dd[1], dd[2])) + for r in res: + r[1] = str(r[1]) + r[2] = str(r[2]) + yield r + res.clear() + current_contig = "" for line in map(str.strip, file): if not line: continue + if line.startswith(TRF_SEQUENCE_PREFIX): current_contig = line.removeprefix(TRF_SEQUENCE_PREFIX) + if sort: + yield from _output_sorted_res() + data = line.split(" ") if data[0].isdigit() and len(data) == 15: # data column - yield [current_contig, *data[:2], "trf", *data[2:]] + # TRF start coord is 1-indexed + start = int(data[0]) - 1 + end = int(data[1]) + if sort: + res.append([current_contig, start, end, "trf", *data[2:14]]) + else: + yield [current_contig, str(start), str(end), "trf", *data[2:14]] + + yield from _output_sorted_res() -def trf_passthrough(trf_data: Iterable[list], _logger: Logger): +def trf_passthrough(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: for data in trf_data: - sys.stdout.write("\t".join(data) + "\n") + yield "\t".join(data) + "\n" diff --git a/strkit/convert/trgt.py b/strkit/convert/trgt.py index 99b9738..137a491 100644 --- a/strkit/convert/trgt.py +++ b/strkit/convert/trgt.py @@ -1,6 +1,5 @@ -import sys from logging import Logger -from typing import Literal, Iterable +from typing import Generator, Literal, Iterable __all__ = [ "trgt_bed_to_bed4", @@ -11,7 +10,9 @@ from strkit.iupac import get_iupac_code_for_nt_set -def trgt_bed_to_bed4(trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None): +def trgt_bed_to_bed4( + trgt_data: Iterable[list], logger: Logger, annotations: Literal["strkit"] | None = None +) -> Generator[str, None, None]: """ Converts a TRGT repeat catalog to the STRkit/BED4 catalog format. :param trgt_data: The loaded TRGT catalog (split by tab). @@ -57,20 +58,20 @@ def _fmt_last_col(v: str) -> str: break if not failed: # found a consensus base for the multiple-motif STR, so we can convert it - sys.stdout.write("\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n") + yield "\t".join((*data[:3], _fmt_last_col("".join(combined_motif_bases)))) + "\n" continue data_str = "\t".join(data) logger.warning(f"Could not convert complex locus at line {line}; taking first motif: {data_str}") - sys.stdout.write("\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n") + yield "\t".join((*data[:3], _fmt_last_col(motifs[0]))) + "\n" -def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger): - return trgt_bed_to_bed4(trgt_data, logger, annotations="strkit") +def trgt_bed_to_strkit_bed(trgt_data: Iterable[list], logger: Logger) -> Generator[str, None, None]: + yield from trgt_bed_to_bed4(trgt_data, logger, annotations="strkit") -def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger): +def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger) -> Generator[str, None, None]: """ Convets a TRF- or STRkit-formatted BED (motif-last) to a basic version of a TRGT catalog. :param trf_data: The loaded BED catalog data. @@ -79,4 +80,4 @@ def trf_or_strkit_bed_to_trgt(trf_data: Iterable[list], _logger: Logger): for i, item in enumerate(trf_data): motif = item[-1] - sys.stdout.write("\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n") + yield "\t".join((*item[:3], f"ID=locus{i};MOTIFS={motif};STRUC=({motif})n")) + "\n" diff --git a/strkit/entry.py b/strkit/entry.py index bfa02b1..ed405ea 100644 --- a/strkit/entry.py +++ b/strkit/entry.py @@ -463,13 +463,14 @@ def add_cc_parser_args(cc_parser): cc_parser.add_argument("paths", type=str, nargs="+", help="Paths to the BED catalogs to combine.") -def add_cv_parser_args(al_parser): +def add_cv_parser_args(cv_parser): from strkit.convert.converter import CONVERTER_OUTPUT_FORMATS - al_parser.add_argument("in_file", type=str, help="Input file to convert from.") - al_parser.add_argument( + cv_parser.add_argument("in_file", type=str, help="Input file to convert from.") + cv_parser.add_argument( "--in-format", type=str, choices=("trf", "trgt"), default="trf", help="Format to convert from." ) - al_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.") + cv_parser.add_argument("--out-format", type=str, choices=CONVERTER_OUTPUT_FORMATS, help="Format to convert to.") + cv_parser.add_argument("--sort", action="store_true", help="Whether to output a sorted BED file (slower).") def add_vs_parser_args(vs_parser): @@ -632,7 +633,7 @@ def _exec_combine_catalogs(p_args): def _exec_convert(p_args): from strkit.convert.converter import convert - return convert(p_args.in_file, p_args.in_format, p_args.out_format, _main_logger(p_args)) + return convert(p_args.in_file, p_args.in_format, p_args.out_format, p_args.sort, _main_logger(p_args)) def _exec_viz_server(p_args): diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/trf_convert_sort.dat b/tests/data/trf_convert_sort.dat new file mode 100644 index 0000000..7602438 --- /dev/null +++ b/tests/data/trf_convert_sort.dat @@ -0,0 +1,17 @@ +Tandem Repeats Finder Program written by: + +Gary Benson +Program in Bioinformatics +Boston University +Version 4.09 + + +Sequence: Chr_Y.H1 + + + +Parameters: 2 5 7 80 10 50 2000 + + +570 609 19 2.1 19 80 0 52 22 7 67 2 1.28 GGGCGGCGGGGGAAAAAAG GGGCGGTGGGGGAAAAGAGGGGCGGCGGGGGAGAGAAGGG +1 62 28 2.2 28 91 0 103 22 16 43 17 1.87 TATCACCGGGGAGGAGGAGGGTAGTCAG TATCACCGGGGAGGAGGAGGGTGGTCAGTATCACCGGGGAGGAGGAGGGTAGTCGTTATCAC diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 0000000..0f6b4a7 --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,22 @@ +import logging +from pathlib import Path +from strkit.convert.constants import FORMAT_TRF, FORMAT_BED4 +from strkit.convert.converter import convert + +TRF_CONVERT_SORT_DAT = Path(__file__).parent / "data" / "trf_convert_sort.dat" + +logger = logging.getLogger(__name__) + + +def test_convert_trf_dat_to_bed4(capsys): + res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=False, logger=logger) + assert res == 0 + captured = capsys.readouterr() + assert captured.out == "Chr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\nChr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\n" + + +def test_convert_trf_dat_to_bed4_sorted(capsys): + res = convert(str(TRF_CONVERT_SORT_DAT), FORMAT_TRF, FORMAT_BED4, sort=True, logger=logger) + assert res == 0 + captured = capsys.readouterr() + assert captured.out == "Chr_Y.H1\t0\t62\tTATCACCGGGGAGGAGGAGGGTAGTCAG\nChr_Y.H1\t569\t609\tGGGCGGCGGGGGAAAAAAG\n"