Skip to content

Commit 6b05f15

Browse files
Mike LeeMike Lee
authored andcommitted
help menu tweaks
1 parent 645f004 commit 6b05f15

3 files changed

Lines changed: 20 additions & 66 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
1414
-->
1515

16-
## v1.14.0 (NOT RELEASED YET)
16+
## v1.14.0 (07-Apr-2026)
1717

1818
### Added
1919
- to `bit-cov-analyzer`
@@ -33,6 +33,7 @@
3333
- default window size change from 50 to 100, and default step size changed from 10 to 20
3434
- drastic improvements to efficiency when working with large genomes (e.g., 3GB)
3535
- histogram of coverages no longer plotted by default, only done now when adding the `--write-window-stats` flag
36+
- no longer produces window-coverage-overview.txt as all of that info is captured within window-coverage-overview.tsv
3637
- `bit-get-mapped-reads-pid`
3738
- minor improvements to efficiency
3839
- `bit-get-cov-stats`

bit/cli/cov_analyzer.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@
66
def main():
77

88
desc = """
9-
This script identifies and pulls out regions of zero-coverage and relatively higher and lower coverage when
10-
given a reference fasta and a bam file. It generates a bed file of the specified window-
11-
and step-size, utilizes mosdepth to get the coverage of those windows, then generates stats
12-
for those windows and pulls out regions with zero coverage and coverage above and below specified thresholds.
13-
It outputs a table of coverage stats for all windows (if requested), a table of merged adjacent windows
14-
("regions"), and identified regions in fasta format. By default it looks for coverage variations
15-
from the global mean coverage, but you can tell it to use per-contig coverages by adding the
16-
`--per-contig` flag. Additionally, it is recommended to exclude contigs holding mitochondrial
17-
genomes or chloroplasts due to their generally very relative high coverage, unless you
18-
specifically want to investigate them too (and if so, you should probably use `--per-contig` mode). For
19-
version info, run `bit-version`.
9+
This program analyzes coverage patterns given a reference fasta and a bam file as inputs.
10+
11+
It generates a bed file of the specified window- and step-size, utilizes mosdepth
12+
to get the coverage of those windows, then generates stats for those windows and
13+
pulls out regions with zero coverage and regions with coverage above and below specified thresholds.
14+
Primary outputs include tables and fastas of all merged adjacent windows ("regions"). By default, it
15+
looks for coverage variations from the global mean coverage, but you can tell it to use per-contig
16+
coverages by adding the `--per-contig` flag. Additionally, it is recommended to exclude contigs
17+
holding mitochondrial genomes or chloroplasts due to their relatively high coverage (if you
18+
specifically want to investigate them too, you should probably use `--per-contig` mode).
19+
For version info, run `bit-version`.
2020
"""
2121

2222
parser = argparse.ArgumentParser(
@@ -95,7 +95,7 @@ def main():
9595
"-w",
9696
"--window-size",
9797
metavar="<INT>",
98-
help='Sliding window size (default: 100)',
98+
help='Window size (default: 100)',
9999
type=int,
100100
default=100,
101101
)
@@ -111,21 +111,21 @@ def main():
111111
"-g",
112112
"--allowed-gap",
113113
metavar="<INT>",
114-
help='Number of bases allowed between qualifying windows (those with coverages above/below the specified thresholds) to still merge them into one contiguous region (default: 1000)',
114+
help='Number of bases allowed between qualifying windows to still merge them into one contiguous region (default: 1000)',
115115
type=int,
116116
default=1000,
117117
)
118118
optional.add_argument(
119119
"-B",
120120
"--buffer",
121121
metavar="<INT>",
122-
help='Add this length to each side of a high/low region of interest when pulled out as fasta (default: 100)',
122+
help='Add this length to each side of a region of interest when pulled out as fasta (default: 100)',
123123
type=int,
124124
default=100,
125125
)
126126
optional.add_argument(
127127
"--write-window-stats",
128-
help='Add this flag to also write out individual window stats and histogram of coverages (saves spacetime by not doing so)',
128+
help='Add this flag to also write out individual window stats and a histogram of coverages (saves spacetime by not doing so)',
129129
action="store_true",
130130
)
131131

bit/modules/cov_analyzer.py

Lines changed: 3 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,6 @@ def generate_outputs(reference_fasta, high_merged_regions, low_merged_regions,
563563

564564

565565
def write_window_cov_stats(cov_stats, output_dir, contig_lengths, log_file):
566-
out_txt = f"{output_dir}/window-coverage-overview.txt"
567566
out_tsv = f"{output_dir}/window-coverage-overview.tsv"
568567
percentiles = [0.01, 0.1, 1, 5, 10, 25, 50, 75, 90, 95, 99, 99.9, 99.99]
569568

@@ -598,58 +597,12 @@ def write_window_cov_stats(cov_stats, output_dir, contig_lengths, log_file):
598597
**{f"p{p}": group["cov"].quantile(p/100) for p in percentiles}
599598
})
600599

601-
# writing assembly level
602-
with open(out_txt, "w") as out:
603-
out.write("====================================\n")
604-
out.write("=== GLOBAL WINDOW COVERAGE STATS ===\n")
605-
out.write("====================================\n")
606-
out.write(f"Assembly size: {assembly_length:,} bp\n")
607-
out.write(f"Number of windows: {len(cov_stats.df)}\n")
608-
out.write(f"Mean: {cov_stats.global_mean:.2f}\n")
609-
out.write(f"Median: {cov_stats.global_median:.2f}\n")
610-
out.write(f"Std: {cov_stats.global_std:.2f}\n")
611-
out.write(f"Min: {cov_stats.global_min:.2f}\n")
612-
out.write(f"Max: {cov_stats.global_max:.2f}\n\n")
613-
614-
out.write("Percentiles (global):\n")
615-
for p in percentiles:
616-
val = cov_stats.df["cov"].quantile(p/100)
617-
out.write(f" {p:>6.2f}% : {val:.2f}\n")
618-
619-
# now writing per-contig
620-
out.write("\n========================================\n")
621-
out.write("=== PER-CONTIG WINDOW COVERAGE STATS ===\n")
622-
out.write("========================================\n")
623-
for contig, group in cov_stats.df.groupby("contig", sort=False):
624-
out.write(f"\n-- {contig} --\n")
625-
contig_length = contig_lengths.get(contig, 0)
626-
gm = group["cov"].mean()
627-
md = group["cov"].median()
628-
sd = group["cov"].std()
629-
mn = group["cov"].min()
630-
mx = group["cov"].max()
631-
out.write(f"Contig size: {contig_length:,} bp\n")
632-
out.write(f"Number of windows: {len(group)}\n")
633-
out.write(f"Mean: {gm:.2f}\n")
634-
out.write(f"Median: {md:.2f}\n")
635-
out.write(f"Std: {sd:.2f}\n")
636-
out.write(f"Min: {mn:.2f}\n")
637-
out.write(f"Max: {mx:.2f}\n")
638-
out.write("\nPercentiles:\n")
639-
for p in percentiles:
640-
val = group["cov"].quantile(p/100)
641-
out.write(f" {p:>6.2f}% : {val:.2f}\n")
642-
out.write("\n")
643-
644-
tee(f"\n Window-coverage summary written to:\n {Fore.YELLOW}{out_txt}{Fore.RESET}", log_file)
645-
646-
# writing out as tsv
647600
df_summary = pd.DataFrame(rows)
648601
cols = ["name","length_bp","num_windows","mean","median","std","min","max"] \
649602
+ [f"p{p}" for p in percentiles]
650603
df_summary.to_csv(out_tsv, sep="\t", index=False, float_format="%.2f", columns=cols)
651604

652-
tee(f" Window-coverage summary table written to:\n {Fore.YELLOW}{out_tsv}{Fore.RESET}", log_file)
605+
tee(f"\n Window-coverage overview written to:\n {Fore.YELLOW}{out_tsv}{Fore.RESET}", log_file)
653606

654607

655608
def write_windows_table(cov_stats, output_dir, log_file):
@@ -697,10 +650,10 @@ def write_regions_of_interest_table(merged_regions, output_dir, type, log_file):
697650
if len(sorted_df) > 0:
698651
n_low_complexity = int(sorted_df["low_complexity"].sum()) if "low_complexity" in sorted_df.columns else 0
699652
lc_note = f" ({n_low_complexity} flagged as low-complexity)" if n_low_complexity > 0 else ""
700-
tee(f"\n Number of {type}-coverage regions identified: {Fore.YELLOW}{len(sorted_df)}{Fore.RESET}{lc_note}", log_file)
653+
tee(f"\n\n Number of {type}-coverage regions identified: {Fore.YELLOW}{len(sorted_df)}{Fore.RESET}{lc_note}", log_file)
701654
tee(f" {type.capitalize()}-coverage regions-of-interest table written to:\n {Fore.YELLOW}{out_path}{Fore.RESET}", log_file)
702655
else:
703-
tee(f"\n No {type}-coverage regions-of-interest identified.", log_file)
656+
tee(f"\n\n No {type}-coverage regions-of-interest identified.", log_file)
704657

705658

706659
def write_regions_fasta(reference_fasta, regions_df, buffer, output_dir, type, contig_lengths):

0 commit comments

Comments
 (0)