Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: Benchmarks

on:
workflow_dispatch:
inputs:
alignment_url:
description: SAM/SAM.gz/BAM URL
default: "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00097/alignment/HG00097.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"
reference_url:
description: Reference FASTA/FASTA.gz URL
default: "http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.11.fa.gz"
regions:
description: Space-separated regions
default: "11:5000000-6000000 11:65000000-66000000 11:120000000-121000000"
pull_request:
types: [labeled, synchronize]

jobs:
benchmark:
if: >-
github.event_name == 'workflow_dispatch' ||
contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
runs-on: ubuntu-24.04
permissions:
contents: read
pull-requests: write
env:
ALIGNMENT_URL: ${{ inputs.alignment_url || 'http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/HG00097/alignment/HG00097.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam' }}
REFERENCE_URL: ${{ inputs.reference_url || 'http://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.11.fa.gz' }}
REGIONS: ${{ inputs.regions || '11:5000000-6000000 11:65000000-66000000 11:120000000-121000000' }}
steps:
- uses: actions/checkout@v5
with:
fetch-depth: 0

- name: Install dependencies
run: |
wget -O root.tar.gz https://root.cern/download/root_v6.36.00.Linux-ubuntu24.04-x86_64-gcc13.3.tar.gz
sudo tar -xzf root.tar.gz -C /opt/
echo "/opt/root/bin" >> "$GITHUB_PATH"
sudo apt-get update
sudo apt-get install -y libvdt-dev libtbb-dev libhts-dev samtools

- name: Build
run: |
source /opt/root/bin/thisroot.sh
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DRAMTOOLS_BUILD_TESTS=OFF -DRAMTOOLS_BUILD_BENCHMARKS=OFF
cmake --build build --target samtoramntuple ramntupleview -j "$(nproc)"

- name: Prepare data
run: |
mkdir -p data results
curl -fL "$ALIGNMENT_URL" -o data/alignment
if samtools quickcheck data/alignment 2>/dev/null; then
samtools view -h -o data/input.sam data/alignment
elif file data/alignment | grep -qi gzip; then
gunzip -c data/alignment > data/input.sam
else
mv data/alignment data/input.sam
fi
rm -f data/alignment
curl -fL "$REFERENCE_URL" -o data/ref.gz
if file data/ref.gz | grep -qi gzip; then
gunzip -c data/ref.gz > data/reference.fa
else
mv data/ref.gz data/reference.fa
fi
rm -f data/ref.gz
samtools faidx data/reference.fa
df -h /

- name: Benchmark PR
run: |
source /opt/root/bin/thisroot.sh
python3 scripts/run_benchmark.py --ram results/pr.ram --output results/pr.json

- name: Benchmark base branch
if: github.event_name == 'pull_request'
continue-on-error: true
run: |
source /opt/root/bin/thisroot.sh
BASE_REF="${{ github.event.pull_request.base.ref }}"
git worktree add ../ramtools-base "origin/${BASE_REF}"
cmake -S ../ramtools-base -B ../ramtools-base/build -DCMAKE_BUILD_TYPE=Release -DRAMTOOLS_BUILD_TESTS=OFF -DRAMTOOLS_BUILD_BENCHMARKS=OFF
cmake --build ../ramtools-base/build --target samtoramntuple ramntupleview -j "$(nproc)"
python3 scripts/run_benchmark.py --build-dir ../ramtools-base/build --ram results/base.ram --no-cram --output results/base.json

- name: Render report
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
ARGS="results/pr.json --pr-sha $PR_SHA --output results/benchmark-summary.md"
if [ -f results/base.json ]; then
ARGS="$ARGS --base results/base.json --base-sha $BASE_SHA"
fi
python3 scripts/render_benchmark.py $ARGS

- name: Step summary
if: hashFiles('results/benchmark-summary.md') != ''
run: cat results/benchmark-summary.md >> "$GITHUB_STEP_SUMMARY"

- name: PR comment
if: github.event_name == 'pull_request' && hashFiles('results/benchmark-summary.md') != ''
uses: marocchino/sticky-pull-request-comment@v2
with:
header: ramtools-benchmark
path: results/benchmark-summary.md

- uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
results/pr.json
results/base.json
results/benchmark-summary.md
134 changes: 134 additions & 0 deletions scripts/render_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import argparse
import json
import pathlib
import statistics


def fmt_size(n):
for unit in ("B", "KB", "MB", "GB", "TB"):
if n < 1024 or unit == "TB":
return f"{n:.2f} {unit}"
n /= 1024


def fmt_dist(samples):
if not samples:
return "n/a"
mean = statistics.mean(samples)
stddev = statistics.stdev(samples) if len(samples) > 1 else 0.0
if mean < 1e-3:
unit, scale = "μs", 1e6
elif mean < 1:
unit, scale = "ms", 1e3
else:
unit, scale = "s", 1.0
if stddev > 0:
return f"{mean * scale:.3g} ± {stddev * scale:.2g} {unit}"
return f"{mean * scale:.3g} {unit}"


def ratio(base, pr):
if not base or not pr:
return "-"
pm = statistics.mean(pr)
return f"{statistics.mean(base) / pm:.2f}" if pm else "-"


def region_label(q):
if q.get("records") is not None:
return f"`{q['region']}` ({q['records']:,} reads)"
return f"`{q['region']}`"


def records_line(pr):
sam = pr.get("sam_records")
ram = pr["ram"].get("records")
cram = pr.get("cram", {}).get("records")
parts = [f"SAM `{sam:,}`" if sam is not None else "SAM `?`"]
if ram is not None:
tag = "match" if sam is not None and ram == sam else "MISMATCH"
parts.append(f"RAM `{ram:,}` ({tag})")
if cram is not None:
tag = "match" if sam is not None and cram == sam else "MISMATCH"
parts.append(f"CRAM `{cram:,}` ({tag})")
return f"**Records:** {' / '.join(parts)}"


def render_comparison(pr, base, base_sha, pr_sha):
bl = f"base ({base_sha[:7]})" if base_sha else "base"
pl = f"PR ({pr_sha[:7]})" if pr_sha else "PR"
lines = [
f"Comparing **{bl}** vs **{pl}** on `{fmt_size(pr['sam_size_bytes'])}` SAM input",
"",
f"| Benchmark | {bl} | {pl} | base / PR |",
"|-----------|------|------|----------:|",
f"| RAM size | {fmt_size(base['ram']['size_bytes'])} | {fmt_size(pr['ram']['size_bytes'])} "
f"| {base['ram']['size_bytes'] / pr['ram']['size_bytes']:.2f} |",
f"| RAM conversion | {fmt_dist(base['ram']['conversion_seconds'])} "
f"| {fmt_dist(pr['ram']['conversion_seconds'])} "
f"| {ratio(base['ram']['conversion_seconds'], pr['ram']['conversion_seconds'])} |",
]
for b, p in zip(base["ram"]["queries"], pr["ram"]["queries"]):
lines.append(
f"| query {region_label(p)} | {fmt_dist(b['seconds'])} | {fmt_dist(p['seconds'])} "
f"| {ratio(b['seconds'], p['seconds'])} |"
)
return lines + [""]


def render_absolute(pr, pr_sha):
pl = f"PR ({pr_sha[:7]})" if pr_sha else "PR"
lines = [
f"**{pl}** on `{fmt_size(pr['sam_size_bytes'])}` SAM input",
"",
"| Benchmark | Value |",
"|-----------|-------|",
f"| RAM size | {fmt_size(pr['ram']['size_bytes'])} |",
f"| RAM conversion | {fmt_dist(pr['ram']['conversion_seconds'])} |",
]
for q in pr["ram"]["queries"]:
lines.append(f"| query {region_label(q)} | {fmt_dist(q['seconds'])} |")
return lines + [""]


def render_cram(pr):
cram, sam = pr["cram"], pr["sam_size_bytes"]
lines = [
"### CRAM 3.1 reference (archive preset: fqzcomp + name tokenizer + rANS)",
"",
"| Metric | Value |",
"|--------|-------|",
f"| CRAM size | {fmt_size(cram['size_bytes'])} ({cram['size_bytes'] / sam:.1%} of SAM) |",
f"| CRAM conversion | {fmt_dist(cram['conversion_seconds'])} |",
f"| CRAM index | {fmt_dist(cram['index_seconds'])} |",
]
for q in cram["queries"]:
lines.append(f"| query {region_label(q)} (CRAM) | {fmt_dist(q['seconds'])} |")
return lines + [""]


def main():
ap = argparse.ArgumentParser()
ap.add_argument("pr_json")
ap.add_argument("--base")
ap.add_argument("--base-sha", default="")
ap.add_argument("--pr-sha", default="")
ap.add_argument("--output", required=True)
args = ap.parse_args()

pr = json.loads(pathlib.Path(args.pr_json).read_text())
base = json.loads(pathlib.Path(args.base).read_text()) if args.base else None

out = ["## Benchmark results", "", records_line(pr), ""]
out += render_comparison(pr, base, args.base_sha, args.pr_sha) if base else render_absolute(pr, args.pr_sha)
if "cram" in pr:
out += render_cram(pr)

text = "\n".join(out)
pathlib.Path(args.output).parent.mkdir(parents=True, exist_ok=True)
pathlib.Path(args.output).write_text(text)
print(text)


if __name__ == "__main__":
main()
123 changes: 123 additions & 0 deletions scripts/run_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import argparse
import json
import os
import pathlib
import re
import subprocess
import time

SAM = "data/input.sam"
REF = "data/reference.fa"
CRAM = "results/input.cram"
QUERY_REPEAT = 3


def measure(cmd, n, cleanup=None):
samples, last = [], ""
for _ in range(n):
if cleanup:
cleanup()
start = time.perf_counter()
proc = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True)
samples.append(time.perf_counter() - start)
last = proc.stdout
return samples, last


def parse_records(output):
m = re.findall(r"Found\s+(\d+)\s+records", output)
return int(m[-1]) if m else None


def count_int(cmd):
return int(subprocess.run(cmd, text=True, capture_output=True, check=True).stdout.strip())


def count_ram(cmd):
return parse_records(subprocess.run(cmd, text=True, capture_output=True, check=True).stdout)


def main():
ap = argparse.ArgumentParser()
ap.add_argument("--build-dir", default="build")
ap.add_argument("--ram", default="results/input.ram")
ap.add_argument("--no-cram", action="store_true")
ap.add_argument("--output", required=True)
args = ap.parse_args()

threads = str(os.cpu_count() or 2)
regions = os.environ.get("REGIONS", "").split()
ram = pathlib.Path(args.ram)
cram = pathlib.Path(CRAM)
ram.parent.mkdir(parents=True, exist_ok=True)

sam_records = count_int(["samtools", "view", "-c", SAM])

convert, _ = measure(
[f"{args.build_dir}/tools/samtoramntuple", SAM, str(ram)],
1,
cleanup=lambda: ram.unlink(missing_ok=True),
)
ram_total = count_ram([f"{args.build_dir}/tools/ramntupleview", str(ram)])
ram_queries = []
for r in regions:
samples, out = measure([f"{args.build_dir}/tools/ramntupleview", str(ram), r], QUERY_REPEAT)
ram_queries.append({"region": r, "seconds": samples, "records": parse_records(out)})

result = {
"sam_size_bytes": os.path.getsize(SAM),
"sam_records": sam_records,
"ram": {
"size_bytes": ram.stat().st_size,
"records": ram_total,
"conversion_seconds": convert,
"queries": ram_queries,
},
}

if not args.no_cram:
def rm_cram():
cram.unlink(missing_ok=True)
pathlib.Path(str(cram) + ".crai").unlink(missing_ok=True)

cram_convert, _ = measure(
[
"samtools", "view", "-@", threads, "-C", "-T", REF,
"--output-fmt-option", "version=3.1",
"--output-fmt-option", "archive=1",
"--output-fmt-option", "use_fqz=1",
"--output-fmt-option", "use_tok=1",
"--output-fmt-option", "use_arith=1",
"-o", str(cram), SAM,
],
1,
cleanup=rm_cram,
)
cram_index, _ = measure(
["samtools", "index", "-@", threads, str(cram)],
1,
cleanup=lambda: pathlib.Path(str(cram) + ".crai").unlink(missing_ok=True),
)
cram_total = count_int(["samtools", "view", "-@", threads, "-c", str(cram)])
cram_queries = []
for r in regions:
samples, out = measure(
["samtools", "view", "-@", threads, "-c", "-F", "2308", str(cram), r], QUERY_REPEAT
)
cram_queries.append({"region": r, "seconds": samples, "records": int(out.strip())})
result["cram"] = {
"size_bytes": cram.stat().st_size,
"records": cram_total,
"conversion_seconds": cram_convert,
"index_seconds": cram_index,
"queries": cram_queries,
}

out_path = pathlib.Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(result, indent=2) + "\n")
print(json.dumps(result, indent=2))


if __name__ == "__main__":
main()
Loading