Skip to content

Add accuracy, correctness, and performance CI/CD workflows #2

Add accuracy, correctness, and performance CI/CD workflows

Add accuracy, correctness, and performance CI/CD workflows #2

Workflow file for this run

name: Performance Benchmarks
# IMPORTANT – Runner choice and benchmark precision
# ─────────────────────────────────────────────────
# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware.
# They are useful for *relative* regression detection (e.g. "this PR is 2×
# slower than main") but NOT for absolute latency numbers.
#
# For precise, publication-quality benchmarks the team uses the dedicated
# asap-tools benchmarking infra (used for TurboProm paper experiments).
# That infra must be decoupled from Cloudlab and registered as a
# GitHub self-hosted runner. Until then, set `runner: self-hosted` in the
# workflow_dispatch inputs below to target a self-hosted machine when one
# is available, or rely on the relative-regression job for PR gating.
#
# References:
# - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby
# - asap-tools/execution-utilities/asap_query_latency/ — ClickBench hits
on:
pull_request:
branches: [ main ]
paths:
- 'asap-summary-ingest/**'
- 'asap-query-engine/**'
- 'asap-common/sketch-core/**'
- 'asap-common/dependencies/**'
- 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
- 'asap-tools/execution-utilities/asap_query_latency/**'
- '.github/workflows/performance.yml'
workflow_dispatch:
inputs:
runner:
description: >
Runner label to use. Use 'ubuntu-latest' for relative regression
detection on GH-hosted VMs, or a self-hosted runner label (e.g.
'self-hosted') for precise absolute benchmarks.
required: false
default: ubuntu-latest
max_rows:
description: Rows to ingest (0 = full dataset; reduce for faster CI runs)
required: false
default: '100000'
benchmark_suite:
description: 'Which suite to run: h2o | query_latency | all'
required: false
default: all
env:
# Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners)
DEFAULT_MAX_ROWS: '100000'
# Latency regression threshold: flag if p95 latency increases by more than
# this factor relative to the baseline run within the same CI job.
LATENCY_REGRESSION_FACTOR: '2.0'
jobs:
# ── 1. Relative performance regression (always runs on GH-hosted VMs) ──────
relative-regression:
name: Relative performance regression (H2O groupby)
# Use the workflow_dispatch runner input when triggered manually;
# fall back to ubuntu-latest for PR triggers.
runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }}
timeout-minutes: 45
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install requests kafka-python gdown matplotlib
if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
fi
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build base image
run: |
docker build \
-t sketchdb-base:latest \
-f asap-common/installation/Dockerfile \
asap-common
- name: Build summary-ingest image
run: |
docker build \
-t asap-summary-ingest:ci \
-f asap-summary-ingest/Dockerfile \
asap-summary-ingest
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- name: Install protoc
run: |
sudo apt-get update -qq
sudo apt-get install -y protobuf-compiler
- name: Run sccache
uses: mozilla-actions/sccache-action@v0.0.4
- name: Cache cargo
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
- name: Build query engine binary
run: cargo build --release --bin query_engine_rust --locked
env:
RUSTC_WRAPPER: sccache
- name: Resolve max_rows
id: config
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT"
else
echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT"
fi
# ── Baseline (ClickHouse exact) ──
- name: Run ClickHouse baseline benchmark
working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
run: |
python run_benchmark.py \
--mode baseline \
--load-data \
--max-rows ${{ steps.config.outputs.max_rows }} \
--output /tmp/baseline_perf_results.csv
# ── ASAP path ──
- name: Run ASAP benchmark
working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
run: |
python run_benchmark.py \
--mode asap \
--load-data \
--max-rows ${{ steps.config.outputs.max_rows }} \
--output /tmp/asap_perf_results.csv \
--qe-bin ${{ github.workspace }}/target/release/query_engine_rust
# ── Regression check ──
- name: Check for latency regressions
run: |
python3 - <<'EOF'
import csv, os, sys
factor = float(os.environ["LATENCY_REGRESSION_FACTOR"])
asap_file = "/tmp/asap_perf_results.csv"
base_file = "/tmp/baseline_perf_results.csv"
def load_latency(path):
with open(path) as f:
return {row["query_id"]: float(row["latency_ms"])
for row in csv.DictReader(f)
if row.get("latency_ms") not in (None, "", "null")}
try:
asap = load_latency(asap_file)
base = load_latency(base_file)
except FileNotFoundError as e:
print(f"Result file missing: {e}. Skipping regression check.")
sys.exit(0)
regressions = []
print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}")
print("-" * 72)
for qid, base_lat in base.items():
if qid not in asap:
continue
ratio = asap[qid] / base_lat if base_lat > 0 else float("inf")
status = "REGRESSION" if ratio > factor else "ok"
print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f} {status}")
if status == "REGRESSION":
regressions.append((qid, ratio))
if regressions:
print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):")
for qid, r in regressions:
print(f" - {qid}: {r:.2f}x slower than baseline")
print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to")
print(" cloud noise. For authoritative numbers use a self-hosted runner.")
sys.exit(1)
else:
print(f"\nNo regressions detected (threshold: {factor}x).")
EOF
env:
LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }}
- name: Generate latency comparison plot
if: always()
working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
run: |
python plot_latency.py \
--asap /tmp/asap_perf_results.csv \
--baseline /tmp/baseline_perf_results.csv \
--output /tmp/latency_comparison.png 2>/dev/null || \
python plot_latency.py 2>/dev/null || true
- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: performance-results-${{ github.run_id }}
path: |
/tmp/asap_perf_results.csv
/tmp/baseline_perf_results.csv
/tmp/latency_comparison.png
if-no-files-found: warn
# ── 2. Query latency micro-benchmark (manual / self-hosted) ─────────────────
query-latency:
name: Query latency micro-benchmark
# Skip on PRs — run manually or on a scheduled trigger once self-hosted
# runners are available. On GH-hosted VMs the numbers are too noisy to be
# actionable for absolute latency SLOs.
if: >
github.event_name == 'workflow_dispatch' &&
(inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all')
runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- name: Self-hosted runner notice
if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }}
run: |
echo "::warning::Running query_latency benchmark on a GH-hosted VM."
echo "::warning::Results are indicative only. Register the asap-tools"
echo "::warning::benchmarking host as a self-hosted runner for"
echo "::warning::publication-quality absolute latency measurements."
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install requests
if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then
pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt
fi
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- name: Install protoc
run: |
sudo apt-get update -qq
sudo apt-get install -y protobuf-compiler
- name: Run sccache
uses: mozilla-actions/sccache-action@v0.0.4
- name: Cache cargo
uses: actions/cache@v4
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
- name: Build query engine binary
run: cargo build --release --bin query_engine_rust --locked
env:
RUSTC_WRAPPER: sccache
- name: Run query latency benchmark
working-directory: asap-tools/execution-utilities/asap_query_latency
run: |
python run_benchmark.py \
--output /tmp/query_latency_results.csv \
--qe-bin ${{ github.workspace }}/target/release/query_engine_rust
- name: Upload query latency results
if: always()
uses: actions/upload-artifact@v4
with:
name: query-latency-results-${{ github.run_id }}
path: /tmp/query_latency_results.csv
if-no-files-found: warn