Add accuracy, correctness, and performance CI/CD workflows #2
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Performance Benchmarks | |
| # IMPORTANT – Runner choice and benchmark precision | |
| # ───────────────────────────────────────────────── | |
| # Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware. | |
| # They are useful for *relative* regression detection (e.g. "this PR is 2× | |
| # slower than main") but NOT for absolute latency numbers. | |
| # | |
| # For precise, publication-quality benchmarks the team uses the dedicated | |
| # asap-tools benchmarking infra (used for TurboProm paper experiments). | |
| # That infra must be decoupled from Cloudlab and registered as a | |
| # GitHub self-hosted runner. Until then, set `runner: self-hosted` in the | |
| # workflow_dispatch inputs below to target a self-hosted machine when one | |
| # is available, or rely on the relative-regression job for PR gating. | |
| # | |
| # References: | |
| # - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby | |
| # - asap-tools/execution-utilities/asap_query_latency/ — ClickBench hits | |
| on: | |
| pull_request: | |
| branches: [ main ] | |
| paths: | |
| - 'asap-summary-ingest/**' | |
| - 'asap-query-engine/**' | |
| - 'asap-common/sketch-core/**' | |
| - 'asap-common/dependencies/**' | |
| - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' | |
| - 'asap-tools/execution-utilities/asap_query_latency/**' | |
| - '.github/workflows/performance.yml' | |
| workflow_dispatch: | |
| inputs: | |
| runner: | |
| description: > | |
| Runner label to use. Use 'ubuntu-latest' for relative regression | |
| detection on GH-hosted VMs, or a self-hosted runner label (e.g. | |
| 'self-hosted') for precise absolute benchmarks. | |
| required: false | |
| default: ubuntu-latest | |
| max_rows: | |
| description: Rows to ingest (0 = full dataset; reduce for faster CI runs) | |
| required: false | |
| default: '100000' | |
| benchmark_suite: | |
| description: 'Which suite to run: h2o | query_latency | all' | |
| required: false | |
| default: all | |
| env: | |
| # Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners) | |
| DEFAULT_MAX_ROWS: '100000' | |
| # Latency regression threshold: flag if p95 latency increases by more than | |
| # this factor relative to the baseline run within the same CI job. | |
| LATENCY_REGRESSION_FACTOR: '2.0' | |
| jobs: | |
| # ── 1. Relative performance regression (always runs on GH-hosted VMs) ────── | |
| relative-regression: | |
| name: Relative performance regression (H2O groupby) | |
| # Use the workflow_dispatch runner input when triggered manually; | |
| # fall back to ubuntu-latest for PR triggers. | |
| runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }} | |
| timeout-minutes: 45 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Install Python dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install requests kafka-python gdown matplotlib | |
| if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then | |
| pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt | |
| fi | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Build base image | |
| run: | | |
| docker build \ | |
| -t sketchdb-base:latest \ | |
| -f asap-common/installation/Dockerfile \ | |
| asap-common | |
| - name: Build summary-ingest image | |
| run: | | |
| docker build \ | |
| -t asap-summary-ingest:ci \ | |
| -f asap-summary-ingest/Dockerfile \ | |
| asap-summary-ingest | |
| - name: Install Rust | |
| uses: dtolnay/rust-toolchain@stable | |
| - name: Install protoc | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y protobuf-compiler | |
| - name: Run sccache | |
| uses: mozilla-actions/sccache-action@v0.0.4 | |
| - name: Cache cargo | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cargo/registry | |
| ~/.cargo/git | |
| target | |
| key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} | |
| - name: Build query engine binary | |
| run: cargo build --release --bin query_engine_rust --locked | |
| env: | |
| RUSTC_WRAPPER: sccache | |
| - name: Resolve max_rows | |
| id: config | |
| run: | | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT" | |
| fi | |
| # ── Baseline (ClickHouse exact) ── | |
| - name: Run ClickHouse baseline benchmark | |
| working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline | |
| run: | | |
| python run_benchmark.py \ | |
| --mode baseline \ | |
| --load-data \ | |
| --max-rows ${{ steps.config.outputs.max_rows }} \ | |
| --output /tmp/baseline_perf_results.csv | |
| # ── ASAP path ── | |
| - name: Run ASAP benchmark | |
| working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline | |
| run: | | |
| python run_benchmark.py \ | |
| --mode asap \ | |
| --load-data \ | |
| --max-rows ${{ steps.config.outputs.max_rows }} \ | |
| --output /tmp/asap_perf_results.csv \ | |
| --qe-bin ${{ github.workspace }}/target/release/query_engine_rust | |
| # ── Regression check ── | |
| - name: Check for latency regressions | |
| run: | | |
| python3 - <<'EOF' | |
| import csv, os, sys | |
| factor = float(os.environ["LATENCY_REGRESSION_FACTOR"]) | |
| asap_file = "/tmp/asap_perf_results.csv" | |
| base_file = "/tmp/baseline_perf_results.csv" | |
| def load_latency(path): | |
| with open(path) as f: | |
| return {row["query_id"]: float(row["latency_ms"]) | |
| for row in csv.DictReader(f) | |
| if row.get("latency_ms") not in (None, "", "null")} | |
| try: | |
| asap = load_latency(asap_file) | |
| base = load_latency(base_file) | |
| except FileNotFoundError as e: | |
| print(f"Result file missing: {e}. Skipping regression check.") | |
| sys.exit(0) | |
| regressions = [] | |
| print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}") | |
| print("-" * 72) | |
| for qid, base_lat in base.items(): | |
| if qid not in asap: | |
| continue | |
| ratio = asap[qid] / base_lat if base_lat > 0 else float("inf") | |
| status = "REGRESSION" if ratio > factor else "ok" | |
| print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f} {status}") | |
| if status == "REGRESSION": | |
| regressions.append((qid, ratio)) | |
| if regressions: | |
| print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):") | |
| for qid, r in regressions: | |
| print(f" - {qid}: {r:.2f}x slower than baseline") | |
| print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to") | |
| print(" cloud noise. For authoritative numbers use a self-hosted runner.") | |
| sys.exit(1) | |
| else: | |
| print(f"\nNo regressions detected (threshold: {factor}x).") | |
| EOF | |
| env: | |
| LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }} | |
| - name: Generate latency comparison plot | |
| if: always() | |
| working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline | |
| run: | | |
| python plot_latency.py \ | |
| --asap /tmp/asap_perf_results.csv \ | |
| --baseline /tmp/baseline_perf_results.csv \ | |
| --output /tmp/latency_comparison.png 2>/dev/null || \ | |
| python plot_latency.py 2>/dev/null || true | |
| - name: Upload benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: performance-results-${{ github.run_id }} | |
| path: | | |
| /tmp/asap_perf_results.csv | |
| /tmp/baseline_perf_results.csv | |
| /tmp/latency_comparison.png | |
| if-no-files-found: warn | |
| # ── 2. Query latency micro-benchmark (manual / self-hosted) ───────────────── | |
| query-latency: | |
| name: Query latency micro-benchmark | |
| # Skip on PRs — run manually or on a scheduled trigger once self-hosted | |
| # runners are available. On GH-hosted VMs the numbers are too noisy to be | |
| # actionable for absolute latency SLOs. | |
| if: > | |
| github.event_name == 'workflow_dispatch' && | |
| (inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all') | |
| runs-on: ${{ inputs.runner || 'ubuntu-latest' }} | |
| timeout-minutes: 60 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Self-hosted runner notice | |
| if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }} | |
| run: | | |
| echo "::warning::Running query_latency benchmark on a GH-hosted VM." | |
| echo "::warning::Results are indicative only. Register the asap-tools" | |
| echo "::warning::benchmarking host as a self-hosted runner for" | |
| echo "::warning::publication-quality absolute latency measurements." | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Install Python dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install requests | |
| if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then | |
| pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt | |
| fi | |
| - name: Install Rust | |
| uses: dtolnay/rust-toolchain@stable | |
| - name: Install protoc | |
| run: | | |
| sudo apt-get update -qq | |
| sudo apt-get install -y protobuf-compiler | |
| - name: Run sccache | |
| uses: mozilla-actions/sccache-action@v0.0.4 | |
| - name: Cache cargo | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cargo/registry | |
| ~/.cargo/git | |
| target | |
| key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} | |
| - name: Build query engine binary | |
| run: cargo build --release --bin query_engine_rust --locked | |
| env: | |
| RUSTC_WRAPPER: sccache | |
| - name: Run query latency benchmark | |
| working-directory: asap-tools/execution-utilities/asap_query_latency | |
| run: | | |
| python run_benchmark.py \ | |
| --output /tmp/query_latency_results.csv \ | |
| --qe-bin ${{ github.workspace }}/target/release/query_engine_rust | |
| - name: Upload query latency results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: query-latency-results-${{ github.run_id }} | |
| path: /tmp/query_latency_results.csv | |
| if-no-files-found: warn |