Add accuracy, correctness, and performance CI/CD workflows #2

Workflow file for this run

.github/workflows/performance.yml at 5152c32

	name: Performance Benchmarks

	# IMPORTANT – Runner choice and benchmark precision
	# ─────────────────────────────────────────────────
	# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware.
	# They are useful for relative regression detection (e.g. "this PR is 2×
	# slower than main") but NOT for absolute latency numbers.
	#
	# For precise, publication-quality benchmarks the team uses the dedicated
	# asap-tools benchmarking infra (used for TurboProm paper experiments).
	# That infra must be decoupled from Cloudlab and registered as a
	# GitHub self-hosted runner. Until then, set `runner: self-hosted` in the
	# workflow_dispatch inputs below to target a self-hosted machine when one
	# is available, or rely on the relative-regression job for PR gating.
	#
	# References:
	# - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby
	# - asap-tools/execution-utilities/asap_query_latency/ — ClickBench hits

	on:
	pull_request:
	branches: [ main ]
	paths:
	- 'asap-summary-ingest/**'
	- 'asap-query-engine/**'
	- 'asap-common/sketch-core/**'
	- 'asap-common/dependencies/**'
	- 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
	- 'asap-tools/execution-utilities/asap_query_latency/**'
	- '.github/workflows/performance.yml'
	workflow_dispatch:
	inputs:
	runner:
	description: >
	Runner label to use. Use 'ubuntu-latest' for relative regression
	detection on GH-hosted VMs, or a self-hosted runner label (e.g.
	'self-hosted') for precise absolute benchmarks.
	required: false
	default: ubuntu-latest
	max_rows:
	description: Rows to ingest (0 = full dataset; reduce for faster CI runs)
	required: false
	default: '100000'
	benchmark_suite:
	description: 'Which suite to run: h2o \| query_latency \| all'
	required: false
	default: all

	env:
	# Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners)
	DEFAULT_MAX_ROWS: '100000'
	# Latency regression threshold: flag if p95 latency increases by more than
	# this factor relative to the baseline run within the same CI job.
	LATENCY_REGRESSION_FACTOR: '2.0'

	jobs:
	# ── 1. Relative performance regression (always runs on GH-hosted VMs) ──────
	relative-regression:
	name: Relative performance regression (H2O groupby)
	# Use the workflow_dispatch runner input when triggered manually;
	# fall back to ubuntu-latest for PR triggers.
	runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner \|\| 'ubuntu-latest' }}
	timeout-minutes: 45

	steps:
	- uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install Python dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install requests kafka-python gdown matplotlib
	if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
	pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
	fi

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Build base image
	run: \|
	docker build \
	-t sketchdb-base:latest \
	-f asap-common/installation/Dockerfile \
	asap-common

	- name: Build summary-ingest image
	run: \|
	docker build \
	-t asap-summary-ingest:ci \
	-f asap-summary-ingest/Dockerfile \
	asap-summary-ingest

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	- name: Install protoc
	run: \|
	sudo apt-get update -qq
	sudo apt-get install -y protobuf-compiler

	- name: Run sccache
	uses: mozilla-actions/sccache-action@v0.0.4

	- name: Cache cargo
	uses: actions/cache@v4
	with:
	path: \|
	~/.cargo/registry
	~/.cargo/git
	target
	key: ${{ runner.os }}-cargo-perf-${{ hashFiles('/Cargo.lock', '/Cargo.toml') }}

	- name: Build query engine binary
	run: cargo build --release --bin query_engine_rust --locked
	env:
	RUSTC_WRAPPER: sccache

	- name: Resolve max_rows
	id: config
	run: \|
	if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
	echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT"
	else
	echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT"
	fi

	# ── Baseline (ClickHouse exact) ──
	- name: Run ClickHouse baseline benchmark
	working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
	run: \|
	python run_benchmark.py \
	--mode baseline \
	--load-data \
	--max-rows ${{ steps.config.outputs.max_rows }} \
	--output /tmp/baseline_perf_results.csv

	# ── ASAP path ──
	- name: Run ASAP benchmark
	working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
	run: \|
	python run_benchmark.py \
	--mode asap \
	--load-data \
	--max-rows ${{ steps.config.outputs.max_rows }} \
	--output /tmp/asap_perf_results.csv \
	--qe-bin ${{ github.workspace }}/target/release/query_engine_rust

	# ── Regression check ──
	- name: Check for latency regressions
	run: \|
	python3 - <<'EOF'
	import csv, os, sys

	factor = float(os.environ["LATENCY_REGRESSION_FACTOR"])
	asap_file = "/tmp/asap_perf_results.csv"
	base_file = "/tmp/baseline_perf_results.csv"

	def load_latency(path):
	with open(path) as f:
	return {row["query_id"]: float(row["latency_ms"])
	for row in csv.DictReader(f)
	if row.get("latency_ms") not in (None, "", "null")}

	try:
	asap = load_latency(asap_file)
	base = load_latency(base_file)
	except FileNotFoundError as e:
	print(f"Result file missing: {e}. Skipping regression check.")
	sys.exit(0)

	regressions = []
	print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}")
	print("-" * 72)
	for qid, base_lat in base.items():
	if qid not in asap:
	continue
	ratio = asap[qid] / base_lat if base_lat > 0 else float("inf")
	status = "REGRESSION" if ratio > factor else "ok"
	print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f} {status}")
	if status == "REGRESSION":
	regressions.append((qid, ratio))

	if regressions:
	print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):")
	for qid, r in regressions:
	print(f" - {qid}: {r:.2f}x slower than baseline")
	print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to")
	print(" cloud noise. For authoritative numbers use a self-hosted runner.")
	sys.exit(1)
	else:
	print(f"\nNo regressions detected (threshold: {factor}x).")
	EOF
	env:
	LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }}

	- name: Generate latency comparison plot
	if: always()
	working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
	run: \|
	python plot_latency.py \
	--asap /tmp/asap_perf_results.csv \
	--baseline /tmp/baseline_perf_results.csv \
	--output /tmp/latency_comparison.png 2>/dev/null \|\| \
	python plot_latency.py 2>/dev/null \|\| true

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: performance-results-${{ github.run_id }}
	path: \|
	/tmp/asap_perf_results.csv
	/tmp/baseline_perf_results.csv
	/tmp/latency_comparison.png
	if-no-files-found: warn

	# ── 2. Query latency micro-benchmark (manual / self-hosted) ─────────────────
	query-latency:
	name: Query latency micro-benchmark
	# Skip on PRs — run manually or on a scheduled trigger once self-hosted
	# runners are available. On GH-hosted VMs the numbers are too noisy to be
	# actionable for absolute latency SLOs.
	if: >
	github.event_name == 'workflow_dispatch' &&
	(inputs.benchmark_suite == 'query_latency' \|\| inputs.benchmark_suite == 'all')
	runs-on: ${{ inputs.runner \|\| 'ubuntu-latest' }}
	timeout-minutes: 60

	steps:
	- uses: actions/checkout@v4

	- name: Self-hosted runner notice
	if: ${{ inputs.runner == 'ubuntu-latest' \|\| inputs.runner == '' }}
	run: \|
	echo "::warning::Running query_latency benchmark on a GH-hosted VM."
	echo "::warning::Results are indicative only. Register the asap-tools"
	echo "::warning::benchmarking host as a self-hosted runner for"
	echo "::warning::publication-quality absolute latency measurements."

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install Python dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install requests
	if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then
	pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt
	fi

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	- name: Install protoc
	run: \|
	sudo apt-get update -qq
	sudo apt-get install -y protobuf-compiler

	- name: Run sccache
	uses: mozilla-actions/sccache-action@v0.0.4

	- name: Cache cargo
	uses: actions/cache@v4
	with:
	path: \|
	~/.cargo/registry
	~/.cargo/git
	target
	key: ${{ runner.os }}-cargo-perf-${{ hashFiles('/Cargo.lock', '/Cargo.toml') }}

	- name: Build query engine binary
	run: cargo build --release --bin query_engine_rust --locked
	env:
	RUSTC_WRAPPER: sccache

	- name: Run query latency benchmark
	working-directory: asap-tools/execution-utilities/asap_query_latency
	run: \|
	python run_benchmark.py \
	--output /tmp/query_latency_results.csv \
	--qe-bin ${{ github.workspace }}/target/release/query_engine_rust

	- name: Upload query latency results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: query-latency-results-${{ github.run_id }}
	path: /tmp/query_latency_results.csv
	if-no-files-found: warn

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add accuracy, correctness, and performance CI/CD workflows #2

Workflow file

Add accuracy, correctness, and performance CI/CD workflows #2

Uh oh!

Workflow file for this run