diff --git a/crates/qfc-inference/examples/pipeline_latency.rs b/crates/qfc-inference/examples/pipeline_latency.rs new file mode 100644 index 0000000..66703e5 --- /dev/null +++ b/crates/qfc-inference/examples/pipeline_latency.rs @@ -0,0 +1,341 @@ +//! QFC B-2 spike: WAN pipeline-inference latency calculator. +//! +//! Pure-math decision tool. NO qfc-crate imports, pure std Rust, compiles with +//! default features (no candle). Run with: +//! cargo run -p qfc-inference --example pipeline_latency +//! +//! ## Model +//! Pipeline inference splits a transformer across K miners (stages). Activations +//! (layer-boundary hidden states) flow miner->miner over K-1 network hops. This +//! tool computes the NETWORK-ONLY latency floor (compute deliberately excluded: +//! we are measuring whether the WAN alone makes pipeline inference viable). +//! +//! ## Network scenario provenance (cite these in the spike report) +//! Each scenario is (rtt_ms, bw_mbit_s): +//! +//! - lan_same_vpc 0.25 ms / 5000 Mbit/s +//! MEASURED: A->B/C/D AWS us-east-1 intra-VPC RTT (~0.25 ms median). +//! Bandwidth ASSUMED conservative: real intra-VPC is ~10-25 Gbit/s; we use +//! 5 Gbit/s as a deliberately pessimistic floor. +//! - regional_cross_az 5.0 ms / 1000 Mbit/s +//! PUBLISHED: AWS cross-AZ / nearby-region typical RTT and bandwidth. +//! - continental 40.0 ms / 300 Mbit/s +//! PUBLISHED: AWS cross-region same continent (e.g. us-east<->us-west is +//! ~60 ms RTT); 40 ms is a mid-estimate. 300 Mbit/s single-stream typical. +//! - intercontinental 242.0 ms / 20 Mbit/s +//! MEASURED: Singapore (Singtel) <-> AWS us-east-1 (Virginia). RTT 242 ms +//! median, 20 Mbit/s sustained single-stream throughput. +//! +//! Derived: +//! one_way_ms = rtt_ms / 2 +//! transfer_ms(b) = one_way_ms + (b * 8 / (bw_mbit_s * 1e6)) * 1000 + +// ---------------------------------------------------------------------------- +// Network model +// ---------------------------------------------------------------------------- + +/// A measured/published network scenario between two pipeline stages. +struct Scenario { + name: &'static str, + rtt_ms: f64, + bw_mbit_s: f64, +} + +impl Scenario { + /// One-way propagation latency (half the round trip). + fn one_way_ms(&self) -> f64 { + self.rtt_ms / 2.0 + } + + /// Time to move `bytes` across one hop: one-way latency + serialization. + fn transfer_ms(&self, bytes: f64) -> f64 { + self.one_way_ms() + (bytes * 8.0 / (self.bw_mbit_s * 1e6)) * 1000.0 + } +} + +const SCENARIOS: [Scenario; 4] = [ + Scenario { + name: "lan_same_vpc", + rtt_ms: 0.25, + bw_mbit_s: 5000.0, + }, + Scenario { + name: "regional_cross_az", + rtt_ms: 5.0, + bw_mbit_s: 1000.0, + }, + Scenario { + name: "continental", + rtt_ms: 40.0, + bw_mbit_s: 300.0, + }, + Scenario { + name: "intercontinental", + rtt_ms: 242.0, + bw_mbit_s: 20.0, + }, +]; + +// ---------------------------------------------------------------------------- +// Model configs +// ---------------------------------------------------------------------------- + +/// fp16 activations: 2 bytes per element. +const DTYPE_BYTES: usize = 2; + +struct Model { + name: &'static str, + hidden: usize, + #[allow(dead_code)] + layers: usize, +} + +const MODELS: [Model; 4] = [ + Model { + name: "bert-base", + hidden: 768, + layers: 12, + }, + Model { + name: "qwen2.5-0.5b", + hidden: 896, + layers: 24, + }, + Model { + name: "qwen2.5-3b", + hidden: 2048, + layers: 36, + }, + Model { + name: "qwen2.5-7b", + hidden: 3584, + layers: 28, + }, +]; + +/// Activation bytes crossing one layer boundary = batch * seq_len * hidden * dtype. +fn activation_bytes(batch: usize, seq_len: usize, hidden: usize) -> f64 { + (batch * seq_len * hidden * DTYPE_BYTES) as f64 +} + +// ---------------------------------------------------------------------------- +// Workload latency models +// ---------------------------------------------------------------------------- + +/// Interactive autoregressive decode: generate `tokens` tokens. Each token is a +/// full forward pass traversing all K stages -> K-1 hops, decode seq_len=1 so the +/// per-token activation is tiny (RTT-dominated). Returns network-floor latency in +/// milliseconds for the whole generation (compute excluded). +fn decode_latency_ms(model: &Model, sc: &Scenario, stages: usize, tokens: usize) -> f64 { + let hops = (stages - 1) as f64; + let act = activation_bytes(1, 1, model.hidden); + tokens as f64 * hops * sc.transfer_ms(act) +} + +/// Single-request prefill traversal latency (ms): one pass across K-1 hops. +fn prefill_request_ms( + model: &Model, + sc: &Scenario, + stages: usize, + batch: usize, + seq: usize, +) -> f64 { + let hops = (stages - 1) as f64; + let act = activation_bytes(batch, seq, model.hidden); + hops * sc.transfer_ms(act) +} + +/// Per-hop transfer time (ms) for a prefill activation. Under pipelining this is +/// the steady-state throughput bottleneck (throughput ~ 1/max(stage_compute, per_hop_transfer)). +fn prefill_per_hop_ms(model: &Model, sc: &Scenario, batch: usize, seq: usize) -> f64 { + sc.transfer_ms(activation_bytes(batch, seq, model.hidden)) +} + +// ---------------------------------------------------------------------------- +// Sanity checks (run from main; panic on math regression) +// ---------------------------------------------------------------------------- + +fn sanity() { + let m7b = &MODELS[3]; + assert_eq!(m7b.name, "qwen2.5-7b"); + let inter = &SCENARIOS[3]; + assert_eq!(inter.name, "intercontinental"); + let lan = &SCENARIOS[0]; + + // intercontinental 7b K=4 decode > 30s + let dec_inter = decode_latency_ms(m7b, inter, 4, 100) / 1000.0; + assert!( + dec_inter > 30.0, + "intercontinental 7b K=4 decode should be >30s, got {dec_inter:.3}s" + ); + + // lan 7b K=4 decode < 0.1s + let dec_lan = decode_latency_ms(m7b, lan, 4, 100) / 1000.0; + assert!( + dec_lan < 0.1, + "lan 7b K=4 decode should be <0.1s, got {dec_lan:.4}s" + ); + + // intercontinental B8 S512 7b per-hop > 5000ms + let hop_inter = prefill_per_hop_ms(m7b, inter, 8, 512); + assert!( + hop_inter > 5000.0, + "intercontinental B8 S512 7b per-hop should be >5000ms, got {hop_inter:.1}ms" + ); +} + +// ---------------------------------------------------------------------------- +// Output +// ---------------------------------------------------------------------------- + +fn print_table_a() { + println!("## Table A — Activation transfer size per layer boundary (KB)\n"); + println!("| model | decode (b1,s1) | prefill (b1,s512) | prefill (b8,s512) |"); + println!("|---|---:|---:|---:|"); + for m in &MODELS { + let decode = activation_bytes(1, 1, m.hidden) / 1024.0; + let p1 = activation_bytes(1, 512, m.hidden) / 1024.0; + let p8 = activation_bytes(8, 512, m.hidden) / 1024.0; + println!("| {} | {:.3} | {:.1} | {:.1} |", m.name, decode, p1, p8); + } + println!(); +} + +fn print_table_b() { + println!("## Table B — Interactive decode: network-only latency for 100 tokens (seconds)\n"); + let ks = [2usize, 4, 8]; + // header + print!("| model | K |"); + for sc in &SCENARIOS { + print!(" {} |", sc.name); + } + println!(); + print!("|---|---:|"); + for _ in &SCENARIOS { + print!("---:|"); + } + println!(); + for m in &MODELS { + for &k in &ks { + print!("| {} | {} |", m.name, k); + for sc in &SCENARIOS { + let s = decode_latency_ms(m, sc, k, 100) / 1000.0; + print!(" {s:.3} |"); + } + println!(); + } + } + println!(); +} + +fn print_table_c() { + println!("## Table C — Batch prefill (B=8, S=512): per-hop transfer (ms) and single-request K=4 latency (ms)\n"); + print!("| model |"); + for sc in &SCENARIOS { + print!(" {} per-hop | {} K4-req |", sc.name, sc.name); + } + println!(); + print!("|---|"); + for _ in &SCENARIOS { + print!("---:|---:|"); + } + println!(); + for m in &MODELS { + print!("| {} |", m.name); + for sc in &SCENARIOS { + let hop = prefill_per_hop_ms(m, sc, 8, 512); + let req = prefill_request_ms(m, sc, 4, 8, 512); + print!(" {hop:.1} | {req:.1} |"); + } + println!(); + } + println!(); +} + +fn print_verdicts() { + let m7b = &MODELS[3]; + + println!("## Summary verdicts\n"); + + // Interactive viable? 100-token decode network-floor < 2000 ms, qwen2.5-7b K=4 + println!( + "### Interactive viable? (qwen2.5-7b, K=4, 100-token decode network-floor < 2000 ms)\n" + ); + println!("| scenario | network-floor (ms) | verdict |"); + println!("|---|---:|---|"); + for sc in &SCENARIOS { + let ms = decode_latency_ms(m7b, sc, 4, 100); + let verdict = if ms < 2000.0 { "PASS" } else { "FAIL" }; + println!("| {} | {:.1} | {} |", sc.name, ms, verdict); + } + println!(); + + // Batch transfer-bound? per-hop prefill transfer (B8 S512, 7b) > 200 ms + println!("### Batch transfer-bound? (qwen2.5-7b, B=8, S=512, per-hop transfer > 200 ms => bandwidth dominates)\n"); + println!("| scenario | per-hop (ms) | bandwidth-gated |"); + println!("|---|---:|---|"); + for sc in &SCENARIOS { + let hop = prefill_per_hop_ms(m7b, sc, 8, 512); + let flag = if hop > 200.0 { "YES" } else { "no" }; + println!("| {} | {:.1} | {} |", sc.name, hop, flag); + } + println!(); +} + +fn main() { + // Guard against math regressions before emitting any numbers. + sanity(); + + println!("# QFC B-2 spike — WAN pipeline-inference latency\n"); + println!( + "Network-only floor (compute excluded). dtype=fp16 (2B). \ + Decode T=100 tokens, batch=1, seq=1. Prefill S=512. \ + K (stages) = 4 representative; K in {{2,4,8}} for decode.\n" + ); + + print_table_a(); + print_table_b(); + print_table_c(); + print_verdicts(); + + println!( + "GO/NO-GO: interactive pipeline inference over real WAN = NO-GO \ + (RTT-dominated, 7B K=4 intercontinental ~{:.0}s for 100 tokens); \ + batch prefill viable but bandwidth-gated intercontinentally \ + (7B B8 S512 per-hop ~{:.0}s).", + decode_latency_ms(&MODELS[3], &SCENARIOS[3], 4, 100) / 1000.0, + prefill_per_hop_ms(&MODELS[3], &SCENARIOS[3], 8, 512) / 1000.0, + ); +} + +// ---------------------------------------------------------------------------- +// Unit tests (cargo test -p qfc-inference --example pipeline_latency) +// ---------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn intercontinental_7b_k4_decode_over_30s() { + let s = decode_latency_ms(&MODELS[3], &SCENARIOS[3], 4, 100) / 1000.0; + assert!(s > 30.0, "got {s:.3}s"); + } + + #[test] + fn lan_7b_k4_decode_under_100ms() { + let s = decode_latency_ms(&MODELS[3], &SCENARIOS[0], 4, 100) / 1000.0; + assert!(s < 0.1, "got {s:.4}s"); + } + + #[test] + fn intercontinental_b8_s512_7b_per_hop_over_5000ms() { + let ms = prefill_per_hop_ms(&MODELS[3], &SCENARIOS[3], 8, 512); + assert!(ms > 5000.0, "got {ms:.1}ms"); + } + + #[test] + fn sanity_runs() { + sanity(); + } +} diff --git a/docs/adr/0011-b2-pipeline-scope.md b/docs/adr/0011-b2-pipeline-scope.md new file mode 100644 index 0000000..1e9e30e --- /dev/null +++ b/docs/adr/0011-b2-pipeline-scope.md @@ -0,0 +1,69 @@ +# ADR-0011: B-2 scope — batch-only, locality-aware pipeline inference + +**Status:** accepted · **Date:** 2026-06-14 · **Context:** ROADMAP-AI-V3 +Feature B-2, gated on the WAN-latency spike +([docs/spikes/B2-wan-pipeline-latency.md](../spikes/B2-wan-pipeline-latency.md)). + +## Problem + +The roadmap gated B-2 (multi-miner pipeline execution) on a WAN-latency spike +and hypothesised "batch first, interactive only if latency supports it." The +spike measured real WAN (SG↔us-east-1: 242 ms RTT, 20 Mbit/s; intra-VPC LAN: +0.25 ms) and modelled pipeline cost across model sizes and stage counts. The +data decides the scope. + +## Decision + +### 1. Interactive autoregressive inference over WAN is OUT of B-2 + +Per-token full-pipeline traversal makes latency `T · (K−1) · RTT/2`. For +qwen2.5-7b, K=4, 100 tokens: 6 s continental, 37 s intercontinental — network +*alone*. Only same-datacenter/same-region paths pass a 2 s bar, and those negate +the point of *geographically distributed* miners. B-2 does **not** target +interactive inference. (Interactive stays single-miner, as today.) + +### 2. B-2 targets batch / async single-forward workloads only + +Prefill-style single-pass workloads (embeddings, classification, batched +scoring, async generation where latency is not user-facing) pipeline acceptably +**when network-local**. This is the B-2 product. + +### 3. Bandwidth is a first-class assignment constraint, co-equal with RTT + +The spike's correction to the roadmap: B-2's risk is not only RTT. At 20 Mbit/s, +a 7B prefill hop (B8/S512, 28.7 MB) is ~12 s — transfer-bound. Even regional +cross-AZ trips the 200 ms/hop threshold. Therefore the B3 shard-group assignment +(extending A3's assignment) must: +- group miners by **network proximity** (region/AS/measured RTT+BW), forming + pipeline groups whose inter-stage links clear a bandwidth+latency floor; +- carry **activation-transfer cost** (bytes/hop ÷ link BW + RTT) as an explicit + term in group formation and reject groups that exceed a per-hop budget; +- prefer **fewer stages** (smaller K) — latency and hop count both scale with K. + +### 4. Activation compression is a B-2 requirement, not an option + +Layer-boundary tensors must be quantized for transport (fp16→int8/fp8, ~2–4× +fewer bytes) before any cross-region group is viable. This becomes part of the +B4 pipeline-execution prototype and the B5 per-stage activation commitment +(hash the *quantized* transported tensor). + +## Consequences + +- **B3** (shard-group assignment): add the locality/bandwidth model above. +- **B4** (pipeline prototype): batch-only; include activation quantization; + measure on a real ≥2-region miner pair, not a single-region stand-in. +- **B5** (per-stage verification): activation commitments hash the quantized + transported bytes (so verifier re-execution compares the same artifact). +- The 12–16 h B-2 build estimate stands for the batch path; dropping interactive + removes the hardest latency problem, so the estimate is, if anything, safer. +- **Hard gate before committing B3–B5:** re-run + `cargo run -p qfc-inference --example pipeline_latency` against real + cross-region miner-to-miner RTT+BW (needs ≥2 miners in different regions — + the single-region testnet cannot produce this). Build only if a realistic + locality tier (same-region/continental, post-compression) clears the per-hop + budget for the target model sizes. + +## Non-goals (unchanged) + +Interactive WAN inference; any pipeline group that can't meet the per-hop +transfer budget; treating RTT as the sole network cost. diff --git a/docs/spikes/B2-wan-pipeline-latency.md b/docs/spikes/B2-wan-pipeline-latency.md new file mode 100644 index 0000000..ab60cc8 --- /dev/null +++ b/docs/spikes/B2-wan-pipeline-latency.md @@ -0,0 +1,105 @@ +# Spike: WAN latency for multi-miner pipeline inference (B-2 gate) + +**Date:** 2026-06-14 · **Owner:** Larry · **Status:** complete → decision in +[ADR-0011](../adr/0011-b2-pipeline-scope.md). + +ROADMAP-AI-V3 gates Feature B-2 (multi-miner pipeline execution) on "a +WAN-latency measurement spike before committing." The roadmap's own honest +constraint: *"pipeline parallelism over WAN latencies is unproven for +interactive inference — target batch/async workloads first… interactive only if +latency data supports it."* This spike gets the data and answers: does it? + +**Answer:** interactive = **no-go** over real WAN; batch = **viable but +bandwidth-gated** (not just RTT-gated — the dimension the roadmap under-weighted). + +## Method + +Pipeline inference splits a transformer across K miners; layer-boundary +activations cross K−1 network hops per forward pass. Per-hop cost is +`one_way_latency + activation_bytes / bandwidth`. The reproducible calculator is +`crates/qfc-inference/examples/pipeline_latency.rs` +(`cargo run -p qfc-inference --example pipeline_latency`) — re-run it with real +numbers once geo-distributed miners exist. + +### Measured anchors (real) + +| Path | RTT | Bandwidth | How | +|---|---|---|---| +| **LAN** — AWS us-east-1 intra-VPC (testnet VPS-A→B/C/D) | **0.25 ms** | ~5 Gbit/s (assumed; typical 10–25 Gbit/s) | `ping` A→{B,C,D}, 5 samples each: 0.22–0.28 ms avg | +| **Intercontinental** — Singapore (Singtel) ↔ AWS us-east-1 (Virginia) | **242 ms** median | **20 Mbit/s** single-stream | TCP-handshake RTT ×8 (238–284 ms); 20 MB over SSH = 2.5 MB/s | + +The live QFC testnet is **single-region** (B/C/D are `10.0.x.x` behind a +ProxyJump through A — LAN, not WAN), so it cannot directly measure cross-region +*miner* latency. The two measured anchors bracket reality: LAN is the +same-datacenter floor, SG↔Virginia is the intercontinental ceiling. The middle +two scenarios use published AWS inter-region figures: + +| Scenario | RTT (ms) | BW (Mbit/s) | Provenance | +|---|---:|---:|---| +| `lan_same_vpc` | 0.25 | 5000 | **measured** RTT; BW assumed | +| `regional_cross_az` | 5 | 1000 | published | +| `continental` | 40 | 300 | published (us-east↔us-west ~60 ms; 40 mid-estimate) | +| `intercontinental` | 242 | 20 | **measured** (both) | + +### Models (real registry configs), fp16 activations + +bert-base (h768), qwen2.5-0.5b (h896), qwen2.5-3b (h2048), qwen2.5-7b (h3584). +Activation bytes/boundary = `batch · seq · hidden · 2`. + +## Results + +### Interactive autoregressive decode — network-only floor, 100 tokens (seconds) + +Each generated token traverses the whole pipeline (K−1 hops); decode activations +are tiny (seq=1 → RTT-dominated). qwen2.5-7b: + +| K | lan | regional | continental | intercontinental | +|---:|---:|---:|---:|---:| +| 2 | 0.014 | 0.256 | 2.02 | 12.4 | +| 4 | 0.041 | 0.767 | **6.06** | **37.2** | +| 8 | 0.096 | 1.79 | 14.1 | 86.7 | + +This is **network alone**, compute excluded. Viability gate (100-token floor +< 2 s, qwen2.5-7b K=4): lan **PASS** (41 ms), regional **PASS** (767 ms), +continental **FAIL** (6.1 s), intercontinental **FAIL** (37 s). The only passing +scenarios are same-datacenter or same-region — i.e. *not* geographically +distributed miners. + +### Batch prefill (B=8, S=512) — per-hop transfer (ms), the throughput bottleneck + +Prefill activations are large (qwen2.5-7b B8/S512 = **28.7 MB/boundary**), so +bandwidth, not RTT, dominates. Per-hop transfer time: + +| model | lan | regional | continental | intercontinental | +|---|---:|---:|---:|---:| +| bert-base | 10 | 53 | 188 | 2638 | +| qwen2.5-3b | 27 | 137 | 467 | 6832 | +| qwen2.5-7b | 47 | **237** | 803 | **11865** | + +Bandwidth-bound flag (per-hop > 200 ms): regional **YES** (237 ms), continental +**YES**, intercontinental **YES** (≈12 s/hop). Only LAN keeps prefill transfer +under the compute timescale. **This is the spike's key finding the roadmap +missed:** B-2's risk was framed as RTT/latency, but for batch workloads the +20 Mbit/s measured inter-region bandwidth makes *transfer time* the wall — a 7B +prefill hop is ~12 s intercontinentally regardless of how the pipeline overlaps. + +## Conclusion + +- **Interactive inference over geo-distributed WAN: NO-GO.** Per-token RTT × token + count is prohibitive at any real WAN RTT (≥ tens of ms). Drop it from B-2. +- **Batch/async prefill: VIABLE only with network locality.** Same-region + (≤ regional, high-bandwidth) miner groups work; cross-region is bandwidth- + walled until activations are compressed or links are fat. +- **Bandwidth is co-equal with RTT** as a B-2 constraint — assignment must model + both. + +Decision and B-2 re-scoping: [ADR-0011](../adr/0011-b2-pipeline-scope.md). + +## Validation gate (before final B-2 commit) + +These projections use one measured intercontinental path + published mid-range +figures. Before building B3–B5, re-run the calculator against **real +cross-region miner-to-miner** measurements (RTT + sustained multi-stream +bandwidth on the activation transport actually chosen). The single-region +testnet can't produce that today; it needs ≥2 miners deployed in different +regions.