justrach · justrach · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/bench/iouring/.gitignore b/bench/iouring/.gitignore
@@ -0,0 +1,2 @@
+# Built artifacts (regenerate via ./build.sh)
+iouring_smoke
diff --git a/bench/iouring/Containerfile b/bench/iouring/Containerfile
@@ -0,0 +1,11 @@
+# Linux smoke-test image for the io_uring AcceptLoop in zig/src/iouring.zig.
+#
+# Pre-built static aarch64-linux-musl binary; no compiler needed at image
+# time. Build the binary on the host first with:
+#   ./bench/iouring/build.sh
+FROM alpine:3.20
+
+COPY iouring_smoke /usr/local/bin/iouring_smoke
+RUN chmod +x /usr/local/bin/iouring_smoke
+
+ENTRYPOINT ["/usr/local/bin/iouring_smoke"]
diff --git a/bench/iouring/README.md b/bench/iouring/README.md
@@ -0,0 +1,59 @@
+# io_uring smoke test
+
+End-to-end correctness check for the Linux `io_uring` `IORING_OP_ACCEPT_MULTISHOT`
+accept loop in [`zig/src/iouring.zig`](../../zig/src/iouring.zig).
+
+This is **not a benchmark.** It exists to prove that:
+
+1. `zig/src/iouring.zig` compiles cleanly for `aarch64-linux-musl`.
+2. `std.os.linux.IoUring` works on the kernel the test is run against.
+3. `IORING_OP_ACCEPT_MULTISHOT` actually delivers all expected accepts when
+   N clients connect to a listen socket.
+
+The smoke binary opens a TCP listener on `127.0.0.1:18080`, runs the
+`AcceptLoop` on a worker thread, dials the listener N times from the main
+thread, and asserts the accept callback fired N times. Exits 0 on success,
+non-zero otherwise.
+
+## Run
+
+Requires Apple `container` 0.11+ (or any compatible OCI runtime — set
+`RUNTIME=docker` / `RUNTIME=podman`). On macOS, start the container service
+first:
+
+```bash
+container system start
+./bench/iouring/run.sh
+```
+
+The script:
+
+1. Cross-compiles `iouring_smoke` for `aarch64-linux-musl` on the host.
+2. Builds the OCI image from this directory.
+3. Runs the smoke binary inside a fresh container.
+
+A passing run prints something like:
+
+```
+listening on 127.0.0.1:18080 (fd=3)
+  client 1/16 connected
+  ...
+  client 16/16 connected
+io_uring AcceptLoop saw 16 accepts (wanted >= 16)
+OK
+==> io_uring smoke test PASSED
+```
+
+The kernel version reported on a clean `container run alpine:3.20 uname -a`
+on macOS 26 / `container` 0.11 is `Linux ... 6.18.5 ... aarch64`, well above
+the 5.19 minimum for `IORING_OP_ACCEPT_MULTISHOT`.
+
+## Limitations
+
+* Only the accept loop is exercised. Per-connection `recv` / `send` over
+  `io_uring` is not implemented yet — see the staged plan in
+  [`zig/src/iouring.zig`](../../zig/src/iouring.zig).
+* No request/response payload is sent; the test closes accepted fds
+  immediately.
+* No latency or throughput numbers are produced. Per `AGENTS.md`, do not
+  cite this script in any benchmark table or release note.
diff --git a/bench/iouring/build.sh b/bench/iouring/build.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Build the io_uring smoke-test binary for Linux (aarch64-linux-musl by
+# default; override with TARGET=...). Designed to run on macOS via Apple
+# `container`, on a Linux dev box natively, or in CI on a Linux runner.
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$REPO_ROOT"
+
+TARGET="${TARGET:-aarch64-linux-musl}"
+OUT="bench/iouring/iouring_smoke"
+
+# Stub turbo_build_options so iouring.zig compiles standalone without the
+# whole turbonet build graph.
+STUB_DIR="$(mktemp -d)"
+trap 'rm -rf "$STUB_DIR"' EXIT
+cat > "$STUB_DIR/turbo_build_options.zig" <<EOF
+pub const iouring_enabled: bool = true;
+EOF
+
+echo "==> cross-compiling iouring smoke for $TARGET"
+zig build-exe -target "$TARGET" -O ReleaseSafe -femit-bin="$OUT" \
+    --dep iouring \
+    -Mroot=bench/iouring/iouring_smoke.zig \
+    --dep turbo_build_options \
+    -Miouring=zig/src/iouring.zig \
+    -Mturbo_build_options="$STUB_DIR/turbo_build_options.zig"
+
+file "$OUT"
+echo "==> built $OUT"
diff --git a/bench/iouring/full_bench/.gitignore b/bench/iouring/full_bench/.gitignore
@@ -0,0 +1,2 @@
+# wrk output is regenerated by build_and_bench.sh
+results/
diff --git a/bench/iouring/full_bench/Containerfile b/bench/iouring/full_bench/Containerfile
@@ -0,0 +1,42 @@
+# Linux io_uring vs blocking-accept benchmark image for TurboAPI.
+# Builds the Zig extension twice (once with -Diouring=true, once without),
+# runs the same TurboAPI app under each, and drives wrk against it from
+# inside the same container.
+#
+# Honest scope: the ONLY thing that differs between the two builds in the
+# current PR (#144) is the accept loop. Per-connection recv/send still goes
+# through the same thread-pool synchronous syscalls. Treat the deltas
+# accordingly.
+
+FROM debian:bookworm-slim
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/root/.local/bin:/opt/zig:${PATH}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates curl xz-utils build-essential pkg-config \
+    git wrk \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Zig 0.16.0 (aarch64-linux).
+RUN mkdir -p /opt && cd /opt \
+ && curl -fsSL https://ziglang.org/download/0.16.0/zig-aarch64-linux-0.16.0.tar.xz -o zig.tar.xz \
+ && tar -xJf zig.tar.xz \
+ && mv zig-aarch64-linux-0.16.0 zig \
+ && rm zig.tar.xz
+
+# Install uv to fetch Python 3.14 free-threaded.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Pre-fetch Python 3.14t so the first run is fast.
+RUN /root/.local/bin/uv python install 3.14t
+
+WORKDIR /work
+
+# The repo will be bind-mounted at /work at run time. The driver script
+# (build_and_bench.sh) handles the per-build steps.
+COPY build_and_bench.sh /usr/local/bin/build_and_bench.sh
+COPY app.py /app/app.py
+RUN chmod +x /usr/local/bin/build_and_bench.sh
+
+ENTRYPOINT ["/usr/local/bin/build_and_bench.sh"]
diff --git a/bench/iouring/full_bench/RESULTS.md b/bench/iouring/full_bench/RESULTS.md
@@ -0,0 +1,83 @@
+# io_uring vs blocking-accept — A/B run
+
+> **Scope:** the *only* code path that differs between the two builds is
+> the listener accept loop. Per-connection `recv` / `send` still goes
+> through the existing thread-pool synchronous syscalls in both
+> variants. Treat the deltas accordingly. Per `AGENTS.md`, do not cite
+> these numbers in release notes, framework comparison tables, or any
+> public copy.
+
+## Environment
+
+- Apple `container` CLI (Linux microVM on macOS)
+- Kernel: `Linux 6.18.5 aarch64`
+- Python: `3.14.4 free-threaded` (GIL disabled)
+- Zig: `0.16.0`, `-Doptimize=ReleaseFast`
+- wrk: `t=4 c=64 d=10s` per iteration, 3s warmup
+- Iterations: **5 per (variant, workload)**, median reported
+- All traffic on loopback inside one container
+
+## Workloads
+
+| name      | request                                      | notes |
+|-----------|----------------------------------------------|-------|
+| `noargs`  | `GET /`                                      | trivial fast path |
+| `user_id` | `GET /user/{id}` with `id` random 1..10M     | path-param parsing every request, defeats per-path caching |
+| `query`   | `GET /q?id={id}` with `id` random 1..10M     | query-string parsing every request |
+| `items`   | `GET /items` returning a 50-record JSON body | bigger response (~2 KB) |
+
+`user_id` and `query` use `wrk -s vary_user_id.lua` / `vary_query.lua`
+which generate a fresh URL per request, so the radix-trie lookup runs
+cold every time.
+
+## Median of 5 runs
+
+| workload | variant   | req/s     | Δ vs blocking | p50      | p99      |
+|----------|-----------|-----------|---------------|----------|----------|
+| noargs   | blocking  | 697,933   | —             | 20 µs    | 86 µs    |
+| noargs   | iouring   | 713,240   | **+2.2 %**    | 21 µs    | 59 µs    |
+| user_id  | blocking  | 321,439   | —             | 43 µs    | 321 µs   |
+| user_id  | iouring   | 366,991   | **+14.2 %**   | 39 µs    | 261 µs   |
+| query    | blocking  | 235,954   | —             | 28 µs    | 7.87 ms  |
+| query    | iouring   | 235,270   | **−0.3 %**    | 28 µs    | 8.32 ms  |
+| items    | blocking  | 124,408   | —             | 170 µs   | 401 µs   |
+| items    | iouring   | 130,719   | **+5.1 %**    | 150 µs   | 533 µs   |
+
+Raw per-iteration `wrk` outputs are in `results/`.
+
+## Honest caveats
+
+- 5 samples is enough to spot order-of-magnitude differences but not
+  small ones; the `query` and `noargs` deltas are within run-to-run
+  noise on this VM.
+- p99 jitter is high (`query` shows multi-millisecond tails on both
+  builds — likely loopback + `wrk` timing artifacts, not server
+  pauses). Don't read the p99 column as a stable signal.
+- Single-container, loopback, single client. No multi-host, no
+  external network, no multi-worker deployment scenario.
+- `wrk -t4` is approaching saturation on the `noargs` route (~700k
+  rps). Some of the small delta there may be wrk-bound, not
+  server-bound.
+- This run was kicked off in a fresh container, so each variant got a
+  cold start; results were not interleaved.
+
+## Reproducing
+
+```bash
+container build -t turboapi-iouring-bench \
+    -f bench/iouring/full_bench/Containerfile \
+    bench/iouring/full_bench
+
+container run --rm -m 8G -c 4 \
+    -v "$PWD":/work \
+    turboapi-iouring-bench
+```
+
+Override env vars to scale up:
+
+```bash
+container run --rm -m 8G -c 4 \
+    -e DURATION=30s -e ITERS=10 -e CONNS=128 \
+    -v "$PWD":/work \
+    turboapi-iouring-bench
+```
diff --git a/bench/iouring/full_bench/app.py b/bench/iouring/full_bench/app.py
@@ -0,0 +1,55 @@
+"""Routes for io_uring vs blocking-accept A/B benchmarking.
+
+Intentionally small. Each route does the minimum work that exercises a
+different request path so we can check whether the accept-loop change
+moves the needle for anything other than the trivial `/` noargs case.
+
+Routes:
+  GET /              noargs fast path (baseline)
+  GET /user/{id}     path parameter — varied per request by wrk to
+                     defeat any per-path lookup caching
+  GET /q             query string — echoes a single ?id= param
+"""
+
+import os
+import sys
+
+from turboapi import TurboAPI
+
+app = TurboAPI(title="iouring-bench")
+
+
+@app.get("/")
+def home():
+    return {"ok": True}
+
+
+@app.get("/user/{id}")
+def get_user(id: str):
+    return {"id": id}
+
+
+@app.get("/q")
+def get_q(id: str = "0"):
+    return {"id": id}
+
+
+# ~2 KB JSON body (50 records) — exercises more serializer + more bytes
+# on the wire than the trivial routes above.
+_ITEMS = [
+    {"id": i, "name": f"item-{i}", "price": i * 1.5, "in_stock": (i % 3 == 0)}
+    for i in range(50)
+]
+
+
+@app.get("/items")
+def get_items():
+    return {"items": _ITEMS}
+
+
+if __name__ == "__main__":
+    host = os.environ.get("HOST", "0.0.0.0")
+    port = int(os.environ.get("PORT", "8080"))
+    print(f"[bench-app] starting on {host}:{port}", flush=True)
+    sys.stdout.flush()
+    app.run(host=host, port=port)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Built artifacts (regenerate via ./build.sh)
		iouring_smoke
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# wrk output is regenerated by build_and_bench.sh
		results/