Skip to content

Commit 737c721

Browse files
committed
Make scale benchmark provision users before fanout
1 parent 77034d4 commit 737c721

3 files changed

Lines changed: 225 additions & 17 deletions

File tree

harness/dim7_scale.py

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import concurrent.futures
66

77
from .config import BenchConfig
8-
from .sse_client import chat, get_metrics
8+
from .sse_client import chat, get_metrics, provision_user
99

1010

1111
def _timed_chat(config: BenchConfig, message: str) -> dict:
@@ -122,23 +122,73 @@ def run(config: BenchConfig) -> dict:
122122
user_ids = [str(base_user_num + i) for i in range(concurrency)]
123123
except ValueError:
124124
user_ids = [f"{config.user_id}-bench-{i}" for i in range(concurrency)]
125-
multi_cfgs = [config.clone_for_user(uid) for uid in user_ids]
126-
multi_latencies, multi_errors, multi_wall_ms, multi_error_samples = _run_concurrency_scenario(
127-
multi_cfgs,
128-
"Multi-user concurrent request",
129-
)
130-
multi_stats = _latency_stats(multi_latencies)
131-
results["multi_user"] = {
132-
"requests": concurrency,
133-
"errors": multi_errors,
134-
"success": len(multi_latencies),
135-
"wall_time_ms": round(multi_wall_ms, 1),
136-
"p50_ms": multi_stats.get("p50_ms"),
137-
"p95_ms": multi_stats.get("p95_ms"),
138-
"p99_ms": multi_stats.get("p99_ms"),
139-
"error_samples": multi_error_samples,
125+
candidate_cfgs = [config.clone_for_user(uid) for uid in user_ids]
126+
127+
provisioned_cfgs: list[BenchConfig] = []
128+
provisioned_users: list[str] = []
129+
unavailable_users: list[str] = []
130+
provisioning_error_samples: list[str] = []
131+
for cfg in candidate_cfgs:
132+
provision_result = provision_user(cfg)
133+
if provision_result["ok"]:
134+
provisioned_cfgs.append(cfg)
135+
provisioned_users.append(cfg.user_id)
136+
continue
137+
138+
reason = provision_result.get("reason") or "unknown"
139+
if reason == "unknown_user_id":
140+
unavailable_users.append(cfg.user_id)
141+
elif len(provisioning_error_samples) < 3:
142+
sample = (
143+
f"user={cfg.user_id} status={provision_result.get('status_code')} "
144+
f"reason={reason}"
145+
)
146+
if sample not in provisioning_error_samples:
147+
provisioning_error_samples.append(sample)
148+
149+
results["multi_user_provisioning"] = {
150+
"requested_users": concurrency,
151+
"provisioned_users": len(provisioned_cfgs),
152+
"provisioned_user_ids": provisioned_users[:5],
153+
"unavailable_users": len(unavailable_users),
154+
"unavailable_user_ids": unavailable_users[:5],
155+
"error_samples": provisioning_error_samples,
140156
}
141157

158+
if len(provisioned_cfgs) >= 2:
159+
multi_latencies, multi_errors, multi_wall_ms, multi_error_samples = _run_concurrency_scenario(
160+
provisioned_cfgs,
161+
"Multi-user concurrent request",
162+
)
163+
multi_stats = _latency_stats(multi_latencies)
164+
results["multi_user"] = {
165+
"requests": len(provisioned_cfgs),
166+
"errors": multi_errors,
167+
"success": len(multi_latencies),
168+
"wall_time_ms": round(multi_wall_ms, 1),
169+
"p50_ms": multi_stats.get("p50_ms"),
170+
"p95_ms": multi_stats.get("p95_ms"),
171+
"p99_ms": multi_stats.get("p99_ms"),
172+
"error_samples": multi_error_samples,
173+
}
174+
identity_bootstrap_available = True
175+
else:
176+
results["multi_user"] = {
177+
"requests": len(provisioned_cfgs),
178+
"errors": 0,
179+
"success": 0,
180+
"wall_time_ms": 0.0,
181+
"p50_ms": None,
182+
"p95_ms": None,
183+
"p99_ms": None,
184+
"error_samples": [],
185+
}
186+
identity_bootstrap_available = False
187+
results["note"] = (
188+
"Multi-user scale measurement unavailable: benchmark users could not be "
189+
"provisioned through /api/v1/users/provision on this runtime."
190+
)
191+
142192
same_p95 = results["same_session"].get("p95_ms")
143193
multi_p95 = results["multi_user"].get("p95_ms")
144194
if same_p95 and multi_p95 and multi_p95 > 0:
@@ -159,7 +209,22 @@ def run(config: BenchConfig) -> dict:
159209
results["metrics_snapshot"][parts[0]] = parts[1]
160210

161211
# Score calculation
162-
multi_success_rate = len(multi_latencies) / concurrency if concurrency > 0 else 0
212+
if not identity_bootstrap_available:
213+
results["score"] = 0.0
214+
results["verified_score"] = 0.0
215+
results["projected_score"] = 0.0
216+
results["measured_coverage"] = 0.0
217+
return {
218+
"dimension": "scale_cost",
219+
"score": results["score"],
220+
"verified_score": results["verified_score"],
221+
"projected_score": results["projected_score"],
222+
"measured_coverage": results["measured_coverage"],
223+
"details": results,
224+
}
225+
226+
multi_requests = results["multi_user"]["requests"]
227+
multi_success_rate = len(multi_latencies) / multi_requests if multi_requests > 0 else 0
163228
multi_p95 = results["multi_user"].get("p95_ms") or 60000
164229

165230
# Primary measured scale signal should use multi-user throughput, not same-session contention.

harness/sse_client.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,43 @@ def get_metrics(config: BenchConfig) -> str | None:
168168
except requests.RequestException:
169169
pass
170170
return None
171+
172+
173+
def provision_user(config: BenchConfig) -> dict:
174+
"""Attempt to provision the current benchmark user for tenant-aware runtimes."""
175+
url = f"{config.base_url}/api/v1/users/provision"
176+
try:
177+
r = requests.post(
178+
url,
179+
headers=config.headers,
180+
json={"user_id": config.user_id},
181+
timeout=15,
182+
)
183+
except requests.RequestException as e:
184+
return {
185+
"ok": False,
186+
"status_code": None,
187+
"error": str(e),
188+
"reason": "request_failed",
189+
}
190+
191+
try:
192+
payload = r.json()
193+
except json.JSONDecodeError:
194+
payload = {}
195+
196+
if r.status_code == 200:
197+
return {
198+
"ok": True,
199+
"status_code": r.status_code,
200+
"payload": payload,
201+
"reason": "provisioned",
202+
}
203+
204+
reason = payload.get("error") if isinstance(payload, dict) else None
205+
return {
206+
"ok": False,
207+
"status_code": r.status_code,
208+
"payload": payload,
209+
"reason": reason or "http_error",
210+
}

results/nullalis-scale-probe.json

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
{
2+
"benchmark_version": "0.2",
3+
"runtime_name": "Nullalis scale probe",
4+
"url": "http://127.0.0.1:3000",
5+
"user_id": "1",
6+
"date": "2026-03-25",
7+
"artifact_type": "live_gateway_run",
8+
"dimension_scores": {
9+
"scale_cost": 76.2
10+
},
11+
"dimension_verified_scores": {
12+
"scale_cost": 96.0
13+
},
14+
"dimension_projected_scores": {
15+
"scale_cost": 76.2
16+
},
17+
"dimension_measured_coverage": {
18+
"scale_cost": 0.2
19+
},
20+
"dimension_details": {
21+
"scale_cost": {
22+
"baseline_p50_ms": 3797.0,
23+
"same_session": {
24+
"requests": 20,
25+
"errors": 0,
26+
"success": 20,
27+
"wall_time_ms": 40565.0,
28+
"p50_ms": 11195.5,
29+
"p95_ms": 40559.9,
30+
"p99_ms": 40559.9,
31+
"error_samples": []
32+
},
33+
"multi_user_provisioning": {
34+
"requested_users": 20,
35+
"provisioned_users": 2,
36+
"provisioned_user_ids": [
37+
"1",
38+
"2"
39+
],
40+
"unavailable_users": 18,
41+
"unavailable_user_ids": [
42+
"3",
43+
"4",
44+
"5",
45+
"6",
46+
"7"
47+
],
48+
"error_samples": []
49+
},
50+
"multi_user": {
51+
"requests": 2,
52+
"errors": 0,
53+
"success": 2,
54+
"wall_time_ms": 3988.4,
55+
"p50_ms": 3005.9,
56+
"p95_ms": 3988.2,
57+
"p99_ms": 3988.2,
58+
"error_samples": []
59+
},
60+
"contention_ratio_same_session_over_multi_user": 10.17,
61+
"metrics_snapshot": {
62+
"nullalis_http_transport_native_total{subsystem=\"tools\"}": "0",
63+
"nullalis_http_transport_native_total{subsystem=\"providers\"}": "0",
64+
"nullalis_http_transport_native_total{subsystem=\"channels\"}": "0",
65+
"nullalis_http_transport_native_total{subsystem=\"system\"}": "0",
66+
"nullalis_http_transport_curl_total{subsystem=\"tools\"}": "0",
67+
"nullalis_http_transport_curl_total{subsystem=\"providers\"}": "1688",
68+
"nullalis_http_transport_curl_total{subsystem=\"channels\"}": "0",
69+
"nullalis_http_transport_curl_total{subsystem=\"system\"}": "0",
70+
"nullalis_http_transport_fallback_total{subsystem=\"tools\"}": "0",
71+
"nullalis_http_transport_fallback_total{subsystem=\"providers\"}": "0",
72+
"nullalis_http_transport_fallback_total{subsystem=\"channels\"}": "0",
73+
"nullalis_http_transport_fallback_total{subsystem=\"system\"}": "0",
74+
"nullalis_http_pool_hits_total": "0",
75+
"nullalis_http_pool_misses_total": "0",
76+
"nullalis_http_pool_idle_connections": "0"
77+
},
78+
"score": 76.2,
79+
"verified_score": 96.0,
80+
"projected_score": 76.2,
81+
"measured_coverage": 0.2
82+
}
83+
},
84+
"verified_composite_score": 96.0,
85+
"projected_composite_score": 3.8,
86+
"coverage_adjusted_verified_score": 1.0,
87+
"measured_coverage": 0.01,
88+
"composite_score": 3.8,
89+
"rating": "Early Stage",
90+
"elapsed_seconds": 59.0,
91+
"runtime_timing": {
92+
"dynamic_enabled": false,
93+
"base_timeout_secs": 90,
94+
"last_timeout_used_secs": null,
95+
"timeout_floor_secs": 90,
96+
"timeout_ceiling_secs": 3600,
97+
"timeout_multiplier": 4.0,
98+
"timeout_grace_secs": 30,
99+
"latency_ewma_ms": 0.0,
100+
"latency_samples": 0
101+
},
102+
"method": "Direct harness run against live gateway using SSE stream endpoint; per-chat timeout policy described in runtime_timing."
103+
}

0 commit comments

Comments
 (0)