Skip to content

Commit e9f7a5f

Browse files
committed
harness: split scale test into same-session and multi-user concurrency
1 parent 196d37a commit e9f7a5f

3 files changed

Lines changed: 133 additions & 53 deletions

File tree

harness/config.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,26 @@ def timeout_state(self) -> dict:
100100
"latency_ewma_ms": round(self._latency_ewma_ms, 1),
101101
"latency_samples": self._latency_samples,
102102
}
103+
104+
def clone_for_user(self, user_id: str) -> "BenchConfig":
105+
"""Create a per-user clone preserving harness behavior settings."""
106+
return BenchConfig(
107+
base_url=self.base_url,
108+
token=self.token,
109+
user_id=user_id,
110+
timeout=self.timeout,
111+
timeout_dynamic=self.timeout_dynamic,
112+
timeout_floor_secs=self.timeout_floor_secs,
113+
timeout_ceiling_secs=self.timeout_ceiling_secs,
114+
timeout_multiplier=self.timeout_multiplier,
115+
timeout_grace_secs=self.timeout_grace_secs,
116+
chat_endpoint=self.chat_endpoint,
117+
health_endpoint=self.health_endpoint,
118+
ready_endpoint=self.ready_endpoint,
119+
diagnostics_endpoint=self.diagnostics_endpoint,
120+
metrics_endpoint=self.metrics_endpoint,
121+
memory_sample_size=self.memory_sample_size,
122+
scale_concurrency=self.scale_concurrency,
123+
latency_requests=self.latency_requests,
124+
schedule_wait_secs=self.schedule_wait_secs,
125+
)

harness/dim7_scale.py

Lines changed: 103 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,56 @@ def _timed_chat(config: BenchConfig, message: str) -> dict:
2121
}
2222

2323

24+
def _run_concurrency_scenario(configs: list[BenchConfig], message_prefix: str) -> tuple[list[float], int, float]:
25+
"""Run one concurrency scenario and return (latencies_ms, errors, wall_time_ms)."""
26+
latencies: list[float] = []
27+
errors = 0
28+
start_all = time.monotonic()
29+
30+
with concurrent.futures.ThreadPoolExecutor(max_workers=len(configs)) as pool:
31+
futures = []
32+
for i, cfg in enumerate(configs):
33+
futures.append(
34+
pool.submit(
35+
_timed_chat,
36+
cfg,
37+
f"{message_prefix} {i}: what is {i}+{i}?",
38+
)
39+
)
40+
for f in concurrent.futures.as_completed(futures):
41+
r = f.result()
42+
if r["error"]:
43+
errors += 1
44+
else:
45+
latencies.append(r["latency_ms"])
46+
47+
wall_time_ms = (time.monotonic() - start_all) * 1000
48+
return latencies, errors, wall_time_ms
49+
50+
51+
def _latency_stats(latencies: list[float]) -> dict:
52+
if not latencies:
53+
return {}
54+
ordered = sorted(latencies)
55+
return {
56+
"p50_ms": round(statistics.median(ordered), 1),
57+
"p95_ms": round(
58+
ordered[int(len(ordered) * 0.95)] if len(ordered) >= 5 else max(ordered),
59+
1,
60+
),
61+
"p99_ms": round(
62+
ordered[int(len(ordered) * 0.99)] if len(ordered) >= 10 else max(ordered),
63+
1,
64+
),
65+
}
66+
67+
2468
def run(config: BenchConfig) -> dict:
25-
"""Test scale under concurrent load."""
69+
"""Test scale in three modes:
70+
1) same-session contention (single user)
71+
2) multi-user parallel (one request per user)
72+
3) transport/pool metric snapshot
73+
"""
2674
results = {}
2775
concurrency = config.scale_concurrency
2876

@@ -40,47 +88,50 @@ def run(config: BenchConfig) -> dict:
4088
results["baseline_p50_ms"] = None
4189
results["baseline_error"] = "All baseline requests failed"
4290

43-
# Phase 2: Concurrent load
44-
latencies = []
45-
errors = 0
46-
start_all = time.monotonic()
47-
48-
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as pool:
49-
futures = []
50-
for i in range(concurrency):
51-
futures.append(
52-
pool.submit(
53-
_timed_chat,
54-
config,
55-
f"Concurrent test request {i}: what is {i}+{i}?",
56-
)
57-
)
58-
for f in concurrent.futures.as_completed(futures):
59-
r = f.result()
60-
if r["error"]:
61-
errors += 1
62-
else:
63-
latencies.append(r["latency_ms"])
91+
# Phase 2a: Same-user same-session contention (diagnostic only; not primary scale metric)
92+
same_session_cfgs = [config for _ in range(concurrency)]
93+
same_latencies, same_errors, same_wall_ms = _run_concurrency_scenario(
94+
same_session_cfgs,
95+
"Same-session contention request",
96+
)
97+
same_stats = _latency_stats(same_latencies)
98+
results["same_session"] = {
99+
"requests": concurrency,
100+
"errors": same_errors,
101+
"success": len(same_latencies),
102+
"wall_time_ms": round(same_wall_ms, 1),
103+
"p50_ms": same_stats.get("p50_ms"),
104+
"p95_ms": same_stats.get("p95_ms"),
105+
"p99_ms": same_stats.get("p99_ms"),
106+
}
64107

65-
wall_time_ms = (time.monotonic() - start_all) * 1000
108+
# Phase 2b: Multi-user parallel (primary scale metric)
109+
try:
110+
base_user_num = int(config.user_id)
111+
user_ids = [str(base_user_num + i) for i in range(concurrency)]
112+
except ValueError:
113+
user_ids = [f"{config.user_id}-bench-{i}" for i in range(concurrency)]
114+
multi_cfgs = [config.clone_for_user(uid) for uid in user_ids]
115+
multi_latencies, multi_errors, multi_wall_ms = _run_concurrency_scenario(
116+
multi_cfgs,
117+
"Multi-user concurrent request",
118+
)
119+
multi_stats = _latency_stats(multi_latencies)
120+
results["multi_user"] = {
121+
"requests": concurrency,
122+
"errors": multi_errors,
123+
"success": len(multi_latencies),
124+
"wall_time_ms": round(multi_wall_ms, 1),
125+
"p50_ms": multi_stats.get("p50_ms"),
126+
"p95_ms": multi_stats.get("p95_ms"),
127+
"p99_ms": multi_stats.get("p99_ms"),
128+
}
66129

67-
results["concurrent_requests"] = concurrency
68-
results["concurrent_errors"] = errors
69-
results["concurrent_success"] = len(latencies)
70-
results["wall_time_ms"] = round(wall_time_ms, 1)
71-
72-
if latencies:
73-
latencies.sort()
74-
results["concurrent_p50_ms"] = round(statistics.median(latencies), 1)
75-
results["concurrent_p95_ms"] = (
76-
round(latencies[int(len(latencies) * 0.95)], 1)
77-
if len(latencies) >= 5
78-
else round(max(latencies), 1)
79-
)
80-
results["concurrent_p99_ms"] = (
81-
round(latencies[int(len(latencies) * 0.99)], 1)
82-
if len(latencies) >= 10
83-
else round(max(latencies), 1)
130+
same_p95 = results["same_session"].get("p95_ms")
131+
multi_p95 = results["multi_user"].get("p95_ms")
132+
if same_p95 and multi_p95 and multi_p95 > 0:
133+
results["contention_ratio_same_session_over_multi_user"] = round(
134+
same_p95 / multi_p95, 2
84135
)
85136

86137
# Phase 3: Parse metrics for pool/transport stats
@@ -96,23 +147,22 @@ def run(config: BenchConfig) -> dict:
96147
results["metrics_snapshot"][parts[0]] = parts[1]
97148

98149
# Score calculation
99-
success_rate = len(latencies) / concurrency if concurrency > 0 else 0
100-
p95 = results.get("concurrent_p95_ms", 60000)
150+
multi_success_rate = len(multi_latencies) / concurrency if concurrency > 0 else 0
151+
multi_p95 = results["multi_user"].get("p95_ms") or 60000
101152

102-
# inverse_rss (projected — can't measure from outside), inverse_p95, max_users, horizontal, inverse_cost
103-
p95_score = max(0, 100 - (p95 / 1000)) # 0ms = 100, 100s = 0
104-
success_score = success_rate * 100
153+
# Primary measured scale signal should use multi-user throughput, not same-session contention.
154+
multi_p95_score = max(0, 100 - (multi_p95 / 1000)) # 0ms = 100, 100s = 0
155+
verified_score = multi_p95_score * multi_success_rate
156+
157+
# Projection still includes unmeasured dimensions.
105158
projected_score = (
106-
0.70 * 0.25 # inverse_rss: projected (2.6MB binary, ~43MB test RSS)
107-
+ p95_score / 100 * 0.20
108-
+ 0.70 * 0.25 # max_users: projected (~1000/instance)
109-
+ 0.80 * 0.20 # horizontal: projected (tenant lock + Postgres)
159+
0.70 * 0.25 # inverse_rss: projected
160+
+ (multi_p95_score / 100) * 0.20
161+
+ 0.70 * 0.25 # max_users: projected
162+
+ 0.80 * 0.20 # horizontal: projected
110163
+ 0.60 * 0.10 # inverse_cost: projected
111164
) * 100
112-
113-
# Adjust based on actual success rate
114-
projected_score = projected_score * success_rate
115-
verified_score = p95_score * success_rate
165+
projected_score = projected_score * multi_success_rate
116166

117167
results["score"] = round(min(100, projected_score), 1)
118168
results["verified_score"] = round(min(100, verified_score), 1)

harness/runner.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ def main():
114114
action="store_true",
115115
help="Skip the 3-minute schedule execution wait",
116116
)
117+
parser.add_argument(
118+
"--scale-concurrency",
119+
type=int,
120+
default=20,
121+
help="Concurrent requests for scale dimension (default: 20)",
122+
)
117123
args = parser.parse_args()
118124

119125
config = BenchConfig(
@@ -126,6 +132,7 @@ def main():
126132
timeout_ceiling_secs=args.timeout_ceiling,
127133
timeout_multiplier=args.timeout_multiplier,
128134
timeout_grace_secs=args.timeout_grace,
135+
scale_concurrency=args.scale_concurrency,
129136
schedule_wait_secs=0 if args.skip_schedule_wait else 180,
130137
)
131138

0 commit comments

Comments
 (0)