Skip to content

Commit b2e3ab8

Browse files
committed
Polish TwinBench website for public launch
1 parent c2c5091 commit b2e3ab8

File tree

14 files changed

+232
-51
lines changed

14 files changed

+232
-51
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ This repository exists because the current benchmark landscape still misses a ca
1212

1313
TwinBench now ships with a lightweight public site and leaderboard surface in [`website/`](website/).
1414

15+
Live site:
16+
17+
- [Live Leaderboard](https://projectnuggets.github.io/DTaaS-benchmark/index.html)
18+
- [Reference Result](https://projectnuggets.github.io/DTaaS-benchmark/results/nullalis-live-2026-03-25-openended/index.html)
19+
1520
Build it locally:
1621

1722
```bash
@@ -23,6 +28,8 @@ Then open:
2328
- `website/index.html`
2429
- `website/results/nullalis-live-2026-03-25-openended/index.html`
2530

31+
The website is the public leaderboard and share surface. The repo is the benchmark source, run path, and submission workflow.
32+
2633
## Quick Run
2734

2835
Generic runtime:

scripts/build_website.py

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,18 @@
2222

2323
CANONICAL_SLUG = "nullalis-live-2026-03-25-openended"
2424
EXCLUDE_SLUGS = {"nullalis-smoke-autonomy", "nullalis-v0.1", "nullalis-v0.2"}
25+
HOMEPAGE_SLUGS = {
26+
"nullalis-live-2026-03-25-openended",
27+
"twinbench-demo-runtime",
28+
}
29+
DISPLAY_NAME_OVERRIDES = {
30+
"nullalis-live-2026-03-25-openended": "Nullalis Reference Runtime",
31+
"nullalis-local-2026-03-24": "Nullalis Baseline",
32+
"nullalis-live-2026-03-25": "Nullalis Auth-Failed Run",
33+
"nullalis-scale-probe": "Nullalis Scale Fairness Probe",
34+
"nullalis-targeted-2026-03-24": "Nullalis Degraded Recovery Run",
35+
"twinbench-demo-runtime": "TwinBench Demo Runtime",
36+
}
2537

2638

2739
def _load_results() -> list[dict]:
@@ -35,7 +47,8 @@ def _load_results() -> list[dict]:
3547
continue
3648
item = {
3749
"slug": slug,
38-
"runtime_name": data.get("runtime_name", slug),
50+
"runtime_name": DISPLAY_NAME_OVERRIDES.get(slug, data.get("runtime_name", slug)),
51+
"raw_runtime_name": data.get("runtime_name", slug),
3952
"date": data.get("date", ""),
4053
"coverage_adjusted_verified_score": data.get("coverage_adjusted_verified_score", 0),
4154
"verified_composite_score": data.get("verified_composite_score", 0),
@@ -57,7 +70,7 @@ def _load_results() -> list[dict]:
5770
item["headline_note"] = (
5871
"Current public reference result. Scale interpretation is conservative relative to the later provisioning-aware scale fix."
5972
)
60-
elif "auth-poisoned" in slug or slug.endswith("2026-03-25"):
73+
elif slug == "nullalis-live-2026-03-25":
6174
item["headline_note"] = "Supporting degraded artifact preserved for audit."
6275
elif "scale-probe" in slug:
6376
item["headline_note"] = "Supporting fairness probe, not a replacement full-run score."
@@ -276,8 +289,9 @@ def _page(title: str, body: str, depth: int = 0) -> str:
276289

277290
def _render_home(results: list[dict]) -> str:
278291
top = results[0]
292+
homepage_items = [item for item in results if item["slug"] in HOMEPAGE_SLUGS]
279293
rows = []
280-
for item in results[:8]:
294+
for item in homepage_items:
281295
score = item["coverage_adjusted_verified_score"]
282296
coverage = round(item["measured_coverage"] * 100)
283297
rows.append(
@@ -295,9 +309,11 @@ def _render_home(results: list[dict]) -> str:
295309
<div class="eyebrow">The open benchmark for personal AI assistants</div>
296310
<h1>TwinBench</h1>
297311
<p class="lede">TwinBench measures whether an AI system can behave like a real personal AI assistant: remember, act, follow up, stay safe, and operate over time.</p>
312+
<p class="lede" style="max-width:42ch"><strong>Can your personal AI assistant beat TwinBench?</strong> Start with the reference result, then run the demo or benchmark your own system.</p>
298313
<div class="actions">
299314
<a class="button" href="results/{top['slug']}/index.html">See the Reference Result</a>
300-
<a class="button secondary" href="submit/index.html">Submit Your Assistant</a>
315+
<a class="button secondary" href="submit/index.html">Benchmark Your Assistant</a>
316+
<a class="button secondary" href="compare/index.html">Compare Results</a>
301317
</div>
302318
<div class="stats">
303319
<div class="stat"><strong>{len(results)}</strong> Checked-in artifacts</div>
@@ -310,13 +326,14 @@ def _render_home(results: list[dict]) -> str:
310326
<div class="panel">
311327
<div class="eyebrow">Leaderboard</div>
312328
<h2>Current public board</h2>
313-
<p>The board is one place, but every result keeps its class, coverage, and evidence story. Trust comes from artifacts, not claims.</p>
329+
<p>The public board shows the current reference result and challenge-worthy artifacts. Historical and degraded runs stay available, but they do not dominate first impression.</p>
314330
<div class="leaderboard">
315331
<table>
316332
<thead><tr><th>Assistant</th><th>Class</th><th>Tier</th><th>Score</th><th>Coverage</th><th>Date</th></tr></thead>
317333
<tbody>{''.join(rows)}</tbody>
318334
</table>
319335
</div>
336+
<p style="margin-top:12px"><a href="https://github.com/ProjectNuggets/DTaaS-benchmark/blob/main/docs/RESULTS_INDEX.md">See full artifact history</a></p>
320337
</div>
321338
<div class="panel">
322339
<div class="eyebrow">Why it matters</div>
@@ -374,14 +391,25 @@ def _render_result(item: dict) -> str:
374391
details = item["details"]
375392
artifact_links = item["artifact_links"]
376393
detail_items = []
394+
sorted_dims = []
377395
for dim, info in details.items():
378396
verified = info.get("verified_score", info.get("score", 0)) if isinstance(info, dict) else 0
379397
reason = item["dimension_reason_codes"].get(dim) or ""
380398
status = item["dimension_status"].get(dim, "measured")
381399
label = DIMENSION_LABELS.get(dim, dim)
400+
sorted_dims.append((label, verified))
382401
detail_items.append(
383402
f"<div class='tile'><strong>{label}</strong><div class='score'>{verified:.1f}</div><div><span class='pill'>{status}</span> {reason}</div></div>"
384403
)
404+
sorted_dims.sort(key=lambda x: x[1], reverse=True)
405+
strongest = ", ".join(label for label, _ in sorted_dims[:3]) if sorted_dims else "N/A"
406+
weakest = sorted_dims[-1][0] if sorted_dims else "N/A"
407+
if item["slug"] == CANONICAL_SLUG:
408+
why_matters = "This is the strongest public proof in the repo that the personal AI assistant category is real and measurable."
409+
elif item["slug"] == "twinbench-demo-runtime":
410+
why_matters = "This result proves a new user can run TwinBench end to end before pointing it at a real assistant."
411+
else:
412+
why_matters = "This artifact helps explain benchmark progression, fairness, and failure handling."
385413
links = []
386414
if "json" in artifact_links:
387415
links.append(f"<a href='{artifact_links['json']}'>JSON artifact</a>")
@@ -418,6 +446,21 @@ def _render_result(item: dict) -> str:
418446
</div>
419447
</section>
420448
449+
<section class="two-col section">
450+
<div class="panel">
451+
<div class="eyebrow">What stands out</div>
452+
<h2>Result interpretation</h2>
453+
<p><strong>Strongest dimensions:</strong> {strongest}</p>
454+
<p><strong>Main limitation:</strong> {weakest}</p>
455+
<p><strong>Why it matters:</strong> {why_matters}</p>
456+
</div>
457+
<div class="panel">
458+
<div class="eyebrow">Evidence</div>
459+
<h2>How to read it</h2>
460+
<p>Use coverage-adjusted verified score for public comparison, verified raw for direct measurement strength, and measured coverage to understand how much of the benchmark was truly exercised.</p>
461+
</div>
462+
</section>
463+
421464
<section class="section">
422465
<div class="eyebrow">Dimension tiles</div>
423466
<div class="grid">{''.join(detail_items)}</div>
@@ -442,7 +485,16 @@ def _render_methodology() -> str:
442485
<div class="card"><h3>Coverage matters</h3><p>The headline ranking number is coverage-adjusted verified score, not the most flattering number in the artifact.</p></div>
443486
<div class="card"><h3>Trust over hype</h3><p>Unsupported surfaces, missing bootstrap, and partial measurement are reported explicitly instead of flattened into a false failure.</p></div>
444487
</section>
488+
489+
<section class="panel prose section">
490+
<h2>What the headline numbers mean</h2>
491+
<p><strong>Verified</strong> is what the run directly proved. <strong>Projected</strong> is the broader estimate with explicit assumptions. <strong>Measured coverage</strong> tells you how much of the benchmark was directly exercised.</p>
492+
<p>TwinBench uses <strong>coverage-adjusted verified score</strong> for public ranking because it rewards both strength and honest measurement.</p>
493+
<h2>Why unavailable is not failure</h2>
494+
<p>Some systems do not expose the runtime surfaces needed for a fair direct measurement. TwinBench records that explicitly instead of pretending they cleanly failed a dimension.</p>
495+
</section>
445496
""",
497+
depth=1,
446498
)
447499

448500

@@ -463,10 +515,15 @@ def _render_faq() -> str:
463515
<p>No. Nullalis is the current reference runtime because it produced the first strong public artifact.</p>
464516
<h2>Can I run TwinBench quickly?</h2>
465517
<p>Yes. Use the demo path from the repo or run against a native runtime with one command.</p>
518+
<h2>Why can some dimensions be unavailable?</h2>
519+
<p>Because some systems do not expose the runtime surfaces required for a fair direct measurement. TwinBench shows that honestly rather than hiding it.</p>
520+
<h2>Why does coverage matter?</h2>
521+
<p>Coverage shows how much of the benchmark was truly exercised. A flattering score with weak coverage should not outrank a strong, deeply measured artifact.</p>
466522
<h2>What if my assistant only supports part of the benchmark?</h2>
467523
<p>That is still useful. TwinBench prefers honest partial artifacts over fake comparability.</p>
468524
</section>
469525
""",
526+
depth=1,
470527
)
471528

472529

@@ -494,6 +551,7 @@ def _render_submit() -> str:
494551
<p><a href="https://github.com/ProjectNuggets/DTaaS-benchmark/issues/new?template=submit-results.md">Open a results submission</a></p>
495552
</section>
496553
""",
554+
depth=1,
497555
)
498556

499557

@@ -508,7 +566,7 @@ def _render_compare(results: list[dict]) -> str:
508566
<section class="hero">
509567
<div class="eyebrow">Compare</div>
510568
<h1>Compare two results</h1>
511-
<p class="lede">Use this page when you want a clean side-by-side view instead of two tabs and a screenshot.</p>
569+
<p class="lede">Use this page when you want a clean side-by-side view instead of two tabs and a screenshot. The reference runtime is preselected to make comparison fast.</p>
512570
</section>
513571
514572
<section class="panel prose">
@@ -537,8 +595,11 @@ def _render_compare(results: list[dict]) -> str:
537595
const right = bySlug(data.results, document.getElementById('right').value);
538596
document.getElementById('compare-out').innerHTML = `<div class="grid">${{render(left)}}${{render(right)}}</div>`;
539597
}});
598+
document.getElementById('left').value = '{CANONICAL_SLUG}';
599+
document.getElementById('right').value = 'twinbench-demo-runtime';
540600
</script>
541601
""",
602+
depth=1,
542603
)
543604

544605

website/compare/index.html

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,30 @@
44
<meta charset="utf-8">
55
<meta name="viewport" content="width=device-width, initial-scale=1">
66
<title>Compare TwinBench results</title>
7-
<link rel="stylesheet" href="site.css">
7+
<link rel="stylesheet" href="../site.css">
88
</head>
99
<body>
1010
<div class="shell">
1111
<div class="nav">
12-
<a href="index.html">Home</a>
13-
<a href="methodology/index.html">Methodology</a>
14-
<a href="faq/index.html">FAQ</a>
15-
<a href="submit/index.html">Submit</a>
16-
<a href="compare/index.html">Compare</a>
12+
<a href="../index.html">Home</a>
13+
<a href="../methodology/index.html">Methodology</a>
14+
<a href="../faq/index.html">FAQ</a>
15+
<a href="../submit/index.html">Submit</a>
16+
<a href="../compare/index.html">Compare</a>
1717
<a href="https://github.com/ProjectNuggets/DTaaS-benchmark">GitHub</a>
1818
</div>
1919

2020
<section class="hero">
2121
<div class="eyebrow">Compare</div>
2222
<h1>Compare two results</h1>
23-
<p class="lede">Use this page when you want a clean side-by-side view instead of two tabs and a screenshot.</p>
23+
<p class="lede">Use this page when you want a clean side-by-side view instead of two tabs and a screenshot. The reference runtime is preselected to make comparison fast.</p>
2424
</section>
2525

2626
<section class="panel prose">
2727
<label for="left">Left</label>
28-
<select id="left"><option value='nullalis-live-2026-03-25-openended'>Nullalis local openended race (nullalis-live-2026-03-25-openended)</option><option value='nullalis-local-2026-03-24'>Nullalis local live (nullalis-local-2026-03-24)</option><option value='twinbench-demo-runtime'>TwinBench Demo Runtime (twinbench-demo-runtime)</option><option value='nullalis-live-2026-03-25'>Nullalis local live (nullalis-live-2026-03-25)</option><option value='nullalis-scale-probe'>Nullalis scale probe (nullalis-scale-probe)</option><option value='nullalis-targeted-2026-03-24'>Nullalis local live (nullalis-targeted-2026-03-24)</option></select>
28+
<select id="left"><option value='nullalis-live-2026-03-25-openended'>Nullalis Reference Runtime (nullalis-live-2026-03-25-openended)</option><option value='nullalis-local-2026-03-24'>Nullalis Baseline (nullalis-local-2026-03-24)</option><option value='twinbench-demo-runtime'>TwinBench Demo Runtime (twinbench-demo-runtime)</option><option value='nullalis-live-2026-03-25'>Nullalis Auth-Failed Run (nullalis-live-2026-03-25)</option><option value='nullalis-scale-probe'>Nullalis Scale Fairness Probe (nullalis-scale-probe)</option><option value='nullalis-targeted-2026-03-24'>Nullalis Degraded Recovery Run (nullalis-targeted-2026-03-24)</option></select>
2929
<label for="right">Right</label>
30-
<select id="right"><option value='nullalis-live-2026-03-25-openended'>Nullalis local openended race (nullalis-live-2026-03-25-openended)</option><option value='nullalis-local-2026-03-24'>Nullalis local live (nullalis-local-2026-03-24)</option><option value='twinbench-demo-runtime'>TwinBench Demo Runtime (twinbench-demo-runtime)</option><option value='nullalis-live-2026-03-25'>Nullalis local live (nullalis-live-2026-03-25)</option><option value='nullalis-scale-probe'>Nullalis scale probe (nullalis-scale-probe)</option><option value='nullalis-targeted-2026-03-24'>Nullalis local live (nullalis-targeted-2026-03-24)</option></select>
30+
<select id="right"><option value='nullalis-live-2026-03-25-openended'>Nullalis Reference Runtime (nullalis-live-2026-03-25-openended)</option><option value='nullalis-local-2026-03-24'>Nullalis Baseline (nullalis-local-2026-03-24)</option><option value='twinbench-demo-runtime'>TwinBench Demo Runtime (twinbench-demo-runtime)</option><option value='nullalis-live-2026-03-25'>Nullalis Auth-Failed Run (nullalis-live-2026-03-25)</option><option value='nullalis-scale-probe'>Nullalis Scale Fairness Probe (nullalis-scale-probe)</option><option value='nullalis-targeted-2026-03-24'>Nullalis Degraded Recovery Run (nullalis-targeted-2026-03-24)</option></select>
3131
<p><button class="button" id="run-compare" type="button">Compare</button></p>
3232
<div id="compare-out"></div>
3333
</section>
@@ -49,6 +49,8 @@ <h1>Compare two results</h1>
4949
const right = bySlug(data.results, document.getElementById('right').value);
5050
document.getElementById('compare-out').innerHTML = `<div class="grid">${render(left)}${render(right)}</div>`;
5151
});
52+
document.getElementById('left').value = 'nullalis-live-2026-03-25-openended';
53+
document.getElementById('right').value = 'twinbench-demo-runtime';
5254
</script>
5355

5456
<div class="footer">

website/data/results.json

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"results": [
33
{
44
"slug": "nullalis-live-2026-03-25-openended",
5-
"runtime_name": "Nullalis local openended race",
5+
"runtime_name": "Nullalis Reference Runtime",
6+
"raw_runtime_name": "Nullalis local openended race",
67
"date": "2026-03-25",
78
"coverage_adjusted_verified_score": 75.9,
89
"verified_composite_score": 90.9,
@@ -290,7 +291,8 @@
290291
},
291292
{
292293
"slug": "nullalis-local-2026-03-24",
293-
"runtime_name": "Nullalis local live",
294+
"runtime_name": "Nullalis Baseline",
295+
"raw_runtime_name": "Nullalis local live",
294296
"date": "2026-03-24",
295297
"coverage_adjusted_verified_score": 68.1,
296298
"verified_composite_score": 81.6,
@@ -545,6 +547,7 @@
545547
{
546548
"slug": "twinbench-demo-runtime",
547549
"runtime_name": "TwinBench Demo Runtime",
550+
"raw_runtime_name": "TwinBench Demo Runtime",
548551
"date": "2026-03-25",
549552
"coverage_adjusted_verified_score": 54.4,
550553
"verified_composite_score": 79.0,
@@ -851,7 +854,8 @@
851854
},
852855
{
853856
"slug": "nullalis-live-2026-03-25",
854-
"runtime_name": "Nullalis local live",
857+
"runtime_name": "Nullalis Auth-Failed Run",
858+
"raw_runtime_name": "Nullalis local live",
855859
"date": "2026-03-25",
856860
"coverage_adjusted_verified_score": 5.8,
857861
"verified_composite_score": 8.4,
@@ -1126,7 +1130,8 @@
11261130
},
11271131
{
11281132
"slug": "nullalis-scale-probe",
1129-
"runtime_name": "Nullalis scale probe",
1133+
"runtime_name": "Nullalis Scale Fairness Probe",
1134+
"raw_runtime_name": "Nullalis scale probe",
11301135
"date": "2026-03-25",
11311136
"coverage_adjusted_verified_score": 1.0,
11321137
"verified_composite_score": 96.0,
@@ -1208,7 +1213,8 @@
12081213
},
12091214
{
12101215
"slug": "nullalis-targeted-2026-03-24",
1211-
"runtime_name": "Nullalis local live",
1216+
"runtime_name": "Nullalis Degraded Recovery Run",
1217+
"raw_runtime_name": "Nullalis local live",
12121218
"date": "2026-03-24",
12131219
"coverage_adjusted_verified_score": 0.0,
12141220
"verified_composite_score": 0.0,

website/faq/index.html

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44
<meta charset="utf-8">
55
<meta name="viewport" content="width=device-width, initial-scale=1">
66
<title>TwinBench FAQ</title>
7-
<link rel="stylesheet" href="site.css">
7+
<link rel="stylesheet" href="../site.css">
88
</head>
99
<body>
1010
<div class="shell">
1111
<div class="nav">
12-
<a href="index.html">Home</a>
13-
<a href="methodology/index.html">Methodology</a>
14-
<a href="faq/index.html">FAQ</a>
15-
<a href="submit/index.html">Submit</a>
16-
<a href="compare/index.html">Compare</a>
12+
<a href="../index.html">Home</a>
13+
<a href="../methodology/index.html">Methodology</a>
14+
<a href="../faq/index.html">FAQ</a>
15+
<a href="../submit/index.html">Submit</a>
16+
<a href="../compare/index.html">Compare</a>
1717
<a href="https://github.com/ProjectNuggets/DTaaS-benchmark">GitHub</a>
1818
</div>
1919

@@ -30,6 +30,10 @@ <h2>Is this only for Nullalis?</h2>
3030
<p>No. Nullalis is the current reference runtime because it produced the first strong public artifact.</p>
3131
<h2>Can I run TwinBench quickly?</h2>
3232
<p>Yes. Use the demo path from the repo or run against a native runtime with one command.</p>
33+
<h2>Why can some dimensions be unavailable?</h2>
34+
<p>Because some systems do not expose the runtime surfaces required for a fair direct measurement. TwinBench shows that honestly rather than hiding it.</p>
35+
<h2>Why does coverage matter?</h2>
36+
<p>Coverage shows how much of the benchmark was truly exercised. A flattering score with weak coverage should not outrank a strong, deeply measured artifact.</p>
3337
<h2>What if my assistant only supports part of the benchmark?</h2>
3438
<p>That is still useful. TwinBench prefers honest partial artifacts over fake comparability.</p>
3539
</section>

0 commit comments

Comments
 (0)