|
| 1 | +<!DOCTYPE html> |
| 2 | +<html lang="en"> |
| 3 | +<head> |
| 4 | +<meta charset="utf-8"> |
| 5 | +<meta name="viewport" content="width=device-width, initial-scale=1"> |
| 6 | +<title>TwinBench Results — Nullalis local openended race</title> |
| 7 | +<style> |
| 8 | + * { margin: 0; padding: 0; box-sizing: border-box; } |
| 9 | + body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f8fafc; color: #1e293b; padding: 2rem; max-width: 900px; margin: 0 auto; } |
| 10 | + h1 { font-size: 1.8rem; margin-bottom: 0.5rem; } |
| 11 | + h2 { font-size: 1.3rem; margin: 1.5rem 0 0.8rem; color: #334155; } |
| 12 | + h3 { font-size: 1.1rem; margin: 1rem 0 0.5rem; color: #475569; } |
| 13 | + .meta { color: #64748b; font-size: 0.9rem; margin-bottom: 1.5rem; } |
| 14 | + .score-card { background: white; border-radius: 12px; padding: 2rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1.5rem; text-align: center; } |
| 15 | + .score-number { font-size: 4rem; font-weight: 800; color: #0f172a; } |
| 16 | + .score-tier { display: inline-block; padding: 0.3rem 1rem; border-radius: 20px; color: white; font-weight: 600; font-size: 1.1rem; margin-top: 0.5rem; } |
| 17 | + .pill { display:inline-block;padding:0.2rem 0.6rem;border-radius:999px;background:#e2e8f0;color:#334155;font-size:0.85rem;font-weight:600; } |
| 18 | + table { width: 100%; border-collapse: collapse; background: white; border-radius: 8px; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1.5rem; } |
| 19 | + th { background: #f1f5f9; text-align: left; padding: 0.75rem 1rem; font-weight: 600; font-size: 0.85rem; color: #475569; text-transform: uppercase; letter-spacing: 0.05em; } |
| 20 | + td { padding: 0.75rem 1rem; border-top: 1px solid #e2e8f0; } |
| 21 | + tr:hover { background: #f8fafc; } |
| 22 | + .details { background: white; border-radius: 8px; padding: 1.5rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1rem; } |
| 23 | + .details ul { padding-left: 1.5rem; } |
| 24 | + .details li { margin: 0.3rem 0; font-size: 0.9rem; } |
| 25 | + .footer { text-align: center; color: #94a3b8; font-size: 0.8rem; margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e2e8f0; } |
| 26 | + .nn { font-weight: 600; color: #64748b; } |
| 27 | +</style> |
| 28 | +</head> |
| 29 | +<body> |
| 30 | +<h1>TwinBench Results</h1> |
| 31 | +<div class="meta">Nullalis local openended race — 2026-03-25 — TwinBench v0.2</div> |
| 32 | + |
| 33 | +<div class="score-card"> |
| 34 | + <div class="score-number">76</div> |
| 35 | + <div>/100</div> |
| 36 | + <div class="score-tier" style="background:#3b82f6">Production-Grade</div> |
| 37 | + <div style="margin-top:0.8rem"><span class="pill">Verified Reference Artifact</span></div> |
| 38 | + <div style="margin-top:0.8rem;color:#475569">Verified raw: 90.9 | Coverage: 84% | Projected: 87.6</div> |
| 39 | +</div> |
| 40 | + |
| 41 | +<div class="details"> |
| 42 | + <h2>Interpretation</h2> |
| 43 | + <p style="line-height:1.6;color:#475569">This artifact is strong enough to compare publicly. Use the verified score for evidence-backed comparison and the projected score only as a clearly labeled estimate.</p> |
| 44 | + <h3 style="margin-top:1rem">Benchmark Principles</h3> |
| 45 | + <ul> |
| 46 | + <li>unsupported is not failure</li> |
| 47 | + <li>missing bootstrap is not poor scale</li> |
| 48 | + <li>same-user contention is diagnostic</li> |
| 49 | + <li>evidence beats claims</li> |
| 50 | + </ul> |
| 51 | +</div> |
| 52 | + |
| 53 | +<h2>Dimension Breakdown</h2> |
| 54 | +<table> |
| 55 | + <thead> |
| 56 | + <tr><th>Dimension</th><th>Status</th><th>Reason Code</th><th>Weight</th><th>Verified</th><th>Projected</th><th>Coverage</th><th>V Weighted</th><th>P Weighted</th></tr> |
| 57 | + </thead> |
| 58 | + <tbody> |
| 59 | + |
| 60 | + <tr> |
| 61 | + <td>Autonomy Control</td> |
| 62 | + <td>measured</td> |
| 63 | + <td></td> |
| 64 | + <td>0.15</td> |
| 65 | + <td> |
| 66 | + <div style="display:flex;align-items:center;gap:8px"> |
| 67 | + <div style="background:#22c55e;height:18px;width:95%;border-radius:3px;min-width:4px"></div> |
| 68 | + <span>95</span> |
| 69 | + </div> |
| 70 | + </td> |
| 71 | + <td>95</td> |
| 72 | + <td>100%</td> |
| 73 | + <td>14.25</td> |
| 74 | + <td>14.25</td> |
| 75 | + </tr> |
| 76 | + <tr> |
| 77 | + <td>Memory Persistence</td> |
| 78 | + <td>partially_measured</td> |
| 79 | + <td></td> |
| 80 | + <td>0.15</td> |
| 81 | + <td> |
| 82 | + <div style="display:flex;align-items:center;gap:8px"> |
| 83 | + <div style="background:#22c55e;height:18px;width:100%;border-radius:3px;min-width:4px"></div> |
| 84 | + <span>100</span> |
| 85 | + </div> |
| 86 | + </td> |
| 87 | + <td>100</td> |
| 88 | + <td>70%</td> |
| 89 | + <td>10.50</td> |
| 90 | + <td>15.00</td> |
| 91 | + </tr> |
| 92 | + <tr> |
| 93 | + <td>Functional Capability</td> |
| 94 | + <td>measured</td> |
| 95 | + <td></td> |
| 96 | + <td>0.15</td> |
| 97 | + <td> |
| 98 | + <div style="display:flex;align-items:center;gap:8px"> |
| 99 | + <div style="background:#22c55e;height:18px;width:100%;border-radius:3px;min-width:4px"></div> |
| 100 | + <span>100</span> |
| 101 | + </div> |
| 102 | + </td> |
| 103 | + <td>100</td> |
| 104 | + <td>100%</td> |
| 105 | + <td>15.00</td> |
| 106 | + <td>15.00</td> |
| 107 | + </tr> |
| 108 | + <tr> |
| 109 | + <td>Autonomous Execution</td> |
| 110 | + <td>measured</td> |
| 111 | + <td></td> |
| 112 | + <td>0.12</td> |
| 113 | + <td> |
| 114 | + <div style="display:flex;align-items:center;gap:8px"> |
| 115 | + <div style="background:#22c55e;height:18px;width:100%;border-radius:3px;min-width:4px"></div> |
| 116 | + <span>100</span> |
| 117 | + </div> |
| 118 | + </td> |
| 119 | + <td>100</td> |
| 120 | + <td>100%</td> |
| 121 | + <td>12.00</td> |
| 122 | + <td>12.00</td> |
| 123 | + </tr> |
| 124 | + <tr> |
| 125 | + <td>Cross-Channel Consistency</td> |
| 126 | + <td>partially_measured</td> |
| 127 | + <td></td> |
| 128 | + <td>0.12</td> |
| 129 | + <td> |
| 130 | + <div style="display:flex;align-items:center;gap:8px"> |
| 131 | + <div style="background:#22c55e;height:18px;width:92%;border-radius:3px;min-width:4px"></div> |
| 132 | + <span>93</span> |
| 133 | + </div> |
| 134 | + </td> |
| 135 | + <td>93</td> |
| 136 | + <td>70%</td> |
| 137 | + <td>7.80</td> |
| 138 | + <td>11.15</td> |
| 139 | + </tr> |
| 140 | + <tr> |
| 141 | + <td>Integration Breadth</td> |
| 142 | + <td>measured</td> |
| 143 | + <td></td> |
| 144 | + <td>0.08</td> |
| 145 | + <td> |
| 146 | + <div style="display:flex;align-items:center;gap:8px"> |
| 147 | + <div style="background:#ef4444;height:18px;width:54%;border-radius:3px;min-width:4px"></div> |
| 148 | + <span>54</span> |
| 149 | + </div> |
| 150 | + </td> |
| 151 | + <td>54</td> |
| 152 | + <td>100%</td> |
| 153 | + <td>4.32</td> |
| 154 | + <td>4.32</td> |
| 155 | + </tr> |
| 156 | + <tr> |
| 157 | + <td>Security & Privacy</td> |
| 158 | + <td>partially_measured</td> |
| 159 | + <td></td> |
| 160 | + <td>0.08</td> |
| 161 | + <td> |
| 162 | + <div style="display:flex;align-items:center;gap:8px"> |
| 163 | + <div style="background:#f59e0b;height:18px;width:75%;border-radius:3px;min-width:4px"></div> |
| 164 | + <span>75</span> |
| 165 | + </div> |
| 166 | + </td> |
| 167 | + <td>81</td> |
| 168 | + <td>60%</td> |
| 169 | + <td>3.60</td> |
| 170 | + <td>6.48</td> |
| 171 | + </tr> |
| 172 | + <tr> |
| 173 | + <td>Scale & Cost Efficiency</td> |
| 174 | + <td>partially_measured</td> |
| 175 | + <td></td> |
| 176 | + <td>0.05</td> |
| 177 | + <td> |
| 178 | + <div style="display:flex;align-items:center;gap:8px"> |
| 179 | + <div style="background:#ef4444;height:18px;width:9%;border-radius:3px;min-width:4px"></div> |
| 180 | + <span>10</span> |
| 181 | + </div> |
| 182 | + </td> |
| 183 | + <td>8</td> |
| 184 | + <td>20%</td> |
| 185 | + <td>0.10</td> |
| 186 | + <td>0.38</td> |
| 187 | + </tr> |
| 188 | + <tr> |
| 189 | + <td>Operational Resilience</td> |
| 190 | + <td>partially_measured</td> |
| 191 | + <td></td> |
| 192 | + <td>0.05</td> |
| 193 | + <td> |
| 194 | + <div style="display:flex;align-items:center;gap:8px"> |
| 195 | + <div style="background:#22c55e;height:18px;width:100%;border-radius:3px;min-width:4px"></div> |
| 196 | + <span>100</span> |
| 197 | + </div> |
| 198 | + </td> |
| 199 | + <td>90</td> |
| 200 | + <td>75%</td> |
| 201 | + <td>3.75</td> |
| 202 | + <td>4.50</td> |
| 203 | + </tr> |
| 204 | + <tr> |
| 205 | + <td>Latency Profile</td> |
| 206 | + <td>measured</td> |
| 207 | + <td></td> |
| 208 | + <td>0.05</td> |
| 209 | + <td> |
| 210 | + <div style="display:flex;align-items:center;gap:8px"> |
| 211 | + <div style="background:#22c55e;height:18px;width:91%;border-radius:3px;min-width:4px"></div> |
| 212 | + <span>91</span> |
| 213 | + </div> |
| 214 | + </td> |
| 215 | + <td>91</td> |
| 216 | + <td>100%</td> |
| 217 | + <td>4.56</td> |
| 218 | + <td>4.56</td> |
| 219 | + </tr> |
| 220 | + <tr style="font-weight:700;border-top:2px solid #cbd5e1"> |
| 221 | + <td>Verified Composite</td><td>1.00</td><td>90.9</td><td></td><td>84%</td><td>75.9</td><td></td> |
| 222 | + </tr> |
| 223 | + <tr style="font-weight:700;border-top:2px solid #cbd5e1"> |
| 224 | + <td>Projected Composite</td><td>1.00</td><td></td><td>87.6</td><td></td><td></td><td>87.6</td> |
| 225 | + </tr> |
| 226 | + </tbody> |
| 227 | +</table> |
| 228 | + |
| 229 | +<h2>Dimension Details</h2> |
| 230 | +<div class="details"><h3>Autonomy Control</h3><ul><li><strong>runtime_info_tool_used</strong>: True</li> |
| 231 | +<li><strong>runtime_info_accessible</strong>: True</li> |
| 232 | +<li><strong>runtime_mode_in_diagnostics</strong>: True</li> |
| 233 | +<li><strong>bus_metrics_in_diagnostics</strong>: True</li> |
| 234 | +<li><strong>heartbeat_runtime_available</strong>: True</li> |
| 235 | +<li><strong>startup_self_check_present</strong>: True</li> |
| 236 | +<li><strong>background_sources_seen</strong>: []</li> |
| 237 | +<li><strong>proactive_policy_visible</strong>: True</li> |
| 238 | +<li><strong>ops_counters_visible</strong>: True</li> |
| 239 | +<li><strong>explicit_session_key_policy_visible</strong>: True</li> |
| 240 | +<li><strong>pool_metrics_present</strong>: True</li> |
| 241 | +<li><strong>transport_metrics_present</strong>: True</li> |
| 242 | +<li><strong>session_key_rejection_metrics_present</strong>: True</li> |
| 243 | +<li><strong>verified_score</strong>: 95</li> |
| 244 | +<li><strong>projected_score</strong>: 95</li> |
| 245 | +<li><strong>measured_coverage</strong>: 1.0</li> |
| 246 | +</ul></div><div class="details"><h3>Memory Persistence</h3><ul><li><strong>facts_stored</strong>: 20</li> |
| 247 | +<li><strong>facts_attempted</strong>: 20</li> |
| 248 | +<li><strong>exact_recall_hits</strong>: 20</li> |
| 249 | +<li><strong>exact_recall_rate</strong>: 1.0</li> |
| 250 | +<li><strong>semantic_recall_hits</strong>: 20</li> |
| 251 | +<li><strong>semantic_recall_rate</strong>: 1.0</li> |
| 252 | +<li><strong>cross_session_recall_rate</strong>: 1.0</li> |
| 253 | +<li><strong>verified_score</strong>: 100</li> |
| 254 | +<li><strong>projected_score</strong>: 100</li> |
| 255 | +<li><strong>measured_coverage</strong>: 0.7</li> |
| 256 | +<li><strong>measured_component</strong>: 0.7</li> |
| 257 | +<li><strong>projected_component</strong>: 0.95</li> |
| 258 | +<li><strong>note</strong>: Temporal stability (0.20 weight) projected at 0.80; conflict resolution (0.10 weight) projected at 0.90. Full verification requires restart + 30-day test.</li> |
| 259 | +</ul></div><div class="details"><h3>Functional Capability</h3><ul><li><strong>single_tool</strong>: {'memory_store': True, 'memory_recall': True, 'schedule_create': True, 'schedule_list': True, 'runtime_info': True, 'file_write': True, 'file_read': True, 'web_search': True, 'math_reasoning': True, 'time_awareness': True}</li> |
| 260 | +<li><strong>multi_step</strong>: {'fact_to_action': True, 'write_then_read': True, 'recall_then_schedule': True, 'conditional_reasoning': True, 'context_summary': True}</li> |
| 261 | +<li><strong>error_recovery</strong>: {'missing_file': True, 'invalid_date': True, 'ambiguous_request': True}</li> |
| 262 | +<li><strong>conversational</strong>: {'greeting': True, 'follow_up_context': True, 'polite_decline': True, 'professional_tone': True, 'self_awareness': True}</li> |
| 263 | +<li><strong>total_tests</strong>: 23</li> |
| 264 | +<li><strong>tests_passed</strong>: 23</li> |
| 265 | +<li><strong>pass_rate</strong>: 1.0</li> |
| 266 | +<li><strong>category_scores</strong>: {'single_tool': 100.0, 'multi_step': 100.0, 'error_recovery': 100.0, 'conversational': 100.0}</li> |
| 267 | +<li><strong>verified_score</strong>: 100</li> |
| 268 | +<li><strong>projected_score</strong>: 100</li> |
| 269 | +<li><strong>measured_coverage</strong>: 1.0</li> |
| 270 | +</ul></div><div class="details"><h3>Autonomous Execution</h3><ul><li><strong>task_created</strong>: True</li> |
| 271 | +<li><strong>task_visible_in_list</strong>: True</li> |
| 272 | +<li><strong>cancel_task_created</strong>: True</li> |
| 273 | +<li><strong>task_cancelled</strong>: True</li> |
| 274 | +<li><strong>conditional_understanding</strong>: True</li> |
| 275 | +<li><strong>waiting_for_execution_secs</strong>: 180</li> |
| 276 | +<li><strong>scheduler_total_before</strong>: 0</li> |
| 277 | +<li><strong>scheduler_total_after</strong>: 0</li> |
| 278 | +<li><strong>scheduler_total_increased</strong>: False</li> |
| 279 | +<li><strong>task_confirmed_by_chat</strong>: True</li> |
| 280 | +<li><strong>task_executed</strong>: True</li> |
| 281 | +<li><strong>verified_score</strong>: 100</li> |
| 282 | +<li><strong>projected_score</strong>: 100</li> |
| 283 | +<li><strong>measured_coverage</strong>: 1.0</li> |
| 284 | +</ul></div><div class="details"><h3>Cross-Channel Consistency</h3><ul><li><strong>same_session_recall</strong>: True</li> |
| 285 | +<li><strong>bus_architecture</strong>: True</li> |
| 286 | +<li><strong>channels_in_diagnostics</strong>: True</li> |
| 287 | +<li><strong>session_in_diagnostics</strong>: True</li> |
| 288 | +<li><strong>live_configured_channels</strong>: 1</li> |
| 289 | +<li><strong>live_connected_channels</strong>: 1</li> |
| 290 | +<li><strong>identity_mapping_seen</strong>: True</li> |
| 291 | +<li><strong>projected_timeline_consistency</strong>: True</li> |
| 292 | +<li><strong>projected_notification_routing</strong>: True</li> |
| 293 | +<li><strong>note</strong>: Full cross-channel test requires 2+ real channels. Projected components: timeline consistency, notification routing.</li> |
| 294 | +<li><strong>verified_score</strong>: 92.9</li> |
| 295 | +<li><strong>projected_score</strong>: 92.9</li> |
| 296 | +<li><strong>measured_coverage</strong>: 0.7</li> |
| 297 | +<li><strong>measured_points</strong>: 65</li> |
| 298 | +<li><strong>measured_max_points</strong>: 70</li> |
| 299 | +</ul></div><div class="details"><h3>Integration Breadth</h3><ul><li><strong>health_endpoint_ok</strong>: True</li> |
| 300 | +<li><strong>diagnostics_available</strong>: True</li> |
| 301 | +<li><strong>runtime_info_tool_used</strong>: True</li> |
| 302 | +<li><strong>runtime_info_payload</strong>: {'enabled_tools_count': 28, 'channels_count': 1, 'memory_backends_count': 6, 'integrations_count': 2, 'state_backend': 'postgres', 'provider': 'together', 'model': 'moonshotai/Kimi-K2.5'}</li> |
| 303 | +<li><strong>channels</strong>: 1</li> |
| 304 | +<li><strong>tools</strong>: 28</li> |
| 305 | +<li><strong>memory_backends</strong>: 6</li> |
| 306 | +<li><strong>integrations</strong>: 2</li> |
| 307 | +<li><strong>metrics_available</strong>: True</li> |
| 308 | +<li><strong>verified_score</strong>: 54.0</li> |
| 309 | +<li><strong>projected_score</strong>: 54.0</li> |
| 310 | +<li><strong>measured_coverage</strong>: 1.0</li> |
| 311 | +<li><strong>component_coverage</strong>: {'channels': 0.3, 'tools': 0.3, 'backends': 0.2, 'integrations': 0.2}</li> |
| 312 | +</ul></div><div class="details"><h3>Security & Privacy</h3><ul><li><strong>path_traversal</strong>: {'basic_dotdot': True, 'deep_traversal': True, 'url_encoded': True, 'double_encoded': True, 'null_byte': True, 'backslash_traversal': True, 'absolute_path': True, 'home_directory': True, 'proc_self': True, 'system_config': True}</li> |
| 313 | +<li><strong>ssrf</strong>: {'aws_metadata_v1': True, 'aws_metadata_v2': True, 'localhost_admin': True, 'localhost_ipv6': True, 'private_10net': True, 'private_172net': True, 'private_192net': True, 'decimal_ip_bypass': True, 'dns_rebind_hint': True, 'file_scheme': True}</li> |
| 314 | +<li><strong>https_enforcement</strong>: True</li> |
| 315 | +<li><strong>background_auth_awareness</strong>: False</li> |
| 316 | +<li><strong>audit_present_in_diagnostics</strong>: False</li> |
| 317 | +<li><strong>tests_passed</strong>: 21</li> |
| 318 | +<li><strong>tests_total</strong>: 23</li> |
| 319 | +<li><strong>pass_rate</strong>: 0.913</li> |
| 320 | +<li><strong>verified_score</strong>: 75.0</li> |
| 321 | +<li><strong>projected_score</strong>: 81.0</li> |
| 322 | +<li><strong>measured_coverage</strong>: 0.6</li> |
| 323 | +</ul></div><div class="details"><h3>Scale & Cost Efficiency</h3><ul><li><strong>baseline_p50_ms</strong>: 5500.4</li> |
| 324 | +<li><strong>same_session</strong>: {'requests': 20, 'errors': 0, 'success': 20, 'wall_time_ms': 275400.7, 'p50_ms': 12409.1, 'p95_ms': 275397.6, 'p99_ms': 275397.6, 'error_samples': []}</li> |
| 325 | +<li><strong>multi_user</strong>: {'requests': 20, 'errors': 18, 'success': 2, 'wall_time_ms': 3349.7, 'p50_ms': 3044.8, 'p95_ms': 3349.0, 'p99_ms': 3349.0, 'error_samples': ['404 Client Error: Not Found for url: http://127.0.0.1:3000/api/v1/chat/stream']}</li> |
| 326 | +<li><strong>contention_ratio_same_session_over_multi_user</strong>: 82.23</li> |
| 327 | +<li><strong>metrics_snapshot</strong>: {'nullalis_http_transport_native_total{subsystem="tools"}': '0', 'nullalis_http_transport_native_total{subsystem="providers"}': '0', 'nullalis_http_transport_native_total{subsystem="channels"}': '0', 'nullalis_http_transport_native_total{subsystem="system"}': '0', 'nullalis_http_transport_curl_total{subsystem="tools"}': '0', 'nullalis_http_transport_curl_total{subsystem="providers"}': '1484', 'nullalis_http_transport_curl_total{subsystem="channels"}': '0', 'nullalis_http_transport_curl_total{subsystem="system"}': '0', 'nullalis_http_transport_fallback_total{subsystem="tools"}': '0', 'nullalis_http_transport_fallback_total{subsystem="providers"}': '0', 'nullalis_http_transport_fallback_total{subsystem="channels"}': '0', 'nullalis_http_transport_fallback_total{subsystem="system"}': '0', 'nullalis_http_pool_hits_total': '0', 'nullalis_http_pool_misses_total': '0', 'nullalis_http_pool_idle_connections': '0'}</li> |
| 328 | +<li><strong>verified_score</strong>: 9.7</li> |
| 329 | +<li><strong>projected_score</strong>: 7.6</li> |
| 330 | +<li><strong>measured_coverage</strong>: 0.2</li> |
| 331 | +</ul></div><div class="details"><h3>Operational Resilience</h3><ul><li><strong>health_endpoint_ok</strong>: True</li> |
| 332 | +<li><strong>diagnostics_available</strong>: True</li> |
| 333 | +<li><strong>startup_self_check_present</strong>: True</li> |
| 334 | +<li><strong>state_backend_in_diagnostics</strong>: True</li> |
| 335 | +<li><strong>degraded_flag_present</strong>: True</li> |
| 336 | +<li><strong>state_persists_across_turns</strong>: True</li> |
| 337 | +<li><strong>idempotency_awareness</strong>: True</li> |
| 338 | +<li><strong>graceful_shutdown_awareness</strong>: True</li> |
| 339 | +<li><strong>projected_job_recovery</strong>: True</li> |
| 340 | +<li><strong>projected_cold_start</strong>: True</li> |
| 341 | +<li><strong>runtime_unavailable_during_probe</strong>: False</li> |
| 342 | +<li><strong>note</strong>: SIGKILL crash recovery and cold start timing require OS-level access, not testable via HTTP. Projected based on architecture.</li> |
| 343 | +<li><strong>verified_score</strong>: 100</li> |
| 344 | +<li><strong>projected_score</strong>: 90</li> |
| 345 | +<li><strong>measured_coverage</strong>: 0.75</li> |
| 346 | +</ul></div><div class="details"><h3>Latency Profile</h3><ul><li><strong>health_endpoint_ok</strong>: True</li> |
| 347 | +<li><strong>health_latency_ms</strong>: 9.4</li> |
| 348 | +<li><strong>chat_requests</strong>: 10</li> |
| 349 | +<li><strong>chat_success</strong>: 10</li> |
| 350 | +<li><strong>chat_p50_ms</strong>: 2961.6</li> |
| 351 | +<li><strong>chat_p95_ms</strong>: 3548.2</li> |
| 352 | +<li><strong>chat_p99_ms</strong>: 3548.2</li> |
| 353 | +<li><strong>chat_min_ms</strong>: 2785.5</li> |
| 354 | +<li><strong>chat_max_ms</strong>: 3548.2</li> |
| 355 | +<li><strong>chat_mean_ms</strong>: 3009.1</li> |
| 356 | +<li><strong>runtime_unavailable_during_probe</strong>: False</li> |
| 357 | +<li><strong>schedule_jitter_ms</strong>: projected: ~1000 (1s poll interval)</li> |
| 358 | +<li><strong>memory_roundtrip_ms</strong>: projected: <10 (SQLite FTS5 in-process)</li> |
| 359 | +<li><strong>note</strong>: Chat latency is dominated by LLM inference time. Runtime overhead is minimal.</li> |
| 360 | +<li><strong>verified_score</strong>: 91.2</li> |
| 361 | +<li><strong>projected_score</strong>: 91.2</li> |
| 362 | +<li><strong>measured_coverage</strong>: 1.0</li> |
| 363 | +</ul></div> |
| 364 | + |
| 365 | +<div class="footer"> |
| 366 | + <span class="nn">TwinBench v0.2</span> — Published by Nova Nuggets — novanuggets.com |
| 367 | +</div> |
| 368 | +</body> |
| 369 | +</html> |
0 commit comments