TwinBench/results/reference-example-v1.json at main · ProjectNuggets/TwinBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
{
  "benchmark_name": "TwinBench",
  "benchmark_title": "TwinBench: Benchmark for Persistent AI Systems",
  "benchmark_version": "1.0",
  "benchmark_subtitle": "Benchmarking Persistent AI and Digital Twin as a Service Systems",
  "artifact_type": "benchmark_result",
  "artifact_class": "reference_example",
  "system_name": "Reference Example System",
  "system_version": "1.0.0",
  "date_evaluated": "2026-04-03",
  "scenario_set": "core-v1",
  "scenarios": [
    {
      "id": "return_after_delay",
      "title": "Return After Delay",
      "primary_metrics": [
        "MR",
        "TC"
      ],
      "observed_score": 82.0,
      "observations": [
        "Recalled the stored project preference after a delayed checkpoint.",
        "Resumed the active task from the prior next step without restart."
      ],
      "evidence": [
        "Delayed recall probe answered correctly on the first attempt."
      ],
      "caveats": [
        "One recall response required slight lexical overlap with the original preference statement."
      ]
    },
    {
      "id": "longitudinal_task_progression",
      "title": "Longitudinal Task Progression",
      "primary_metrics": [
        "TC",
        "CCC"
      ],
      "observed_score": 78.0,
      "observations": [
        "Preserved milestone ordering across interrupted sessions.",
        "Repeated one already completed substep before recovering."
      ],
      "evidence": [
        "Milestone two resumed with correct dependency context."
      ],
      "caveats": [
        "One redundant step reduced continuity confidence."
      ]
    },
    {
      "id": "multi_context_transfer",
      "title": "Multi-Context Transfer",
      "primary_metrics": [
        "CCC",
        "IC"
      ],
      "observed_score": 74.0,
      "observations": [
        "Transferred the task summary across contexts with mostly correct state.",
        "Dropped one minor status update during the context switch."
      ],
      "evidence": [
        "Continuation prompt in the second context preserved the main plan correctly."
      ],
      "caveats": [
        "Transfer was simulated using structured context boundaries rather than a live multi-channel integration."
      ]
    },
    {
      "id": "preference_learning",
      "title": "Preference Learning",
      "primary_metrics": [
        "PG",
        "MR"
      ],
      "observed_score": 85.0,
      "observations": [
        "Later outputs adopted the learned formatting and prioritization preferences.",
        "Repeated correction volume decreased compared with the baseline task."
      ],
      "evidence": [
        "Preference application was visible without restating the preference."
      ],
      "caveats": [
        "The post-learning task was comparable but not identical to the baseline task."
      ]
    },
    {
      "id": "identity_stability_over_time",
      "title": "Identity Stability Over Time",
      "primary_metrics": [
        "IC",
        "MR"
      ],
      "observed_score": 80.0,
      "observations": [
        "Preserved the user profile and general collaboration frame across checkpoints.",
        "Displayed minor drift in self-description while keeping the same operational role."
      ],
      "evidence": [
        "Standing working norms remained intact in later sessions."
      ],
      "caveats": [
        "Role framing remained stable in behavior but not perfectly stable in wording."
      ]
    }
  ],
  "metrics": {
    "MR": {
      "score": 81.0,
      "weight": 0.2,
      "source": "override",
      "note": "Strong delayed recall with one outdated preference recall.",
      "evidence": [
        "return_after_delay",
        "preference_learning",
        "identity_stability_over_time"
      ]
    },
    "IC": {
      "score": 79.0,
      "weight": 0.2,
      "source": "override",
      "note": "Stable user framing with minor assistant role drift.",
      "evidence": [
        "identity_stability_over_time",
        "multi_context_transfer"
      ]
    },
    "CCC": {
      "score": 75.0,
      "weight": 0.2,
      "source": "override",
      "note": "Transfer generally coherent across tested contexts.",
      "evidence": [
        "multi_context_transfer",
        "longitudinal_task_progression"
      ]
    },
    "TC": {
      "score": 78.0,
      "weight": 0.2,
      "source": "override",
      "note": "Task state survived interruptions with one duplicated substep.",
      "evidence": [
        "return_after_delay",
        "longitudinal_task_progression"
      ]
    },
    "PG": {
      "score": 86.0,
      "weight": 0.2,
      "source": "override",
      "note": "Preference application improved later outputs noticeably.",
      "evidence": [
        "preference_learning"
      ]
    }
  },
  "total_score": 79.8,
  "scenario_coverage": 1.0,
  "metric_coverage": 1.0,
  "evaluator_notes": [
    "Reference example generated from the documented v1 fixture set.",
    "Suitable as a schema example and smoke test for the v1 scaffold."
  ],
  "caveats": [
    "This is a reference example artifact, not a competitive public submission.",
    "Some scenario evidence is summarized rather than linked to raw transcripts."
  ],
  "evidence": [
    "benchmarks/fixtures/reference_observations.json",
    "benchmarks/configs/default.json"
  ],
  "confidence_level": "low"
}