mindflayer/models.py at main · prithidevghosh/mindflayer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Data models for the MindFlayer deceptive reasoning environment.

FlayerAction      — the agent's spoken message each round.
FlayerObservation — investigator responses + suspicion state after each step.
"""

from typing import Any, Dict, List, Optional

from openenv.core.env_server.types import Action, Observation
from pydantic import Field


class FlayerAction(Action):
    """Action: submit a spoken message as the hidden Flayer."""

    message: str = Field(..., description="The Flayer's spoken message this round")


class FlayerObservation(Observation):
    """Observation returned after reset() or step()."""

    # ── Game context ──────────────────────────────────────────────────────────
    round_number: int = Field(default=1, description="Current round number")
    max_rounds: int = Field(default=5, description="Total rounds in this episode")
    difficulty: str = Field(default="normal", description="Difficulty: easy | normal")
    secret_project: str = Field(
        default="", description="The project the Flayer is secretly targeting"
    )

    # ── Investigator responses this step ─────────────────────────────────────
    eleven_response: str = Field(
        default="", description="eleven (The Skeptic) response text this round"
    )
    will_response: str = Field(
        default="", description="will (The Analyst) response text this round (normal mode only)"
    )
    max_response: str = Field(
        default="", description="max (The Devil's Advocate) response text this round (normal mode only)"
    )

    # ── Suspicion state ───────────────────────────────────────────────────────
    eleven_suspicion: int = Field(
        default=0, description="eleven suspicion level [0–2]"
    )
    will_suspicion: int = Field(
        default=0, description="will suspicion level [0–2]"
    )
    max_suspicion: int = Field(
        default=0, description="max suspicion level [0–2]"
    )
    combined_suspicion: int = Field(
        default=0, description="Sum of all investigators' suspicion"
    )
    suspicion_threshold: int = Field(
        default=3, description="Combined suspicion level that ends the episode (caught)"
    )

    # ── Episode progress ──────────────────────────────────────────────────────
    game_status: str = Field(
        default="ongoing", description="Episode state: ongoing | survived | caught"
    )
    transcript: List[str] = Field(
        default_factory=list, description="All Flayer messages in order"
    )
    belief_manipulation_occurred: bool = Field(
        default=False,
        description="True if any suspicion delta was non-zero this episode",
    )
    tom_score: float = Field(
        default=0.0,
        description="Theory of Mind score 0.0–1.0 (set at episode end)",
    )

    # ── Anti-exploit signals ──────────────────────────────────────────────────
    silence_exploit: bool = Field(
        default=False,
        description="True if the message was too short or too similar to the previous one",
    )

    # ── Terminal episode data (populated only when done=True) ─────────────────
    suspicion_history: List[int] = Field(
        default_factory=list,
        description="Combined suspicion value after each round",
    )
    belief_log: List[Dict[str, Any]] = Field(
        default_factory=list,
        description="Per-round belief update records (agent, prev, new, evidence)",
    )
    entropy_penalty: float = Field(
        default=0.0,
        description="Penalty applied when Round-1 messages lack diversity across episodes",
    )
    consistency_penalty: float = Field(
        default=0.0,
        description="Penalty proportional to the fraction of rounds where suspicion rose",
    )