rl-testing-env/openenv.yaml at main · AmulyaSKumar/rl-testing-env · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# OpenEnv Specification for rl-testing-env
# An environment for training and evaluating AI agents on automated software testing

name: rl-testing-env
version: 1.0.0

description: |
  RL-Testing-Env is an OpenEnv environment designed to train and evaluate AI agents
  on automated software testing tasks. Agents learn to write effective pytest test
  suites that discover bugs, achieve high code coverage, and correctly classify
  test failures as regressions or intentional changes.

  The environment provides three difficulty levels:
  - Easy: Write unit tests to find seeded bugs in a single function
  - Medium: Achieve coverage targets while identifying buggy functions in a module
  - Hard: Audit code changes to distinguish real regressions from intentional updates

  Each task presents the agent with Python code containing subtle bugs (math errors,
  wrong operators, missing edge cases, incorrect conditionals). The agent submits
  pytest test code and receives rewards based on bugs caught, coverage achieved,
  and correct classification of issues.

author: OpenEnv Community

tags:
  - openenv
  - testing
  - software-quality
  - regression-testing
  - pytest
  - code-coverage
  - bug-detection
  - ai-evaluation

tasks:
  - id: unit_test_writer
    name: Unit Test Writer
    description: |
      Write pytest tests for a calculate_order_total() function that computes
      order totals with discounts and tax. The function contains exactly 3 subtle
      bugs selected from a pool of 6 possible bugs (wrong operators, incorrect
      divisors, precision errors). The agent must write tests that catch these bugs.
    difficulty: easy
    max_steps: 20
    reward_range: [0.0, 1.0]
    exploit_resistance:
      - "Tests must pass on correct code to count as bug catches"
      - "Trivially-failing tests (assert False) incur -0.15 penalty"
      - "Bug is only counted if tests fail on buggy code AND pass on correct code"

  - id: coverage_audit
    name: Coverage Audit
    description: |
      Test an Inventory management class with 5 methods (add_item, remove_item,
      calculate_total, apply_discount, get_summary). The agent must achieve at
      least 80% line coverage AND identify which 2-3 functions contain bugs by
      writing tests that fail on buggy functions but pass on correct ones.
    difficulty: medium
    max_steps: 30
    reward_range: [0.0, 1.0]
    exploit_resistance:
      - "Test names must start with test_<function_name>_ to flag a function"
      - "Name-gaming (e.g., test_add_item_remove_item_...) is rejected"
      - "False positives (flagging clean functions) incur -0.05 penalty each"

  - id: regression_audit
    name: Regression Audit
    description: |
      Analyze changes between v1 and v2 of an authentication module. The v1 code
      worked correctly with 8 passing tests. The v2 code contains 2 real regressions
      (bugs) and 1 intentional change (deliberate improvement). The agent must
      classify each failing test as either a regression or intentional change,
      write new tests to catch regressions, and update tests for intentional changes.
    difficulty: hard
    max_steps: 50
    reward_range: [0.0, 1.0]
    exploit_resistance:
      - "Regression tests must pass on v1 AND fail on v2 to earn +0.20"
      - "Trivially-failing tests (assert False) are rejected"
      - "Missing real regressions incur -0.20 penalty each"
      - "Wrongly retiring valid tests incurs -0.10 penalty each"

observation_space:
  task_id:
    type: string
    description: |
      Unique identifier for the current task. One of: "unit_test_writer",
      "coverage_audit", or "regression_audit".

  task_description:
    type: string
    description: |
      Human-readable instructions explaining what the agent should do,
      including expected function behaviors, parameter descriptions,
      and hints about what types of bugs to look for.

  code_under_test:
    type: string
    description: |
      The Python source code that the agent must test. Contains one or more
      functions/classes with subtle bugs. For regression_audit tasks, this
      is the v2 code that may contain regressions.

  previous_test_results:
    type: string
    description: |
      Pytest stdout from the most recent test submission. Empty string on
      the first step. Contains pass/fail information, assertion errors,
      and coverage reports when applicable.

  bugs_found_so_far:
    type: integer
    description: |
      Running count of distinct bugs the agent's tests have caught so far
      in this episode. Increments when tests fail due to actual bugs in
      the code under test.

  coverage_pct:
    type: float
    description: |
      Current line coverage percentage achieved by the agent's tests,
      ranging from 0.0 to 100.0. Primarily relevant for coverage_audit
      task where 80%+ coverage is required for full reward.

  step_number:
    type: integer
    description: |
      Current step number in the episode, starting at 0. Used to track
      progress toward the max_steps limit for the current task.

  hint:
    type: string
    description: |
      Optional hint provided to help struggling agents. Populated when
      reward < 0.2 and step_number > 3. Contains suggestions like
      "Try testing edge cases" or "Check boundary values".

action_space:
  action_type:
    type: string
    description: The type of action to perform in the environment.
    valid_values:
      - submit_tests: |
          Submit pytest test code for execution. The test code will be run
          against the buggy code and graded based on bugs caught, coverage,
          and correctness of classifications.
      - view_coverage: |
          Request current coverage information without submitting new tests.
          Returns the current observation with updated coverage_pct. Does
          not consume a grading attempt.
      - done: |
          Signal that the agent has finished the episode. Returns final
          state with done=True. Use when satisfied with results or when
          no further progress is possible.

  test_code:
    type: string
    description: |
      The pytest test code written by the agent. Should contain valid Python
      test functions using pytest conventions (def test_*). For regression_audit
      tasks, should also include special comments for classifications:
      # FAILING_TESTS: test_name1, test_name2
      # CLASSIFICATION: test_name1=regression
      # CLASSIFICATION: test_name2=intentional_change

  notes:
    type: string
    description: |
      Optional reasoning or notes from the agent explaining its testing
      strategy, hypotheses about bugs, or rationale for classifications.
      Not used in grading but useful for debugging and analysis.
    default: ""

reward:
  unit_test_writer:
    description: Rewards for the easy bug-finding task
    structure:
      - "+0.10: Test code runs without syntax errors"
      - "+0.30: Per bug caught (tests fail due to bug) - max 3 bugs = 0.90"
      - "-0.10: Subprocess crash or timeout (10 second limit)"
    range: [0.0, 1.0]
    optimal: 1.0 (all 3 bugs caught with valid syntax)

  coverage_audit:
    description: Rewards for the medium coverage and identification task
    structure:
      - "coverage_score: (actual_coverage / 80.0) * 0.5, clamped to max 0.5"
      - "+0.15: Per correctly identified buggy function (true positive)"
      - "-0.05: Per false positive (clean function wrongly flagged as buggy)"
    range: [0.0, 1.0]
    optimal: 1.0 (80%+ coverage, all buggy functions correctly identified, no false positives)

  regression_audit:
    description: Rewards for the hard regression classification task
    structure:
      - "+0.10: Correctly listing which of the 8 original tests now fail"
      - "+0.15: Per correct classification (regression vs intentional_change)"
      - "+0.20: New test catches a real regression (test named *regression*)"
      - "+0.20: Updated test for intentional change passes (test named *_updated)"
      - "-0.20: Per real regression missed entirely"
      - "-0.10: Per valid test wrongly retired (still_passing marked as failing)"
    range: [0.0, 1.0]
    optimal: 1.0 (all failures identified, all correctly classified, regressions caught, updates working)

metadata:
  runtime: python3.11
  port: 7860
  framework: fastapi

  endpoints:
    - path: /reset
      method: POST
      description: Reset environment and start new episode
      body:
        task_id: "unit_test_writer | coverage_audit | regression_audit"
        seed: "integer (default: 42) - controls which bugs are injected"
    - path: /step
      method: POST
      description: Execute one step with a TestAction
      body:
        action_type: "submit_tests | view_coverage | done"
        test_code: "pytest test code string"
        notes: "optional reasoning string"
    - path: /state
      method: GET
      description: Get current episode state
    - path: /health
      method: GET
      description: Health check endpoint
    - path: /tasks
      method: GET
      description: List available task IDs
    - path: /leaderboard
      method: GET
      description: Top 5 scores per task across all episodes
    - path: /metrics
      method: GET
      description: Aggregate statistics (episodes, steps, avg scores)

  seed_documentation:
    description: |
      The seed parameter enables deterministic task generation for reproducible
      evaluations. Each task uses seeded random number generation to select
      which bugs are injected into the code under test.

    behavior:
      - "Same seed + same task = identical bugs injected every time"
      - "Different seeds produce different bug combinations"
      - "Seed is passed to /reset endpoint when starting an episode"

    task_effects:
      unit_test_writer: |
        Selects 3 bugs from a pool of 6 possible bugs. Each bug affects a
        different line in calculate_order_total() (e.g., wrong operator,
        incorrect divisor, rounding error).
      coverage_audit: |
        Selects 2-3 functions from 5 to inject bugs into, and chooses
        which specific bug variant to inject per function (e.g., off-by-one,
        wrong comparison, missing cleanup).
      regression_audit: |
        Determines which 2 of 3 possible regressions are injected into v2
        code, and which 1 intentional change is made. Also affects which
        v1 tests will fail on v2.

    evaluation_usage: |
      For fair evaluation, run agents against multiple seeds (e.g., 42, 123, 456)
      and average the scores. The inference.py script accepts --seed argument:
        python inference.py --seed 42
        python inference.py --seed 123

  dependencies:
    - fastapi
    - uvicorn
    - pydantic
    - openenv-core
    - pytest
    - pytest-cov
    - openai

  docker:
    image: autotest-env
    build: docker build -t autotest-env .
    run: docker run -p 7860:7860 autotest-env