From c9ba4919f59e544cfb39a7f7acc896a1e440bfb8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:00:58 +0000 Subject: [PATCH 1/2] Initial plan From 931ddc02598dc77d63719ceb76e8296d58cd396d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 23 Jan 2026 22:03:33 +0000 Subject: [PATCH 2/2] docs: document reward computation in vec_wrapper.py Co-authored-by: memmelma <31846371+memmelma@users.noreply.github.com> --- env/wrappers/vec_wrapper.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/env/wrappers/vec_wrapper.py b/env/wrappers/vec_wrapper.py index be6e6b5..10cf167 100644 --- a/env/wrappers/vec_wrapper.py +++ b/env/wrappers/vec_wrapper.py @@ -9,6 +9,21 @@ def _worker(remote, parent_remote, env_fn_wrapper): + """ + Worker process for handling environment interactions in a subprocess. + + Reward computation: + - The reward at each step comes directly from env.step(action), which returns + (obs, reward, done, truncated, info). The truncated flag is ignored here. + - The per-step reward is determined by the underlying environment's reward function. + - Rewards are accumulated over the episode in a list. When done=True, + the episode_return (sum of all step rewards) is stored in info["episode_return"]. + - The per-step reward is sent back to the main process via remote.send(). + + For ASID (Automatic System Identification), an alternative reward can be computed + using ASIDRewardWrapper which calculates reward = trace(J_transpose * J), where J is + the Jacobian of observations w.r.t. physics parameters estimated via finite differences. + """ parent_remote.close() env = _patch_env(env_fn_wrapper.var()) # env = env_fn_wrapper.var() @@ -18,12 +33,15 @@ def _worker(remote, parent_remote, env_fn_wrapper): cmd, data = remote.recv() # gym interface if cmd == "step": + # Reward is computed by the underlying environment's step function. + # For standard RL tasks, this is the task-specific reward. + # For ASID, use ASIDRewardWrapper to compute information-theoretic rewards. obs, reward, done, _, info = env.step(data) - rewards.append(reward) + rewards.append(reward) # Accumulate per-step rewards successes.append(info.get("success", 0)) if done: info["terminal_obs"] = obs - info["episode_return"] = sum(rewards) + info["episode_return"] = sum(rewards) # Total episode reward info["episode_success"] = float(sum(successes) > 0) rewards, successes = [], [] obs, _ = env.reset()