mvr-rl.github.io/index.html at main · mvr-rl/mvr-rl.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta http-equiv="x-ua-compatible" content="ie=edge" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>MVR: Multi-view Video Reward Shaping for Reinforcement Learning</title>
    <meta
      name="description"
      content="MVR learns state relevance from multi-view video-text similarity and turns it into state-dependent reward shaping for reinforcement learning."
    />

    <meta property="og:title" content="MVR: Multi-view Video Reward Shaping for Reinforcement Learning" />
    <meta
      property="og:description"
      content="Learn state relevance from multi-view videos and turn it into reward shaping that guides early exploration and fades as performance improves."
    />
    <meta property="og:type" content="website" />

    <link rel="icon" href="images/favicon.svg" type="image/svg+xml" />
    <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css" />
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" />
    <link rel="stylesheet" href="index.css" />

    <script defer src="js/main.js"></script>
  </head>

  <body>
    <a class="skip-link" href="#content">Skip to content</a>

    <div class="container page-container">
      <div class="top-nav">
        <div class="top-nav-inner px-3 py-2 d-flex align-items-center justify-content-between flex-wrap gap-2">
          <div class="d-flex align-items-center gap-2">
            <span class="nav-brand">MVR</span>
            <span class="pill-hint">Project Page</span>
          </div>
	          <div class="d-flex align-items-center gap-1 flex-wrap">
	            <a class="nav-pill" href="#tldr">Abstract</a>
	            <a class="nav-pill" href="#overview">Overview</a>
	            <a class="nav-pill" href="#method">Method</a>
	            <a class="nav-pill" href="#results">Results</a>
	            <a class="nav-pill" href="#videos">Videos</a>
	            <a class="nav-pill" href="#bibtex">BibTeX</a>
	          </div>
	        </div>
	      </div>

	      <header class="py-4">
	        <div class="hero-card">
	          <div class="hero-title">
	            <div class="venue-badge">Accepted at ICLR 2026</div>
	            <div class="kicker">Reinforcement Learning · Video-Language Models · Reward Shaping · Multi-view Videos</div>
	            <h1 class="display-title">MVR: Multi-view Video Reward Shaping for Reinforcement Learning</h1>
	            <div class="hero-subtitle">Learn state relevance from multi-view videos and turn it into reward shaping that guides early exploration and fades as performance improves.</div>
	          </div>

          <div class="hero-links">
            <a class="btn btn-outline-primary btn-sm" href="assets/MVR_ICLR2026.pdf">
              <i class="ai ai-open-access"></i>
              Paper (PDF)
            </a>
            <a class="btn btn-outline-primary btn-sm" href="https://github.com/mvr-rl/mvr" target="_blank" rel="noreferrer">
              <i class="fab fa-github"></i>
              Code
            </a>
	            <a class="btn btn-outline-primary btn-sm" href="https://openreview.net/forum?id=7lw6s9ELfr" target="_blank" rel="noreferrer">
	              <i class="ai ai-openreview"></i>
	              OpenReview
	            </a>
            <a class="btn btn-outline-primary btn-sm" href="https://arxiv.org/abs/2603.01694" target="_blank" rel="noreferrer">
              <i class="ai ai-arxiv"></i>
              arXiv
            </a>
          </div>

          <div class="hero-authors">
            <div class="authors">
              <span><a href="https://liruiluo.github.io/" target="_blank" rel="noreferrer">Lirui Luo</a></span>
              <span>·</span>
              <span><a href="https://guoxizhang.com/" target="_blank" rel="noreferrer">Guoxi Zhang</a></span>
              <span>·</span>
              <span><a href="https://sbx126.github.io/" target="_blank" rel="noreferrer">Hongming Xu</a></span>
              <span>·</span>
              <span><a href="https://www.yangyaodong.com/" target="_blank" rel="noreferrer">Yaodong Yang</a></span>
              <span>·</span>
              <span><a href="https://congfang-ml.github.io/" target="_blank" rel="noreferrer">Cong Fang</a></span>
              <span>·</span>
              <span><a href="https://liqing.io" target="_blank" rel="noreferrer">Qing Li</a></span>
            </div>
	            <div class="affiliations">
	              <span>Peking University</span>
	              <span>·</span>
	              <span>BIGAI</span>
	            </div>
	          </div>
	        </div>
	      </header>

      <main id="content" class="pb-5">
	        <section class="section-card" id="tldr">
	          <h2 class="section-title">Abstract</h2>
	          <p class="section-text mb-0">
	            Reward design is of great importance for solving complex tasks with reinforcement learning. Recent studies have explored using image-text similarity
	            produced by vision-language models (VLMs) to augment rewards of a task with visual feedback. A common practice linearly adds VLM scores to task or
	            success rewards without explicit shaping, potentially altering the optimal policy. Moreover, such approaches, often relying on single static images,
	            struggle with tasks whose desired behavior involves complex, dynamic motions spanning multiple visually different states. Furthermore, single
	            viewpoints can occlude critical aspects of an agent's behavior. To address these issues, this paper presents Multi-View Video Reward Shaping (MVR),
	            a framework that models the relevance of states regarding the target task using videos captured from multiple viewpoints. MVR leverages video-text
	            similarity from a frozen pre-trained VLM to learn a state relevance function that mitigates the bias towards specific static poses inherent in
	            image-based methods. Additionally, we introduce a state-dependent reward shaping formulation that integrates task-specific rewards and VLM-based
	            guidance, automatically reducing the influence of VLM guidance once the desired motion pattern is achieved. We confirm the efficacy of the proposed
	            framework with extensive experiments on challenging humanoid locomotion tasks from HumanoidBench and manipulation tasks from MetaWorld, verifying
	            the design choices through ablation studies.
	          </p>
	        </section>

        <section class="section-card" id="overview">
          <h2 class="section-title">Overview</h2>

	          <div class="figure-card mb-3">
	            <div class="figure-title">Why reward shaping?</div>
	            <img class="img-fluid rounded" src="figures/teaser.png" alt="MVR teaser figure" />
	            <div class="figure-caption">
	              Directly adding VLM similarity to rewards can change the optimal policy and is brittle for dynamic skills. MVR learns state relevance from
	              multi-view trajectory videos (reducing occlusion and viewpoint bias) and turns it into a state-dependent reward shaping term that guides
	              exploration early and fades away once the desired motion pattern is achieved.
	            </div>
	          </div>

          <div class="callouts">
            <div class="callout">
              <div class="k">Challenge</div>
              <div class="v">VLM similarity can be noisy and viewpoint-biased for dynamic skills; directly adding it to rewards may change the optimal policy.</div>
            </div>
            <div class="callout">
              <div class="k">Key Idea</div>
              <div class="v">Learn state relevance from multi-view video-text similarity via paired comparisons (Bradley–Terry), instead of regressing raw scores.</div>
            </div>
            <div class="callout">
              <div class="k">Reward Shaping</div>
              <div class="v">Use a state-dependent shaping term that helps exploration early and decays once trajectories become indistinguishable from a reference set.</div>
            </div>
          </div>
        </section>

        <section class="section-card" id="method">
          <h2 class="section-title">Method</h2>

          <p class="section-text">
            MVR periodically renders recent trajectories into multi-view videos, queries a frozen VLM for similarity scores and embeddings, and updates a
            state-space relevance function. The learned relevance defines a shaped reward for online off-policy RL.
          </p>

          <div class="figure-card">
            <div class="figure-title">MVR framework</div>
            <img class="img-fluid rounded" src="figures/framework.png" alt="MVR framework figure" />
            <div class="figure-caption">
	              MVR alternates between relevance learning (paired-comparison matching and representation regularization) and policy learning with reference-set-based reward shaping.
            </div>
          </div>

          <div class="row g-3 mt-1">
            <div class="col-12 col-lg-6">
              <div class="figure-card h-100">
                <div class="figure-title">State relevance learning</div>
                <p class="section-text mb-0">
                  Instead of regressing raw similarity scores in state space, MVR preserves the ordering induced by video-text comparisons via a Bradley–Terry
                  model, and regularizes representations for stability.
                </p>
              </div>
            </div>
            <div class="col-12 col-lg-6">
              <div class="figure-card h-100">
                <div class="figure-title">State-dependent reward shaping</div>
                <p class="section-text mb-0">
                  Shaping guides the policy towards relevant states early, then fades as the policy becomes “indistinguishable” from high-scoring reference
                  behaviors, reducing persistent conflict with task rewards.
                </p>
              </div>
            </div>
          </div>
        </section>

	        <section class="section-card" id="results">
	          <h2 class="section-title">Results</h2>
	          <p class="section-text">
	            We evaluate MVR on 19 robotics tasks from HumanoidBench (humanoid locomotion) and MetaWorld (manipulation). The figures below are from the paper.
	          </p>

          <div class="row g-3">
            <div class="col-12">
              <div class="figure-card">
                <div class="figure-title">Tasks</div>
                <img class="w-100 rounded" src="figures/tasks.png?v=ee0687b" alt="Tasks figure" />
              </div>
            </div>
            <div class="col-12">
              <div class="figure-card">
                <div class="figure-title">Ablation</div>
                <img class="w-100 rounded" src="figures/ablation.png" alt="Ablation figure" />
              </div>
            </div>
            <div class="col-12">
              <div class="figure-card">
                <div class="figure-title">Case study: identifying suboptimal states</div>
                <img class="w-100 rounded" src="figures/demo.png?v=ee0687b" alt="Demo figure" />
              </div>
            </div>
          </div>
		        </section>

	        <section class="section-card" id="videos">
	          <h2 class="section-title">Videos</h2>
	          <p class="section-text">
	            Qualitative comparison on HumanoidBench. MVR succeeds on Sit_Hard, while TQC fails under the same setting.
	          </p>
	          <div class="row g-3">
		            <div class="col-12 col-lg-6">
		              <div class="figure-card h-100">
		                <div class="figure-title">MVR (success): Sit_Hard</div>
		                <video class="w-100 rounded" controls muted playsinline autoplay loop preload="metadata">
		                  <source src="videos/mvr_success_h1hand_sit_hard_seed0.mp4" type="video/mp4" />
		                  Your browser does not support the video tag.
		                </video>
	                <div class="figure-caption">MVR reaches the target sitting behavior; the shaping term decays as task rewards dominate.</div>
	              </div>
	            </div>
	            <div class="col-12 col-lg-6">
	              <div class="figure-card h-100">
	                <div class="figure-title">TQC (failure): Sit_Hard</div>
	                <video class="w-100 rounded" controls muted playsinline autoplay loop preload="metadata">
	                  <source src="videos/tqc_failure_seed2_envseed0.mp4" type="video/mp4" />
	                  Your browser does not support the video tag.
	                </video>
	                <div class="figure-caption">TQC fails to reach the target behavior under the same setting.</div>
	              </div>
	            </div>
	          </div>
	        </section>

	        <section class="section-card" id="bibtex">
	          <h2 class="section-title">BibTeX</h2>
	          <div class="bibtex-toolbar">
	            <div class="footer-note">If you find our work useful, please cite:</div>
	            <button id="copyBibBtn" class="btn btn-sm btn-primary" type="button">
              <i class="fa-regular fa-copy"></i>
              Copy
            </button>
          </div>
          <div class="codebox">
            <pre id="bibtexBlock">@inproceedings{luo2026mvr,
  title        = {MVR: Multi-view Video Reward Shaping for Reinforcement Learning},
  author       = {Luo, Lirui and Zhang, Guoxi and Xu, Hongming and Yang, Yaodong and Fang, Cong and Li, Qing},
  booktitle    = {International Conference on Learning Representations},
  year         = {2026}
}</pre>
          </div>
        </section>

        <section class="section-card" id="contact">
          <h2 class="section-title">Contact</h2>
          <p class="section-text mb-0">
            Corresponding authors: <a href="mailto:fangcong@pku.edu.cn">fangcong@pku.edu.cn</a> and
            <a href="mailto:dylan.liqing@gmail.com">dylan.liqing@gmail.com</a>.
          </p>
        </section>

        <div class="mt-4 footer-note">
          Built as a static project page for GitHub Pages.
        </div>
      </main>
    </div>
  </body>
</html>