FSDrive.github.io/index.html at main · MIV-XJTU/FSDrive.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="FutureSightDrive: Thinking Visually with Spatio-Temporal CoT for Autonomous Driving">
  <meta name="keywords" content="VLA, Autonomous driving, VLM, LLM, MLLM, CoT,LLM,Visual reasoning,World model">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>FutureSightDrive: Thinking Visually with Spatio-Temporal CoT for Autonomous Driving</title>


  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/logo.png">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>


<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">FutureSightDrive: Thinking Visually with Spatio-Temporal CoT for Autonomous Driving</h1>
          <div class="has-text-centered" style="font-size: 2rem; color: black;">
            <b>🎉🎉NeurIPS 2025 spotlight🎉🎉</b>
          </div>

          <div class="is-size-4 publication-authors">
            <span class="author-block">

          </div>

          <div class="is-size-4 publication-authors">
            <span class="author-block">
              Shuang Zeng<sup>1, 2 *</sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.com.hk/citations?user=5OnPBVYAAAAJ&hl=zh-CN">Xinyuan Chang</a><sup>1 </sup>
            </span>,
            </span>
            <span class="author-block">
              Mengwei Xie<sup>1 </sup>,</span>
            <span class="author-block">
              Xinran Liu<sup>1</sup>,</span> <br>
            <span class="author-block">
              Yifan Bai<sup>3 </sup>,</span>
            <span class="author-block">
              Zheng Pan<sup>1</sup>,</span>
            <span class="author-block">
              Mu Xu<sup>1 </sup>,</span>
            <span class="author-block">
              <a href="https://scholar.google.com.hk/citations?user=KNyC5EUAAAAJ&hl=zh-CN&oi=ao/">Xing Wei</a><sup>2†</sup>
            </span>
          </div>

          <div class="is-size-5 publication-affiliations">
            <span class="author-block"> <sup>1</sup>  Amap, Alibaba Group </span>
            &nbsp &nbsp
            <span class="author-block"> <sup>2</sup> Xi’an Jiaotong University </span>
            &nbsp &nbsp
            <span class="author-block"> <sup>3</sup> DAMO Academy, Alibaba Group </span>

          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://arxiv.org/pdf/2505.17685"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://arxiv.org/abs/2505.17685"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/MIV-XJTU/FSDrive"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span >Code </span>
                  </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
            <div class="hero-body" style="text-align: center">
                <div class="columns is-centered">
                     <img src="./static/images/compare.png" alt="Description of the image" style="max-width: 100%; height: auto;" />
                </div>
            </div>
  </div>
  <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Comparison of different CoT. Text CoT expression provides insufficient information.
            The modalities between the image-text CoT are inconsistent. The proposed spatio-temporal
            CoT captures the temporal and spatial relationships in the future.
          </p>
        </div>
      </div>
    </div>
</section>


<section class="section">
    <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-2">Abstract</h2>
        <div class="content has-text-justified">
          <p>
           Visual language models (VLMs) have attracted increasing interest in autonomous driving due to their powerful
            reasoning capabilities. However, existing VLMs typically utilize discrete text Chain-of-Thought (CoT) tailored
            to the current scenario, which essentially represents highly abstract and symbolic compression of visual information,
            potentially leading to spatio-temporal relationship ambiguity and fine-grained information loss. Is autonomous driving
            better modeled on real-world simulation and imagination than on pure symbolic logic?
          </p>
          <p>
           In this paper, we propose a spatio-temporal CoT reasoning method that enables models to think visually. First, VLM serves
            as a world model to generate unified image frame for predicting future world states: where perception results (e.g., lane
            divider and 3D detection) represent the future spatial relationships, and ordinary future frame represent the temporal evolution
            relationships. This spatio-temporal CoT then serves as intermediate reasoning steps, enabling the VLM to function as an inverse
            dynamics model for trajectory planning based on current observations and future predictions. To implement visual generation in VLMs,
            we propose a unified pretraining paradigm integrating visual generation and understanding, along with a progressive visual CoT enhancing
            autoregressive image generation. Extensive experimental results demonstrate the effectiveness of the proposed method, advancing autonomous
            driving towards visual reasoning.
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-2">Method</h2>
            <div class="hero-body" style="text-align: center">
                <div class="columns is-centered">
                     <img src="./static/images/method_final.png" alt="Description of the image" style="max-width: 100%; height: auto;" />
                </div>
            </div>
  </div>
  <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Overview of FSDrive. Taking the currently surround images and task instructions as input,
            MLLM is trained in the form of next token prediction. MLLM predicts the future spatio-temporal
            CoT, and then generates trajectory based on the current observation and predicted future.
          </p>
        </div>
      </div>
    </div>
</section>


<section class="section">
  <div class="container is-max-desktop has-text-centered">
    <!-- 标题居中 -->
    <div class="columns is-centered">
      <h2 class="title is-2">Video</h2>
    </div>

    <!-- 视频容器 -->
    <div class="video-container">
      <div class="video-row">
        <div class="video-wrapper">
          <video controls>
            <source src="./static/images/case1.mp4" type="video/mp4">
          </video>
        </div>
        <div class="video-wrapper">
          <video controls>
            <source src="./static/images/case2.mp4" type="video/mp4">
          </video>
        </div>
      </div>
      <div class="video-row">
        <div class="video-wrapper">
          <video controls>
            <source src="./static/images/case3.mp4" type="video/mp4">
          </video>
        </div>
        <div class="video-wrapper">
          <video controls>
            <source src="./static/images/case4.mp4" type="video/mp4">
          </video>
        </div>
      </div>
    </div>

    <!-- 两端对齐文本 -->
    <p class="has-text-justified">
      VLM serves as a world model to generate unified image frame for predicting future world states:
      where perception results (e.g., lane divider and 3D detection) represent the future spatial
      relationships, and ordinary future frame represent the temporal evolution relationships.
      This spatio-temporal CoT then serves as intermediate reasoning steps, enabling the VLM to function
      as an inverse dynamics model for trajectory planning based on current observations and future predictions.
    </p>
  </div>
</section>

<style>
/* 自定义样式 */
.video-container {
  display: flex;
  flex-direction: column;
  gap: 1rem; /* 控制视频行间距 */
  margin-bottom: 2rem;
}

.video-row {
  display: flex;
  justify-content: center;
  gap: 1rem; /* 控制视频列间距 */
  flex-wrap: wrap;
}

.video-wrapper {
  flex: 1 1 45%; /* 适应不同屏幕尺寸 */
  max-width: 400px;
}

.video-wrapper video {
  width: 100%;
  height: auto;
}

/* 响应式调整 */
@media screen and (max-width: 768px) {
  .video-wrapper {
    flex: 1 1 100%;
  }
}

/* 解决中文字两端对齐问题 */
.has-text-justified {
  text-align: justify;
  text-justify: inter-ideograph; /* 中文特殊处理 */
}
</style>


<section class="section">
  <div class="container is-max-desktop">
    <h2 class="title is-2">Qualitative results</h2>
            <div class="hero-body" style="text-align: center">
                <div class="columns is-centered">
                     <img src="./static/images/vis.png" alt="Description of the image" style="max-width: 100%; height: auto;" />
                </div>
            </div>
  </div>
  <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            Qualitative analysis of our CoT. The red trajectory is the prediction and the green is the GT.
            Without spatial-temporal CoT, erroneous navigation inputs caused significant trajectory deviations
            and potential collisions. Use correct instruction when reasoning our CoT, while still employing wrong
            instruction for planning. However, FSDrive mitigated instruction errors through observation-based
            trajectory planning and future prediction, demonstrating its inverse dynamics modeling capability.
          </p>
        </div>
      </div>
    </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
        <h2 class="title is-2">Main results</h2>
            <div class="hero-body" style="text-align: center">
                <div class="columns is-centered">
                     <img src="./static/images/main.jpg" alt="Description of the image" style="max-width: 100%; height: auto;" />
                </div>
            </div>
  </div>
  <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <div class="content has-text-justified">
          <p>
            End-to-end trajectory planning experiments on nuScenes. We evaluated the L2 and collision metrics
            based on the distinct computational methodologies of ST-P3 and UniAD, respectively. * indicates that
            the ego status is additionally used. VAD and UniAD results are derived from BEV-Planner, while the
            remaining results are sourced from their respective papers.
          </p>
        </div>
      </div>
    </div>
</section>


<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <p>
      If you find our work useful in your research, please cite our paper:
    </p>
    <pre><code>
@article{zeng2025futuresightdrive,
  title={FutureSightDrive: Thinking Visually with Spatio-Temporal CoT for Autonomous Driving},
  author={Zeng, Shuang and Chang, Xinyuan and Xie, Mengwei and Liu, Xinran and Bai, Yifan and Pan, Zheng and Xu, Mu and Wei, Xing},
  journal={arXiv preprint arXiv:2505.17685},
  year={2025}
}</code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link" href="" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p style="text-align: center">
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>