From 50c4782ad3653e6aa51de2db73532b1af35234b6 Mon Sep 17 00:00:00 2001 From: Claas de Boer Date: Fri, 10 Apr 2026 14:23:19 +0200 Subject: [PATCH] Fix pred_depth in V-JEPA 2.1 ViT-L configs to match released checkpoint The released ViT-L checkpoint (vjepa2_1_vitl_dist_vitG_384.pt) has 12 predictor blocks, matching the distillation protocol described in the paper: "we retrain a new predictor, which has 12 blocks (instead of 24 for pretraining)". The ViT-B config already correctly specifies pred_depth: 12. With pred_depth: 24, loading the checkpoint results in a silent partial load where blocks 0-11 are loaded but blocks 12-23 remain randomly initialized (strict=False). --- configs/train_2_1/vitl16/cooldown-256px-64f.yaml | 2 +- configs/train_2_1/vitl16/pretrain-256px-16f.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/train_2_1/vitl16/cooldown-256px-64f.yaml b/configs/train_2_1/vitl16/cooldown-256px-64f.yaml index 73308856..6a74c2ea 100644 --- a/configs/train_2_1/vitl16/cooldown-256px-64f.yaml +++ b/configs/train_2_1/vitl16/cooldown-256px-64f.yaml @@ -118,7 +118,7 @@ model: model_name: vit_large n_registers: 0 normalize_predictor: false - pred_depth: 24 + pred_depth: 12 pred_embed_dim: 384 pred_num_heads: 12 uniform_power: true diff --git a/configs/train_2_1/vitl16/pretrain-256px-16f.yaml b/configs/train_2_1/vitl16/pretrain-256px-16f.yaml index 3988cf9a..6ffe061e 100644 --- a/configs/train_2_1/vitl16/pretrain-256px-16f.yaml +++ b/configs/train_2_1/vitl16/pretrain-256px-16f.yaml @@ -120,7 +120,7 @@ model: n_registers: 0 n_registers_predictor: 0 normalize_predictor: false - pred_depth: 24 + pred_depth: 12 pred_embed_dim: 384 pred_is_causal: false pred_local_window: