From 17cf1d34a65a5963d08322227ed892b2e4057298 Mon Sep 17 00:00:00 2001 From: Claas de Boer Date: Fri, 10 Apr 2026 14:32:02 +0200 Subject: [PATCH] Add vit_base to embed_dim_encoder lookup in V-JEPA 2.1 training The embed_dim_encoder lookup only handles vit_large, vit_giant_xformers, and vit_gigantic_xformers. Add vit_base (768) to match the released ViT-B config and checkpoint (vjepa2_1_vitb_dist_vitG_384.pt). --- app/vjepa_2_1/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/vjepa_2_1/train.py b/app/vjepa_2_1/train.py index f8fe5949..abf0e209 100644 --- a/app/vjepa_2_1/train.py +++ b/app/vjepa_2_1/train.py @@ -114,7 +114,9 @@ def main(args, resume_preempt=False): normalize_predictor = cfgs_model.get("normalize_predictor", False) modality_embedding = cfgs_model.get("modality_embedding", False) levels_predictor = cfgs_model.get("levels_predictor", 4) - if model_name == "vit_large": + if model_name == "vit_base": + embed_dim_encoder = 768 + elif model_name == "vit_large": embed_dim_encoder = 1024 elif model_name == "vit_giant_xformers": embed_dim_encoder = 1408