diff --git a/evals/action_anticipation_frozen/eval.py b/evals/action_anticipation_frozen/eval.py index 9a15ee04..b81ec065 100644 --- a/evals/action_anticipation_frozen/eval.py +++ b/evals/action_anticipation_frozen/eval.py @@ -132,11 +132,13 @@ def main(args_eval, resume_preempt=False): except Exception: pass - if not torch.cuda.is_available(): - device = torch.device("cpu") - else: + if torch.cuda.is_available(): device = torch.device("cuda:0") torch.cuda.set_device(device) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") world_size, rank = init_distributed() logger.info(f"Initialized (rank/world-size) {rank}/{world_size}") diff --git a/evals/image_classification_frozen/eval.py b/evals/image_classification_frozen/eval.py index df0bf958..a0faf947 100644 --- a/evals/image_classification_frozen/eval.py +++ b/evals/image_classification_frozen/eval.py @@ -108,11 +108,13 @@ def main(args_eval, resume_preempt=False): except Exception: pass - if not torch.cuda.is_available(): - device = torch.device("cpu") - else: + if torch.cuda.is_available(): device = torch.device("cuda:0") torch.cuda.set_device(device) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") world_size, rank = init_distributed() logger.info(f"Initialized (rank/world-size) {rank}/{world_size}") diff --git a/evals/main.py b/evals/main.py index 3df8d4a9..9c6a0003 100644 --- a/evals/main.py +++ b/evals/main.py @@ -104,7 +104,11 @@ def process_main(args, rank, fname, world_size, devices): ) # Single-GPU debugging else: - process_main(args=args, rank=0, fname=args.fname, world_size=1, devices=["cuda:0"]) + if torch.backends.mps.is_available(): + device = "mps" + else: + device = "cuda:0" + process_main(args=args, rank=0, fname=args.fname, world_size=1, devices=[device]) else: num_gpus = len(args.devices) mp.set_start_method("spawn") diff --git a/evals/video_classification_frozen/eval.py b/evals/video_classification_frozen/eval.py index 87151254..5e1edc5c 100644 --- a/evals/video_classification_frozen/eval.py +++ b/evals/video_classification_frozen/eval.py @@ -113,11 +113,13 @@ def main(args_eval, resume_preempt=False): except Exception: pass - if not torch.cuda.is_available(): - device = torch.device("cpu") - else: + if torch.cuda.is_available(): device = torch.device("cuda:0") torch.cuda.set_device(device) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") world_size, rank = init_distributed() logger.info(f"Initialized (rank/world-size) {rank}/{world_size}") diff --git a/notebooks/vjepa2_demo.ipynb b/notebooks/vjepa2_demo.ipynb index 2a816bc5..719cc5ba 100644 --- a/notebooks/vjepa2_demo.ipynb +++ b/notebooks/vjepa2_demo.ipynb @@ -1,291 +1,312 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# V-JEPA 2 Demo Notebook\n", - "\n", - "This tutorial provides an example of how to load the V-JEPA 2 model in vanilla PyTorch and HuggingFace, extract a video embedding, and then predict an action class. For more details about the paper and model weights, please see https://github.com/facebookresearch/vjepa2." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's import the necessary libraries and load the necessary functions for this tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import os\n", - "import subprocess\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn.functional as F\n", - "from decord import VideoReader\n", - "from transformers import AutoVideoProcessor, AutoModel\n", - "\n", - "import src.datasets.utils.video.transforms as video_transforms\n", - "import src.datasets.utils.video.volume_transforms as volume_transforms\n", - "from src.models.attentive_pooler import AttentiveClassifier\n", - "from src.models.vision_transformer import vit_giant_xformers_rope\n", - "\n", - "IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)\n", - "IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)\n", - "\n", - "def load_pretrained_vjepa_pt_weights(model, pretrained_weights):\n", - " # Load weights of the VJEPA2 encoder\n", - " # The PyTorch state_dict is already preprocessed to have the right key names\n", - " pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"encoder\"]\n", - " pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n", - " pretrained_dict = {k.replace(\"backbone.\", \"\"): v for k, v in pretrained_dict.items()}\n", - " msg = model.load_state_dict(pretrained_dict, strict=False)\n", - " print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n", - "\n", - "\n", - "def load_pretrained_vjepa_classifier_weights(model, pretrained_weights):\n", - " # Load weights of the VJEPA2 classifier\n", - " # The PyTorch state_dict is already preprocessed to have the right key names\n", - " pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"classifiers\"][0]\n", - " pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n", - " msg = model.load_state_dict(pretrained_dict, strict=False)\n", - " print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n", - "\n", - "\n", - "def build_pt_video_transform(img_size):\n", - " short_side_size = int(256.0 / 224 * img_size)\n", - " # Eval transform has no random cropping nor flip\n", - " eval_transform = video_transforms.Compose(\n", - " [\n", - " video_transforms.Resize(short_side_size, interpolation=\"bilinear\"),\n", - " video_transforms.CenterCrop(size=(img_size, img_size)),\n", - " volume_transforms.ClipToTensor(),\n", - " video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),\n", - " ]\n", - " )\n", - " return eval_transform\n", - "\n", - "\n", - "def get_video():\n", - " vr = VideoReader(\"sample_video.mp4\")\n", - " # choosing some frames here, you can define more complex sampling strategy\n", - " frame_idx = np.arange(0, 128, 2)\n", - " video = vr.get_batch(frame_idx).asnumpy()\n", - " return video\n", - "\n", - "\n", - "def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform):\n", - " # Run a sample inference with VJEPA\n", - " with torch.inference_mode():\n", - " # Read and pre-process the image\n", - " video = get_video() # T x H x W x C\n", - " video = torch.from_numpy(video).permute(0, 3, 1, 2) # T x C x H x W\n", - " x_pt = pt_transform(video).cuda().unsqueeze(0)\n", - " x_hf = hf_transform(video, return_tensors=\"pt\")[\"pixel_values_videos\"].to(\"cuda\")\n", - " # Extract the patch-wise features from the last layer\n", - " out_patch_features_pt = model_pt(x_pt)\n", - " out_patch_features_hf = model_hf.get_vision_features(x_hf)\n", - "\n", - " return out_patch_features_hf, out_patch_features_pt\n", - "\n", - "\n", - "def get_vjepa_video_classification_results(classifier, out_patch_features_pt):\n", - " SOMETHING_SOMETHING_V2_CLASSES = json.load(open(\"ssv2_classes.json\", \"r\"))\n", - "\n", - " with torch.inference_mode():\n", - " out_classifier = classifier(out_patch_features_pt)\n", - "\n", - " print(f\"Classifier output shape: {out_classifier.shape}\")\n", - "\n", - " print(\"Top 5 predicted class names:\")\n", - " top5_indices = out_classifier.topk(5).indices[0]\n", - " top5_probs = F.softmax(out_classifier.topk(5).values[0]) * 100.0 # convert to percentage\n", - " for idx, prob in zip(top5_indices, top5_probs):\n", - " str_idx = str(idx.item())\n", - " print(f\"{SOMETHING_SOMETHING_V2_CLASSES[str_idx]} ({prob}%)\")\n", - "\n", - " return" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's download a sample video to the local repository. If the video is already downloaded, the code will skip this step. Likewise, let's download a mapping for the action recognition classes used in Something-Something V2, so we can interpret the predicted action class from our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sample_video_path = \"sample_video.mp4\"\n", - "# Download the video if not yet downloaded to local path\n", - "if not os.path.exists(sample_video_path):\n", - " video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\"\n", - " command = [\"wget\", video_url, \"-O\", sample_video_path]\n", - " subprocess.run(command)\n", - " print(\"Downloading video\")\n", - "\n", - "# Download SSV2 classes if not already present\n", - "ssv2_classes_path = \"ssv2_classes.json\"\n", - "if not os.path.exists(ssv2_classes_path):\n", - " command = [\n", - " \"wget\",\n", - " \"https://huggingface.co/datasets/huggingface/label-files/resolve/d79675f2d50a7b1ecf98923d42c30526a51818e2/\"\n", - " \"something-something-v2-id2label.json\",\n", - " \"-O\",\n", - " \"ssv2_classes.json\",\n", - " ]\n", - " subprocess.run(command)\n", - " print(\"Downloading SSV2 classes\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's load the models in both vanilla Pytorch as well as through the HuggingFace API. Note that HuggingFace API will automatically load the weights through `from_pretrained()`, so there is no additional download required for HuggingFace.\n", - "\n", - "To download the PyTorch model weights, use wget and specify your preferred target path. See the README for the model weight URLs.\n", - "E.g. \n", - "```\n", - "wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR\n", - "```\n", - "Then update `pt_model_path` with `YOUR_DIR/vitg-384.pt`. Also note that you have the option to use `torch.hub.load`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# HuggingFace model repo name\n", - "hf_model_name = (\n", - " \"facebook/vjepa2-vitg-fpc64-384\" # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384\n", - ")\n", - "# Path to local PyTorch weights\n", - "pt_model_path = \"YOUR_MODEL_PATH\"\n", - "\n", - "# Initialize the HuggingFace model, load pretrained weights\n", - "model_hf = AutoModel.from_pretrained(hf_model_name)\n", - "model_hf.cuda().eval()\n", - "\n", - "# Build HuggingFace preprocessing transform\n", - "hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)\n", - "img_size = hf_transform.crop_size[\"height\"] # E.g. 384, 256, etc.\n", - "\n", - "# Initialize the PyTorch model, load pretrained weights\n", - "model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64)\n", - "model_pt.cuda().eval()\n", - "load_pretrained_vjepa_pt_weights(model_pt, pt_model_path)\n", - "\n", - "### Can also use torch.hub to load the model\n", - "# model_pt, _ = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')\n", - "# model_pt.cuda().eval()\n", - "\n", - "# Build PyTorch preprocessing transform\n", - "pt_video_transform = build_pt_video_transform(img_size=img_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can run the encoder on the video to get the patch-wise features from the last layer of the encoder. To verify that the HuggingFace and PyTorch models are equivalent, we will compare the values of the features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Inference on video to get the patch-wise features\n", - "out_patch_features_hf, out_patch_features_pt = forward_vjepa_video(\n", - " model_hf, model_pt, hf_transform, pt_video_transform\n", - ")\n", - "\n", - "print(\n", - " f\"\"\"\n", - " Inference results on video:\n", - " HuggingFace output shape: {out_patch_features_hf.shape}\n", - " PyTorch output shape: {out_patch_features_pt.shape}\n", - " Absolute difference sum: {torch.abs(out_patch_features_pt - out_patch_features_hf).sum():.6f}\n", - " Close: {torch.allclose(out_patch_features_pt, out_patch_features_hf, atol=1e-3, rtol=1e-3)}\n", - " \"\"\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Great! Now we know that the features from both models are equivalent. Now let's run a pretrained attentive probe classifier on top of the extracted features, to predict an action class for the video. Let's use the Something-Something V2 probe. Note that the repository also includes attentive probe weights for other evaluations such as EPIC-KITCHENS-100 and Diving48.\n", - "\n", - "To download the attentive probe weights, use wget and specify your preferred target path. E.g. `wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR`\n", - "\n", - "Then update `classifier_model_path` with `YOUR_DIR/ssv2-vitg-384-64x2x3.pt`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the classifier\n", - "classifier_model_path = \"YOUR_ATTENTIVE_PROBE_PATH\"\n", - "classifier = (\n", - " AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval()\n", - ")\n", - "load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)\n", - "\n", - "# Get classification results\n", - "get_vjepa_video_classification_results(classifier, out_patch_features_pt)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The video features a man putting a bowling ball into a tube, so the predicted action of \"Putting [something] into [something]\" makes sense!\n", - "\n", - "This concludes the tutorial. Please see the README and paper for full details on the capabilities of V-JEPA 2 :)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# V-JEPA 2 Demo Notebook\n", + "\n", + "This tutorial provides an example of how to load the V-JEPA 2 model in vanilla PyTorch and HuggingFace, extract a video embedding, and then predict an action class. For more details about the paper and model weights, please see https://github.com/facebookresearch/vjepa2." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import the necessary libraries and load the necessary functions for this tutorial." + ] + }, + { + "cell_type": "code", + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "source": [ + "import json\n", + "import os\n", + "import subprocess\n", + "\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from decord import VideoReader\n", + "from transformers import AutoVideoProcessor, AutoModel\n", + "\n", + "import src.datasets.utils.video.transforms as video_transforms\n", + "import src.datasets.utils.video.volume_transforms as volume_transforms\n", + "from src.models.attentive_pooler import AttentiveClassifier\n", + "from src.models.vision_transformer import vit_giant_xformers_rope\n", + "\n", + "IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)\n", + "IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)\n", + "\n", + "def load_pretrained_vjepa_pt_weights(model, pretrained_weights):\n", + " # Load weights of the VJEPA2 encoder\n", + " # The PyTorch state_dict is already preprocessed to have the right key names\n", + " pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"encoder\"]\n", + " pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n", + " pretrained_dict = {k.replace(\"backbone.\", \"\"): v for k, v in pretrained_dict.items()}\n", + " msg = model.load_state_dict(pretrained_dict, strict=False)\n", + " print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n", + "\n", + "\n", + "def load_pretrained_vjepa_classifier_weights(model, pretrained_weights):\n", + " # Load weights of the VJEPA2 classifier\n", + " # The PyTorch state_dict is already preprocessed to have the right key names\n", + " pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"classifiers\"][0]\n", + " pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n", + " msg = model.load_state_dict(pretrained_dict, strict=False)\n", + " print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n", + "\n", + "\n", + "def build_pt_video_transform(img_size):\n", + " short_side_size = int(256.0 / 224 * img_size)\n", + " # Eval transform has no random cropping nor flip\n", + " eval_transform = video_transforms.Compose(\n", + " [\n", + " video_transforms.Resize(short_side_size, interpolation=\"bilinear\"),\n", + " video_transforms.CenterCrop(size=(img_size, img_size)),\n", + " volume_transforms.ClipToTensor(),\n", + " video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),\n", + " ]\n", + " )\n", + " return eval_transform\n", + "\n", + "\n", + "def get_video():\n", + " vr = VideoReader(\"sample_video.mp4\")\n", + " # choosing some frames here, you can define more complex sampling strategy\n", + " frame_idx = np.arange(0, 128, 2)\n", + " video = vr.get_batch(frame_idx).asnumpy()\n", + " return video\n", + "\n", + "\n", + "def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform):\n", + " # Run a sample inference with VJEPA\n", + " with torch.inference_mode():\n", + " # Read and pre-process the image\n", + " video = get_video() # T x H x W x C\n", + " video = torch.from_numpy(video).permute(0, 3, 1, 2) # T x C x H x W\n", + " x_pt = pt_transform(video).cuda().unsqueeze(0)\n", + " x_hf = hf_transform(video, return_tensors=\"pt\")[\"pixel_values_videos\"].to(\"cuda\")\n", + " # Extract the patch-wise features from the last layer\n", + " out_patch_features_pt = model_pt(x_pt)\n", + " out_patch_features_hf = model_hf.get_vision_features(x_hf)\n", + "\n", + " return out_patch_features_hf, out_patch_features_pt\n", + "\n", + "\n", + "def get_vjepa_video_classification_results(classifier, out_patch_features_pt):\n", + " SOMETHING_SOMETHING_V2_CLASSES = json.load(open(\"ssv2_classes.json\", \"r\"))\n", + "\n", + " with torch.inference_mode():\n", + " out_classifier = classifier(out_patch_features_pt)\n", + "\n", + " print(f\"Classifier output shape: {out_classifier.shape}\")\n", + "\n", + " print(\"Top 5 predicted class names:\")\n", + " top5_indices = out_classifier.topk(5).indices[0]\n", + " top5_probs = F.softmax(out_classifier.topk(5).values[0]) * 100.0 # convert to percentage\n", + " for idx, prob in zip(top5_indices, top5_probs):\n", + " str_idx = str(idx.item())\n", + " print(f\"{SOMETHING_SOMETHING_V2_CLASSES[str_idx]} ({prob}%)\")\n", + "\n", + " return" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's download a sample video to the local repository. If the video is already downloaded, the code will skip this step. Likewise, let's download a mapping for the action recognition classes used in Something-Something V2, so we can interpret the predicted action class from our model." + ] + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-21T12:20:03.166783Z", + "start_time": "2025-08-21T12:20:03.031248Z" } - ], - "metadata": { - "fileHeader": "", - "fileUid": "f0b70ba6-1c84-47e1-81bd-b7642f9acf50", - "isAdHoc": false, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" + }, + "source": [ + "sample_video_path = \"sample_video.mp4\"\n", + "# Download the video if not yet downloaded to local path\n", + "if not os.path.exists(sample_video_path):\n", + " video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\"\n", + " command = [\"wget\", video_url, \"-O\", sample_video_path]\n", + " subprocess.run(command)\n", + " print(\"Downloading video\")\n", + "\n", + "# Download SSV2 classes if not already present\n", + "ssv2_classes_path = \"ssv2_classes.json\"\n", + "if not os.path.exists(ssv2_classes_path):\n", + " command = [\n", + " \"wget\",\n", + " \"https://huggingface.co/datasets/huggingface/label-files/resolve/d79675f2d50a7b1ecf98923d42c30526a51818e2/\"\n", + " \"something-something-v2-id2label.json\",\n", + " \"-O\",\n", + " \"ssv2_classes.json\",\n", + " ]\n", + " subprocess.run(command)\n", + " print(\"Downloading SSV2 classes\")" + ], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'os' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mNameError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[1]\u001B[39m\u001B[32m, line 3\u001B[39m\n\u001B[32m 1\u001B[39m sample_video_path = \u001B[33m\"\u001B[39m\u001B[33msample_video.mp4\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 2\u001B[39m \u001B[38;5;66;03m# Download the video if not yet downloaded to local path\u001B[39;00m\n\u001B[32m----> \u001B[39m\u001B[32m3\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[43mos\u001B[49m.path.exists(sample_video_path):\n\u001B[32m 4\u001B[39m video_url = \u001B[33m\"\u001B[39m\u001B[33mhttps://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m 5\u001B[39m command = [\u001B[33m\"\u001B[39m\u001B[33mwget\u001B[39m\u001B[33m\"\u001B[39m, video_url, \u001B[33m\"\u001B[39m\u001B[33m-O\u001B[39m\u001B[33m\"\u001B[39m, sample_video_path]\n", + "\u001B[31mNameError\u001B[39m: name 'os' is not defined" + ] } + ], + "execution_count": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's load the models in both vanilla Pytorch as well as through the HuggingFace API. Note that HuggingFace API will automatically load the weights through `from_pretrained()`, so there is no additional download required for HuggingFace.\n", + "\n", + "To download the PyTorch model weights, use wget and specify your preferred target path. See the README for the model weight URLs.\n", + "E.g. \n", + "```\n", + "wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR\n", + "```\n", + "Then update `pt_model_path` with `YOUR_DIR/vitg-384.pt`. Also note that you have the option to use `torch.hub.load`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# HuggingFace model repo name\n", + "hf_model_name = (\n", + " \"facebook/vjepa2-vitg-fpc64-384\" # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384\n", + ")\n", + "# Path to local PyTorch weights\n", + "pt_model_path = \"YOUR_MODEL_PATH\"\n", + "\n", + "# Initialize the HuggingFace model, load pretrained weights\n", + "model_hf = AutoModel.from_pretrained(hf_model_name)\n", + "model_hf.cuda().eval()\n", + "\n", + "# Build HuggingFace preprocessing transform\n", + "hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)\n", + "img_size = hf_transform.crop_size[\"height\"] # E.g. 384, 256, etc.\n", + "\n", + "# Initialize the PyTorch model, load pretrained weights\n", + "model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64)\n", + "model_pt.cuda().eval()\n", + "load_pretrained_vjepa_pt_weights(model_pt, pt_model_path)\n", + "\n", + "### Can also use torch.hub to load the model\n", + "# model_pt, _ = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')\n", + "# model_pt.cuda().eval()\n", + "\n", + "# Build PyTorch preprocessing transform\n", + "pt_video_transform = build_pt_video_transform(img_size=img_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can run the encoder on the video to get the patch-wise features from the last layer of the encoder. To verify that the HuggingFace and PyTorch models are equivalent, we will compare the values of the features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference on video to get the patch-wise features\n", + "out_patch_features_hf, out_patch_features_pt = forward_vjepa_video(\n", + " model_hf, model_pt, hf_transform, pt_video_transform\n", + ")\n", + "\n", + "print(\n", + " f\"\"\"\n", + " Inference results on video:\n", + " HuggingFace output shape: {out_patch_features_hf.shape}\n", + " PyTorch output shape: {out_patch_features_pt.shape}\n", + " Absolute difference sum: {torch.abs(out_patch_features_pt - out_patch_features_hf).sum():.6f}\n", + " Close: {torch.allclose(out_patch_features_pt, out_patch_features_hf, atol=1e-3, rtol=1e-3)}\n", + " \"\"\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! Now we know that the features from both models are equivalent. Now let's run a pretrained attentive probe classifier on top of the extracted features, to predict an action class for the video. Let's use the Something-Something V2 probe. Note that the repository also includes attentive probe weights for other evaluations such as EPIC-KITCHENS-100 and Diving48.\n", + "\n", + "To download the attentive probe weights, use wget and specify your preferred target path. E.g. `wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR`\n", + "\n", + "Then update `classifier_model_path` with `YOUR_DIR/ssv2-vitg-384-64x2x3.pt`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the classifier\n", + "classifier_model_path = \"YOUR_ATTENTIVE_PROBE_PATH\"\n", + "classifier = (\n", + " AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval()\n", + ")\n", + "load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)\n", + "\n", + "# Get classification results\n", + "get_vjepa_video_classification_results(classifier, out_patch_features_pt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The video features a man putting a bowling ball into a tube, so the predicted action of \"Putting [something] into [something]\" makes sense!\n", + "\n", + "This concludes the tutorial. Please see the README and paper for full details on the capabilities of V-JEPA 2 :)" + ] + } + ], + "metadata": { + "fileHeader": "", + "fileUid": "f0b70ba6-1c84-47e1-81bd-b7642f9acf50", + "isAdHoc": false, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 2 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/vjepa2_demo.py b/notebooks/vjepa2_demo.py index 625c7112..a642d1b0 100644 --- a/notebooks/vjepa2_demo.py +++ b/notebooks/vjepa2_demo.py @@ -63,14 +63,14 @@ def get_video(): return video -def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform): +def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform, device="cuda"): # Run a sample inference with VJEPA with torch.inference_mode(): # Read and pre-process the image video = get_video() # T x H x W x C video = torch.from_numpy(video).permute(0, 3, 1, 2) # T x C x H x W - x_pt = pt_transform(video).cuda().unsqueeze(0) - x_hf = hf_transform(video, return_tensors="pt")["pixel_values_videos"].to("cuda") + x_pt = pt_transform(video).to(device).unsqueeze(0) + x_hf = hf_transform(video, return_tensors="pt")["pixel_values_videos"].to(device) # Extract the patch-wise features from the last layer out_patch_features_pt = model_pt(x_pt) out_patch_features_hf = model_hf.get_vision_features(x_hf) @@ -97,6 +97,14 @@ def get_vjepa_video_classification_results(classifier, out_patch_features_pt): def run_sample_inference(): + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" + else: + device = "cpu" + print(f"Using device: {device}") + # HuggingFace model repo name hf_model_name = ( "facebook/vjepa2-vitg-fpc64-384" # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384 @@ -114,7 +122,7 @@ def run_sample_inference(): # Initialize the HuggingFace model, load pretrained weights model_hf = AutoModel.from_pretrained(hf_model_name) - model_hf.cuda().eval() + model_hf.to(device).eval() # Build HuggingFace preprocessing transform hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name) @@ -122,7 +130,7 @@ def run_sample_inference(): # Initialize the PyTorch model, load pretrained weights model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64) - model_pt.cuda().eval() + model_pt.to(device).eval() load_pretrained_vjepa_pt_weights(model_pt, pt_model_path) # Build PyTorch preprocessing transform @@ -130,7 +138,7 @@ def run_sample_inference(): # Inference on video out_patch_features_hf, out_patch_features_pt = forward_vjepa_video( - model_hf, model_pt, hf_transform, pt_video_transform + model_hf, model_pt, hf_transform, pt_video_transform, device ) print( @@ -146,7 +154,7 @@ def run_sample_inference(): # Initialize the classifier classifier_model_path = "YOUR_ATTENTIVE_PROBE_PATH" classifier = ( - AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval() + AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).to(device).eval() ) load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)