diff --git a/evals/action_anticipation_frozen/eval.py b/evals/action_anticipation_frozen/eval.py
index 9a15ee04..b81ec065 100644
--- a/evals/action_anticipation_frozen/eval.py
+++ b/evals/action_anticipation_frozen/eval.py
@@ -132,11 +132,13 @@ def main(args_eval, resume_preempt=False):
     except Exception:
         pass
 
-    if not torch.cuda.is_available():
-        device = torch.device("cpu")
-    else:
+    if torch.cuda.is_available():
         device = torch.device("cuda:0")
         torch.cuda.set_device(device)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
 
     world_size, rank = init_distributed()
     logger.info(f"Initialized (rank/world-size) {rank}/{world_size}")
diff --git a/evals/image_classification_frozen/eval.py b/evals/image_classification_frozen/eval.py
index df0bf958..a0faf947 100644
--- a/evals/image_classification_frozen/eval.py
+++ b/evals/image_classification_frozen/eval.py
@@ -108,11 +108,13 @@ def main(args_eval, resume_preempt=False):
     except Exception:
         pass
 
-    if not torch.cuda.is_available():
-        device = torch.device("cpu")
-    else:
+    if torch.cuda.is_available():
         device = torch.device("cuda:0")
         torch.cuda.set_device(device)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
 
     world_size, rank = init_distributed()
     logger.info(f"Initialized (rank/world-size) {rank}/{world_size}")
diff --git a/evals/main.py b/evals/main.py
index 3df8d4a9..9c6a0003 100644
--- a/evals/main.py
+++ b/evals/main.py
@@ -104,7 +104,11 @@ def process_main(args, rank, fname, world_size, devices):
             )
         # Single-GPU debugging
         else:
-            process_main(args=args, rank=0, fname=args.fname, world_size=1, devices=["cuda:0"])
+            if torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cuda:0"
+            process_main(args=args, rank=0, fname=args.fname, world_size=1, devices=[device])
     else:
         num_gpus = len(args.devices)
         mp.set_start_method("spawn")
diff --git a/evals/video_classification_frozen/eval.py b/evals/video_classification_frozen/eval.py
index 87151254..5e1edc5c 100644
--- a/evals/video_classification_frozen/eval.py
+++ b/evals/video_classification_frozen/eval.py
@@ -113,11 +113,13 @@ def main(args_eval, resume_preempt=False):
     except Exception:
         pass
 
-    if not torch.cuda.is_available():
-        device = torch.device("cpu")
-    else:
+    if torch.cuda.is_available():
         device = torch.device("cuda:0")
         torch.cuda.set_device(device)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
 
     world_size, rank = init_distributed()
     logger.info(f"Initialized (rank/world-size) {rank}/{world_size}")
diff --git a/notebooks/vjepa2_demo.ipynb b/notebooks/vjepa2_demo.ipynb
index 2a816bc5..719cc5ba 100644
--- a/notebooks/vjepa2_demo.ipynb
+++ b/notebooks/vjepa2_demo.ipynb
@@ -1,291 +1,312 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# V-JEPA 2 Demo Notebook\n",
-        "\n",
-        "This tutorial provides an example of how to load the V-JEPA 2 model in vanilla PyTorch and HuggingFace, extract a video embedding, and then predict an action class. For more details about the paper and model weights, please see https://github.com/facebookresearch/vjepa2."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "First, let's import the necessary libraries and load the necessary functions for this tutorial."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import json\n",
-        "import os\n",
-        "import subprocess\n",
-        "\n",
-        "import numpy as np\n",
-        "import torch\n",
-        "import torch.nn.functional as F\n",
-        "from decord import VideoReader\n",
-        "from transformers import AutoVideoProcessor, AutoModel\n",
-        "\n",
-        "import src.datasets.utils.video.transforms as video_transforms\n",
-        "import src.datasets.utils.video.volume_transforms as volume_transforms\n",
-        "from src.models.attentive_pooler import AttentiveClassifier\n",
-        "from src.models.vision_transformer import vit_giant_xformers_rope\n",
-        "\n",
-        "IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)\n",
-        "IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)\n",
-        "\n",
-        "def load_pretrained_vjepa_pt_weights(model, pretrained_weights):\n",
-        "    # Load weights of the VJEPA2 encoder\n",
-        "    # The PyTorch state_dict is already preprocessed to have the right key names\n",
-        "    pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"encoder\"]\n",
-        "    pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n",
-        "    pretrained_dict = {k.replace(\"backbone.\", \"\"): v for k, v in pretrained_dict.items()}\n",
-        "    msg = model.load_state_dict(pretrained_dict, strict=False)\n",
-        "    print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n",
-        "\n",
-        "\n",
-        "def load_pretrained_vjepa_classifier_weights(model, pretrained_weights):\n",
-        "    # Load weights of the VJEPA2 classifier\n",
-        "    # The PyTorch state_dict is already preprocessed to have the right key names\n",
-        "    pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"classifiers\"][0]\n",
-        "    pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n",
-        "    msg = model.load_state_dict(pretrained_dict, strict=False)\n",
-        "    print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n",
-        "\n",
-        "\n",
-        "def build_pt_video_transform(img_size):\n",
-        "    short_side_size = int(256.0 / 224 * img_size)\n",
-        "    # Eval transform has no random cropping nor flip\n",
-        "    eval_transform = video_transforms.Compose(\n",
-        "        [\n",
-        "            video_transforms.Resize(short_side_size, interpolation=\"bilinear\"),\n",
-        "            video_transforms.CenterCrop(size=(img_size, img_size)),\n",
-        "            volume_transforms.ClipToTensor(),\n",
-        "            video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),\n",
-        "        ]\n",
-        "    )\n",
-        "    return eval_transform\n",
-        "\n",
-        "\n",
-        "def get_video():\n",
-        "    vr = VideoReader(\"sample_video.mp4\")\n",
-        "    # choosing some frames here, you can define more complex sampling strategy\n",
-        "    frame_idx = np.arange(0, 128, 2)\n",
-        "    video = vr.get_batch(frame_idx).asnumpy()\n",
-        "    return video\n",
-        "\n",
-        "\n",
-        "def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform):\n",
-        "    # Run a sample inference with VJEPA\n",
-        "    with torch.inference_mode():\n",
-        "        # Read and pre-process the image\n",
-        "        video = get_video()  # T x H x W x C\n",
-        "        video = torch.from_numpy(video).permute(0, 3, 1, 2)  # T x C x H x W\n",
-        "        x_pt = pt_transform(video).cuda().unsqueeze(0)\n",
-        "        x_hf = hf_transform(video, return_tensors=\"pt\")[\"pixel_values_videos\"].to(\"cuda\")\n",
-        "        # Extract the patch-wise features from the last layer\n",
-        "        out_patch_features_pt = model_pt(x_pt)\n",
-        "        out_patch_features_hf = model_hf.get_vision_features(x_hf)\n",
-        "\n",
-        "    return out_patch_features_hf, out_patch_features_pt\n",
-        "\n",
-        "\n",
-        "def get_vjepa_video_classification_results(classifier, out_patch_features_pt):\n",
-        "    SOMETHING_SOMETHING_V2_CLASSES = json.load(open(\"ssv2_classes.json\", \"r\"))\n",
-        "\n",
-        "    with torch.inference_mode():\n",
-        "        out_classifier = classifier(out_patch_features_pt)\n",
-        "\n",
-        "    print(f\"Classifier output shape: {out_classifier.shape}\")\n",
-        "\n",
-        "    print(\"Top 5 predicted class names:\")\n",
-        "    top5_indices = out_classifier.topk(5).indices[0]\n",
-        "    top5_probs = F.softmax(out_classifier.topk(5).values[0]) * 100.0  # convert to percentage\n",
-        "    for idx, prob in zip(top5_indices, top5_probs):\n",
-        "        str_idx = str(idx.item())\n",
-        "        print(f\"{SOMETHING_SOMETHING_V2_CLASSES[str_idx]} ({prob}%)\")\n",
-        "\n",
-        "    return"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Next, let's download a sample video to the local repository. If the video is already downloaded, the code will skip this step. Likewise, let's download a mapping for the action recognition classes used in Something-Something V2, so we can interpret the predicted action class from our model."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "sample_video_path = \"sample_video.mp4\"\n",
-        "# Download the video if not yet downloaded to local path\n",
-        "if not os.path.exists(sample_video_path):\n",
-        "    video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\"\n",
-        "    command = [\"wget\", video_url, \"-O\", sample_video_path]\n",
-        "    subprocess.run(command)\n",
-        "    print(\"Downloading video\")\n",
-        "\n",
-        "# Download SSV2 classes if not already present\n",
-        "ssv2_classes_path = \"ssv2_classes.json\"\n",
-        "if not os.path.exists(ssv2_classes_path):\n",
-        "    command = [\n",
-        "        \"wget\",\n",
-        "        \"https://huggingface.co/datasets/huggingface/label-files/resolve/d79675f2d50a7b1ecf98923d42c30526a51818e2/\"\n",
-        "        \"something-something-v2-id2label.json\",\n",
-        "        \"-O\",\n",
-        "        \"ssv2_classes.json\",\n",
-        "    ]\n",
-        "    subprocess.run(command)\n",
-        "    print(\"Downloading SSV2 classes\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Now, let's load the models in both vanilla Pytorch as well as through the HuggingFace API. Note that HuggingFace API will automatically load the weights through `from_pretrained()`, so there is no additional download required for HuggingFace.\n",
-        "\n",
-        "To download the PyTorch model weights, use wget and specify your preferred target path. See the README for the model weight URLs.\n",
-        "E.g. \n",
-        "```\n",
-        "wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR\n",
-        "```\n",
-        "Then update `pt_model_path` with `YOUR_DIR/vitg-384.pt`. Also note that you have the option to use `torch.hub.load`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# HuggingFace model repo name\n",
-        "hf_model_name = (\n",
-        "    \"facebook/vjepa2-vitg-fpc64-384\"  # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384\n",
-        ")\n",
-        "# Path to local PyTorch weights\n",
-        "pt_model_path = \"YOUR_MODEL_PATH\"\n",
-        "\n",
-        "# Initialize the HuggingFace model, load pretrained weights\n",
-        "model_hf = AutoModel.from_pretrained(hf_model_name)\n",
-        "model_hf.cuda().eval()\n",
-        "\n",
-        "# Build HuggingFace preprocessing transform\n",
-        "hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)\n",
-        "img_size = hf_transform.crop_size[\"height\"]  # E.g. 384, 256, etc.\n",
-        "\n",
-        "# Initialize the PyTorch model, load pretrained weights\n",
-        "model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64)\n",
-        "model_pt.cuda().eval()\n",
-        "load_pretrained_vjepa_pt_weights(model_pt, pt_model_path)\n",
-        "\n",
-        "### Can also use torch.hub to load the model\n",
-        "# model_pt, _ = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')\n",
-        "# model_pt.cuda().eval()\n",
-        "\n",
-        "# Build PyTorch preprocessing transform\n",
-        "pt_video_transform = build_pt_video_transform(img_size=img_size)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Now we can run the encoder on the video to get the patch-wise features from the last layer of the encoder. To verify that the HuggingFace and PyTorch models are equivalent, we will compare the values of the features."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Inference on video to get the patch-wise features\n",
-        "out_patch_features_hf, out_patch_features_pt = forward_vjepa_video(\n",
-        "    model_hf, model_pt, hf_transform, pt_video_transform\n",
-        ")\n",
-        "\n",
-        "print(\n",
-        "    f\"\"\"\n",
-        "    Inference results on video:\n",
-        "    HuggingFace output shape: {out_patch_features_hf.shape}\n",
-        "    PyTorch output shape:     {out_patch_features_pt.shape}\n",
-        "    Absolute difference sum:  {torch.abs(out_patch_features_pt - out_patch_features_hf).sum():.6f}\n",
-        "    Close: {torch.allclose(out_patch_features_pt, out_patch_features_hf, atol=1e-3, rtol=1e-3)}\n",
-        "    \"\"\"\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Great! Now we know that the features from both models are equivalent. Now let's run a pretrained attentive probe classifier on top of the extracted features, to predict an action class for the video. Let's use the Something-Something V2 probe. Note that the repository also includes attentive probe weights for other evaluations such as EPIC-KITCHENS-100 and Diving48.\n",
-        "\n",
-        "To download the attentive probe weights, use wget and specify your preferred target path. E.g. `wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR`\n",
-        "\n",
-        "Then update `classifier_model_path` with `YOUR_DIR/ssv2-vitg-384-64x2x3.pt`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Initialize the classifier\n",
-        "classifier_model_path = \"YOUR_ATTENTIVE_PROBE_PATH\"\n",
-        "classifier = (\n",
-        "    AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval()\n",
-        ")\n",
-        "load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)\n",
-        "\n",
-        "# Get classification results\n",
-        "get_vjepa_video_classification_results(classifier, out_patch_features_pt)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The video features a man putting a bowling ball into a tube, so the predicted action of \"Putting [something] into [something]\" makes sense!\n",
-        "\n",
-        "This concludes the tutorial. Please see the README and paper for full details on the capabilities of V-JEPA 2 :)"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# V-JEPA 2 Demo Notebook\n",
+    "\n",
+    "This tutorial provides an example of how to load the V-JEPA 2 model in vanilla PyTorch and HuggingFace, extract a video embedding, and then predict an action class. For more details about the paper and model weights, please see https://github.com/facebookresearch/vjepa2."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's import the necessary libraries and load the necessary functions for this tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
+   "source": [
+    "import json\n",
+    "import os\n",
+    "import subprocess\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from decord import VideoReader\n",
+    "from transformers import AutoVideoProcessor, AutoModel\n",
+    "\n",
+    "import src.datasets.utils.video.transforms as video_transforms\n",
+    "import src.datasets.utils.video.volume_transforms as volume_transforms\n",
+    "from src.models.attentive_pooler import AttentiveClassifier\n",
+    "from src.models.vision_transformer import vit_giant_xformers_rope\n",
+    "\n",
+    "IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)\n",
+    "IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)\n",
+    "\n",
+    "def load_pretrained_vjepa_pt_weights(model, pretrained_weights):\n",
+    "    # Load weights of the VJEPA2 encoder\n",
+    "    # The PyTorch state_dict is already preprocessed to have the right key names\n",
+    "    pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"encoder\"]\n",
+    "    pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n",
+    "    pretrained_dict = {k.replace(\"backbone.\", \"\"): v for k, v in pretrained_dict.items()}\n",
+    "    msg = model.load_state_dict(pretrained_dict, strict=False)\n",
+    "    print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n",
+    "\n",
+    "\n",
+    "def load_pretrained_vjepa_classifier_weights(model, pretrained_weights):\n",
+    "    # Load weights of the VJEPA2 classifier\n",
+    "    # The PyTorch state_dict is already preprocessed to have the right key names\n",
+    "    pretrained_dict = torch.load(pretrained_weights, weights_only=True, map_location=\"cpu\")[\"classifiers\"][0]\n",
+    "    pretrained_dict = {k.replace(\"module.\", \"\"): v for k, v in pretrained_dict.items()}\n",
+    "    msg = model.load_state_dict(pretrained_dict, strict=False)\n",
+    "    print(\"Pretrained weights found at {} and loaded with msg: {}\".format(pretrained_weights, msg))\n",
+    "\n",
+    "\n",
+    "def build_pt_video_transform(img_size):\n",
+    "    short_side_size = int(256.0 / 224 * img_size)\n",
+    "    # Eval transform has no random cropping nor flip\n",
+    "    eval_transform = video_transforms.Compose(\n",
+    "        [\n",
+    "            video_transforms.Resize(short_side_size, interpolation=\"bilinear\"),\n",
+    "            video_transforms.CenterCrop(size=(img_size, img_size)),\n",
+    "            volume_transforms.ClipToTensor(),\n",
+    "            video_transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),\n",
+    "        ]\n",
+    "    )\n",
+    "    return eval_transform\n",
+    "\n",
+    "\n",
+    "def get_video():\n",
+    "    vr = VideoReader(\"sample_video.mp4\")\n",
+    "    # choosing some frames here, you can define more complex sampling strategy\n",
+    "    frame_idx = np.arange(0, 128, 2)\n",
+    "    video = vr.get_batch(frame_idx).asnumpy()\n",
+    "    return video\n",
+    "\n",
+    "\n",
+    "def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform):\n",
+    "    # Run a sample inference with VJEPA\n",
+    "    with torch.inference_mode():\n",
+    "        # Read and pre-process the image\n",
+    "        video = get_video()  # T x H x W x C\n",
+    "        video = torch.from_numpy(video).permute(0, 3, 1, 2)  # T x C x H x W\n",
+    "        x_pt = pt_transform(video).cuda().unsqueeze(0)\n",
+    "        x_hf = hf_transform(video, return_tensors=\"pt\")[\"pixel_values_videos\"].to(\"cuda\")\n",
+    "        # Extract the patch-wise features from the last layer\n",
+    "        out_patch_features_pt = model_pt(x_pt)\n",
+    "        out_patch_features_hf = model_hf.get_vision_features(x_hf)\n",
+    "\n",
+    "    return out_patch_features_hf, out_patch_features_pt\n",
+    "\n",
+    "\n",
+    "def get_vjepa_video_classification_results(classifier, out_patch_features_pt):\n",
+    "    SOMETHING_SOMETHING_V2_CLASSES = json.load(open(\"ssv2_classes.json\", \"r\"))\n",
+    "\n",
+    "    with torch.inference_mode():\n",
+    "        out_classifier = classifier(out_patch_features_pt)\n",
+    "\n",
+    "    print(f\"Classifier output shape: {out_classifier.shape}\")\n",
+    "\n",
+    "    print(\"Top 5 predicted class names:\")\n",
+    "    top5_indices = out_classifier.topk(5).indices[0]\n",
+    "    top5_probs = F.softmax(out_classifier.topk(5).values[0]) * 100.0  # convert to percentage\n",
+    "    for idx, prob in zip(top5_indices, top5_probs):\n",
+    "        str_idx = str(idx.item())\n",
+    "        print(f\"{SOMETHING_SOMETHING_V2_CLASSES[str_idx]} ({prob}%)\")\n",
+    "\n",
+    "    return"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, let's download a sample video to the local repository. If the video is already downloaded, the code will skip this step. Likewise, let's download a mapping for the action recognition classes used in Something-Something V2, so we can interpret the predicted action class from our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-08-21T12:20:03.166783Z",
+     "start_time": "2025-08-21T12:20:03.031248Z"
     }
-  ],
-  "metadata": {
-    "fileHeader": "",
-    "fileUid": "f0b70ba6-1c84-47e1-81bd-b7642f9acf50",
-    "isAdHoc": false,
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.12.9"
+   },
+   "source": [
+    "sample_video_path = \"sample_video.mp4\"\n",
+    "# Download the video if not yet downloaded to local path\n",
+    "if not os.path.exists(sample_video_path):\n",
+    "    video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\"\n",
+    "    command = [\"wget\", video_url, \"-O\", sample_video_path]\n",
+    "    subprocess.run(command)\n",
+    "    print(\"Downloading video\")\n",
+    "\n",
+    "# Download SSV2 classes if not already present\n",
+    "ssv2_classes_path = \"ssv2_classes.json\"\n",
+    "if not os.path.exists(ssv2_classes_path):\n",
+    "    command = [\n",
+    "        \"wget\",\n",
+    "        \"https://huggingface.co/datasets/huggingface/label-files/resolve/d79675f2d50a7b1ecf98923d42c30526a51818e2/\"\n",
+    "        \"something-something-v2-id2label.json\",\n",
+    "        \"-O\",\n",
+    "        \"ssv2_classes.json\",\n",
+    "    ]\n",
+    "    subprocess.run(command)\n",
+    "    print(\"Downloading SSV2 classes\")"
+   ],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'os' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
+      "\u001B[31mNameError\u001B[39m                                 Traceback (most recent call last)",
+      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[1]\u001B[39m\u001B[32m, line 3\u001B[39m\n\u001B[32m      1\u001B[39m sample_video_path = \u001B[33m\"\u001B[39m\u001B[33msample_video.mp4\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m      2\u001B[39m \u001B[38;5;66;03m# Download the video if not yet downloaded to local path\u001B[39;00m\n\u001B[32m----> \u001B[39m\u001B[32m3\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[43mos\u001B[49m.path.exists(sample_video_path):\n\u001B[32m      4\u001B[39m     video_url = \u001B[33m\"\u001B[39m\u001B[33mhttps://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/bowling/-WH-lxmGJVY_000005_000015.mp4\u001B[39m\u001B[33m\"\u001B[39m\n\u001B[32m      5\u001B[39m     command = [\u001B[33m\"\u001B[39m\u001B[33mwget\u001B[39m\u001B[33m\"\u001B[39m, video_url, \u001B[33m\"\u001B[39m\u001B[33m-O\u001B[39m\u001B[33m\"\u001B[39m, sample_video_path]\n",
+      "\u001B[31mNameError\u001B[39m: name 'os' is not defined"
+     ]
     }
+   ],
+   "execution_count": 1
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's load the models in both vanilla Pytorch as well as through the HuggingFace API. Note that HuggingFace API will automatically load the weights through `from_pretrained()`, so there is no additional download required for HuggingFace.\n",
+    "\n",
+    "To download the PyTorch model weights, use wget and specify your preferred target path. See the README for the model weight URLs.\n",
+    "E.g. \n",
+    "```\n",
+    "wget https://dl.fbaipublicfiles.com/vjepa2/vitg-384.pt -P YOUR_DIR\n",
+    "```\n",
+    "Then update `pt_model_path` with `YOUR_DIR/vitg-384.pt`. Also note that you have the option to use `torch.hub.load`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# HuggingFace model repo name\n",
+    "hf_model_name = (\n",
+    "    \"facebook/vjepa2-vitg-fpc64-384\"  # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384\n",
+    ")\n",
+    "# Path to local PyTorch weights\n",
+    "pt_model_path = \"YOUR_MODEL_PATH\"\n",
+    "\n",
+    "# Initialize the HuggingFace model, load pretrained weights\n",
+    "model_hf = AutoModel.from_pretrained(hf_model_name)\n",
+    "model_hf.cuda().eval()\n",
+    "\n",
+    "# Build HuggingFace preprocessing transform\n",
+    "hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)\n",
+    "img_size = hf_transform.crop_size[\"height\"]  # E.g. 384, 256, etc.\n",
+    "\n",
+    "# Initialize the PyTorch model, load pretrained weights\n",
+    "model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64)\n",
+    "model_pt.cuda().eval()\n",
+    "load_pretrained_vjepa_pt_weights(model_pt, pt_model_path)\n",
+    "\n",
+    "### Can also use torch.hub to load the model\n",
+    "# model_pt, _ = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_vit_giant_384')\n",
+    "# model_pt.cuda().eval()\n",
+    "\n",
+    "# Build PyTorch preprocessing transform\n",
+    "pt_video_transform = build_pt_video_transform(img_size=img_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can run the encoder on the video to get the patch-wise features from the last layer of the encoder. To verify that the HuggingFace and PyTorch models are equivalent, we will compare the values of the features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inference on video to get the patch-wise features\n",
+    "out_patch_features_hf, out_patch_features_pt = forward_vjepa_video(\n",
+    "    model_hf, model_pt, hf_transform, pt_video_transform\n",
+    ")\n",
+    "\n",
+    "print(\n",
+    "    f\"\"\"\n",
+    "    Inference results on video:\n",
+    "    HuggingFace output shape: {out_patch_features_hf.shape}\n",
+    "    PyTorch output shape:     {out_patch_features_pt.shape}\n",
+    "    Absolute difference sum:  {torch.abs(out_patch_features_pt - out_patch_features_hf).sum():.6f}\n",
+    "    Close: {torch.allclose(out_patch_features_pt, out_patch_features_hf, atol=1e-3, rtol=1e-3)}\n",
+    "    \"\"\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great! Now we know that the features from both models are equivalent. Now let's run a pretrained attentive probe classifier on top of the extracted features, to predict an action class for the video. Let's use the Something-Something V2 probe. Note that the repository also includes attentive probe weights for other evaluations such as EPIC-KITCHENS-100 and Diving48.\n",
+    "\n",
+    "To download the attentive probe weights, use wget and specify your preferred target path. E.g. `wget https://dl.fbaipublicfiles.com/vjepa2/evals/ssv2-vitg-384-64x2x3.pt -P YOUR_DIR`\n",
+    "\n",
+    "Then update `classifier_model_path` with `YOUR_DIR/ssv2-vitg-384-64x2x3.pt`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the classifier\n",
+    "classifier_model_path = \"YOUR_ATTENTIVE_PROBE_PATH\"\n",
+    "classifier = (\n",
+    "    AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval()\n",
+    ")\n",
+    "load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)\n",
+    "\n",
+    "# Get classification results\n",
+    "get_vjepa_video_classification_results(classifier, out_patch_features_pt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The video features a man putting a bowling ball into a tube, so the predicted action of \"Putting [something] into [something]\" makes sense!\n",
+    "\n",
+    "This concludes the tutorial. Please see the README and paper for full details on the capabilities of V-JEPA 2 :)"
+   ]
+  }
+ ],
+ "metadata": {
+  "fileHeader": "",
+  "fileUid": "f0b70ba6-1c84-47e1-81bd-b7642f9acf50",
+  "isAdHoc": false,
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 2
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/notebooks/vjepa2_demo.py b/notebooks/vjepa2_demo.py
index 625c7112..a642d1b0 100644
--- a/notebooks/vjepa2_demo.py
+++ b/notebooks/vjepa2_demo.py
@@ -63,14 +63,14 @@ def get_video():
     return video
 
 
-def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform):
+def forward_vjepa_video(model_hf, model_pt, hf_transform, pt_transform, device="cuda"):
     # Run a sample inference with VJEPA
     with torch.inference_mode():
         # Read and pre-process the image
         video = get_video()  # T x H x W x C
         video = torch.from_numpy(video).permute(0, 3, 1, 2)  # T x C x H x W
-        x_pt = pt_transform(video).cuda().unsqueeze(0)
-        x_hf = hf_transform(video, return_tensors="pt")["pixel_values_videos"].to("cuda")
+        x_pt = pt_transform(video).to(device).unsqueeze(0)
+        x_hf = hf_transform(video, return_tensors="pt")["pixel_values_videos"].to(device)
         # Extract the patch-wise features from the last layer
         out_patch_features_pt = model_pt(x_pt)
         out_patch_features_hf = model_hf.get_vision_features(x_hf)
@@ -97,6 +97,14 @@ def get_vjepa_video_classification_results(classifier, out_patch_features_pt):
 
 
 def run_sample_inference():
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    print(f"Using device: {device}")
+    
     # HuggingFace model repo name
     hf_model_name = (
         "facebook/vjepa2-vitg-fpc64-384"  # Replace with your favored model, e.g. facebook/vjepa2-vitg-fpc64-384
@@ -114,7 +122,7 @@ def run_sample_inference():
 
     # Initialize the HuggingFace model, load pretrained weights
     model_hf = AutoModel.from_pretrained(hf_model_name)
-    model_hf.cuda().eval()
+    model_hf.to(device).eval()
 
     # Build HuggingFace preprocessing transform
     hf_transform = AutoVideoProcessor.from_pretrained(hf_model_name)
@@ -122,7 +130,7 @@ def run_sample_inference():
 
     # Initialize the PyTorch model, load pretrained weights
     model_pt = vit_giant_xformers_rope(img_size=(img_size, img_size), num_frames=64)
-    model_pt.cuda().eval()
+    model_pt.to(device).eval()
     load_pretrained_vjepa_pt_weights(model_pt, pt_model_path)
 
     # Build PyTorch preprocessing transform
@@ -130,7 +138,7 @@ def run_sample_inference():
 
     # Inference on video
     out_patch_features_hf, out_patch_features_pt = forward_vjepa_video(
-        model_hf, model_pt, hf_transform, pt_video_transform
+        model_hf, model_pt, hf_transform, pt_video_transform, device
     )
 
     print(
@@ -146,7 +154,7 @@ def run_sample_inference():
     # Initialize the classifier
     classifier_model_path = "YOUR_ATTENTIVE_PROBE_PATH"
     classifier = (
-        AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).cuda().eval()
+        AttentiveClassifier(embed_dim=model_pt.embed_dim, num_heads=16, depth=4, num_classes=174).to(device).eval()
     )
     load_pretrained_vjepa_classifier_weights(classifier, classifier_model_path)