#wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8/resolve/main/nano_v3_reasoning_parser.py
docker run --rm -it --gpus all --ipc=host \
-p 8000:8000 \
-e VLLM_FLASHINFER_MOE_BACKEND=latency \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-v ~/reachy-personal-assistant/nano_v3_reasoning_parser.py:/workspace/nano_v3_reasoning_parser.py \
-w /workspace \
nvcr.io/nvidia/vllm:26.02-py3 \
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
--quantization modelopt_fp4 \
--kv-cache-dtype fp8 \
--gpu-memory-utilization 0.35 \
--max-model-len 16384 \
--max-num-seqs 4 \
--port 8000 \
--host 0.0.0.0 \
--trust-remote-code \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser-plugin nano_v3_reasoning_parser.py \
--reasoning-parser nano_v3
docker run --rm -it --gpus all --ipc=host \
-p 8001:8001 \
-e VLLM_FLASHINFER_MOE_BACKEND=latency \
-e VLLM_VIDEO_LOADER_BACKEND=opencv \
-v ~/.cache/huggingface:/root/.cache/huggingface \
nvcr.io/nvidia/vllm:26.02-py3 \
vllm serve nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD \
--quantization modelopt_fp4 \
--kv-cache-dtype fp8 \
--gpu-memory-utilization 0.35 \
--max-model-len 8192 \
--max-num-seqs 4 \
--port 8001 \
--host 0.0.0.0 \
--trust-remote-code \
--media-io-kwargs '{"video": {"fps": 2, "num_frames": 128}}' \
--allowed-local-media-path /
export UV_VENV_CLEAR=1
cd ~/reachy-personal-assistant/nat
uv venv
uv sync
uv run nat serve --config_file src/ces_tutorial/config.yml --port 8002
docker run --rm -it --gpus all --ipc=host \
-p 8003:8003 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
nvcr.io/nvidia/vllm:26.02-py3 \
vllm serve microsoft/Phi-4-mini-instruct \
--max-model-len 4096 \
--max-num-seqs 4 \
--gpu-memory-utilization 0.15 \
--port 8003 \
--host 0.0.0.0 \
--trust-remote-code
anand@spark-2bb8:~/reachy-personal-assistant$ cat nat/src/ces_tutorial/config.yml
general:
front_end:
_type: fastapi
functions:
wikipedia_search:
_type: wiki_search
max_results: 2
router:
_type: router
route_config:
- name: other
description: Any question that requires careful thought, outside information, image understanding, or tool calling to take actions.
- name: chit_chat
description: Any simple chit chat, small talk, or casual conversation.
- name: image_understanding
description: A question that requires the assistant to see the user eg a question about their appearance, environment, scene or surroundings. Examples what am I holding, what am I wearing, what do I look like, what is in my surroundings, what does it say on the whiteboard. Questions about attire eg what color is my shirt/hat/jacket/etc
llm_name: routing_llm
react_agent:
_type: react_agent
llm_name: agent_llm
verbose: true
parse_agent_response_max_retries: 3
tool_names: [wikipedia_search]
llms:
agent_llm:
_type: nim
model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4
base_url: http://localhost:8000/v1
temperature: 0.0
chitchat_llm:
_type: nim
model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4
base_url: http://localhost:8000/v1
temperature: 0.0
routing_llm:
_type: nim
model_name: microsoft/Phi-4-mini-instruct
base_url: http://localhost:8003/v1
temperature: 0.0
image_llm:
_type: nim
model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD
base_url: http://localhost:8001/v1
temperature: 0.0
workflow:
_type: ces_tutorial_router_agent
agent: react_agent
router: router
chitchat_llm: chitchat_llm
image_llm: image_llm
anand@spark-2bb8:~/reachy-personal-assistant$ curl http://localhost:8000/v1/models
{"object":"list","data":[{"id":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4","object":"model","created":1773548758,"owned_by":"vllm","root":"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4","parent":null,"max_model_len":16384,"permission":[{"id":"modelperm-967de90493ecdcd6","object":"model_permission","created":1773548758,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}anand@spark-2bb8:~/reachy-personal-assistant$ curl http://localhost:8000/v1/models
anand@spark-2bb8:~/reachy-personal-assistant$ curl http://localhost:8001/v1/models
{"object":"list","data":[{"id":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD","object":"model","created":1773548783,"owned_by":"vllm","root":"nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD","parent":null,"max_model_len":8192,"permission":[{"id":"modelperm-a249036fddfb8ec0","object":"model_permission","created":1773548783,"allow_create_engine":false,"allow_sampling":true,"allow_logprobs":true,"allow_search_indices":false,"allow_view":true,"allow_fine_tuning":false,"organization":"*","group":null,"is_blocking":false}]}]}anand@spark-2bb8:~/reachy-personal-assistant$
curl -s http://localhost:8002/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model": "test", "messages": [{"role": "user", "content": "What is the capital of France?"}]}'
{"id":"518f831f-6d04-4a0a-91bb-b722776dca5f","object":"chat.completion","model":"unknown-model","created":1773549795,"choices":[{"finish_reason":"stop","index":0,"message":{"content":"The capital of France is Paris.","role":"assistant"}}],"usage":{"prompt_tokens":6,"completion_tokens":6,"total_tokens":12},"system_fingerprint":null,"service_tier":null}
I was impressed by the video clip in the blog post https://huggingface.co/blog/nvidia-reachy-mini
I tried a little but the local model load is excruciatingly slow and so i decided to return the spark dgx.
Here is roughly what i have so far thanks to some vibe coding with claude