Infinirc · ricky-chaoju · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/backend/app/api/huggingface.py b/backend/app/api/huggingface.py
@@ -616,3 +616,189 @@ async def get_model_readme(
 
     except httpx.RequestError as e:
         return {"content": None, "message": f"Failed to fetch README: {str(e)}"}
+
+
+class ModelFormatInfo(BaseModel):
+    """Model format compatibility information"""
+
+    model_id: str
+    is_mlx_ready: bool = False  # True if from mlx-community
+    is_gguf_ready: bool = False  # True if has .gguf files
+    mlx_variants: list[str] = []  # Available MLX variants
+    gguf_files: list[str] = []  # Available GGUF files
+
+
+def _is_mlx_ready(model_id: str) -> bool:
+    """Check if model is from mlx-community."""
+    return model_id.startswith("mlx-community/")
+
+
+def _is_gguf_ready(files: list[str]) -> bool:
+    """Check if model has GGUF files."""
+    return any(f.endswith(".gguf") for f in files)
+
+
+@router.get("/format-info/{model_id:path}", response_model=ModelFormatInfo)
+async def get_model_format_info(
+    model_id: str,
+    token: str | None = Query(None, description="HuggingFace API token"),
+):
+    """
+    Get model format compatibility information.
+
+    Returns whether the model is MLX-ready, GGUF-ready, and lists available variants.
+    """
+    headers = {}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+
+    result = ModelFormatInfo(
+        model_id=model_id,
+        is_mlx_ready=_is_mlx_ready(model_id),
+    )
+
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            # Get model files to check for GGUF
+            response = await client.get(
+                f"{HF_API_URL}/models/{model_id}",
+                headers=headers,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                siblings = data.get("siblings", [])
+                files = [s.get("rfilename", "") for s in siblings]
+
+                # Check for GGUF files
+                gguf_files = [f for f in files if f.endswith(".gguf")]
+                result.gguf_files = gguf_files
+                result.is_gguf_ready = len(gguf_files) > 0
+
+            # Search for MLX variants if not already MLX
+            if not result.is_mlx_ready:
+                model_name = model_id.split("/")[-1]
+                # Search mlx-community for this model
+                search_response = await client.get(
+                    f"{HF_API_URL}/models",
+                    params={
+                        "search": model_name,
+                        "author": "mlx-community",
+                        "limit": 5,
+                    },
+                )
+                if search_response.status_code == 200:
+                    mlx_models = search_response.json()
+                    result.mlx_variants = [m.get("modelId", m.get("id", "")) for m in mlx_models]
+
+    except httpx.RequestError as e:
+        # Log error but don't fail - return partial info
+        import logging
+
+        logging.getLogger(__name__).warning(f"Failed to fetch format info: {e}")
+
+    return result
+
+
+@router.get("/search-mlx")
+async def search_mlx_models(
+    query: str = Query(..., min_length=2, description="Search query"),
+    limit: int = Query(20, ge=1, le=50, description="Number of results"),
+):
+    """
+    Search for MLX-ready models from mlx-community.
+
+    Returns models that are already converted to MLX format.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            params = {
+                "search": query,
+                "author": "mlx-community",
+                "limit": limit,
+                "sort": "downloads",
+                "direction": -1,
+            }
+
+            response = await client.get(
+                f"{HF_API_URL}/models",
+                params=params,
+            )
+            response.raise_for_status()
+
+            models = response.json()
+            return [
+                {
+                    "id": m.get("modelId", m.get("id")),
+                    "author": m.get("author"),
+                    "downloads": m.get("downloads", 0),
+                    "likes": m.get("likes", 0),
+                    "pipeline_tag": m.get("pipeline_tag"),
+                    "tags": m.get("tags", [])[:5],
+                    "is_mlx_ready": True,
+                }
+                for m in models
+            ]
+
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")
+
+
+@router.get("/search-gguf")
+async def search_gguf_models(
+    query: str = Query(..., min_length=2, description="Search query"),
+    limit: int = Query(20, ge=1, le=50, description="Number of results"),
+):
+    """
+    Search for models with GGUF files available.
+
+    Returns models that have pre-converted GGUF files.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            # Search with GGUF tag
+            params = {
+                "search": query,
+                "limit": limit * 2,  # Get more to filter
+                "sort": "downloads",
+                "direction": -1,
+                "filter": "gguf",
+            }
+
+            response = await client.get(
+                f"{HF_API_URL}/models",
+                params=params,
+            )
+            response.raise_for_status()
+
+            models = response.json()
+
+            # Filter to only include models with GGUF in name or tags
+            gguf_models = []
+            for m in models:
+                model_id = m.get("modelId", m.get("id", ""))
+                tags = m.get("tags", [])
+
+                # Check if model has GGUF indicator
+                is_gguf = "gguf" in model_id.lower() or any("gguf" in t.lower() for t in tags)
+
+                if is_gguf:
+                    gguf_models.append(
+                        {
+                            "id": model_id,
+                            "author": m.get("author"),
+                            "downloads": m.get("downloads", 0),
+                            "likes": m.get("likes", 0),
+                            "pipeline_tag": m.get("pipeline_tag"),
+                            "tags": tags[:5],
+                            "is_gguf_ready": True,
+                        }
+                    )
+
+                if len(gguf_models) >= limit:
+                    break
+
+            return gguf_models
+
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")
diff --git a/backend/app/models/worker.py b/backend/app/models/worker.py
@@ -120,14 +120,17 @@ def available_backends(self) -> list[str]:
                 backends.extend(["vllm", "sglang", "ollama"])
             else:
                 backends.append("ollama")
-        else:
-            # Native backends (Mac)
-            if caps.get("ollama"):
+
+        # Mac native backends - always available (can be installed if missing)
+        if self.is_mac:
+            # vLLM-Metal, MLX, llama.cpp are all installable on Mac
+            mac_backends = ["vllm", "mlx", "llama_cpp"]
+            for b in mac_backends:
+                if b not in backends:
-        # Mac native backends - always available (can be installed if missing)
-        if self.is_mac:
-            # vLLM-Metal, MLX, llama.cpp are all installable on Mac
-            mac_backends = ["vllm", "mlx", "llama_cpp"]
-            for b in mac_backends:
-                if b not in backends:
+        # Mac native backends - conditionally available based on capabilities/health
+        if self.is_mac:
+            # vLLM-Metal, MLX, llama.cpp are installable on Mac, but should only be
+            # exposed as available if the worker has indicated support via capabilities.
+            mac_backends = ["vllm", "mlx", "llama_cpp"]
+            for b in mac_backends:
+                # Only include backend if explicitly marked as supported in capabilities
+                if caps.get(b) and b not in backends:
-        # Mac native backends - always available (can be installed if missing)
-        if self.is_mac:
-            # vLLM-Metal, MLX, llama.cpp are all installable on Mac
-            mac_backends = ["vllm", "mlx", "llama_cpp"]
-            for b in mac_backends:
-                if b not in backends:
+        # Mac native backends - conditionally available based on capabilities/health
+        if self.is_mac:
+            # vLLM-Metal, MLX, llama.cpp are installable on Mac, but should only be
+            # exposed as available if the worker has indicated support via capabilities.
+            mac_backends = ["vllm", "mlx", "llama_cpp"]
+            for b in mac_backends:
+                # Only include backend if explicitly marked as supported in capabilities
+                if caps.get(b) and b not in backends:
+                    backends.append(b)
+            # Ollama on Mac (if installed)
+            if caps.get("ollama") and "ollama" not in backends:
                 backends.append("ollama")
-            if caps.get("mlx"):
-                backends.append("mlx")
-            if caps.get("llama_cpp"):
-                backends.append("llama_cpp")
 
         return backends
 

diff --git a/backend/app/services/deployer/native.py b/backend/app/services/deployer/native.py
@@ -1,7 +1,9 @@
 """Native Mac deployment operations.
 
 This module handles native deployment operations for macOS,
-including Ollama, MLX, and llama.cpp backends.
+including Ollama, MLX, llama.cpp, and vLLM-Metal backends.
+
+Supports automatic model conversion from HuggingFace to MLX/GGUF formats.
 """
 
 import asyncio
@@ -15,17 +17,29 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_mlx_ready(model_id: str) -> bool:
+    """Check if model is already in MLX format."""
+    return model_id.startswith("mlx-community/")
+
+
+def _is_gguf_file(model_id: str) -> bool:
+    """Check if model_id is a GGUF file path."""
+    return model_id.endswith(".gguf")
+
+
 async def deploy_native(deployment: Deployment, db) -> dict:
     """Deploy using native backend (Mac without Docker).
 
-    Supports Ollama, MLX, and llama.cpp backends on macOS.
+    Supports Ollama, MLX, llama.cpp, and vLLM-Metal backends on macOS.
+    Handles automatic conversion of HuggingFace models to MLX/GGUF formats.
     """
     # Import here to avoid circular imports
     from app.services.deployer.health import wait_for_native_api_ready
 
     worker = deployment.worker
     model = deployment.model
     backend = deployment.backend
+    model_id = model.model_id
 
     # Validate backend is supported
     available_backends = worker.available_backends
@@ -35,19 +49,37 @@ async def deploy_native(deployment: Deployment, db) -> dict:
             f"Available backends: {', '.join(available_backends)}"
         }
 
+    # Check if model needs conversion and update status
+    needs_conversion = False
+    if backend == "mlx" and not _is_mlx_ready(model_id):
+        needs_conversion = True
+        deployment.status_message = "Model may need conversion to MLX format..."
+        await db.commit()
+    elif backend == "llama_cpp" and not _is_gguf_file(model_id):
+        needs_conversion = True
+        deployment.status_message = "Model may need conversion to GGUF format..."
+        await db.commit()
+
     try:
         worker_url = f"http://{worker.effective_address}/native/deploy"
 
         deploy_request = {
             "deployment_id": deployment.id,
             "deployment_name": deployment.name,
-            "model_id": model.model_id,
+            "model_id": model_id,
             "backend": backend,
             "port": 0,  # Auto-assign
             "extra_params": deployment.extra_params,
         }
 
-        deployment.status_message = f"Starting {backend} deployment..."
+        # Set container_id early so logs can be fetched during deployment
+        expected_process_id = f"native-{deployment.id}"
+        deployment.container_id = expected_process_id
+
+        if needs_conversion:
+            deployment.status_message = f"Converting model and starting {backend} deployment..."
+        else:
+            deployment.status_message = f"Starting {backend} deployment..."
         await db.commit()
 
         async with httpx.AsyncClient(timeout=600.0) as client:
@@ -59,8 +91,10 @@ async def deploy_native(deployment: Deployment, db) -> dict:
 
             result = response.json()
             deployment.port = result.get("port")
-            # Use process_id as container_id for native deployments
-            deployment.container_id = result.get("process_id")
+            # Verify process_id matches expected
+            actual_process_id = result.get("process_id")
+            if actual_process_id and actual_process_id != expected_process_id:
+                deployment.container_id = actual_process_id
 
         # Wait for API to be ready
         deployment.status_message = "Waiting for model to be ready..."

diff --git a/backend/app/services/deployer/service.py b/backend/app/services/deployer/service.py
@@ -77,11 +77,18 @@ async def deploy(self, deployment_id: int) -> None:
                 worker = deployment.worker
                 backend = deployment.backend
 
-                # Mac with Ollama should always use native deployment (use local Ollama)
+                # Mac with Ollama, MLX, llama_cpp, or vLLM should use native deployment
+                # vLLM on Mac uses vLLM-Metal (native Apple Silicon acceleration)
                 # Mac without Docker should also use native deployment
                 is_mac = worker.os_type == OSType.DARWIN.value
+                native_backends = (
+                    BackendType.OLLAMA.value,
+                    BackendType.MLX.value,
+                    BackendType.LLAMA_CPP.value,
+                    BackendType.VLLM.value,  # vLLM-Metal on Mac
+                )
                 is_mac_native = is_mac and (
-                    backend == BackendType.OLLAMA.value or not worker.supports_docker
+                    backend in native_backends or not worker.supports_docker
                 )
 
                 # Use native deployment for Mac
@@ -315,7 +322,7 @@ async def stop(self, deployment_id: int) -> None:
     async def get_logs(self, deployment: Deployment, tail: int = 100) -> str:
         """Get logs from a deployment"""
         if not deployment.container_id or not deployment.worker:
-            return "No container running"
+            return "No deployment process running"
 
         try:
             worker = deployment.worker