Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 186 additions & 0 deletions backend/app/api/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,3 +616,189 @@ async def get_model_readme(

except httpx.RequestError as e:
return {"content": None, "message": f"Failed to fetch README: {str(e)}"}


class ModelFormatInfo(BaseModel):
"""Model format compatibility information"""

model_id: str
is_mlx_ready: bool = False # True if from mlx-community
is_gguf_ready: bool = False # True if has .gguf files
mlx_variants: list[str] = [] # Available MLX variants
gguf_files: list[str] = [] # Available GGUF files


def _is_mlx_ready(model_id: str) -> bool:
"""Check if model is from mlx-community."""
return model_id.startswith("mlx-community/")


def _is_gguf_ready(files: list[str]) -> bool:
"""Check if model has GGUF files."""
return any(f.endswith(".gguf") for f in files)


@router.get("/format-info/{model_id:path}", response_model=ModelFormatInfo)
async def get_model_format_info(
model_id: str,
token: str | None = Query(None, description="HuggingFace API token"),
):
"""
Get model format compatibility information.

Returns whether the model is MLX-ready, GGUF-ready, and lists available variants.
"""
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"

result = ModelFormatInfo(
model_id=model_id,
is_mlx_ready=_is_mlx_ready(model_id),
)

try:
async with httpx.AsyncClient(timeout=30.0) as client:
# Get model files to check for GGUF
response = await client.get(
f"{HF_API_URL}/models/{model_id}",
headers=headers,
)

if response.status_code == 200:
data = response.json()
siblings = data.get("siblings", [])
files = [s.get("rfilename", "") for s in siblings]

# Check for GGUF files
gguf_files = [f for f in files if f.endswith(".gguf")]
result.gguf_files = gguf_files
result.is_gguf_ready = len(gguf_files) > 0

# Search for MLX variants if not already MLX
if not result.is_mlx_ready:
model_name = model_id.split("/")[-1]
# Search mlx-community for this model
search_response = await client.get(
f"{HF_API_URL}/models",
params={
"search": model_name,
"author": "mlx-community",
"limit": 5,
},
)
if search_response.status_code == 200:
mlx_models = search_response.json()
result.mlx_variants = [m.get("modelId", m.get("id", "")) for m in mlx_models]

except httpx.RequestError as e:
# Log error but don't fail - return partial info
import logging

logging.getLogger(__name__).warning(f"Failed to fetch format info: {e}")

return result


@router.get("/search-mlx")
async def search_mlx_models(
query: str = Query(..., min_length=2, description="Search query"),
limit: int = Query(20, ge=1, le=50, description="Number of results"),
):
"""
Search for MLX-ready models from mlx-community.

Returns models that are already converted to MLX format.
"""
try:
async with httpx.AsyncClient(timeout=30.0) as client:
params = {
"search": query,
"author": "mlx-community",
"limit": limit,
"sort": "downloads",
"direction": -1,
}

response = await client.get(
f"{HF_API_URL}/models",
params=params,
)
response.raise_for_status()

models = response.json()
return [
{
"id": m.get("modelId", m.get("id")),
"author": m.get("author"),
"downloads": m.get("downloads", 0),
"likes": m.get("likes", 0),
"pipeline_tag": m.get("pipeline_tag"),
"tags": m.get("tags", [])[:5],
"is_mlx_ready": True,
}
for m in models
]

except httpx.RequestError as e:
raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")


@router.get("/search-gguf")
async def search_gguf_models(
query: str = Query(..., min_length=2, description="Search query"),
limit: int = Query(20, ge=1, le=50, description="Number of results"),
):
"""
Search for models with GGUF files available.

Returns models that have pre-converted GGUF files.
"""
try:
async with httpx.AsyncClient(timeout=30.0) as client:
# Search with GGUF tag
params = {
"search": query,
"limit": limit * 2, # Get more to filter
"sort": "downloads",
"direction": -1,
"filter": "gguf",
}

response = await client.get(
f"{HF_API_URL}/models",
params=params,
)
response.raise_for_status()

models = response.json()

# Filter to only include models with GGUF in name or tags
gguf_models = []
for m in models:
model_id = m.get("modelId", m.get("id", ""))
tags = m.get("tags", [])

# Check if model has GGUF indicator
is_gguf = "gguf" in model_id.lower() or any("gguf" in t.lower() for t in tags)

if is_gguf:
gguf_models.append(
{
"id": model_id,
"author": m.get("author"),
"downloads": m.get("downloads", 0),
"likes": m.get("likes", 0),
"pipeline_tag": m.get("pipeline_tag"),
"tags": tags[:5],
"is_gguf_ready": True,
}
)

if len(gguf_models) >= limit:
break

return gguf_models

except httpx.RequestError as e:
raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")
17 changes: 10 additions & 7 deletions backend/app/models/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,17 @@ def available_backends(self) -> list[str]:
backends.extend(["vllm", "sglang", "ollama"])
else:
backends.append("ollama")
else:
# Native backends (Mac)
if caps.get("ollama"):

# Mac native backends - always available (can be installed if missing)
if self.is_mac:
# vLLM-Metal, MLX, llama.cpp are all installable on Mac
mac_backends = ["vllm", "mlx", "llama_cpp"]
for b in mac_backends:
if b not in backends:
Comment on lines +124 to +129
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The backend availability logic has changed to always include vLLM, MLX, and llama.cpp for Mac workers (lines 126-130), indicating they "can be installed if missing". However, there's no validation that these installations will actually succeed. If a user tries to deploy with a backend that fails to install (e.g., due to missing dependencies or network issues), the deployment will fail but the backend will still appear as available in the UI. Consider adding health checks or capability flags that indicate whether these backends are actually functional.

Suggested change
# Mac native backends - always available (can be installed if missing)
if self.is_mac:
# vLLM-Metal, MLX, llama.cpp are all installable on Mac
mac_backends = ["vllm", "mlx", "llama_cpp"]
for b in mac_backends:
if b not in backends:
# Mac native backends - conditionally available based on capabilities/health
if self.is_mac:
# vLLM-Metal, MLX, llama.cpp are installable on Mac, but should only be
# exposed as available if the worker has indicated support via capabilities.
mac_backends = ["vllm", "mlx", "llama_cpp"]
for b in mac_backends:
# Only include backend if explicitly marked as supported in capabilities
if caps.get(b) and b not in backends:

Copilot uses AI. Check for mistakes.
backends.append(b)
# Ollama on Mac (if installed)
if caps.get("ollama") and "ollama" not in backends:
backends.append("ollama")
if caps.get("mlx"):
backends.append("mlx")
if caps.get("llama_cpp"):
backends.append("llama_cpp")

return backends

Expand Down
46 changes: 40 additions & 6 deletions backend/app/services/deployer/native.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Native Mac deployment operations.

This module handles native deployment operations for macOS,
including Ollama, MLX, and llama.cpp backends.
including Ollama, MLX, llama.cpp, and vLLM-Metal backends.

Supports automatic model conversion from HuggingFace to MLX/GGUF formats.
"""

import asyncio
Expand All @@ -15,17 +17,29 @@
logger = logging.getLogger(__name__)


def _is_mlx_ready(model_id: str) -> bool:
"""Check if model is already in MLX format."""
return model_id.startswith("mlx-community/")


def _is_gguf_file(model_id: str) -> bool:
"""Check if model_id is a GGUF file path."""
return model_id.endswith(".gguf")


async def deploy_native(deployment: Deployment, db) -> dict:
"""Deploy using native backend (Mac without Docker).

Supports Ollama, MLX, and llama.cpp backends on macOS.
Supports Ollama, MLX, llama.cpp, and vLLM-Metal backends on macOS.
Handles automatic conversion of HuggingFace models to MLX/GGUF formats.
"""
# Import here to avoid circular imports
from app.services.deployer.health import wait_for_native_api_ready

worker = deployment.worker
model = deployment.model
backend = deployment.backend
model_id = model.model_id

# Validate backend is supported
available_backends = worker.available_backends
Expand All @@ -35,19 +49,37 @@ async def deploy_native(deployment: Deployment, db) -> dict:
f"Available backends: {', '.join(available_backends)}"
}

# Check if model needs conversion and update status
needs_conversion = False
if backend == "mlx" and not _is_mlx_ready(model_id):
needs_conversion = True
deployment.status_message = "Model may need conversion to MLX format..."
await db.commit()
elif backend == "llama_cpp" and not _is_gguf_file(model_id):
needs_conversion = True
deployment.status_message = "Model may need conversion to GGUF format..."
await db.commit()

try:
worker_url = f"http://{worker.effective_address}/native/deploy"

deploy_request = {
"deployment_id": deployment.id,
"deployment_name": deployment.name,
"model_id": model.model_id,
"model_id": model_id,
"backend": backend,
"port": 0, # Auto-assign
"extra_params": deployment.extra_params,
}

deployment.status_message = f"Starting {backend} deployment..."
# Set container_id early so logs can be fetched during deployment
expected_process_id = f"native-{deployment.id}"
deployment.container_id = expected_process_id
Comment on lines +75 to +77
Copy link

Copilot AI Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the native deployment function, the container_id is set early (line 77) before the deployment actually starts. If the deployment fails before reaching the worker, the deployment record will have a container_id that doesn't correspond to any actual process. This could cause confusion when trying to debug failed deployments. Consider only setting container_id after the deployment request succeeds.

Copilot uses AI. Check for mistakes.

if needs_conversion:
deployment.status_message = f"Converting model and starting {backend} deployment..."
else:
deployment.status_message = f"Starting {backend} deployment..."
await db.commit()

async with httpx.AsyncClient(timeout=600.0) as client:
Expand All @@ -59,8 +91,10 @@ async def deploy_native(deployment: Deployment, db) -> dict:

result = response.json()
deployment.port = result.get("port")
# Use process_id as container_id for native deployments
deployment.container_id = result.get("process_id")
# Verify process_id matches expected
actual_process_id = result.get("process_id")
if actual_process_id and actual_process_id != expected_process_id:
deployment.container_id = actual_process_id

# Wait for API to be ready
deployment.status_message = "Waiting for model to be ready..."
Expand Down
13 changes: 10 additions & 3 deletions backend/app/services/deployer/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,18 @@ async def deploy(self, deployment_id: int) -> None:
worker = deployment.worker
backend = deployment.backend

# Mac with Ollama should always use native deployment (use local Ollama)
# Mac with Ollama, MLX, llama_cpp, or vLLM should use native deployment
# vLLM on Mac uses vLLM-Metal (native Apple Silicon acceleration)
# Mac without Docker should also use native deployment
is_mac = worker.os_type == OSType.DARWIN.value
native_backends = (
BackendType.OLLAMA.value,
BackendType.MLX.value,
BackendType.LLAMA_CPP.value,
BackendType.VLLM.value, # vLLM-Metal on Mac
)
is_mac_native = is_mac and (
backend == BackendType.OLLAMA.value or not worker.supports_docker
backend in native_backends or not worker.supports_docker
)

# Use native deployment for Mac
Expand Down Expand Up @@ -315,7 +322,7 @@ async def stop(self, deployment_id: int) -> None:
async def get_logs(self, deployment: Deployment, tail: int = 100) -> str:
"""Get logs from a deployment"""
if not deployment.container_id or not deployment.worker:
return "No container running"
return "No deployment process running"

try:
worker = deployment.worker
Expand Down
Loading
Loading