-
Notifications
You must be signed in to change notification settings - Fork 2
[Feature] Mac native backend support with vLLM-Metal, MLX, and llama.cpp #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
05401b2
4c3caf0
5b0dc3f
9ba3215
e692d90
42783dd
03684a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,9 @@ | ||
| """Native Mac deployment operations. | ||
|
|
||
| This module handles native deployment operations for macOS, | ||
| including Ollama, MLX, and llama.cpp backends. | ||
| including Ollama, MLX, llama.cpp, and vLLM-Metal backends. | ||
|
|
||
| Supports automatic model conversion from HuggingFace to MLX/GGUF formats. | ||
| """ | ||
|
|
||
| import asyncio | ||
|
|
@@ -15,17 +17,29 @@ | |
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def _is_mlx_ready(model_id: str) -> bool: | ||
| """Check if model is already in MLX format.""" | ||
| return model_id.startswith("mlx-community/") | ||
|
|
||
|
|
||
| def _is_gguf_file(model_id: str) -> bool: | ||
| """Check if model_id is a GGUF file path.""" | ||
| return model_id.endswith(".gguf") | ||
|
|
||
|
|
||
| async def deploy_native(deployment: Deployment, db) -> dict: | ||
| """Deploy using native backend (Mac without Docker). | ||
|
|
||
| Supports Ollama, MLX, and llama.cpp backends on macOS. | ||
| Supports Ollama, MLX, llama.cpp, and vLLM-Metal backends on macOS. | ||
| Handles automatic conversion of HuggingFace models to MLX/GGUF formats. | ||
| """ | ||
| # Import here to avoid circular imports | ||
| from app.services.deployer.health import wait_for_native_api_ready | ||
|
|
||
| worker = deployment.worker | ||
| model = deployment.model | ||
| backend = deployment.backend | ||
| model_id = model.model_id | ||
|
|
||
| # Validate backend is supported | ||
| available_backends = worker.available_backends | ||
|
|
@@ -35,19 +49,37 @@ async def deploy_native(deployment: Deployment, db) -> dict: | |
| f"Available backends: {', '.join(available_backends)}" | ||
| } | ||
|
|
||
| # Check if model needs conversion and update status | ||
| needs_conversion = False | ||
| if backend == "mlx" and not _is_mlx_ready(model_id): | ||
| needs_conversion = True | ||
| deployment.status_message = "Model may need conversion to MLX format..." | ||
| await db.commit() | ||
| elif backend == "llama_cpp" and not _is_gguf_file(model_id): | ||
| needs_conversion = True | ||
| deployment.status_message = "Model may need conversion to GGUF format..." | ||
| await db.commit() | ||
|
|
||
| try: | ||
| worker_url = f"http://{worker.effective_address}/native/deploy" | ||
|
|
||
| deploy_request = { | ||
| "deployment_id": deployment.id, | ||
| "deployment_name": deployment.name, | ||
| "model_id": model.model_id, | ||
| "model_id": model_id, | ||
| "backend": backend, | ||
| "port": 0, # Auto-assign | ||
| "extra_params": deployment.extra_params, | ||
| } | ||
|
|
||
| deployment.status_message = f"Starting {backend} deployment..." | ||
| # Set container_id early so logs can be fetched during deployment | ||
| expected_process_id = f"native-{deployment.id}" | ||
| deployment.container_id = expected_process_id | ||
|
Comment on lines
+75
to
+77
|
||
|
|
||
| if needs_conversion: | ||
| deployment.status_message = f"Converting model and starting {backend} deployment..." | ||
| else: | ||
| deployment.status_message = f"Starting {backend} deployment..." | ||
| await db.commit() | ||
|
|
||
| async with httpx.AsyncClient(timeout=600.0) as client: | ||
|
|
@@ -59,8 +91,10 @@ async def deploy_native(deployment: Deployment, db) -> dict: | |
|
|
||
| result = response.json() | ||
| deployment.port = result.get("port") | ||
| # Use process_id as container_id for native deployments | ||
| deployment.container_id = result.get("process_id") | ||
| # Verify process_id matches expected | ||
| actual_process_id = result.get("process_id") | ||
| if actual_process_id and actual_process_id != expected_process_id: | ||
| deployment.container_id = actual_process_id | ||
|
|
||
| # Wait for API to be ready | ||
| deployment.status_message = "Waiting for model to be ready..." | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The backend availability logic has changed to always include vLLM, MLX, and llama.cpp for Mac workers (lines 126-130), indicating they "can be installed if missing". However, there's no validation that these installations will actually succeed. If a user tries to deploy with a backend that fails to install (e.g., due to missing dependencies or network issues), the deployment will fail but the backend will still appear as available in the UI. Consider adding health checks or capability flags that indicate whether these backends are actually functional.