diff --git a/backend/app/api/huggingface.py b/backend/app/api/huggingface.py index 26f73cd..38d8f47 100644 --- a/backend/app/api/huggingface.py +++ b/backend/app/api/huggingface.py @@ -616,3 +616,189 @@ async def get_model_readme( except httpx.RequestError as e: return {"content": None, "message": f"Failed to fetch README: {str(e)}"} + + +class ModelFormatInfo(BaseModel): + """Model format compatibility information""" + + model_id: str + is_mlx_ready: bool = False # True if from mlx-community + is_gguf_ready: bool = False # True if has .gguf files + mlx_variants: list[str] = [] # Available MLX variants + gguf_files: list[str] = [] # Available GGUF files + + +def _is_mlx_ready(model_id: str) -> bool: + """Check if model is from mlx-community.""" + return model_id.startswith("mlx-community/") + + +def _is_gguf_ready(files: list[str]) -> bool: + """Check if model has GGUF files.""" + return any(f.endswith(".gguf") for f in files) + + +@router.get("/format-info/{model_id:path}", response_model=ModelFormatInfo) +async def get_model_format_info( + model_id: str, + token: str | None = Query(None, description="HuggingFace API token"), +): + """ + Get model format compatibility information. + + Returns whether the model is MLX-ready, GGUF-ready, and lists available variants. + """ + headers = {} + if token: + headers["Authorization"] = f"Bearer {token}" + + result = ModelFormatInfo( + model_id=model_id, + is_mlx_ready=_is_mlx_ready(model_id), + ) + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + # Get model files to check for GGUF + response = await client.get( + f"{HF_API_URL}/models/{model_id}", + headers=headers, + ) + + if response.status_code == 200: + data = response.json() + siblings = data.get("siblings", []) + files = [s.get("rfilename", "") for s in siblings] + + # Check for GGUF files + gguf_files = [f for f in files if f.endswith(".gguf")] + result.gguf_files = gguf_files + result.is_gguf_ready = len(gguf_files) > 0 + + # Search for MLX variants if not already MLX + if not result.is_mlx_ready: + model_name = model_id.split("/")[-1] + # Search mlx-community for this model + search_response = await client.get( + f"{HF_API_URL}/models", + params={ + "search": model_name, + "author": "mlx-community", + "limit": 5, + }, + ) + if search_response.status_code == 200: + mlx_models = search_response.json() + result.mlx_variants = [m.get("modelId", m.get("id", "")) for m in mlx_models] + + except httpx.RequestError as e: + # Log error but don't fail - return partial info + import logging + + logging.getLogger(__name__).warning(f"Failed to fetch format info: {e}") + + return result + + +@router.get("/search-mlx") +async def search_mlx_models( + query: str = Query(..., min_length=2, description="Search query"), + limit: int = Query(20, ge=1, le=50, description="Number of results"), +): + """ + Search for MLX-ready models from mlx-community. + + Returns models that are already converted to MLX format. + """ + try: + async with httpx.AsyncClient(timeout=30.0) as client: + params = { + "search": query, + "author": "mlx-community", + "limit": limit, + "sort": "downloads", + "direction": -1, + } + + response = await client.get( + f"{HF_API_URL}/models", + params=params, + ) + response.raise_for_status() + + models = response.json() + return [ + { + "id": m.get("modelId", m.get("id")), + "author": m.get("author"), + "downloads": m.get("downloads", 0), + "likes": m.get("likes", 0), + "pipeline_tag": m.get("pipeline_tag"), + "tags": m.get("tags", [])[:5], + "is_mlx_ready": True, + } + for m in models + ] + + except httpx.RequestError as e: + raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}") + + +@router.get("/search-gguf") +async def search_gguf_models( + query: str = Query(..., min_length=2, description="Search query"), + limit: int = Query(20, ge=1, le=50, description="Number of results"), +): + """ + Search for models with GGUF files available. + + Returns models that have pre-converted GGUF files. + """ + try: + async with httpx.AsyncClient(timeout=30.0) as client: + # Search with GGUF tag + params = { + "search": query, + "limit": limit * 2, # Get more to filter + "sort": "downloads", + "direction": -1, + "filter": "gguf", + } + + response = await client.get( + f"{HF_API_URL}/models", + params=params, + ) + response.raise_for_status() + + models = response.json() + + # Filter to only include models with GGUF in name or tags + gguf_models = [] + for m in models: + model_id = m.get("modelId", m.get("id", "")) + tags = m.get("tags", []) + + # Check if model has GGUF indicator + is_gguf = "gguf" in model_id.lower() or any("gguf" in t.lower() for t in tags) + + if is_gguf: + gguf_models.append( + { + "id": model_id, + "author": m.get("author"), + "downloads": m.get("downloads", 0), + "likes": m.get("likes", 0), + "pipeline_tag": m.get("pipeline_tag"), + "tags": tags[:5], + "is_gguf_ready": True, + } + ) + + if len(gguf_models) >= limit: + break + + return gguf_models + + except httpx.RequestError as e: + raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}") diff --git a/backend/app/models/worker.py b/backend/app/models/worker.py index 3be352d..a4c7ed1 100644 --- a/backend/app/models/worker.py +++ b/backend/app/models/worker.py @@ -120,14 +120,17 @@ def available_backends(self) -> list[str]: backends.extend(["vllm", "sglang", "ollama"]) else: backends.append("ollama") - else: - # Native backends (Mac) - if caps.get("ollama"): + + # Mac native backends - always available (can be installed if missing) + if self.is_mac: + # vLLM-Metal, MLX, llama.cpp are all installable on Mac + mac_backends = ["vllm", "mlx", "llama_cpp"] + for b in mac_backends: + if b not in backends: + backends.append(b) + # Ollama on Mac (if installed) + if caps.get("ollama") and "ollama" not in backends: backends.append("ollama") - if caps.get("mlx"): - backends.append("mlx") - if caps.get("llama_cpp"): - backends.append("llama_cpp") return backends diff --git a/backend/app/services/deployer/native.py b/backend/app/services/deployer/native.py index 4127b41..4ed005d 100644 --- a/backend/app/services/deployer/native.py +++ b/backend/app/services/deployer/native.py @@ -1,7 +1,9 @@ """Native Mac deployment operations. This module handles native deployment operations for macOS, -including Ollama, MLX, and llama.cpp backends. +including Ollama, MLX, llama.cpp, and vLLM-Metal backends. + +Supports automatic model conversion from HuggingFace to MLX/GGUF formats. """ import asyncio @@ -15,10 +17,21 @@ logger = logging.getLogger(__name__) +def _is_mlx_ready(model_id: str) -> bool: + """Check if model is already in MLX format.""" + return model_id.startswith("mlx-community/") + + +def _is_gguf_file(model_id: str) -> bool: + """Check if model_id is a GGUF file path.""" + return model_id.endswith(".gguf") + + async def deploy_native(deployment: Deployment, db) -> dict: """Deploy using native backend (Mac without Docker). - Supports Ollama, MLX, and llama.cpp backends on macOS. + Supports Ollama, MLX, llama.cpp, and vLLM-Metal backends on macOS. + Handles automatic conversion of HuggingFace models to MLX/GGUF formats. """ # Import here to avoid circular imports from app.services.deployer.health import wait_for_native_api_ready @@ -26,6 +39,7 @@ async def deploy_native(deployment: Deployment, db) -> dict: worker = deployment.worker model = deployment.model backend = deployment.backend + model_id = model.model_id # Validate backend is supported available_backends = worker.available_backends @@ -35,19 +49,37 @@ async def deploy_native(deployment: Deployment, db) -> dict: f"Available backends: {', '.join(available_backends)}" } + # Check if model needs conversion and update status + needs_conversion = False + if backend == "mlx" and not _is_mlx_ready(model_id): + needs_conversion = True + deployment.status_message = "Model may need conversion to MLX format..." + await db.commit() + elif backend == "llama_cpp" and not _is_gguf_file(model_id): + needs_conversion = True + deployment.status_message = "Model may need conversion to GGUF format..." + await db.commit() + try: worker_url = f"http://{worker.effective_address}/native/deploy" deploy_request = { "deployment_id": deployment.id, "deployment_name": deployment.name, - "model_id": model.model_id, + "model_id": model_id, "backend": backend, "port": 0, # Auto-assign "extra_params": deployment.extra_params, } - deployment.status_message = f"Starting {backend} deployment..." + # Set container_id early so logs can be fetched during deployment + expected_process_id = f"native-{deployment.id}" + deployment.container_id = expected_process_id + + if needs_conversion: + deployment.status_message = f"Converting model and starting {backend} deployment..." + else: + deployment.status_message = f"Starting {backend} deployment..." await db.commit() async with httpx.AsyncClient(timeout=600.0) as client: @@ -59,8 +91,10 @@ async def deploy_native(deployment: Deployment, db) -> dict: result = response.json() deployment.port = result.get("port") - # Use process_id as container_id for native deployments - deployment.container_id = result.get("process_id") + # Verify process_id matches expected + actual_process_id = result.get("process_id") + if actual_process_id and actual_process_id != expected_process_id: + deployment.container_id = actual_process_id # Wait for API to be ready deployment.status_message = "Waiting for model to be ready..." diff --git a/backend/app/services/deployer/service.py b/backend/app/services/deployer/service.py index 33b1aa3..a9b0e72 100644 --- a/backend/app/services/deployer/service.py +++ b/backend/app/services/deployer/service.py @@ -77,11 +77,18 @@ async def deploy(self, deployment_id: int) -> None: worker = deployment.worker backend = deployment.backend - # Mac with Ollama should always use native deployment (use local Ollama) + # Mac with Ollama, MLX, llama_cpp, or vLLM should use native deployment + # vLLM on Mac uses vLLM-Metal (native Apple Silicon acceleration) # Mac without Docker should also use native deployment is_mac = worker.os_type == OSType.DARWIN.value + native_backends = ( + BackendType.OLLAMA.value, + BackendType.MLX.value, + BackendType.LLAMA_CPP.value, + BackendType.VLLM.value, # vLLM-Metal on Mac + ) is_mac_native = is_mac and ( - backend == BackendType.OLLAMA.value or not worker.supports_docker + backend in native_backends or not worker.supports_docker ) # Use native deployment for Mac @@ -315,7 +322,7 @@ async def stop(self, deployment_id: int) -> None: async def get_logs(self, deployment: Deployment, tail: int = 100) -> str: """Get logs from a deployment""" if not deployment.container_id or not deployment.worker: - return "No container running" + return "No deployment process running" try: worker = deployment.worker diff --git a/backend/app/services/local_worker.py b/backend/app/services/local_worker.py index 4b5e499..c6be866 100644 --- a/backend/app/services/local_worker.py +++ b/backend/app/services/local_worker.py @@ -4,14 +4,19 @@ """ import logging +import os import platform +import shutil import socket import subprocess +import time import psutil logger = logging.getLogger(__name__) +OLLAMA_DEFAULT_PORT = 11434 + def get_local_hostname() -> str: """Get the local hostname.""" @@ -121,6 +126,80 @@ def get_local_worker_info() -> dict: } +def ensure_ollama_running_on_host(host: str = "0.0.0.0", port: int = OLLAMA_DEFAULT_PORT) -> bool: + """Ensure Ollama is running on the host with external access enabled. + + This is called BEFORE starting Docker worker so that the container + can access Ollama on the host via localhost (with --network host). + + Args: + host: Host to bind to (default 0.0.0.0 for external access) + port: Port to bind to (default 11434) + + Returns: + True if Ollama is running and accessible + """ + # Only run on macOS + if platform.system() != "Darwin": + return True # Not needed on Linux (Docker can use GPU directly) + + # Check if Ollama is installed + ollama_path = shutil.which("ollama") + if not ollama_path: + logger.info("Ollama is not installed on this Mac") + return False + + # Check if Ollama is already running + try: + import httpx + + with httpx.Client(timeout=2.0) as client: + response = client.get(f"http://localhost:{port}/api/tags") + if response.status_code == 200: + logger.info("Ollama service is already running") + return True + except Exception: + pass + + # Ollama not running, start it with external access + logger.info(f"Starting Ollama service on {host}:{port}") + + env = os.environ.copy() + env["OLLAMA_HOST"] = f"{host}:{port}" + + try: + # Start ollama serve in background + process = subprocess.Popen( + [ollama_path, "serve"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) + logger.info(f"Started Ollama service (PID {process.pid})") + + # Wait for Ollama to be ready (up to 30 seconds) + import httpx + + for _ in range(30): + time.sleep(1) + try: + with httpx.Client(timeout=2.0) as client: + response = client.get(f"http://localhost:{port}/api/tags") + if response.status_code == 200: + logger.info("Ollama service is ready") + return True + except Exception: + pass + + logger.error("Ollama service failed to start in time") + return False + + except Exception as e: + logger.error(f"Failed to start Ollama service: {e}") + return False + + def spawn_docker_worker( worker_name: str, backend_url: str, @@ -132,6 +211,11 @@ def spawn_docker_worker( Returns: dict with keys: success, message, container_id (if success) """ + # On Mac, ensure Ollama is running with external access before starting Docker + if platform.system() == "Darwin": + logger.info("Mac detected, ensuring Ollama is running with external access...") + ensure_ollama_running_on_host() + # First, check if container with same name exists and remove it try: check_result = subprocess.run( diff --git a/frontend/src/api/huggingface.ts b/frontend/src/api/huggingface.ts index bb56690..5310be9 100644 --- a/frontend/src/api/huggingface.ts +++ b/frontend/src/api/huggingface.ts @@ -50,6 +50,16 @@ export interface HFSearchResult { likes: number; pipeline_tag?: string; tags: string[]; + is_mlx_ready?: boolean; + is_gguf_ready?: boolean; +} + +export interface ModelFormatInfo { + model_id: string; + is_mlx_ready: boolean; + is_gguf_ready: boolean; + mlx_variants: string[]; + gguf_files: string[]; } export const huggingfaceApi = { @@ -124,4 +134,43 @@ export const huggingfaceApi = { }>(`/huggingface/readme/${encodeURIComponent(modelId)}`, { params }); return response.data; }, + + getFormatInfo: async ( + modelId: string, + token?: string, + ): Promise => { + const params: Record = {}; + if (token) params.token = token; + const response = await api.get( + `/huggingface/format-info/${encodeURIComponent(modelId)}`, + { params }, + ); + return response.data; + }, + + searchMLX: async ( + query: string, + limit?: number, + ): Promise => { + const response = await api.get( + "/huggingface/search-mlx", + { + params: { query, limit }, + }, + ); + return response.data; + }, + + searchGGUF: async ( + query: string, + limit?: number, + ): Promise => { + const response = await api.get( + "/huggingface/search-gguf", + { + params: { query, limit }, + }, + ); + return response.data; + }, }; diff --git a/frontend/src/api/index.ts b/frontend/src/api/index.ts index 48f8ec6..555a9fb 100644 --- a/frontend/src/api/index.ts +++ b/frontend/src/api/index.ts @@ -88,6 +88,7 @@ export type { VRAMEstimate, HFModelFile, HFSearchResult, + ModelFormatInfo, } from "./huggingface"; // Types - Ollama diff --git a/frontend/src/assets/mlx-logo-dark.png b/frontend/src/assets/mlx-logo-dark.png new file mode 100644 index 0000000..cda3c1f Binary files /dev/null and b/frontend/src/assets/mlx-logo-dark.png differ diff --git a/frontend/src/assets/mlx-logo.png b/frontend/src/assets/mlx-logo.png new file mode 100644 index 0000000..be122bf Binary files /dev/null and b/frontend/src/assets/mlx-logo.png differ diff --git a/frontend/src/components/DeploymentAdvancedForm.tsx b/frontend/src/components/DeploymentAdvancedForm.tsx index b141d1d..89fe3a5 100644 --- a/frontend/src/components/DeploymentAdvancedForm.tsx +++ b/frontend/src/components/DeploymentAdvancedForm.tsx @@ -21,7 +21,7 @@ import type { FormInstance } from "antd"; const { Text } = Typography; interface DeploymentAdvancedFormProps { - backend: "vllm" | "sglang" | "ollama"; + backend: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp"; form: FormInstance; } @@ -304,6 +304,7 @@ export default function DeploymentAdvancedForm({ backend, }: DeploymentAdvancedFormProps) { const renderBackendParams = () => { + // Native Mac backends if (backend === "ollama") { return (
@@ -313,6 +314,12 @@ export default function DeploymentAdvancedForm({ ); } + // MLX and llama.cpp - no advanced settings for now + if (backend === "mlx" || backend === "llama_cpp") { + return null; + } + + // vLLM and SGLang (Docker-based) const isVllm = backend === "vllm"; const tabItems = [ { diff --git a/frontend/src/components/HuggingFaceModelPicker.tsx b/frontend/src/components/HuggingFaceModelPicker.tsx index 2533866..2741e83 100644 --- a/frontend/src/components/HuggingFaceModelPicker.tsx +++ b/frontend/src/components/HuggingFaceModelPicker.tsx @@ -18,6 +18,7 @@ import { Empty, Pagination, Divider, + Segmented, } from "antd"; import Loading from "./Loading"; import { @@ -40,15 +41,18 @@ import { type HFModelInfo, type VRAMEstimate, type HFSearchResult, + type ModelFormatInfo, } from "../services/api"; import { useAppTheme } from "../hooks/useTheme"; +type FormatFilter = "all" | "mlx_ready" | "gguf_ready"; + interface HuggingFaceModelPickerProps { open: boolean; onClose: () => void; onSelect: (modelId: string, modelInfo?: HFModelInfo) => void; gpuMemoryGb?: number; // For compatibility check - backend?: "vllm" | "sglang" | "ollama"; // Reserved for future use + backend?: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp"; // Backend type affects filtering } const { Text, Title } = Typography; @@ -82,9 +86,8 @@ export default function HuggingFaceModelPicker({ onClose, onSelect, gpuMemoryGb, - backend: _backend = "vllm", + backend = "vllm", }: HuggingFaceModelPickerProps) { - void _backend; // Reserved for future use const [searchQuery, setSearchQuery] = useState(""); const [searchResults, setSearchResults] = useState([]); const [searching, setSearching] = useState(false); @@ -92,21 +95,45 @@ export default function HuggingFaceModelPicker({ const [modelInfo, setModelInfo] = useState(null); const [vramEstimate, setVramEstimate] = useState(null); const [readme, setReadme] = useState(null); + const [formatInfo, setFormatInfo] = useState(null); const [loadingDetail, setLoadingDetail] = useState(false); const [currentPage, setCurrentPage] = useState(1); const [totalResults, setTotalResults] = useState(0); const [showDetails, setShowDetails] = useState(false); // Mobile: toggle between list and details + const [formatFilter, setFormatFilter] = useState("all"); const searchTimeoutRef = useRef | null>(null); const { isDark, colors } = useAppTheme(); const { isMobile } = useResponsive(); const pageSize = 20; - // Load popular models + // Set default filter based on backend + useEffect(() => { + if (backend === "mlx") { + setFormatFilter("mlx_ready"); + } else if (backend === "llama_cpp") { + setFormatFilter("gguf_ready"); + } else { + setFormatFilter("all"); + } + }, [backend]); + + // Load popular models based on format filter const loadPopularModels = useCallback(async () => { setSearching(true); try { - const results = await huggingfaceApi.getPopular(pageSize); + let results: HFSearchResult[]; + + if (formatFilter === "mlx_ready") { + // Search for popular MLX models + results = await huggingfaceApi.searchMLX("llama", pageSize); + } else if (formatFilter === "gguf_ready") { + // Search for popular GGUF models + results = await huggingfaceApi.searchGGUF("llama", pageSize); + } else { + results = await huggingfaceApi.getPopular(pageSize); + } + setSearchResults(results); setTotalResults(results.length); } catch (error) { @@ -115,9 +142,9 @@ export default function HuggingFaceModelPicker({ } finally { setSearching(false); } - }, []); + }, [formatFilter]); - // Search models + // Search models with format filter const searchModels = useCallback( async (query: string) => { if (!query.trim()) { @@ -128,10 +155,19 @@ export default function HuggingFaceModelPicker({ setSearching(true); try { - const results = await huggingfaceApi.search(query, { - limit: pageSize, - filter_task: "text-generation", - }); + let results: HFSearchResult[]; + + if (formatFilter === "mlx_ready") { + results = await huggingfaceApi.searchMLX(query, pageSize); + } else if (formatFilter === "gguf_ready") { + results = await huggingfaceApi.searchGGUF(query, pageSize); + } else { + results = await huggingfaceApi.search(query, { + limit: pageSize, + filter_task: "text-generation", + }); + } + setSearchResults(results); // Estimate total (HF API doesn't return total count) setTotalResults( @@ -144,9 +180,20 @@ export default function HuggingFaceModelPicker({ setSearching(false); } }, - [loadPopularModels], + [loadPopularModels, formatFilter], ); + // Re-search when format filter changes + useEffect(() => { + if (open) { + if (searchQuery.trim()) { + searchModels(searchQuery); + } else { + loadPopularModels(); + } + } + }, [formatFilter, open]); + // Debounced search const handleSearchChange = (value: string) => { setSearchQuery(value); @@ -169,10 +216,11 @@ export default function HuggingFaceModelPicker({ setModelInfo(null); setVramEstimate(null); setReadme(null); + setFormatInfo(null); if (isMobile) setShowDetails(true); try { - const [info, estimate, readmeResult] = await Promise.all([ + const [info, estimate, readmeResult, format] = await Promise.all([ huggingfaceApi.getModelInfo(modelId).catch((err) => { console.error("Failed to get model info:", err); return null; @@ -190,10 +238,15 @@ export default function HuggingFaceModelPicker({ console.error("Failed to get README:", err); return { content: null }; }), + huggingfaceApi.getFormatInfo(modelId).catch((err) => { + console.error("Failed to get format info:", err); + return null; + }), ]); setModelInfo(info); setVramEstimate(estimate); + setFormatInfo(format); // Process README content if (readmeResult?.content) { @@ -225,6 +278,7 @@ export default function HuggingFaceModelPicker({ setModelInfo(null); setVramEstimate(null); setReadme(null); + setFormatInfo(null); setShowDetails(false); } }, [open]); @@ -345,7 +399,7 @@ export default function HuggingFaceModelPicker({ }} > {/* Search Input */} -
+
+ {/* Format Filter */} +
+ setFormatFilter(value as FormatFilter)} + block + size="small" + options={[ + { label: "All", value: "all" }, + { label: "MLX Ready", value: "mlx_ready" }, + { label: "GGUF Ready", value: "gguf_ready" }, + ]} + /> +
+ {/* Results List */}
{searching ? ( @@ -676,6 +745,102 @@ export default function HuggingFaceModelPicker({
)} + {/* Format Compatibility */} + {formatInfo && ( +
+
+ Format Compatibility +
+ + + + {formatInfo.is_mlx_ready ? ( + + ) : null}{" "} + MLX{" "} + {formatInfo.is_mlx_ready ? "Ready" : "Needs Conversion"} + + + {formatInfo.is_gguf_ready ? ( + + ) : null}{" "} + GGUF{" "} + {formatInfo.is_gguf_ready + ? "Ready" + : "Needs Conversion"} + + + + {formatInfo.mlx_variants.length > 0 && + !formatInfo.is_mlx_ready && ( +
+ + MLX variants available: + +
+ {formatInfo.mlx_variants + .slice(0, 3) + .map((variant) => ( + handleSelectModel(variant)} + > + {variant} + + ))} +
+
+ )} + + {formatInfo.gguf_files.length > 0 && ( +
+ + GGUF files ({formatInfo.gguf_files.length}): + +
+ {formatInfo.gguf_files.slice(0, 3).map((file) => ( + + {file} + + ))} + {formatInfo.gguf_files.length > 3 && ( + +{formatInfo.gguf_files.length - 3} more + )} +
+
+ )} + + {!formatInfo.is_mlx_ready && !formatInfo.is_gguf_ready && ( +
+ + This model will be automatically converted when + deployed with MLX or llama.cpp backend. + +
+ )} +
+ )} + {/* README Section */} diff --git a/frontend/src/components/ModelCompatibilityCheck.tsx b/frontend/src/components/ModelCompatibilityCheck.tsx index cd8edf8..67a7920 100644 --- a/frontend/src/components/ModelCompatibilityCheck.tsx +++ b/frontend/src/components/ModelCompatibilityCheck.tsx @@ -37,7 +37,7 @@ interface ModelCompatibilityCheckProps { precision?: string; // fp32, fp16, bf16, int8, int4 gpuMemoryGb?: number; // Available GPU memory for compatibility check contextLength?: number; - backend?: "vllm" | "sglang" | "ollama"; + backend?: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp"; } const { Text, Title } = Typography; diff --git a/frontend/src/components/ModelFormatCompatibility.tsx b/frontend/src/components/ModelFormatCompatibility.tsx new file mode 100644 index 0000000..d8ad540 --- /dev/null +++ b/frontend/src/components/ModelFormatCompatibility.tsx @@ -0,0 +1,218 @@ +/** + * ModelFormatCompatibility Component + * + * Displays model format compatibility information and conversion warnings + * for MLX and GGUF formats used by Mac native backends. + */ +import { useState, useEffect } from "react"; +import { Tag, Space, Alert, Tooltip, Typography } from "antd"; +import { + CheckCircleOutlined, + WarningOutlined, + SyncOutlined, + InfoCircleOutlined, +} from "@ant-design/icons"; +import { huggingfaceApi, type ModelFormatInfo } from "../services/api"; + +const { Text } = Typography; + +interface ModelFormatCompatibilityProps { + modelId: string; + backend?: "mlx" | "llama_cpp" | "vllm" | "sglang" | "ollama"; + showDetails?: boolean; + compact?: boolean; +} + +export default function ModelFormatCompatibility({ + modelId, + backend, + showDetails = true, + compact = false, +}: ModelFormatCompatibilityProps) { + const [formatInfo, setFormatInfo] = useState(null); + const [loading, setLoading] = useState(false); + + useEffect(() => { + const fetchFormatInfo = async () => { + if (!modelId) return; + + setLoading(true); + try { + const info = await huggingfaceApi.getFormatInfo(modelId); + setFormatInfo(info); + } catch (error) { + console.error("Failed to fetch format info:", error); + setFormatInfo(null); + } finally { + setLoading(false); + } + }; + + fetchFormatInfo(); + }, [modelId]); + + if (loading) { + return ( + } color="processing"> + Checking format... + + ); + } + + if (!formatInfo) { + return null; + } + + // Determine if format is compatible with selected backend + const isCompatible = (() => { + if (!backend) return true; + if (backend === "mlx") return formatInfo.is_mlx_ready; + if (backend === "llama_cpp") return formatInfo.is_gguf_ready; + return true; // vllm, sglang, ollama don't need format conversion + })(); + + const needsConversion = backend === "mlx" || backend === "llama_cpp"; + + if (compact) { + return ( + + {formatInfo.is_mlx_ready && ( + + + MLX + + + )} + {formatInfo.is_gguf_ready && ( + + + GGUF + + + )} + {needsConversion && !isCompatible && ( + + + Convert + + + )} + + ); + } + + // Show warning if conversion is needed + if (needsConversion && !isCompatible) { + const conversionTarget = backend === "mlx" ? "MLX" : "GGUF"; + + return ( + } + message={`Model will be converted to ${conversionTarget} format`} + description={ + showDetails ? ( +
+ + This HuggingFace model is not in {conversionTarget} format. It + will be automatically converted on the worker before deployment. + This may take several minutes depending on model size. + + + {backend === "mlx" && formatInfo.mlx_variants.length > 0 && ( +
+ + Tip: Consider using an existing MLX model instead: + +
+ {formatInfo.mlx_variants.slice(0, 3).map((variant) => ( + + {variant} + + ))} +
+
+ )} + + {backend === "llama_cpp" && + formatInfo.gguf_files.length === 0 && ( +
+ + Tip: Search for "{modelId.split("/").pop()}-GGUF" to find + pre-converted models. + +
+ )} +
+ ) : null + } + style={{ marginBottom: 16 }} + /> + ); + } + + // Show success if format is ready + if (needsConversion && isCompatible) { + return ( + } + message={`Model is ${backend === "mlx" ? "MLX" : "GGUF"}-ready`} + description={ + showDetails ? ( + + {backend === "mlx" + ? "This model is from mlx-community and optimized for Apple Silicon." + : `This model has ${formatInfo.gguf_files.length} GGUF file(s) available.`} + + ) : null + } + style={{ marginBottom: 16 }} + /> + ); + } + + // General format info display + if (showDetails) { + return ( + } + message="Model Format Information" + description={ + +
+ + {formatInfo.is_mlx_ready ? : null} MLX{" "} + {formatInfo.is_mlx_ready ? "Ready" : "Needs Conversion"} + + + {formatInfo.is_gguf_ready ? : null} GGUF{" "} + {formatInfo.is_gguf_ready ? "Ready" : "Needs Conversion"} + +
+ {formatInfo.gguf_files.length > 0 && ( + + {formatInfo.gguf_files.length} GGUF file(s) available + + )} + {formatInfo.mlx_variants.length > 0 && ( + + {formatInfo.mlx_variants.length} MLX variant(s) found + + )} +
+ } + style={{ marginBottom: 16 }} + /> + ); + } + + return null; +} diff --git a/frontend/src/components/logos/index.tsx b/frontend/src/components/logos/index.tsx index ff49e6b..029b8b7 100644 --- a/frontend/src/components/logos/index.tsx +++ b/frontend/src/components/logos/index.tsx @@ -13,6 +13,8 @@ import ollamaLogoDark from "../../assets/ollama-dark.png"; import ollamaLogoLight from "../../assets/ollama-light.png"; import sglangLogo from "../../assets/sglang.png"; import huggingfaceLogo from "../../assets/huggingface-2.svg"; +import mlxLogo from "../../assets/mlx-logo.png"; +import mlxLogoDark from "../../assets/mlx-logo-dark.png"; // ============================================================================= // Props Types @@ -74,54 +76,47 @@ export function HuggingFaceLogo({ /** * MLX Logo - Apple's ML framework for Apple Silicon */ -export function MLXLogo({ height = 16, style }: Omit) { - // Use Apple-style gradient colors - const gradientId = `mlx-gradient-${Math.random().toString(36).substr(2, 9)}`; +export function MLXLogo({ height = 16, isDark = false, style }: LogoProps) { return ( - - - - - - - - - - MLX - - + MLX ); } /** - * Llama.cpp Logo + * Llama.cpp Logo - High-performance LLM inference + * Uses official branding colors: white text with orange C++ */ export function LlamaCppLogo({ height = 16, isDark = false, style, }: LogoProps) { - const textColor = isDark ? "#ffffff" : "#333333"; + const textColor = isDark ? "#ffffff" : "#1b1f20"; return ( - + - llama.cpp + llama + + + .cpp ); @@ -188,7 +183,7 @@ export function getBackendConfig( mlx: { label: "MLX", color: tagColor, - icon: , + icon: , }, llama_cpp: { label: "llama.cpp", diff --git a/frontend/src/pages/Deployments.tsx b/frontend/src/pages/Deployments.tsx index 922e529..e77be8f 100644 --- a/frontend/src/pages/Deployments.tsx +++ b/frontend/src/pages/Deployments.tsx @@ -43,6 +43,7 @@ import { useResponsive } from "../hooks"; import { useAuth } from "../contexts/AuthContext"; import DeploymentAdvancedForm from "../components/DeploymentAdvancedForm"; import ModelCompatibilityCheck from "../components/ModelCompatibilityCheck"; +import ModelFormatCompatibility from "../components/ModelFormatCompatibility"; import backendVersionsData from "../constants/backendVersions.json"; import dayjs from "dayjs"; import utc from "dayjs/plugin/utc"; @@ -86,7 +87,7 @@ export default function Deployments() { const [selectedWorkerId, setSelectedWorkerId] = useState(null); const [selectedGpuIndexes, setSelectedGpuIndexes] = useState([]); const [selectedBackend, setSelectedBackend] = useState< - "vllm" | "sglang" | "ollama" + "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp" >("vllm"); const [editingDeployment, setEditingDeployment] = useState( null, @@ -101,6 +102,11 @@ export default function Deployments() { // Get the selected worker's GPU info const selectedWorker = workers.find((w) => w.id === selectedWorkerId); + // Helper functions to check model format + const isMLXReady = (modelId: string) => modelId.startsWith("mlx-community/"); + const isGGUFReady = (modelId: string) => + modelId.toLowerCase().includes("gguf"); + // Determine available backends based on model source and worker capabilities const availableBackends = (() => { // Start with model-based restrictions @@ -108,29 +114,19 @@ export default function Deployments() { return ["ollama"] as const; } - // If no worker selected, show all HuggingFace-compatible backends + // If no worker selected, show all possible backends for HuggingFace models if (!selectedWorker) { - return ["vllm", "sglang"] as const; + return ["vllm", "sglang", "mlx", "llama_cpp"] as const; } - // macOS workers only support Ollama (vLLM/SGLang require NVIDIA GPU) + // macOS workers support vLLM-Metal, MLX, and llama.cpp for HuggingFace models + // Ollama is NOT shown for HF models since Ollama can't use HF models directly if (selectedWorker.os_type === "darwin") { - return ["ollama"] as const; - } - - // Use worker's available_backends if provided - if ( - selectedWorker.available_backends && - selectedWorker.available_backends.length > 0 - ) { - // Filter for HuggingFace-compatible backends from worker's list - const hfBackends = selectedWorker.available_backends.filter((b) => - ["vllm", "sglang", "ollama"].includes(b), - ); - return hfBackends.length > 0 ? hfBackends : (["vllm", "sglang"] as const); + return ["vllm", "mlx", "llama_cpp"] as const; } - // Default fallback for Linux workers + // Linux workers: only vLLM and SGLang for HuggingFace models + // Ollama can't use HF models directly, so don't show it return ["vllm", "sglang"] as const; })(); const workerGpus = selectedWorker?.gpu_info || []; @@ -829,16 +825,54 @@ export default function Deployments() { ); const sourceLabel = m.source === "ollama" ? "Ollama" : "HuggingFace"; + const mlxReady = + m.source !== "ollama" && isMLXReady(m.model_id); + const ggufReady = + m.source !== "ollama" && isGGUFReady(m.model_id); return { label: ( - - {sourceIcon} + + + {sourceIcon} + {sourceLabel} {m.name} + {mlxReady && ( + + MLX + + )} + {ggufReady && ( + + GGUF + + )} ), value: m.id, @@ -847,40 +881,6 @@ export default function Deployments() { /> - - setSelectedBackend(value)} + options={availableBackends.map((b) => { + const config = BACKEND_CONFIG[b]; + // Show "vLLM-Metal" for vllm on Mac workers + const label = + b === "vllm" && selectedWorker?.os_type === "darwin" + ? "vLLM-Metal" + : config.label; + return { + label: ( + + + {config.icon} + + {label} + + ), + value: b, + }; + })} + /> + + + {/* macOS Ollama Warning - only show when Ollama backend is selected */} {selectedWorker && selectedWorker.os_type === "darwin" && + selectedBackend === "ollama" && !selectedWorker.capabilities?.ollama && ( )} - {/* macOS Ollama Not Running Warning */} + {/* macOS Ollama Not Running Warning - only show when Ollama backend is selected */} {selectedWorker && selectedWorker.os_type === "darwin" && + selectedBackend === "ollama" && selectedWorker.capabilities?.ollama && !selectedWorker.capabilities?.ollama_running && ( )} + {/* macOS Backend Info - show auto-install message */} + {selectedWorker && + selectedWorker.os_type === "darwin" && + selectedBackend === "vllm" && ( + + Uses Apple Silicon GPU acceleration. Will be automatically + installed on first deployment. + + } + type="info" + showIcon + style={{ marginBottom: 16 }} + /> + )} + {selectedWorker && + selectedWorker.os_type === "darwin" && + selectedBackend === "mlx" && ( + + Native Apple Silicon ML framework. Will be automatically + installed on first deployment. + + } + type="info" + showIcon + style={{ marginBottom: 16 }} + /> + )} + {selectedWorker && + selectedWorker.os_type === "darwin" && + selectedBackend === "llama_cpp" && ( + + High-performance inference with Metal acceleration. Will be + automatically installed via Homebrew on first deployment. + + } + type="info" + showIcon + style={{ marginBottom: 16 }} + /> + )} + {/* macOS Info */} {selectedWorker && selectedWorker.os_type === "darwin" && selectedWorker.capabilities?.ollama_running && ( +

+ This worker supports native Apple Silicon backends: +

+
    +
  • + Ollama - Easiest, pull and run models + directly +
  • +
  • + MLX - Apple's ML framework, optimized + for Apple Silicon +
  • +
  • + llama.cpp - Cross-platform with Metal + acceleration +
  • +
+

+ For MLX/llama.cpp, HuggingFace models will be + automatically converted if needed. +

+
+ } type="info" showIcon style={{ marginBottom: 16 }} @@ -1054,63 +1206,78 @@ export default function Deployments() { {/* Model Compatibility Check - Show when model is selected for vLLM/SGLang */} - {selectedModel && selectedModel.source !== "ollama" && ( - - )} + {selectedModel && + selectedModel.source !== "ollama" && + !["mlx", "llama_cpp"].includes(selectedBackend) && ( + + )} - {/* Version Override - Show when model is selected */} - {selectedModelId && ( - - ; + } + > + )[selectedBackend]?.versions || [] + ).map((v) => ({ + label: ( + + {v.version} + {v.recommended && ( + + Recommended + + )} + + ), + value: v.image, + }))} + /> + + )} + + {/* Advanced Parameters - Show when model is selected (not for MLX/llama.cpp) */} + {selectedModelId && + !["mlx", "llama_cpp"].includes(selectedBackend) && ( + + )} diff --git a/worker/agent.py b/worker/agent.py index c493164..2b9b5b7 100644 --- a/worker/agent.py +++ b/worker/agent.py @@ -25,12 +25,14 @@ from docker_ops import ContainerManager, DockerRunner, GPUDetector, ImageManager, SystemDetector from routes import ( containers_router, + converter_router, deployment_router, images_router, native_router, storage_router, ) from routes.containers import set_agent as set_containers_agent + from routes.converter import set_agent as set_converter_agent from routes.deployment import set_agent as set_deployment_agent from routes.images import set_agent as set_images_agent from routes.native import set_agent as set_native_agent @@ -46,12 +48,14 @@ ) from worker.routes import ( containers_router, + converter_router, deployment_router, images_router, native_router, storage_router, ) from worker.routes.containers import set_agent as set_containers_agent + from worker.routes.converter import set_agent as set_converter_agent from worker.routes.deployment import set_agent as set_deployment_agent from worker.routes.images import set_agent as set_images_agent from worker.routes.native import set_agent as set_native_agent @@ -127,6 +131,12 @@ def _is_local_worker(self) -> bool: async def register(self) -> bool: """Register this worker with the server.""" try: + # For Mac workers, ensure Ollama is running with external access + if self.os_type == "darwin" and self.native_manager: + if self.capabilities.get("ollama"): + logger.info("Ensuring Ollama service is running with external access...") + await self.native_manager.ensure_ollama_running() + gpu_info = self.gpu_detector.detect() system_info = self.system_detector.detect() @@ -296,6 +306,7 @@ def _set_agent_references(worker_agent: WorkerAgent): set_containers_agent(worker_agent) set_storage_agent(worker_agent) set_native_agent(worker_agent) + set_converter_agent(worker_agent) @asynccontextmanager @@ -330,6 +341,7 @@ async def lifespan(app: FastAPI): app.include_router(containers_router) app.include_router(storage_router) app.include_router(native_router) +app.include_router(converter_router) @app.get("/health") diff --git a/worker/docker_ops/gpu.py b/worker/docker_ops/gpu.py index f37cc4e..2f529fb 100644 --- a/worker/docker_ops/gpu.py +++ b/worker/docker_ops/gpu.py @@ -1,18 +1,24 @@ """GPU detection for LMStack Worker. -Provides NVIDIA GPU detection using pynvml (nvidia-ml-py). +Provides GPU detection for NVIDIA (using pynvml) and Apple Silicon. """ +import json import logging +import platform +import subprocess logger = logging.getLogger(__name__) class GPUDetector: - """Detect and report GPU information using pynvml (nvidia-ml-py).""" + """Detect and report GPU information. + + Supports NVIDIA GPUs (via pynvml) and Apple Silicon (via system_profiler). + """ def detect(self) -> list[dict]: - """Detect available GPUs with temperature using pynvml. + """Detect available GPUs. Returns: List of GPU information dictionaries with: @@ -24,6 +30,89 @@ def detect(self) -> list[dict]: - utilization: GPU utilization percentage - temperature: GPU temperature in Celsius """ + # Check platform + if platform.system() == "Darwin": + return self._detect_apple_silicon() + else: + return self._detect_nvidia() + + def _detect_apple_silicon(self) -> list[dict]: + """Detect Apple Silicon GPU information.""" + try: + # Check if this is Apple Silicon + machine = platform.machine() + if machine != "arm64": + # Intel Mac - no GPU info to report + return [] + + # Get GPU info from system_profiler + result = subprocess.run( + ["system_profiler", "SPDisplaysDataType", "-json"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode != 0: + logger.warning("system_profiler failed") + return [] + + data = json.loads(result.stdout) + displays = data.get("SPDisplaysDataType", []) + if not displays: + return [] + + gpus = [] + for idx, display in enumerate(displays): + gpu_name = display.get("sppci_model", "Apple Silicon GPU") + + # Get unified memory info (Apple Silicon uses unified memory) + # Use psutil to get system memory as reference + try: + import psutil + + mem = psutil.virtual_memory() + # Apple Silicon GPU can use up to ~75% of unified memory for GPU tasks + # Report system memory info as reference + memory_total = mem.total + memory_used = mem.used + memory_free = mem.available + except ImportError: + # Fallback: get memory from sysctl + try: + mem_result = subprocess.run( + ["sysctl", "-n", "hw.memsize"], + capture_output=True, + text=True, + timeout=5, + ) + memory_total = int(mem_result.stdout.strip()) + memory_used = 0 + memory_free = memory_total + except Exception: + memory_total = 0 + memory_used = 0 + memory_free = 0 + + gpus.append( + { + "index": idx, + "name": gpu_name, + "memory_total": memory_total, + "memory_used": memory_used, + "memory_free": memory_free, + "utilization": 0, # Would need powermetrics (requires sudo) + "temperature": 0, # Would need powermetrics (requires sudo) + } + ) + + return gpus + + except Exception as e: + logger.warning(f"Apple Silicon GPU detection failed: {e}") + return [] + + def _detect_nvidia(self) -> list[dict]: + """Detect NVIDIA GPUs using pynvml.""" try: import pynvml @@ -72,8 +161,8 @@ def detect(self) -> list[dict]: return gpus except ImportError: - logger.error("pynvml (nvidia-ml-py) not installed") + logger.debug("pynvml (nvidia-ml-py) not installed") return [] except Exception as e: - logger.warning(f"GPU detection failed: {e}") + logger.warning(f"NVIDIA GPU detection failed: {e}") return [] diff --git a/worker/native_ops/__init__.py b/worker/native_ops/__init__.py index 2acb027..f796cc9 100644 --- a/worker/native_ops/__init__.py +++ b/worker/native_ops/__init__.py @@ -1,7 +1,8 @@ """Native process operations for Mac workers without Docker.""" +from .converter import ModelConverter from .mlx import MLXManager from .ollama import OllamaManager from .process_manager import NativeProcessManager -__all__ = ["NativeProcessManager", "OllamaManager", "MLXManager"] +__all__ = ["NativeProcessManager", "OllamaManager", "MLXManager", "ModelConverter"] diff --git a/worker/native_ops/converter.py b/worker/native_ops/converter.py new file mode 100644 index 0000000..a4db5af --- /dev/null +++ b/worker/native_ops/converter.py @@ -0,0 +1,545 @@ +"""Model format converter for MLX and GGUF formats. + +This module handles converting HuggingFace models to formats +compatible with MLX and llama.cpp backends. +""" + +import asyncio +import logging +import shutil +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +# Default cache directory +DEFAULT_CACHE_DIR = Path.home() / ".lmstack" / "converted_models" + + +@dataclass +class ConversionTask: + """Represents an ongoing conversion task.""" + + task_id: str + hf_model_id: str + target_format: str # "mlx" or "gguf" + status: str # "pending", "running", "completed", "failed" + progress: float # 0.0 to 1.0 + message: str + output_path: Optional[str] = None + error: Optional[str] = None + started_at: Optional[datetime] = None + completed_at: Optional[datetime] = None + + +class ModelConverter: + """Model format converter for MLX and GGUF formats. + + Handles: + - Converting HuggingFace models to MLX format + - Converting HuggingFace models to GGUF format + - Caching converted models + - Checking if models are already in compatible formats + """ + + def __init__(self, cache_dir: Optional[Path] = None): + self.cache_dir = cache_dir or DEFAULT_CACHE_DIR + self.cache_dir.mkdir(parents=True, exist_ok=True) + self._tasks: dict[str, ConversionTask] = {} + + def get_mlx_cache_path(self, hf_model_id: str) -> Path: + """Get the cache path for MLX converted model.""" + safe_name = hf_model_id.replace("/", "--") + return self.cache_dir / "mlx" / safe_name + + def get_gguf_cache_path(self, hf_model_id: str, quant_type: str = "q8_0") -> Path: + """Get the cache path for GGUF converted model.""" + safe_name = hf_model_id.replace("/", "--") + return self.cache_dir / "gguf" / f"{safe_name}-{quant_type}.gguf" + + def get_cached_model(self, hf_model_id: str, format: str) -> Optional[str]: + """Get the path to a cached converted model if it exists. + + Args: + hf_model_id: HuggingFace model ID + format: Target format ("mlx" or "gguf") + + Returns: + Path to cached model if exists, None otherwise + """ + if format == "mlx": + cache_path = self.get_mlx_cache_path(hf_model_id) + # MLX models are directories with config.json and model files + if cache_path.exists() and (cache_path / "config.json").exists(): + return str(cache_path) + elif format == "gguf": + # Try common quantization types + for quant in ["q8_0", "q4_k_m", "q4_0", "f16"]: + cache_path = self.get_gguf_cache_path(hf_model_id, quant) + if cache_path.exists(): + return str(cache_path) + return None + + @staticmethod + def is_mlx_ready(model_id: str) -> bool: + """Check if model is from mlx-community (already MLX format). + + Args: + model_id: HuggingFace model ID + + Returns: + True if model is from mlx-community organization + """ + return model_id.startswith("mlx-community/") + + @staticmethod + def is_gguf_ready(model_id: str, files: Optional[list[str]] = None) -> bool: + """Check if model has GGUF files available. + + Args: + model_id: HuggingFace model ID + files: List of file names in the model repository + + Returns: + True if model has .gguf files + """ + if files: + return any(f.endswith(".gguf") for f in files) + # Common patterns for GGUF models + return any(pattern in model_id.lower() for pattern in ["gguf", "-gguf", "_gguf"]) + + async def download_gguf_model(self, hf_model_id: str) -> str: + """Download a GGUF model from HuggingFace. + + Uses huggingface_hub to download the .gguf file(s) from a repo. + + Args: + hf_model_id: HuggingFace model ID (e.g., "hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF") + + Returns: + Path to downloaded GGUF file + """ + try: + from huggingface_hub import hf_hub_download, list_repo_files + except ImportError: + raise RuntimeError( + "huggingface_hub is required. Install with: pip install huggingface_hub" + ) + + # Create cache directory for downloaded models + cache_dir = self.cache_dir / "gguf" + cache_dir.mkdir(parents=True, exist_ok=True) + + # List files in the repo to find .gguf files + try: + files = list_repo_files(hf_model_id) + gguf_files = [f for f in files if f.endswith(".gguf")] + + if not gguf_files: + raise RuntimeError(f"No .gguf files found in {hf_model_id}") + + # Pick the best file (prefer Q8_0 or largest quantization) + gguf_file = gguf_files[0] + for f in gguf_files: + # Prefer Q8_0 quantization + if "q8_0" in f.lower() or "Q8_0" in f: + gguf_file = f + break + + logger.info(f"Downloading {gguf_file} from {hf_model_id}...") + + # Download the file + local_path = hf_hub_download( + repo_id=hf_model_id, + filename=gguf_file, + cache_dir=str(cache_dir), + local_dir=str(cache_dir / hf_model_id.replace("/", "--")), + local_dir_use_symlinks=False, + ) + + logger.info(f"Downloaded GGUF model to {local_path}") + return local_path + + except Exception as e: + logger.error(f"Failed to download GGUF model: {e}") + raise RuntimeError(f"Failed to download GGUF model from {hf_model_id}: {e}") + + @staticmethod + def find_mlx_variant(hf_model_id: str) -> Optional[str]: + """Find MLX variant of a HuggingFace model. + + Searches mlx-community for a converted version of the model. + + Args: + hf_model_id: Original HuggingFace model ID + + Returns: + MLX model ID if found, None otherwise + """ + # Try common naming patterns + model_name = hf_model_id.split("/")[-1] + patterns = [ + f"mlx-community/{model_name}", + f"mlx-community/{model_name}-mlx", + f"mlx-community/{model_name}-4bit", + f"mlx-community/{model_name}-8bit", + ] + return patterns[0] if patterns else None + + def get_task(self, task_id: str) -> Optional[ConversionTask]: + """Get conversion task by ID.""" + return self._tasks.get(task_id) + + def list_tasks(self) -> list[ConversionTask]: + """List all conversion tasks.""" + return list(self._tasks.values()) + + async def convert_to_mlx( + self, + hf_model_id: str, + quantize: bool = True, + bits: int = 4, + ) -> str: + """Convert a HuggingFace model to MLX format. + + Uses mlx_lm.convert to convert the model. + + Args: + hf_model_id: HuggingFace model ID + quantize: Whether to quantize the model + bits: Quantization bits (4 or 8) + + Returns: + Path to converted model + + Raises: + RuntimeError: If conversion fails + """ + # Check if already cached + cached = self.get_cached_model(hf_model_id, "mlx") + if cached: + logger.info(f"Using cached MLX model: {cached}") + return cached + + # Check if already MLX format + if self.is_mlx_ready(hf_model_id): + logger.info(f"Model {hf_model_id} is already MLX format") + return hf_model_id + + output_path = self.get_mlx_cache_path(hf_model_id) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Create task + task_id = f"mlx-{hf_model_id.replace('/', '--')}" + task = ConversionTask( + task_id=task_id, + hf_model_id=hf_model_id, + target_format="mlx", + status="running", + progress=0.0, + message="Starting MLX conversion...", + started_at=datetime.now(), + ) + self._tasks[task_id] = task + + try: + # Check if mlx_lm is available + mlx_convert = shutil.which("mlx_lm.convert") + if not mlx_convert: + # Try using python module + cmd = [ + "python3", + "-m", + "mlx_lm.convert", + "--hf-path", + hf_model_id, + "--mlx-path", + str(output_path), + ] + else: + cmd = [ + mlx_convert, + "--hf-path", + hf_model_id, + "--mlx-path", + str(output_path), + ] + + if quantize: + cmd.extend(["-q", "--q-bits", str(bits)]) + + task.progress = 0.1 + task.message = f"Converting {hf_model_id} to MLX format..." + logger.info(f"Running: {' '.join(cmd)}") + + # Run conversion + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + + task.progress = 0.5 + stdout, _ = await process.communicate() + + if process.returncode != 0: + error_msg = stdout.decode() if stdout else "Unknown error" + raise RuntimeError(f"MLX conversion failed: {error_msg}") + + task.progress = 1.0 + task.status = "completed" + task.message = "Conversion completed" + task.output_path = str(output_path) + task.completed_at = datetime.now() + + logger.info(f"MLX conversion completed: {output_path}") + return str(output_path) + + except Exception as e: + task.status = "failed" + task.error = str(e) + task.message = f"Conversion failed: {e}" + logger.error(f"MLX conversion failed for {hf_model_id}: {e}") + raise + + async def convert_to_gguf( + self, + hf_model_id: str, + quant_type: str = "q8_0", + ) -> str: + """Convert a HuggingFace model to GGUF format. + + Uses llama.cpp's convert scripts to create GGUF. + + Args: + hf_model_id: HuggingFace model ID + quant_type: Quantization type (q4_0, q4_k_m, q8_0, f16, etc.) + + Returns: + Path to converted model + + Raises: + RuntimeError: If conversion fails + """ + # Check if already cached + cached = self.get_cached_model(hf_model_id, "gguf") + if cached: + logger.info(f"Using cached GGUF model: {cached}") + return cached + + output_path = self.get_gguf_cache_path(hf_model_id, quant_type) + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Create task + task_id = f"gguf-{hf_model_id.replace('/', '--')}" + task = ConversionTask( + task_id=task_id, + hf_model_id=hf_model_id, + target_format="gguf", + status="running", + progress=0.0, + message="Starting GGUF conversion...", + started_at=datetime.now(), + ) + self._tasks[task_id] = task + + try: + # First, download the model using huggingface-cli + task.progress = 0.1 + task.message = f"Downloading {hf_model_id}..." + + hf_cache_dir = Path.home() / ".cache" / "huggingface" / "hub" + model_dir = hf_cache_dir / f"models--{hf_model_id.replace('/', '--')}" + + if not model_dir.exists(): + download_cmd = [ + "huggingface-cli", + "download", + hf_model_id, + "--local-dir", + str(self.cache_dir / "downloads" / hf_model_id.replace("/", "--")), + ] + process = await asyncio.create_subprocess_exec( + *download_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + stdout, _ = await process.communicate() + if process.returncode != 0: + raise RuntimeError( + f"Download failed: {stdout.decode() if stdout else 'Unknown error'}" + ) + model_dir = self.cache_dir / "downloads" / hf_model_id.replace("/", "--") + + task.progress = 0.4 + task.message = "Converting to GGUF..." + + # Find llama.cpp convert script + # Common locations + convert_script = None + for path in [ + shutil.which("convert_hf_to_gguf.py"), + Path.home() / "llama.cpp" / "convert_hf_to_gguf.py", + Path("/usr/local/share/llama.cpp/convert_hf_to_gguf.py"), + ]: + if path and Path(path).exists(): + convert_script = str(path) + break + + if not convert_script: + # Try using llama-quantize directly if model is already GGUF + raise RuntimeError( + "llama.cpp convert script not found. " + "Please install llama.cpp: brew install llama.cpp" + ) + + # Convert to GGUF + temp_gguf = output_path.parent / f"{output_path.stem}_temp.gguf" + convert_cmd = [ + "python3", + convert_script, + str(model_dir), + "--outfile", + str(temp_gguf), + "--outtype", + "f16", + ] + + process = await asyncio.create_subprocess_exec( + *convert_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + stdout, _ = await process.communicate() + + if process.returncode != 0: + raise RuntimeError( + f"GGUF conversion failed: {stdout.decode() if stdout else 'Unknown error'}" + ) + + task.progress = 0.7 + task.message = f"Quantizing to {quant_type}..." + + # Quantize if needed + if quant_type != "f16": + llama_quantize = shutil.which("llama-quantize") + if not llama_quantize: + raise RuntimeError("llama-quantize not found. Please install llama.cpp") + + quant_cmd = [ + llama_quantize, + str(temp_gguf), + str(output_path), + quant_type.upper(), + ] + process = await asyncio.create_subprocess_exec( + *quant_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + await process.communicate() + + # Remove temp file + temp_gguf.unlink(missing_ok=True) + else: + # Just rename + temp_gguf.rename(output_path) + + task.progress = 1.0 + task.status = "completed" + task.message = "Conversion completed" + task.output_path = str(output_path) + task.completed_at = datetime.now() + + logger.info(f"GGUF conversion completed: {output_path}") + return str(output_path) + + except Exception as e: + task.status = "failed" + task.error = str(e) + task.message = f"Conversion failed: {e}" + logger.error(f"GGUF conversion failed for {hf_model_id}: {e}") + raise + + def clear_cache(self, hf_model_id: Optional[str] = None, format: Optional[str] = None): + """Clear converted model cache. + + Args: + hf_model_id: Clear cache for specific model (None = all) + format: Clear cache for specific format (None = all) + """ + if hf_model_id: + if format in (None, "mlx"): + cache_path = self.get_mlx_cache_path(hf_model_id) + if cache_path.exists(): + shutil.rmtree(cache_path) + logger.info(f"Cleared MLX cache: {cache_path}") + + if format in (None, "gguf"): + for quant in ["q8_0", "q4_k_m", "q4_0", "f16"]: + cache_path = self.get_gguf_cache_path(hf_model_id, quant) + if cache_path.exists(): + cache_path.unlink() + logger.info(f"Cleared GGUF cache: {cache_path}") + else: + # Clear all + if format in (None, "mlx"): + mlx_dir = self.cache_dir / "mlx" + if mlx_dir.exists(): + shutil.rmtree(mlx_dir) + logger.info("Cleared all MLX cache") + + if format in (None, "gguf"): + gguf_dir = self.cache_dir / "gguf" + if gguf_dir.exists(): + shutil.rmtree(gguf_dir) + logger.info("Cleared all GGUF cache") + + def get_cache_info(self) -> dict: + """Get information about cached models. + + Returns: + Dictionary with cache statistics + """ + info = { + "cache_dir": str(self.cache_dir), + "mlx_models": [], + "gguf_models": [], + "total_size_bytes": 0, + } + + mlx_dir = self.cache_dir / "mlx" + if mlx_dir.exists(): + for model_dir in mlx_dir.iterdir(): + if model_dir.is_dir(): + size = sum(f.stat().st_size for f in model_dir.rglob("*") if f.is_file()) + info["mlx_models"].append( + { + "model_id": model_dir.name.replace("--", "/"), + "path": str(model_dir), + "size_bytes": size, + } + ) + info["total_size_bytes"] += size + + gguf_dir = self.cache_dir / "gguf" + if gguf_dir.exists(): + for gguf_file in gguf_dir.glob("*.gguf"): + size = gguf_file.stat().st_size + # Parse model name from filename (name-quant.gguf) + parts = gguf_file.stem.rsplit("-", 1) + model_id = parts[0].replace("--", "/") if parts else gguf_file.stem + quant = parts[1] if len(parts) > 1 else "unknown" + info["gguf_models"].append( + { + "model_id": model_id, + "quant_type": quant, + "path": str(gguf_file), + "size_bytes": size, + } + ) + info["total_size_bytes"] += size + + return info diff --git a/worker/native_ops/process_manager.py b/worker/native_ops/process_manager.py index 79a57c6..7997378 100644 --- a/worker/native_ops/process_manager.py +++ b/worker/native_ops/process_manager.py @@ -1,17 +1,24 @@ """Native process manager for Mac workers. Manages LLM inference processes without Docker for macOS with Apple Silicon. -Supports Ollama, MLX-LM, and llama.cpp backends. +Supports Ollama, MLX-LM, llama.cpp, and vLLM-Metal backends. """ +import asyncio import logging import os +import shutil import subprocess from dataclasses import dataclass +from pathlib import Path from typing import Optional +from .converter import ModelConverter + logger = logging.getLogger(__name__) +OLLAMA_DEFAULT_PORT = 11434 + @dataclass class NativeProcess: @@ -19,10 +26,11 @@ class NativeProcess: process_id: str # Unique identifier (deployment_id based) pid: int # OS process ID - backend: str # ollama, mlx, llama_cpp + backend: str # ollama, mlx, llama_cpp, vllm model_id: str port: int process: Optional[subprocess.Popen] = None + log_file: Optional[Path] = None # Path to log file for this process class NativeProcessManager: @@ -30,6 +38,88 @@ class NativeProcessManager: def __init__(self): self._processes: dict[str, NativeProcess] = {} + self._ollama_process: Optional[subprocess.Popen] = None + self._converter = ModelConverter() + self._log_dir = Path.home() / ".lmstack" / "logs" + self._log_dir.mkdir(parents=True, exist_ok=True) + + def _write_log(self, process_id: str, message: str) -> None: + """Write a message to a process's log file.""" + log_file = self._log_dir / f"{process_id}.log" + with open(log_file, "a") as f: + from datetime import datetime + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + f.write(f"[{timestamp}] {message}\n") + f.flush() + + async def ensure_ollama_running( + self, host: str = "0.0.0.0", port: int = OLLAMA_DEFAULT_PORT + ) -> bool: + """Ensure Ollama service is running and accessible. + + If Ollama is not running, starts it with OLLAMA_HOST set to allow external connections. + + Args: + host: Host to bind to (default 0.0.0.0 for external access) + port: Port to bind to (default 11434) + + Returns: + True if Ollama is running and accessible + """ + import httpx + + # Check if Ollama is already running + try: + async with httpx.AsyncClient(timeout=2.0) as client: + response = await client.get(f"http://localhost:{port}/api/tags") + if response.status_code == 200: + logger.info("Ollama service is already running") + return True + except Exception: + pass + + # Ollama not running, try to start it + ollama_path = shutil.which("ollama") + if not ollama_path: + logger.warning("Ollama is not installed") + return False + + logger.info(f"Starting Ollama service on {host}:{port}") + + # Set environment for Ollama to bind to all interfaces + env = os.environ.copy() + env["OLLAMA_HOST"] = f"{host}:{port}" + + try: + # Start ollama serve in background + self._ollama_process = subprocess.Popen( + [ollama_path, "serve"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) + logger.info(f"Started Ollama service (PID {self._ollama_process.pid})") + + # Wait for Ollama to be ready + for _ in range(30): # Wait up to 30 seconds + await asyncio.sleep(1) + try: + async with httpx.AsyncClient(timeout=2.0) as client: + response = await client.get(f"http://localhost:{port}/api/tags") + if response.status_code == 200: + logger.info("Ollama service is ready") + return True + except Exception: + pass + + logger.error("Ollama service failed to start in time") + return False + + except Exception as e: + logger.error(f"Failed to start Ollama service: {e}") + return False def get_process(self, process_id: str) -> Optional[NativeProcess]: """Get a managed process by ID.""" @@ -74,6 +164,8 @@ async def start_process( process = await self._start_mlx(process_id, model_id, port, **kwargs) elif backend == "llama_cpp": process = await self._start_llama_cpp(process_id, model_id, port, **kwargs) + elif backend == "vllm": + process = await self._start_vllm_metal(process_id, model_id, port, **kwargs) else: raise ValueError(f"Unknown backend: {backend}") @@ -145,17 +237,13 @@ async def _start_ollama( """ import httpx - ollama_port = 11434 # Ollama's default port + ollama_port = OLLAMA_DEFAULT_PORT - # Check if Ollama service is running - try: - async with httpx.AsyncClient(timeout=5.0) as client: - response = await client.get(f"http://localhost:{ollama_port}/api/tags") - if response.status_code != 200: - raise RuntimeError("Ollama service is not responding") - except httpx.ConnectError: + # Ensure Ollama service is running (starts it if needed) + if not await self.ensure_ollama_running(): raise RuntimeError( - "Ollama service is not running. " "Please start it with: ollama serve" + "Ollama service is not running and could not be started. " + "Please install Ollama: https://ollama.ai" ) # Pull the model if needed @@ -216,6 +304,71 @@ async def _unload_ollama_model(self, process: NativeProcess): except Exception as e: logger.warning(f"Failed to unload Ollama model: {e}") + async def _ensure_mlx_lm_installed(self) -> str: + """Ensure MLX-LM is installed in a virtual environment. + + Creates a virtual environment at ~/.lmstack/venvs/mlx-lm + and installs mlx-lm if not already present. + + Returns: + Path to the python command in the virtual environment + """ + venv_dir = Path.home() / ".lmstack" / "venvs" / "mlx-lm" + python_cmd = venv_dir / "bin" / "python" + + # Check if mlx-lm is already installed in venv + if python_cmd.exists(): + # Verify mlx_lm is importable + check = await asyncio.create_subprocess_exec( + str(python_cmd), + "-c", + "import mlx_lm", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + await check.wait() + if check.returncode == 0: + logger.info(f"MLX-LM already installed at {venv_dir}") + return str(python_cmd) + + # Create virtual environment + logger.info(f"Creating virtual environment for MLX-LM at {venv_dir}") + venv_dir.parent.mkdir(parents=True, exist_ok=True) + + # Create venv + create_venv = await asyncio.create_subprocess_exec( + "python3", + "-m", + "venv", + str(venv_dir), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + await create_venv.wait() + + if create_venv.returncode != 0: + stderr = await create_venv.stderr.read() + raise RuntimeError(f"Failed to create virtual environment: {stderr.decode()}") + + # Install mlx-lm + pip_cmd = venv_dir / "bin" / "pip" + logger.info("Installing mlx-lm (this may take a few minutes)...") + + install_proc = await asyncio.create_subprocess_exec( + str(pip_cmd), + "install", + "mlx-lm", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await install_proc.communicate() + + if install_proc.returncode != 0: + raise RuntimeError(f"Failed to install mlx-lm: {stderr.decode()}") + + logger.info("MLX-LM installed successfully") + return str(python_cmd) + async def _start_mlx( self, process_id: str, @@ -226,14 +379,62 @@ async def _start_mlx( """Start MLX-LM server for Apple Silicon. MLX-LM provides OpenAI-compatible API via mlx_lm.server. + Automatically installs mlx-lm and converts models if needed. """ - # Build command + # Initialize log file early so we can track progress + self._write_log(process_id, f"Starting MLX deployment for {model_id}") + + # Ensure MLX-LM is installed (auto-install if needed) + self._write_log(process_id, "Checking MLX-LM installation...") + python_cmd = await self._ensure_mlx_lm_installed() + self._write_log(process_id, f"MLX-LM ready: {python_cmd}") + + effective_model_id = model_id + + # Check if model needs conversion + if not ModelConverter.is_mlx_ready(model_id): + # Check for cached conversion first + cached = self._converter.get_cached_model(model_id, "mlx") + if cached: + logger.info(f"Using cached MLX model: {cached}") + self._write_log(process_id, f"Using cached MLX model: {cached}") + effective_model_id = cached + else: + # Try to find an existing MLX variant on HuggingFace + mlx_variant = ModelConverter.find_mlx_variant(model_id) + if mlx_variant and ModelConverter.is_mlx_ready(mlx_variant): + logger.info(f"Using MLX variant: {mlx_variant}") + self._write_log(process_id, f"Using MLX variant: {mlx_variant}") + effective_model_id = mlx_variant + else: + # Convert the model + logger.info(f"Converting {model_id} to MLX format...") + self._write_log(process_id, f"Converting {model_id} to MLX format...") + self._write_log(process_id, "This may take a while...") + try: + quantize = kwargs.pop("mlx_quantize", True) + bits = kwargs.pop("mlx_bits", 4) + effective_model_id = await self._converter.convert_to_mlx( + model_id, quantize=quantize, bits=bits + ) + self._write_log(process_id, f"Conversion complete: {effective_model_id}") + except Exception as e: + logger.error(f"MLX conversion failed: {e}") + self._write_log(process_id, f"ERROR: MLX conversion failed: {e}") + raise RuntimeError( + f"Failed to convert model to MLX format: {e}. " + "Consider using an mlx-community model or Ollama backend." + ) + else: + self._write_log(process_id, f"Model {model_id} is MLX-ready") + + # Build command using venv python cmd = [ - "python3", + python_cmd, "-m", "mlx_lm.server", "--model", - model_id, + effective_model_id, "--host", "0.0.0.0", "--port", @@ -244,27 +445,83 @@ async def _start_mlx( if kwargs.get("trust_remote_code"): cmd.append("--trust-remote-code") - # Start the process + # Create log file + log_file = self._log_dir / f"{process_id}.log" + + # Start the process with log file env = os.environ.copy() - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - env=env, - start_new_session=True, - ) + with open(log_file, "a") as f: + process = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) - logger.info(f"Started MLX-LM server (PID {process.pid}) for {model_id}") + logger.info(f"Started MLX-LM server (PID {process.pid}) for {effective_model_id}") return NativeProcess( process_id=process_id, pid=process.pid, backend="mlx", - model_id=model_id, + model_id=effective_model_id, port=port, process=process, + log_file=log_file, ) + async def _ensure_llama_cpp_installed(self) -> str: + """Ensure llama.cpp is installed. + + Installs llama.cpp via Homebrew if not already present. + + Returns: + Path to the llama-server command + """ + llama_server = shutil.which("llama-server") + if llama_server: + logger.info(f"llama.cpp already installed at {llama_server}") + return llama_server + + # Check if brew is available + brew = shutil.which("brew") + if not brew: + raise RuntimeError( + "llama-server not found and Homebrew is not installed. " + "Please install Homebrew first: https://brew.sh" + ) + + # Install llama.cpp via brew + logger.info("Installing llama.cpp via Homebrew (this may take a few minutes)...") + + install_proc = await asyncio.create_subprocess_exec( + brew, + "install", + "llama.cpp", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await install_proc.communicate() + + if install_proc.returncode != 0: + raise RuntimeError(f"Failed to install llama.cpp: {stderr.decode()}") + + # Find llama-server again + llama_server = shutil.which("llama-server") + if not llama_server: + # Try common Homebrew paths + for path in ["/opt/homebrew/bin/llama-server", "/usr/local/bin/llama-server"]: + if Path(path).exists(): + llama_server = path + break + + if not llama_server: + raise RuntimeError("llama.cpp installed but llama-server not found in PATH") + + logger.info("llama.cpp installed successfully") + return llama_server + async def _start_llama_cpp( self, process_id: str, @@ -275,22 +532,69 @@ async def _start_llama_cpp( """Start llama.cpp server with Metal acceleration. llama.cpp provides OpenAI-compatible API via llama-server. + Automatically installs llama.cpp and downloads/converts models if needed. """ - # Check for llama-server binary - import shutil + # Initialize log file early so we can track progress + self._write_log(process_id, f"Starting llama.cpp deployment for {model_id}") - llama_server = shutil.which("llama-server") + # Ensure llama.cpp is installed (auto-install if needed) + self._write_log(process_id, "Checking llama.cpp installation...") + llama_server = await self._ensure_llama_cpp_installed() + self._write_log(process_id, f"llama.cpp ready: {llama_server}") - if not llama_server: - raise RuntimeError( - "llama-server not found. " "Please install llama.cpp: brew install llama.cpp" - ) + effective_model_path = model_id + + # Check if model_id is already a local GGUF file path + if model_id.endswith(".gguf") and Path(model_id).exists(): + logger.info(f"Using local GGUF file: {model_id}") + self._write_log(process_id, f"Using local GGUF file: {model_id}") + effective_model_path = model_id + else: + # Check for cached model + cached = self._converter.get_cached_model(model_id, "gguf") + if cached: + logger.info(f"Using cached GGUF model: {cached}") + self._write_log(process_id, f"Using cached GGUF model: {cached}") + effective_model_path = cached + elif ModelConverter.is_gguf_ready(model_id): + # Model is already GGUF on HuggingFace, download it directly + logger.info(f"Downloading GGUF model from HuggingFace: {model_id}") + self._write_log(process_id, f"Downloading GGUF model from HuggingFace: {model_id}") + self._write_log(process_id, "This may take a while depending on model size...") + try: + effective_model_path = await self._converter.download_gguf_model(model_id) + self._write_log(process_id, f"Download complete: {effective_model_path}") + except Exception as e: + logger.error(f"GGUF download failed: {e}") + self._write_log(process_id, f"ERROR: GGUF download failed: {e}") + raise RuntimeError( + f"Failed to download GGUF model: {e}. " + "Check if the model exists and has .gguf files." + ) + else: + # Need to convert from HuggingFace format + logger.info(f"Converting {model_id} to GGUF format...") + self._write_log(process_id, f"Converting {model_id} to GGUF format...") + self._write_log(process_id, "This may take a while...") + try: + quant_type = kwargs.pop("gguf_quant", "q8_0") + effective_model_path = await self._converter.convert_to_gguf( + model_id, quant_type=quant_type + ) + self._write_log(process_id, f"Conversion complete: {effective_model_path}") + except Exception as e: + logger.error(f"GGUF conversion failed: {e}") + self._write_log(process_id, f"ERROR: GGUF conversion failed: {e}") + raise RuntimeError( + f"Failed to convert model to GGUF format: {e}. " + "Consider using a pre-quantized GGUF model or Ollama backend." + ) # Build command cmd = [ llama_server, "--model", - model_id, + effective_model_path, "--host", "0.0.0.0", "--port", @@ -306,31 +610,165 @@ async def _start_llama_cpp( if n_threads := kwargs.get("n_threads"): cmd.extend(["-t", str(n_threads)]) - # Start the process + # Create log file + log_file = self._log_dir / f"{process_id}.log" + + # Start the process with log file env = os.environ.copy() - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - env=env, - start_new_session=True, - ) + with open(log_file, "a") as f: + process = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) - logger.info(f"Started llama.cpp server (PID {process.pid}) for {model_id}") + logger.info(f"Started llama.cpp server (PID {process.pid}) for {effective_model_path}") return NativeProcess( process_id=process_id, pid=process.pid, backend="llama_cpp", + model_id=effective_model_path, + port=port, + process=process, + log_file=log_file, + ) + + async def _ensure_vllm_metal_installed(self) -> str: + """Ensure vLLM-Metal is installed in a virtual environment. + + Creates a virtual environment at ~/.lmstack/venvs/vllm-metal + and installs vllm-metal if not already present. + + Returns: + Path to the vllm command in the virtual environment + """ + venv_dir = Path.home() / ".lmstack" / "venvs" / "vllm-metal" + vllm_cmd = venv_dir / "bin" / "vllm" + + # Check if vllm is already installed in venv + if vllm_cmd.exists(): + logger.info(f"vLLM-Metal already installed at {vllm_cmd}") + return str(vllm_cmd) + + # Create virtual environment + logger.info(f"Creating virtual environment for vLLM-Metal at {venv_dir}") + venv_dir.parent.mkdir(parents=True, exist_ok=True) + + # Create venv + create_venv = await asyncio.create_subprocess_exec( + "python3", + "-m", + "venv", + str(venv_dir), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + await create_venv.wait() + + if create_venv.returncode != 0: + stderr = await create_venv.stderr.read() + raise RuntimeError(f"Failed to create virtual environment: {stderr.decode()}") + + # Install vllm-metal + pip_cmd = venv_dir / "bin" / "pip" + logger.info("Installing vllm-metal (this may take a few minutes)...") + + install_proc = await asyncio.create_subprocess_exec( + str(pip_cmd), + "install", + "vllm-metal", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await install_proc.communicate() + + if install_proc.returncode != 0: + raise RuntimeError( + f"Failed to install vllm-metal: {stderr.decode()}\n" + "You may need to install it manually: pip install vllm-metal" + ) + + logger.info("vLLM-Metal installed successfully") + return str(vllm_cmd) + + async def _start_vllm_metal( + self, + process_id: str, + model_id: str, + port: int, + **kwargs, + ) -> NativeProcess: + """Start vLLM-Metal server for Apple Silicon. + + vLLM-Metal provides OpenAI-compatible API via `vllm serve`. + Automatically installs vllm-metal in a virtual environment if needed. + See: https://github.com/vllm-project/vllm-metal + """ + # Initialize log file early so we can track progress + self._write_log(process_id, f"Starting vLLM-Metal deployment for {model_id}") + + # Ensure vLLM-Metal is installed (auto-install if needed) + self._write_log(process_id, "Checking vLLM-Metal installation...") + vllm_cmd = await self._ensure_vllm_metal_installed() + self._write_log(process_id, f"vLLM-Metal ready: {vllm_cmd}") + + # Build command using vllm serve + cmd = [ + vllm_cmd, + "serve", + model_id, + "--host", + "0.0.0.0", + "--port", + str(port), + ] + + # Add optional parameters + if gpu_memory_util := kwargs.get("gpu_memory_utilization"): + cmd.extend(["--gpu-memory-utilization", str(gpu_memory_util)]) + + if max_model_len := kwargs.get("max_model_len"): + cmd.extend(["--max-model-len", str(max_model_len)]) + + if dtype := kwargs.get("dtype"): + cmd.extend(["--dtype", str(dtype)]) + + if kwargs.get("trust_remote_code"): + cmd.append("--trust-remote-code") + + # Create log file + log_file = self._log_dir / f"{process_id}.log" + + # Start the process with log file + env = os.environ.copy() + with open(log_file, "a") as f: + process = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + env=env, + start_new_session=True, + ) + + logger.info(f"Started vLLM-Metal server (PID {process.pid}) for {model_id}") + + return NativeProcess( + process_id=process_id, + pid=process.pid, + backend="vllm", model_id=model_id, port=port, process=process, + log_file=log_file, ) def get_logs(self, process_id: str, tail: int = 100) -> str: """Get logs from a process. - For subprocess-based backends, reads from stdout pipe. + Reads from log file for MLX, llama.cpp, and vLLM-Metal backends. For Ollama, returns status information about loaded models. """ process = self._processes.get(process_id) @@ -340,11 +778,15 @@ def get_logs(self, process_id: str, tail: int = 100) -> str: if process.backend == "ollama": return self._get_ollama_status(process) - if process.process and process.process.stdout: + # Read from log file + if process.log_file and process.log_file.exists(): try: - # This is a simple implementation - in production you'd want - # to capture logs to a file and read the tail - return "Logs are available but streaming is not yet implemented" + with open(process.log_file) as f: + lines = f.readlines() + # Return last 'tail' lines + if len(lines) > tail: + lines = lines[-tail:] + return "".join(lines) except Exception as e: return f"Error reading logs: {e}" diff --git a/worker/routes/__init__.py b/worker/routes/__init__.py index cd506d0..9ce61f8 100644 --- a/worker/routes/__init__.py +++ b/worker/routes/__init__.py @@ -6,9 +6,11 @@ - containers.py: Docker container management endpoints - storage.py: Storage and volume management endpoints - native.py: Native deployment endpoints (Mac without Docker) +- converter.py: Model format conversion endpoints (MLX/GGUF) """ from .containers import router as containers_router +from .converter import router as converter_router from .deployment import router as deployment_router from .images import router as images_router from .native import router as native_router @@ -20,4 +22,5 @@ "containers_router", "storage_router", "native_router", + "converter_router", ] diff --git a/worker/routes/converter.py b/worker/routes/converter.py new file mode 100644 index 0000000..a259ddb --- /dev/null +++ b/worker/routes/converter.py @@ -0,0 +1,233 @@ +"""Model conversion routes for the worker agent. + +Provides API endpoints for converting HuggingFace models to MLX/GGUF formats. +""" + +import logging +from typing import TYPE_CHECKING, Optional + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +if TYPE_CHECKING: + from worker.agent import WorkerAgent + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["converter"]) + +# Global agent reference (set by agent.py) +_agent: "WorkerAgent | None" = None + + +def set_agent(agent: "WorkerAgent"): + """Set the global agent reference.""" + global _agent + _agent = agent + + +def _get_converter(): + """Get the converter from native manager.""" + if not _agent: + raise HTTPException(status_code=500, detail="Agent not initialized") + if not hasattr(_agent, "native_manager") or not _agent.native_manager: + raise HTTPException( + status_code=400, + detail="Model conversion only available on Mac workers with native support", + ) + return _agent.native_manager._converter + + +class MLXConvertRequest(BaseModel): + """Request to convert a model to MLX format.""" + + hf_model_id: str + quantize: bool = True + bits: int = 4 # 4 or 8 + + +class GGUFConvertRequest(BaseModel): + """Request to convert a model to GGUF format.""" + + hf_model_id: str + quant_type: str = "q8_0" # q4_0, q4_k_m, q8_0, f16 + + +class ConvertResponse(BaseModel): + """Response from conversion request.""" + + task_id: str + status: str + message: str + output_path: Optional[str] = None + + +class ConversionProgress(BaseModel): + """Conversion task progress.""" + + task_id: str + status: str + progress: float + message: str + output_path: Optional[str] = None + error: Optional[str] = None + + +class FormatCheckRequest(BaseModel): + """Request to check model format compatibility.""" + + model_id: str + files: Optional[list[str]] = None + + +class FormatCheckResponse(BaseModel): + """Response with model format compatibility info.""" + + model_id: str + is_mlx_ready: bool + is_gguf_ready: bool + cached_mlx: Optional[str] = None + cached_gguf: Optional[str] = None + + +@router.post("/convert/mlx", response_model=ConvertResponse) +async def convert_to_mlx(request: MLXConvertRequest): + """Convert a HuggingFace model to MLX format. + + This endpoint starts the conversion process and returns immediately. + Use GET /convert/progress/{task_id} to check progress. + """ + converter = _get_converter() + + try: + # Start conversion + output_path = await converter.convert_to_mlx( + hf_model_id=request.hf_model_id, + quantize=request.quantize, + bits=request.bits, + ) + + task_id = f"mlx-{request.hf_model_id.replace('/', '--')}" + + return ConvertResponse( + task_id=task_id, + status="completed", + message="Conversion completed successfully", + output_path=output_path, + ) + + except RuntimeError as e: + raise HTTPException(status_code=500, detail=str(e)) + except Exception as e: + logger.exception(f"MLX conversion failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/convert/gguf", response_model=ConvertResponse) +async def convert_to_gguf(request: GGUFConvertRequest): + """Convert a HuggingFace model to GGUF format. + + This endpoint starts the conversion process and returns immediately. + Use GET /convert/progress/{task_id} to check progress. + """ + converter = _get_converter() + + try: + output_path = await converter.convert_to_gguf( + hf_model_id=request.hf_model_id, + quant_type=request.quant_type, + ) + + task_id = f"gguf-{request.hf_model_id.replace('/', '--')}" + + return ConvertResponse( + task_id=task_id, + status="completed", + message="Conversion completed successfully", + output_path=output_path, + ) + + except RuntimeError as e: + raise HTTPException(status_code=500, detail=str(e)) + except Exception as e: + logger.exception(f"GGUF conversion failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/convert/progress/{task_id}", response_model=ConversionProgress) +async def get_conversion_progress(task_id: str): + """Get the progress of a conversion task.""" + converter = _get_converter() + + task = converter.get_task(task_id) + if not task: + raise HTTPException(status_code=404, detail=f"Task {task_id} not found") + + return ConversionProgress( + task_id=task.task_id, + status=task.status, + progress=task.progress, + message=task.message, + output_path=task.output_path, + error=task.error, + ) + + +@router.get("/convert/tasks") +async def list_conversion_tasks(): + """List all conversion tasks.""" + converter = _get_converter() + + tasks = converter.list_tasks() + return { + "tasks": [ + { + "task_id": t.task_id, + "hf_model_id": t.hf_model_id, + "target_format": t.target_format, + "status": t.status, + "progress": t.progress, + "message": t.message, + } + for t in tasks + ] + } + + +@router.post("/convert/check-format", response_model=FormatCheckResponse) +async def check_model_format(request: FormatCheckRequest): + """Check if a model is already in MLX or GGUF format.""" + converter = _get_converter() + + from worker.native_ops.converter import ModelConverter + + is_mlx = ModelConverter.is_mlx_ready(request.model_id) + is_gguf = ModelConverter.is_gguf_ready(request.model_id, request.files) + + return FormatCheckResponse( + model_id=request.model_id, + is_mlx_ready=is_mlx, + is_gguf_ready=is_gguf, + cached_mlx=converter.get_cached_model(request.model_id, "mlx") if not is_mlx else None, + cached_gguf=converter.get_cached_model(request.model_id, "gguf") if not is_gguf else None, + ) + + +@router.get("/convert/cache") +async def get_cache_info(): + """Get information about cached converted models.""" + converter = _get_converter() + return converter.get_cache_info() + + +@router.delete("/convert/cache") +async def clear_cache(model_id: Optional[str] = None, format: Optional[str] = None): + """Clear the model conversion cache. + + Args: + model_id: Clear cache for specific model (None = all) + format: Clear cache for specific format: "mlx" or "gguf" (None = all) + """ + converter = _get_converter() + converter.clear_cache(model_id, format) + return {"status": "ok", "message": "Cache cleared"}