diff --git a/backend/app/api/huggingface.py b/backend/app/api/huggingface.py
index 26f73cd..38d8f47 100644
--- a/backend/app/api/huggingface.py
+++ b/backend/app/api/huggingface.py
@@ -616,3 +616,189 @@ async def get_model_readme(
 
     except httpx.RequestError as e:
         return {"content": None, "message": f"Failed to fetch README: {str(e)}"}
+
+
+class ModelFormatInfo(BaseModel):
+    """Model format compatibility information"""
+
+    model_id: str
+    is_mlx_ready: bool = False  # True if from mlx-community
+    is_gguf_ready: bool = False  # True if has .gguf files
+    mlx_variants: list[str] = []  # Available MLX variants
+    gguf_files: list[str] = []  # Available GGUF files
+
+
+def _is_mlx_ready(model_id: str) -> bool:
+    """Check if model is from mlx-community."""
+    return model_id.startswith("mlx-community/")
+
+
+def _is_gguf_ready(files: list[str]) -> bool:
+    """Check if model has GGUF files."""
+    return any(f.endswith(".gguf") for f in files)
+
+
+@router.get("/format-info/{model_id:path}", response_model=ModelFormatInfo)
+async def get_model_format_info(
+    model_id: str,
+    token: str | None = Query(None, description="HuggingFace API token"),
+):
+    """
+    Get model format compatibility information.
+
+    Returns whether the model is MLX-ready, GGUF-ready, and lists available variants.
+    """
+    headers = {}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+
+    result = ModelFormatInfo(
+        model_id=model_id,
+        is_mlx_ready=_is_mlx_ready(model_id),
+    )
+
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            # Get model files to check for GGUF
+            response = await client.get(
+                f"{HF_API_URL}/models/{model_id}",
+                headers=headers,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                siblings = data.get("siblings", [])
+                files = [s.get("rfilename", "") for s in siblings]
+
+                # Check for GGUF files
+                gguf_files = [f for f in files if f.endswith(".gguf")]
+                result.gguf_files = gguf_files
+                result.is_gguf_ready = len(gguf_files) > 0
+
+            # Search for MLX variants if not already MLX
+            if not result.is_mlx_ready:
+                model_name = model_id.split("/")[-1]
+                # Search mlx-community for this model
+                search_response = await client.get(
+                    f"{HF_API_URL}/models",
+                    params={
+                        "search": model_name,
+                        "author": "mlx-community",
+                        "limit": 5,
+                    },
+                )
+                if search_response.status_code == 200:
+                    mlx_models = search_response.json()
+                    result.mlx_variants = [m.get("modelId", m.get("id", "")) for m in mlx_models]
+
+    except httpx.RequestError as e:
+        # Log error but don't fail - return partial info
+        import logging
+
+        logging.getLogger(__name__).warning(f"Failed to fetch format info: {e}")
+
+    return result
+
+
+@router.get("/search-mlx")
+async def search_mlx_models(
+    query: str = Query(..., min_length=2, description="Search query"),
+    limit: int = Query(20, ge=1, le=50, description="Number of results"),
+):
+    """
+    Search for MLX-ready models from mlx-community.
+
+    Returns models that are already converted to MLX format.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            params = {
+                "search": query,
+                "author": "mlx-community",
+                "limit": limit,
+                "sort": "downloads",
+                "direction": -1,
+            }
+
+            response = await client.get(
+                f"{HF_API_URL}/models",
+                params=params,
+            )
+            response.raise_for_status()
+
+            models = response.json()
+            return [
+                {
+                    "id": m.get("modelId", m.get("id")),
+                    "author": m.get("author"),
+                    "downloads": m.get("downloads", 0),
+                    "likes": m.get("likes", 0),
+                    "pipeline_tag": m.get("pipeline_tag"),
+                    "tags": m.get("tags", [])[:5],
+                    "is_mlx_ready": True,
+                }
+                for m in models
+            ]
+
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")
+
+
+@router.get("/search-gguf")
+async def search_gguf_models(
+    query: str = Query(..., min_length=2, description="Search query"),
+    limit: int = Query(20, ge=1, le=50, description="Number of results"),
+):
+    """
+    Search for models with GGUF files available.
+
+    Returns models that have pre-converted GGUF files.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            # Search with GGUF tag
+            params = {
+                "search": query,
+                "limit": limit * 2,  # Get more to filter
+                "sort": "downloads",
+                "direction": -1,
+                "filter": "gguf",
+            }
+
+            response = await client.get(
+                f"{HF_API_URL}/models",
+                params=params,
+            )
+            response.raise_for_status()
+
+            models = response.json()
+
+            # Filter to only include models with GGUF in name or tags
+            gguf_models = []
+            for m in models:
+                model_id = m.get("modelId", m.get("id", ""))
+                tags = m.get("tags", [])
+
+                # Check if model has GGUF indicator
+                is_gguf = "gguf" in model_id.lower() or any("gguf" in t.lower() for t in tags)
+
+                if is_gguf:
+                    gguf_models.append(
+                        {
+                            "id": model_id,
+                            "author": m.get("author"),
+                            "downloads": m.get("downloads", 0),
+                            "likes": m.get("likes", 0),
+                            "pipeline_tag": m.get("pipeline_tag"),
+                            "tags": tags[:5],
+                            "is_gguf_ready": True,
+                        }
+                    )
+
+                if len(gguf_models) >= limit:
+                    break
+
+            return gguf_models
+
+    except httpx.RequestError as e:
+        raise HTTPException(status_code=503, detail=f"Failed to search HuggingFace: {str(e)}")
diff --git a/backend/app/models/worker.py b/backend/app/models/worker.py
index 3be352d..a4c7ed1 100644
--- a/backend/app/models/worker.py
+++ b/backend/app/models/worker.py
@@ -120,14 +120,17 @@ def available_backends(self) -> list[str]:
                 backends.extend(["vllm", "sglang", "ollama"])
             else:
                 backends.append("ollama")
-        else:
-            # Native backends (Mac)
-            if caps.get("ollama"):
+
+        # Mac native backends - always available (can be installed if missing)
+        if self.is_mac:
+            # vLLM-Metal, MLX, llama.cpp are all installable on Mac
+            mac_backends = ["vllm", "mlx", "llama_cpp"]
+            for b in mac_backends:
+                if b not in backends:
+                    backends.append(b)
+            # Ollama on Mac (if installed)
+            if caps.get("ollama") and "ollama" not in backends:
                 backends.append("ollama")
-            if caps.get("mlx"):
-                backends.append("mlx")
-            if caps.get("llama_cpp"):
-                backends.append("llama_cpp")
 
         return backends
 
diff --git a/backend/app/services/deployer/native.py b/backend/app/services/deployer/native.py
index 4127b41..4ed005d 100644
--- a/backend/app/services/deployer/native.py
+++ b/backend/app/services/deployer/native.py
@@ -1,7 +1,9 @@
 """Native Mac deployment operations.
 
 This module handles native deployment operations for macOS,
-including Ollama, MLX, and llama.cpp backends.
+including Ollama, MLX, llama.cpp, and vLLM-Metal backends.
+
+Supports automatic model conversion from HuggingFace to MLX/GGUF formats.
 """
 
 import asyncio
@@ -15,10 +17,21 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_mlx_ready(model_id: str) -> bool:
+    """Check if model is already in MLX format."""
+    return model_id.startswith("mlx-community/")
+
+
+def _is_gguf_file(model_id: str) -> bool:
+    """Check if model_id is a GGUF file path."""
+    return model_id.endswith(".gguf")
+
+
 async def deploy_native(deployment: Deployment, db) -> dict:
     """Deploy using native backend (Mac without Docker).
 
-    Supports Ollama, MLX, and llama.cpp backends on macOS.
+    Supports Ollama, MLX, llama.cpp, and vLLM-Metal backends on macOS.
+    Handles automatic conversion of HuggingFace models to MLX/GGUF formats.
     """
     # Import here to avoid circular imports
     from app.services.deployer.health import wait_for_native_api_ready
@@ -26,6 +39,7 @@ async def deploy_native(deployment: Deployment, db) -> dict:
     worker = deployment.worker
     model = deployment.model
     backend = deployment.backend
+    model_id = model.model_id
 
     # Validate backend is supported
     available_backends = worker.available_backends
@@ -35,19 +49,37 @@ async def deploy_native(deployment: Deployment, db) -> dict:
             f"Available backends: {', '.join(available_backends)}"
         }
 
+    # Check if model needs conversion and update status
+    needs_conversion = False
+    if backend == "mlx" and not _is_mlx_ready(model_id):
+        needs_conversion = True
+        deployment.status_message = "Model may need conversion to MLX format..."
+        await db.commit()
+    elif backend == "llama_cpp" and not _is_gguf_file(model_id):
+        needs_conversion = True
+        deployment.status_message = "Model may need conversion to GGUF format..."
+        await db.commit()
+
     try:
         worker_url = f"http://{worker.effective_address}/native/deploy"
 
         deploy_request = {
             "deployment_id": deployment.id,
             "deployment_name": deployment.name,
-            "model_id": model.model_id,
+            "model_id": model_id,
             "backend": backend,
             "port": 0,  # Auto-assign
             "extra_params": deployment.extra_params,
         }
 
-        deployment.status_message = f"Starting {backend} deployment..."
+        # Set container_id early so logs can be fetched during deployment
+        expected_process_id = f"native-{deployment.id}"
+        deployment.container_id = expected_process_id
+
+        if needs_conversion:
+            deployment.status_message = f"Converting model and starting {backend} deployment..."
+        else:
+            deployment.status_message = f"Starting {backend} deployment..."
         await db.commit()
 
         async with httpx.AsyncClient(timeout=600.0) as client:
@@ -59,8 +91,10 @@ async def deploy_native(deployment: Deployment, db) -> dict:
 
             result = response.json()
             deployment.port = result.get("port")
-            # Use process_id as container_id for native deployments
-            deployment.container_id = result.get("process_id")
+            # Verify process_id matches expected
+            actual_process_id = result.get("process_id")
+            if actual_process_id and actual_process_id != expected_process_id:
+                deployment.container_id = actual_process_id
 
         # Wait for API to be ready
         deployment.status_message = "Waiting for model to be ready..."
diff --git a/backend/app/services/deployer/service.py b/backend/app/services/deployer/service.py
index 33b1aa3..a9b0e72 100644
--- a/backend/app/services/deployer/service.py
+++ b/backend/app/services/deployer/service.py
@@ -77,11 +77,18 @@ async def deploy(self, deployment_id: int) -> None:
                 worker = deployment.worker
                 backend = deployment.backend
 
-                # Mac with Ollama should always use native deployment (use local Ollama)
+                # Mac with Ollama, MLX, llama_cpp, or vLLM should use native deployment
+                # vLLM on Mac uses vLLM-Metal (native Apple Silicon acceleration)
                 # Mac without Docker should also use native deployment
                 is_mac = worker.os_type == OSType.DARWIN.value
+                native_backends = (
+                    BackendType.OLLAMA.value,
+                    BackendType.MLX.value,
+                    BackendType.LLAMA_CPP.value,
+                    BackendType.VLLM.value,  # vLLM-Metal on Mac
+                )
                 is_mac_native = is_mac and (
-                    backend == BackendType.OLLAMA.value or not worker.supports_docker
+                    backend in native_backends or not worker.supports_docker
                 )
 
                 # Use native deployment for Mac
@@ -315,7 +322,7 @@ async def stop(self, deployment_id: int) -> None:
     async def get_logs(self, deployment: Deployment, tail: int = 100) -> str:
         """Get logs from a deployment"""
         if not deployment.container_id or not deployment.worker:
-            return "No container running"
+            return "No deployment process running"
 
         try:
             worker = deployment.worker
diff --git a/backend/app/services/local_worker.py b/backend/app/services/local_worker.py
index 4b5e499..c6be866 100644
--- a/backend/app/services/local_worker.py
+++ b/backend/app/services/local_worker.py
@@ -4,14 +4,19 @@
 """
 
 import logging
+import os
 import platform
+import shutil
 import socket
 import subprocess
+import time
 
 import psutil
 
 logger = logging.getLogger(__name__)
 
+OLLAMA_DEFAULT_PORT = 11434
+
 
 def get_local_hostname() -> str:
     """Get the local hostname."""
@@ -121,6 +126,80 @@ def get_local_worker_info() -> dict:
     }
 
 
+def ensure_ollama_running_on_host(host: str = "0.0.0.0", port: int = OLLAMA_DEFAULT_PORT) -> bool:
+    """Ensure Ollama is running on the host with external access enabled.
+
+    This is called BEFORE starting Docker worker so that the container
+    can access Ollama on the host via localhost (with --network host).
+
+    Args:
+        host: Host to bind to (default 0.0.0.0 for external access)
+        port: Port to bind to (default 11434)
+
+    Returns:
+        True if Ollama is running and accessible
+    """
+    # Only run on macOS
+    if platform.system() != "Darwin":
+        return True  # Not needed on Linux (Docker can use GPU directly)
+
+    # Check if Ollama is installed
+    ollama_path = shutil.which("ollama")
+    if not ollama_path:
+        logger.info("Ollama is not installed on this Mac")
+        return False
+
+    # Check if Ollama is already running
+    try:
+        import httpx
+
+        with httpx.Client(timeout=2.0) as client:
+            response = client.get(f"http://localhost:{port}/api/tags")
+            if response.status_code == 200:
+                logger.info("Ollama service is already running")
+                return True
+    except Exception:
+        pass
+
+    # Ollama not running, start it with external access
+    logger.info(f"Starting Ollama service on {host}:{port}")
+
+    env = os.environ.copy()
+    env["OLLAMA_HOST"] = f"{host}:{port}"
+
+    try:
+        # Start ollama serve in background
+        process = subprocess.Popen(
+            [ollama_path, "serve"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            env=env,
+            start_new_session=True,
+        )
+        logger.info(f"Started Ollama service (PID {process.pid})")
+
+        # Wait for Ollama to be ready (up to 30 seconds)
+        import httpx
+
+        for _ in range(30):
+            time.sleep(1)
+            try:
+                with httpx.Client(timeout=2.0) as client:
+                    response = client.get(f"http://localhost:{port}/api/tags")
+                    if response.status_code == 200:
+                        logger.info("Ollama service is ready")
+                        return True
+            except Exception:
+                pass
+
+        logger.error("Ollama service failed to start in time")
+        return False
+
+    except Exception as e:
+        logger.error(f"Failed to start Ollama service: {e}")
+        return False
+
+
 def spawn_docker_worker(
     worker_name: str,
     backend_url: str,
@@ -132,6 +211,11 @@ def spawn_docker_worker(
     Returns:
         dict with keys: success, message, container_id (if success)
     """
+    # On Mac, ensure Ollama is running with external access before starting Docker
+    if platform.system() == "Darwin":
+        logger.info("Mac detected, ensuring Ollama is running with external access...")
+        ensure_ollama_running_on_host()
+
     # First, check if container with same name exists and remove it
     try:
         check_result = subprocess.run(
diff --git a/frontend/src/api/huggingface.ts b/frontend/src/api/huggingface.ts
index bb56690..5310be9 100644
--- a/frontend/src/api/huggingface.ts
+++ b/frontend/src/api/huggingface.ts
@@ -50,6 +50,16 @@ export interface HFSearchResult {
   likes: number;
   pipeline_tag?: string;
   tags: string[];
+  is_mlx_ready?: boolean;
+  is_gguf_ready?: boolean;
+}
+
+export interface ModelFormatInfo {
+  model_id: string;
+  is_mlx_ready: boolean;
+  is_gguf_ready: boolean;
+  mlx_variants: string[];
+  gguf_files: string[];
 }
 
 export const huggingfaceApi = {
@@ -124,4 +134,43 @@ export const huggingfaceApi = {
     }>(`/huggingface/readme/${encodeURIComponent(modelId)}`, { params });
     return response.data;
   },
+
+  getFormatInfo: async (
+    modelId: string,
+    token?: string,
+  ): Promise<ModelFormatInfo> => {
+    const params: Record<string, string> = {};
+    if (token) params.token = token;
+    const response = await api.get<ModelFormatInfo>(
+      `/huggingface/format-info/${encodeURIComponent(modelId)}`,
+      { params },
+    );
+    return response.data;
+  },
+
+  searchMLX: async (
+    query: string,
+    limit?: number,
+  ): Promise<HFSearchResult[]> => {
+    const response = await api.get<HFSearchResult[]>(
+      "/huggingface/search-mlx",
+      {
+        params: { query, limit },
+      },
+    );
+    return response.data;
+  },
+
+  searchGGUF: async (
+    query: string,
+    limit?: number,
+  ): Promise<HFSearchResult[]> => {
+    const response = await api.get<HFSearchResult[]>(
+      "/huggingface/search-gguf",
+      {
+        params: { query, limit },
+      },
+    );
+    return response.data;
+  },
 };
diff --git a/frontend/src/api/index.ts b/frontend/src/api/index.ts
index 48f8ec6..555a9fb 100644
--- a/frontend/src/api/index.ts
+++ b/frontend/src/api/index.ts
@@ -88,6 +88,7 @@ export type {
   VRAMEstimate,
   HFModelFile,
   HFSearchResult,
+  ModelFormatInfo,
 } from "./huggingface";
 
 // Types - Ollama
diff --git a/frontend/src/assets/mlx-logo-dark.png b/frontend/src/assets/mlx-logo-dark.png
new file mode 100644
index 0000000..cda3c1f
Binary files /dev/null and b/frontend/src/assets/mlx-logo-dark.png differ
diff --git a/frontend/src/assets/mlx-logo.png b/frontend/src/assets/mlx-logo.png
new file mode 100644
index 0000000..be122bf
Binary files /dev/null and b/frontend/src/assets/mlx-logo.png differ
diff --git a/frontend/src/components/DeploymentAdvancedForm.tsx b/frontend/src/components/DeploymentAdvancedForm.tsx
index b141d1d..89fe3a5 100644
--- a/frontend/src/components/DeploymentAdvancedForm.tsx
+++ b/frontend/src/components/DeploymentAdvancedForm.tsx
@@ -21,7 +21,7 @@ import type { FormInstance } from "antd";
 const { Text } = Typography;
 
 interface DeploymentAdvancedFormProps {
-  backend: "vllm" | "sglang" | "ollama";
+  backend: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp";
   form: FormInstance;
 }
 
@@ -304,6 +304,7 @@ export default function DeploymentAdvancedForm({
   backend,
 }: DeploymentAdvancedFormProps) {
   const renderBackendParams = () => {
+    // Native Mac backends
     if (backend === "ollama") {
       return (
         <div style={{ padding: "16px 0" }}>
@@ -313,6 +314,12 @@ export default function DeploymentAdvancedForm({
       );
     }
 
+    // MLX and llama.cpp - no advanced settings for now
+    if (backend === "mlx" || backend === "llama_cpp") {
+      return null;
+    }
+
+    // vLLM and SGLang (Docker-based)
     const isVllm = backend === "vllm";
     const tabItems = [
       {
diff --git a/frontend/src/components/HuggingFaceModelPicker.tsx b/frontend/src/components/HuggingFaceModelPicker.tsx
index 2533866..2741e83 100644
--- a/frontend/src/components/HuggingFaceModelPicker.tsx
+++ b/frontend/src/components/HuggingFaceModelPicker.tsx
@@ -18,6 +18,7 @@ import {
   Empty,
   Pagination,
   Divider,
+  Segmented,
 } from "antd";
 import Loading from "./Loading";
 import {
@@ -40,15 +41,18 @@ import {
   type HFModelInfo,
   type VRAMEstimate,
   type HFSearchResult,
+  type ModelFormatInfo,
 } from "../services/api";
 import { useAppTheme } from "../hooks/useTheme";
 
+type FormatFilter = "all" | "mlx_ready" | "gguf_ready";
+
 interface HuggingFaceModelPickerProps {
   open: boolean;
   onClose: () => void;
   onSelect: (modelId: string, modelInfo?: HFModelInfo) => void;
   gpuMemoryGb?: number; // For compatibility check
-  backend?: "vllm" | "sglang" | "ollama"; // Reserved for future use
+  backend?: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp"; // Backend type affects filtering
 }
 
 const { Text, Title } = Typography;
@@ -82,9 +86,8 @@ export default function HuggingFaceModelPicker({
   onClose,
   onSelect,
   gpuMemoryGb,
-  backend: _backend = "vllm",
+  backend = "vllm",
 }: HuggingFaceModelPickerProps) {
-  void _backend; // Reserved for future use
   const [searchQuery, setSearchQuery] = useState("");
   const [searchResults, setSearchResults] = useState<HFSearchResult[]>([]);
   const [searching, setSearching] = useState(false);
@@ -92,21 +95,45 @@ export default function HuggingFaceModelPicker({
   const [modelInfo, setModelInfo] = useState<HFModelInfo | null>(null);
   const [vramEstimate, setVramEstimate] = useState<VRAMEstimate | null>(null);
   const [readme, setReadme] = useState<string | null>(null);
+  const [formatInfo, setFormatInfo] = useState<ModelFormatInfo | null>(null);
   const [loadingDetail, setLoadingDetail] = useState(false);
   const [currentPage, setCurrentPage] = useState(1);
   const [totalResults, setTotalResults] = useState(0);
   const [showDetails, setShowDetails] = useState(false); // Mobile: toggle between list and details
+  const [formatFilter, setFormatFilter] = useState<FormatFilter>("all");
   const searchTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
   const { isDark, colors } = useAppTheme();
   const { isMobile } = useResponsive();
 
   const pageSize = 20;
 
-  // Load popular models
+  // Set default filter based on backend
+  useEffect(() => {
+    if (backend === "mlx") {
+      setFormatFilter("mlx_ready");
+    } else if (backend === "llama_cpp") {
+      setFormatFilter("gguf_ready");
+    } else {
+      setFormatFilter("all");
+    }
+  }, [backend]);
+
+  // Load popular models based on format filter
   const loadPopularModels = useCallback(async () => {
     setSearching(true);
     try {
-      const results = await huggingfaceApi.getPopular(pageSize);
+      let results: HFSearchResult[];
+
+      if (formatFilter === "mlx_ready") {
+        // Search for popular MLX models
+        results = await huggingfaceApi.searchMLX("llama", pageSize);
+      } else if (formatFilter === "gguf_ready") {
+        // Search for popular GGUF models
+        results = await huggingfaceApi.searchGGUF("llama", pageSize);
+      } else {
+        results = await huggingfaceApi.getPopular(pageSize);
+      }
+
       setSearchResults(results);
       setTotalResults(results.length);
     } catch (error) {
@@ -115,9 +142,9 @@ export default function HuggingFaceModelPicker({
     } finally {
       setSearching(false);
     }
-  }, []);
+  }, [formatFilter]);
 
-  // Search models
+  // Search models with format filter
   const searchModels = useCallback(
     async (query: string) => {
       if (!query.trim()) {
@@ -128,10 +155,19 @@ export default function HuggingFaceModelPicker({
 
       setSearching(true);
       try {
-        const results = await huggingfaceApi.search(query, {
-          limit: pageSize,
-          filter_task: "text-generation",
-        });
+        let results: HFSearchResult[];
+
+        if (formatFilter === "mlx_ready") {
+          results = await huggingfaceApi.searchMLX(query, pageSize);
+        } else if (formatFilter === "gguf_ready") {
+          results = await huggingfaceApi.searchGGUF(query, pageSize);
+        } else {
+          results = await huggingfaceApi.search(query, {
+            limit: pageSize,
+            filter_task: "text-generation",
+          });
+        }
+
         setSearchResults(results);
         // Estimate total (HF API doesn't return total count)
         setTotalResults(
@@ -144,9 +180,20 @@ export default function HuggingFaceModelPicker({
         setSearching(false);
       }
     },
-    [loadPopularModels],
+    [loadPopularModels, formatFilter],
   );
 
+  // Re-search when format filter changes
+  useEffect(() => {
+    if (open) {
+      if (searchQuery.trim()) {
+        searchModels(searchQuery);
+      } else {
+        loadPopularModels();
+      }
+    }
+  }, [formatFilter, open]);
+
   // Debounced search
   const handleSearchChange = (value: string) => {
     setSearchQuery(value);
@@ -169,10 +216,11 @@ export default function HuggingFaceModelPicker({
       setModelInfo(null);
       setVramEstimate(null);
       setReadme(null);
+      setFormatInfo(null);
       if (isMobile) setShowDetails(true);
 
       try {
-        const [info, estimate, readmeResult] = await Promise.all([
+        const [info, estimate, readmeResult, format] = await Promise.all([
           huggingfaceApi.getModelInfo(modelId).catch((err) => {
             console.error("Failed to get model info:", err);
             return null;
@@ -190,10 +238,15 @@ export default function HuggingFaceModelPicker({
             console.error("Failed to get README:", err);
             return { content: null };
           }),
+          huggingfaceApi.getFormatInfo(modelId).catch((err) => {
+            console.error("Failed to get format info:", err);
+            return null;
+          }),
         ]);
 
         setModelInfo(info);
         setVramEstimate(estimate);
+        setFormatInfo(format);
 
         // Process README content
         if (readmeResult?.content) {
@@ -225,6 +278,7 @@ export default function HuggingFaceModelPicker({
       setModelInfo(null);
       setVramEstimate(null);
       setReadme(null);
+      setFormatInfo(null);
       setShowDetails(false);
     }
   }, [open]);
@@ -345,7 +399,7 @@ export default function HuggingFaceModelPicker({
           }}
         >
           {/* Search Input */}
-          <div style={{ padding: 16 }}>
+          <div style={{ padding: 16, paddingBottom: 8 }}>
             <div
               style={{
                 display: "flex",
@@ -371,6 +425,21 @@ export default function HuggingFaceModelPicker({
             </div>
           </div>
 
+          {/* Format Filter */}
+          <div style={{ padding: "0 16px 12px" }}>
+            <Segmented
+              value={formatFilter}
+              onChange={(value) => setFormatFilter(value as FormatFilter)}
+              block
+              size="small"
+              options={[
+                { label: "All", value: "all" },
+                { label: "MLX Ready", value: "mlx_ready" },
+                { label: "GGUF Ready", value: "gguf_ready" },
+              ]}
+            />
+          </div>
+
           {/* Results List */}
           <div style={{ flex: 1, overflow: "auto" }}>
             {searching ? (
@@ -676,6 +745,102 @@ export default function HuggingFaceModelPicker({
                   </div>
                 )}
 
+                {/* Format Compatibility */}
+                {formatInfo && (
+                  <div
+                    style={{
+                      padding: 16,
+                      background: isDark ? "#1a1a1a" : "#fafafa",
+                      borderRadius: 8,
+                      border: `1px solid ${colors.border}`,
+                      marginBottom: 16,
+                    }}
+                  >
+                    <div
+                      style={{
+                        display: "flex",
+                        alignItems: "center",
+                        justifyContent: "space-between",
+                        marginBottom: 12,
+                      }}
+                    >
+                      <Text strong>Format Compatibility</Text>
+                    </div>
+
+                    <Space wrap>
+                      <Tag
+                        color={formatInfo.is_mlx_ready ? "green" : "default"}
+                      >
+                        {formatInfo.is_mlx_ready ? (
+                          <CheckCircleOutlined />
+                        ) : null}{" "}
+                        MLX{" "}
+                        {formatInfo.is_mlx_ready ? "Ready" : "Needs Conversion"}
+                      </Tag>
+                      <Tag
+                        color={formatInfo.is_gguf_ready ? "green" : "default"}
+                      >
+                        {formatInfo.is_gguf_ready ? (
+                          <CheckCircleOutlined />
+                        ) : null}{" "}
+                        GGUF{" "}
+                        {formatInfo.is_gguf_ready
+                          ? "Ready"
+                          : "Needs Conversion"}
+                      </Tag>
+                    </Space>
+
+                    {formatInfo.mlx_variants.length > 0 &&
+                      !formatInfo.is_mlx_ready && (
+                        <div style={{ marginTop: 12 }}>
+                          <Text type="secondary" style={{ fontSize: 12 }}>
+                            MLX variants available:
+                          </Text>
+                          <div style={{ marginTop: 4 }}>
+                            {formatInfo.mlx_variants
+                              .slice(0, 3)
+                              .map((variant) => (
+                                <Tag
+                                  key={variant}
+                                  style={{ cursor: "pointer", marginBottom: 4 }}
+                                  onClick={() => handleSelectModel(variant)}
+                                >
+                                  {variant}
+                                </Tag>
+                              ))}
+                          </div>
+                        </div>
+                      )}
+
+                    {formatInfo.gguf_files.length > 0 && (
+                      <div style={{ marginTop: 12 }}>
+                        <Text type="secondary" style={{ fontSize: 12 }}>
+                          GGUF files ({formatInfo.gguf_files.length}):
+                        </Text>
+                        <div style={{ marginTop: 4 }}>
+                          {formatInfo.gguf_files.slice(0, 3).map((file) => (
+                            <Tag key={file} style={{ marginBottom: 4 }}>
+                              {file}
+                            </Tag>
+                          ))}
+                          {formatInfo.gguf_files.length > 3 && (
+                            <Tag>+{formatInfo.gguf_files.length - 3} more</Tag>
+                          )}
+                        </div>
+                      </div>
+                    )}
+
+                    {!formatInfo.is_mlx_ready && !formatInfo.is_gguf_ready && (
+                      <div style={{ marginTop: 8 }}>
+                        <Text type="secondary" style={{ fontSize: 12 }}>
+                          This model will be automatically converted when
+                          deployed with MLX or llama.cpp backend.
+                        </Text>
+                      </div>
+                    )}
+                  </div>
+                )}
+
                 {/* README Section */}
                 <Divider orientation="left">
                   <Space>
diff --git a/frontend/src/components/ModelCompatibilityCheck.tsx b/frontend/src/components/ModelCompatibilityCheck.tsx
index cd8edf8..67a7920 100644
--- a/frontend/src/components/ModelCompatibilityCheck.tsx
+++ b/frontend/src/components/ModelCompatibilityCheck.tsx
@@ -37,7 +37,7 @@ interface ModelCompatibilityCheckProps {
   precision?: string; // fp32, fp16, bf16, int8, int4
   gpuMemoryGb?: number; // Available GPU memory for compatibility check
   contextLength?: number;
-  backend?: "vllm" | "sglang" | "ollama";
+  backend?: "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp";
 }
 
 const { Text, Title } = Typography;
diff --git a/frontend/src/components/ModelFormatCompatibility.tsx b/frontend/src/components/ModelFormatCompatibility.tsx
new file mode 100644
index 0000000..d8ad540
--- /dev/null
+++ b/frontend/src/components/ModelFormatCompatibility.tsx
@@ -0,0 +1,218 @@
+/**
+ * ModelFormatCompatibility Component
+ *
+ * Displays model format compatibility information and conversion warnings
+ * for MLX and GGUF formats used by Mac native backends.
+ */
+import { useState, useEffect } from "react";
+import { Tag, Space, Alert, Tooltip, Typography } from "antd";
+import {
+  CheckCircleOutlined,
+  WarningOutlined,
+  SyncOutlined,
+  InfoCircleOutlined,
+} from "@ant-design/icons";
+import { huggingfaceApi, type ModelFormatInfo } from "../services/api";
+
+const { Text } = Typography;
+
+interface ModelFormatCompatibilityProps {
+  modelId: string;
+  backend?: "mlx" | "llama_cpp" | "vllm" | "sglang" | "ollama";
+  showDetails?: boolean;
+  compact?: boolean;
+}
+
+export default function ModelFormatCompatibility({
+  modelId,
+  backend,
+  showDetails = true,
+  compact = false,
+}: ModelFormatCompatibilityProps) {
+  const [formatInfo, setFormatInfo] = useState<ModelFormatInfo | null>(null);
+  const [loading, setLoading] = useState(false);
+
+  useEffect(() => {
+    const fetchFormatInfo = async () => {
+      if (!modelId) return;
+
+      setLoading(true);
+      try {
+        const info = await huggingfaceApi.getFormatInfo(modelId);
+        setFormatInfo(info);
+      } catch (error) {
+        console.error("Failed to fetch format info:", error);
+        setFormatInfo(null);
+      } finally {
+        setLoading(false);
+      }
+    };
+
+    fetchFormatInfo();
+  }, [modelId]);
+
+  if (loading) {
+    return (
+      <Tag icon={<SyncOutlined spin />} color="processing">
+        Checking format...
+      </Tag>
+    );
+  }
+
+  if (!formatInfo) {
+    return null;
+  }
+
+  // Determine if format is compatible with selected backend
+  const isCompatible = (() => {
+    if (!backend) return true;
+    if (backend === "mlx") return formatInfo.is_mlx_ready;
+    if (backend === "llama_cpp") return formatInfo.is_gguf_ready;
+    return true; // vllm, sglang, ollama don't need format conversion
+  })();
+
+  const needsConversion = backend === "mlx" || backend === "llama_cpp";
+
+  if (compact) {
+    return (
+      <Space size={4}>
+        {formatInfo.is_mlx_ready && (
+          <Tooltip title="This model is MLX-ready (from mlx-community)">
+            <Tag color="green" style={{ margin: 0 }}>
+              MLX
+            </Tag>
+          </Tooltip>
+        )}
+        {formatInfo.is_gguf_ready && (
+          <Tooltip title="This model has GGUF files available">
+            <Tag color="blue" style={{ margin: 0 }}>
+              GGUF
+            </Tag>
+          </Tooltip>
+        )}
+        {needsConversion && !isCompatible && (
+          <Tooltip title="Model will be converted automatically">
+            <Tag color="orange" style={{ margin: 0 }}>
+              <SyncOutlined /> Convert
+            </Tag>
+          </Tooltip>
+        )}
+      </Space>
+    );
+  }
+
+  // Show warning if conversion is needed
+  if (needsConversion && !isCompatible) {
+    const conversionTarget = backend === "mlx" ? "MLX" : "GGUF";
+
+    return (
+      <Alert
+        type="warning"
+        showIcon
+        icon={<WarningOutlined />}
+        message={`Model will be converted to ${conversionTarget} format`}
+        description={
+          showDetails ? (
+            <div style={{ marginTop: 8 }}>
+              <Text type="secondary" style={{ fontSize: 12 }}>
+                This HuggingFace model is not in {conversionTarget} format. It
+                will be automatically converted on the worker before deployment.
+                This may take several minutes depending on model size.
+              </Text>
+
+              {backend === "mlx" && formatInfo.mlx_variants.length > 0 && (
+                <div style={{ marginTop: 8 }}>
+                  <Text type="secondary" style={{ fontSize: 12 }}>
+                    Tip: Consider using an existing MLX model instead:
+                  </Text>
+                  <div style={{ marginTop: 4 }}>
+                    {formatInfo.mlx_variants.slice(0, 3).map((variant) => (
+                      <Tag
+                        key={variant}
+                        style={{ marginRight: 4, marginBottom: 4 }}
+                      >
+                        {variant}
+                      </Tag>
+                    ))}
+                  </div>
+                </div>
+              )}
+
+              {backend === "llama_cpp" &&
+                formatInfo.gguf_files.length === 0 && (
+                  <div style={{ marginTop: 8 }}>
+                    <Text type="secondary" style={{ fontSize: 12 }}>
+                      Tip: Search for "{modelId.split("/").pop()}-GGUF" to find
+                      pre-converted models.
+                    </Text>
+                  </div>
+                )}
+            </div>
+          ) : null
+        }
+        style={{ marginBottom: 16 }}
+      />
+    );
+  }
+
+  // Show success if format is ready
+  if (needsConversion && isCompatible) {
+    return (
+      <Alert
+        type="success"
+        showIcon
+        icon={<CheckCircleOutlined />}
+        message={`Model is ${backend === "mlx" ? "MLX" : "GGUF"}-ready`}
+        description={
+          showDetails ? (
+            <Text type="secondary" style={{ fontSize: 12 }}>
+              {backend === "mlx"
+                ? "This model is from mlx-community and optimized for Apple Silicon."
+                : `This model has ${formatInfo.gguf_files.length} GGUF file(s) available.`}
+            </Text>
+          ) : null
+        }
+        style={{ marginBottom: 16 }}
+      />
+    );
+  }
+
+  // General format info display
+  if (showDetails) {
+    return (
+      <Alert
+        type="info"
+        showIcon
+        icon={<InfoCircleOutlined />}
+        message="Model Format Information"
+        description={
+          <Space direction="vertical" size={4} style={{ marginTop: 8 }}>
+            <div>
+              <Tag color={formatInfo.is_mlx_ready ? "green" : "default"}>
+                {formatInfo.is_mlx_ready ? <CheckCircleOutlined /> : null} MLX{" "}
+                {formatInfo.is_mlx_ready ? "Ready" : "Needs Conversion"}
+              </Tag>
+              <Tag color={formatInfo.is_gguf_ready ? "green" : "default"}>
+                {formatInfo.is_gguf_ready ? <CheckCircleOutlined /> : null} GGUF{" "}
+                {formatInfo.is_gguf_ready ? "Ready" : "Needs Conversion"}
+              </Tag>
+            </div>
+            {formatInfo.gguf_files.length > 0 && (
+              <Text type="secondary" style={{ fontSize: 12 }}>
+                {formatInfo.gguf_files.length} GGUF file(s) available
+              </Text>
+            )}
+            {formatInfo.mlx_variants.length > 0 && (
+              <Text type="secondary" style={{ fontSize: 12 }}>
+                {formatInfo.mlx_variants.length} MLX variant(s) found
+              </Text>
+            )}
+          </Space>
+        }
+        style={{ marginBottom: 16 }}
+      />
+    );
+  }
+
+  return null;
+}
diff --git a/frontend/src/components/logos/index.tsx b/frontend/src/components/logos/index.tsx
index ff49e6b..029b8b7 100644
--- a/frontend/src/components/logos/index.tsx
+++ b/frontend/src/components/logos/index.tsx
@@ -13,6 +13,8 @@ import ollamaLogoDark from "../../assets/ollama-dark.png";
 import ollamaLogoLight from "../../assets/ollama-light.png";
 import sglangLogo from "../../assets/sglang.png";
 import huggingfaceLogo from "../../assets/huggingface-2.svg";
+import mlxLogo from "../../assets/mlx-logo.png";
+import mlxLogoDark from "../../assets/mlx-logo-dark.png";
 
 // =============================================================================
 // Props Types
@@ -74,54 +76,47 @@ export function HuggingFaceLogo({
 /**
  * MLX Logo - Apple's ML framework for Apple Silicon
  */
-export function MLXLogo({ height = 16, style }: Omit<LogoProps, "isDark">) {
-  // Use Apple-style gradient colors
-  const gradientId = `mlx-gradient-${Math.random().toString(36).substr(2, 9)}`;
+export function MLXLogo({ height = 16, isDark = false, style }: LogoProps) {
   return (
-    <svg width={height * 1.5} height={height} viewBox="0 0 36 24" style={style}>
-      <defs>
-        <linearGradient id={gradientId} x1="0%" y1="0%" x2="100%" y2="100%">
-          <stop offset="0%" stopColor="#FF6B6B" />
-          <stop offset="50%" stopColor="#9B59B6" />
-          <stop offset="100%" stopColor="#3498DB" />
-        </linearGradient>
-      </defs>
-      <text
-        x="18"
-        y="17"
-        fontSize="14"
-        fontWeight="700"
-        fontFamily="SF Pro Display, -apple-system, BlinkMacSystemFont, sans-serif"
-        textAnchor="middle"
-        fill={`url(#${gradientId})`}
-      >
-        MLX
-      </text>
-    </svg>
+    <img
+      src={isDark ? mlxLogoDark : mlxLogo}
+      alt="MLX"
+      style={{ height, width: "auto", objectFit: "contain", ...style }}
+    />
   );
 }
 
 /**
- * Llama.cpp Logo
+ * Llama.cpp Logo - High-performance LLM inference
+ * Uses official branding colors: white text with orange C++
  */
 export function LlamaCppLogo({
   height = 16,
   isDark = false,
   style,
 }: LogoProps) {
-  const textColor = isDark ? "#ffffff" : "#333333";
+  const textColor = isDark ? "#ffffff" : "#1b1f20";
   return (
-    <svg width={height * 3} height={height} viewBox="0 0 72 24" style={style}>
+    <svg width={height * 4.5} height={height} viewBox="0 0 90 20" style={style}>
       <text
-        x="36"
-        y="17"
-        fontSize="12"
-        fontWeight="600"
-        fontFamily="Menlo, Monaco, monospace"
-        textAnchor="middle"
+        x="0"
+        y="15"
+        fontSize="14"
+        fontWeight="700"
+        fontFamily="system-ui, -apple-system, sans-serif"
         fill={textColor}
       >
-        llama.cpp
+        llama
+      </text>
+      <text
+        x="44"
+        y="15"
+        fontSize="14"
+        fontWeight="700"
+        fontFamily="system-ui, -apple-system, sans-serif"
+        fill="#ff8236"
+      >
+        .cpp
       </text>
     </svg>
   );
@@ -188,7 +183,7 @@ export function getBackendConfig(
     mlx: {
       label: "MLX",
       color: tagColor,
-      icon: <MLXLogo height={16} />,
+      icon: <MLXLogo height={16} isDark={isDark} />,
     },
     llama_cpp: {
       label: "llama.cpp",
diff --git a/frontend/src/pages/Deployments.tsx b/frontend/src/pages/Deployments.tsx
index 922e529..e77be8f 100644
--- a/frontend/src/pages/Deployments.tsx
+++ b/frontend/src/pages/Deployments.tsx
@@ -43,6 +43,7 @@ import { useResponsive } from "../hooks";
 import { useAuth } from "../contexts/AuthContext";
 import DeploymentAdvancedForm from "../components/DeploymentAdvancedForm";
 import ModelCompatibilityCheck from "../components/ModelCompatibilityCheck";
+import ModelFormatCompatibility from "../components/ModelFormatCompatibility";
 import backendVersionsData from "../constants/backendVersions.json";
 import dayjs from "dayjs";
 import utc from "dayjs/plugin/utc";
@@ -86,7 +87,7 @@ export default function Deployments() {
   const [selectedWorkerId, setSelectedWorkerId] = useState<number | null>(null);
   const [selectedGpuIndexes, setSelectedGpuIndexes] = useState<number[]>([]);
   const [selectedBackend, setSelectedBackend] = useState<
-    "vllm" | "sglang" | "ollama"
+    "vllm" | "sglang" | "ollama" | "mlx" | "llama_cpp"
   >("vllm");
   const [editingDeployment, setEditingDeployment] = useState<Deployment | null>(
     null,
@@ -101,6 +102,11 @@ export default function Deployments() {
   // Get the selected worker's GPU info
   const selectedWorker = workers.find((w) => w.id === selectedWorkerId);
 
+  // Helper functions to check model format
+  const isMLXReady = (modelId: string) => modelId.startsWith("mlx-community/");
+  const isGGUFReady = (modelId: string) =>
+    modelId.toLowerCase().includes("gguf");
+
   // Determine available backends based on model source and worker capabilities
   const availableBackends = (() => {
     // Start with model-based restrictions
@@ -108,29 +114,19 @@ export default function Deployments() {
       return ["ollama"] as const;
     }
 
-    // If no worker selected, show all HuggingFace-compatible backends
+    // If no worker selected, show all possible backends for HuggingFace models
     if (!selectedWorker) {
-      return ["vllm", "sglang"] as const;
+      return ["vllm", "sglang", "mlx", "llama_cpp"] as const;
     }
 
-    // macOS workers only support Ollama (vLLM/SGLang require NVIDIA GPU)
+    // macOS workers support vLLM-Metal, MLX, and llama.cpp for HuggingFace models
+    // Ollama is NOT shown for HF models since Ollama can't use HF models directly
     if (selectedWorker.os_type === "darwin") {
-      return ["ollama"] as const;
-    }
-
-    // Use worker's available_backends if provided
-    if (
-      selectedWorker.available_backends &&
-      selectedWorker.available_backends.length > 0
-    ) {
-      // Filter for HuggingFace-compatible backends from worker's list
-      const hfBackends = selectedWorker.available_backends.filter((b) =>
-        ["vllm", "sglang", "ollama"].includes(b),
-      );
-      return hfBackends.length > 0 ? hfBackends : (["vllm", "sglang"] as const);
+      return ["vllm", "mlx", "llama_cpp"] as const;
     }
 
-    // Default fallback for Linux workers
+    // Linux workers: only vLLM and SGLang for HuggingFace models
+    // Ollama can't use HF models directly, so don't show it
     return ["vllm", "sglang"] as const;
   })();
   const workerGpus = selectedWorker?.gpu_info || [];
@@ -829,16 +825,54 @@ export default function Deployments() {
                   );
                 const sourceLabel =
                   m.source === "ollama" ? "Ollama" : "HuggingFace";
+                const mlxReady =
+                  m.source !== "ollama" && isMLXReady(m.model_id);
+                const ggufReady =
+                  m.source !== "ollama" && isGGUFReady(m.model_id);
                 return {
                   label: (
                     <span
                       style={{ display: "flex", alignItems: "center", gap: 6 }}
                     >
-                      <Tag style={{ ...getTagStyle("small"), margin: 0 }}>
-                        {sourceIcon}
+                      <Tag
+                        style={{
+                          ...getTagStyle("small"),
+                          margin: 0,
+                          display: "inline-flex",
+                          alignItems: "center",
+                          gap: 4,
+                          width: 90,
+                        }}
+                      >
+                        <span
+                          style={{
+                            width: 16,
+                            display: "inline-flex",
+                            alignItems: "center",
+                            justifyContent: "center",
+                          }}
+                        >
+                          {sourceIcon}
+                        </span>
                         {sourceLabel}
                       </Tag>
                       {m.name}
+                      {mlxReady && (
+                        <Tag
+                          color="green"
+                          style={{ fontSize: 10, margin: 0, padding: "0 4px" }}
+                        >
+                          MLX
+                        </Tag>
+                      )}
+                      {ggufReady && (
+                        <Tag
+                          color="blue"
+                          style={{ fontSize: 10, margin: 0, padding: "0 4px" }}
+                        >
+                          GGUF
+                        </Tag>
+                      )}
                     </span>
                   ),
                   value: m.id,
@@ -847,40 +881,6 @@ export default function Deployments() {
             />
           </Form.Item>
 
-          <Form.Item
-            name="backend"
-            label="Inference Backend"
-            rules={[{ required: true, message: "Please select a backend" }]}
-            extra={
-              selectedModel?.source === "ollama"
-                ? "Ollama models can only use Ollama backend"
-                : selectedWorker?.os_type === "darwin"
-                  ? "macOS workers only support Ollama backend"
-                  : "HuggingFace models can use vLLM or SGLang"
-            }
-          >
-            <Select
-              placeholder="Select a backend"
-              disabled={!selectedModelId}
-              value={selectedBackend}
-              onChange={(value) => setSelectedBackend(value)}
-              options={availableBackends.map((b) => {
-                const config = BACKEND_CONFIG[b];
-                return {
-                  label: (
-                    <span
-                      style={{ display: "flex", alignItems: "center", gap: 8 }}
-                    >
-                      {config.icon}
-                      {config.label}
-                    </span>
-                  ),
-                  value: b,
-                };
-              })}
-            />
-          </Form.Item>
-
           <Form.Item
             name="worker_id"
             label="Worker"
@@ -893,6 +893,25 @@ export default function Deployments() {
                 // Reset GPU selection when worker changes
                 setSelectedGpuIndexes([]);
                 form.setFieldValue("gpu_indexes", undefined);
+                // Check if current backend is available on the new worker
+                const newWorker = workers.find((w) => w.id === value);
+                const isMac = newWorker?.os_type === "darwin";
+                const macBackends = ["ollama", "mlx", "llama_cpp", "vllm"];
+                const linuxBackends = ["vllm", "sglang", "ollama"];
+                const newAvailable = isMac ? macBackends : linuxBackends;
+                // Reset to first available backend if current is not available
+                if (!newAvailable.includes(selectedBackend)) {
+                  const defaultBackend = isMac ? "vllm" : "vllm";
+                  setSelectedBackend(
+                    defaultBackend as
+                      | "vllm"
+                      | "sglang"
+                      | "ollama"
+                      | "mlx"
+                      | "llama_cpp",
+                  );
+                  form.setFieldValue("backend", defaultBackend);
+                }
               }}
               options={workers.map((w) => ({
                 label: (
@@ -906,12 +925,17 @@ export default function Deployments() {
                     <span>
                       {w.name} ({w.address})
                     </span>
-                    {w.gpu_info && w.gpu_info.length > 0 && (
-                      <Tag color="blue" style={{ marginLeft: 8 }}>
-                        {w.gpu_info.length} GPU
-                        {w.gpu_info.length > 1 ? "s" : ""}
-                      </Tag>
-                    )}
+                    <span style={{ display: "flex", gap: 4 }}>
+                      {w.os_type === "darwin" && (
+                        <Tag color="purple">macOS</Tag>
+                      )}
+                      {w.gpu_info && w.gpu_info.length > 0 && (
+                        <Tag color="blue">
+                          {w.gpu_info.length} GPU
+                          {w.gpu_info.length > 1 ? "s" : ""}
+                        </Tag>
+                      )}
+                    </span>
                   </span>
                 ),
                 value: w.id,
@@ -919,9 +943,62 @@ export default function Deployments() {
             />
           </Form.Item>
 
-          {/* macOS Ollama Warning */}
+          <Form.Item
+            name="backend"
+            label="Inference Backend"
+            rules={[{ required: true, message: "Please select a backend" }]}
+            extra={
+              !selectedWorker
+                ? "Select a worker first"
+                : selectedModel?.source === "ollama"
+                  ? "Ollama models can only use Ollama backend"
+                  : selectedWorker?.os_type === "darwin"
+                    ? "macOS workers support vLLM-Metal, Ollama, MLX, and llama.cpp with Apple Silicon acceleration"
+                    : "HuggingFace models can use vLLM or SGLang"
+            }
+          >
+            <Select
+              placeholder={
+                selectedWorker ? "Select a backend" : "Select a worker first"
+              }
+              disabled={!selectedModelId || !selectedWorkerId}
+              value={selectedBackend}
+              onChange={(value) => setSelectedBackend(value)}
+              options={availableBackends.map((b) => {
+                const config = BACKEND_CONFIG[b];
+                // Show "vLLM-Metal" for vllm on Mac workers
+                const label =
+                  b === "vllm" && selectedWorker?.os_type === "darwin"
+                    ? "vLLM-Metal"
+                    : config.label;
+                return {
+                  label: (
+                    <span
+                      style={{ display: "flex", alignItems: "center", gap: 8 }}
+                    >
+                      <span
+                        style={{
+                          width: 50,
+                          display: "flex",
+                          alignItems: "center",
+                          justifyContent: "center",
+                        }}
+                      >
+                        {config.icon}
+                      </span>
+                      {label}
+                    </span>
+                  ),
+                  value: b,
+                };
+              })}
+            />
+          </Form.Item>
+
+          {/* macOS Ollama Warning - only show when Ollama backend is selected */}
           {selectedWorker &&
             selectedWorker.os_type === "darwin" &&
+            selectedBackend === "ollama" &&
             !selectedWorker.capabilities?.ollama && (
               <Alert
                 message="Ollama Not Installed"
@@ -955,9 +1032,10 @@ export default function Deployments() {
               />
             )}
 
-          {/* macOS Ollama Not Running Warning */}
+          {/* macOS Ollama Not Running Warning - only show when Ollama backend is selected */}
           {selectedWorker &&
             selectedWorker.os_type === "darwin" &&
+            selectedBackend === "ollama" &&
             selectedWorker.capabilities?.ollama &&
             !selectedWorker.capabilities?.ollama_running && (
               <Alert
@@ -986,13 +1064,87 @@ export default function Deployments() {
               />
             )}
 
+          {/* macOS Backend Info - show auto-install message */}
+          {selectedWorker &&
+            selectedWorker.os_type === "darwin" &&
+            selectedBackend === "vllm" && (
+              <Alert
+                message="vLLM-Metal"
+                description={
+                  <span style={{ fontSize: 12 }}>
+                    Uses Apple Silicon GPU acceleration. Will be automatically
+                    installed on first deployment.
+                  </span>
+                }
+                type="info"
+                showIcon
+                style={{ marginBottom: 16 }}
+              />
+            )}
+          {selectedWorker &&
+            selectedWorker.os_type === "darwin" &&
+            selectedBackend === "mlx" && (
+              <Alert
+                message="MLX-LM"
+                description={
+                  <span style={{ fontSize: 12 }}>
+                    Native Apple Silicon ML framework. Will be automatically
+                    installed on first deployment.
+                  </span>
+                }
+                type="info"
+                showIcon
+                style={{ marginBottom: 16 }}
+              />
+            )}
+          {selectedWorker &&
+            selectedWorker.os_type === "darwin" &&
+            selectedBackend === "llama_cpp" && (
+              <Alert
+                message="llama.cpp"
+                description={
+                  <span style={{ fontSize: 12 }}>
+                    High-performance inference with Metal acceleration. Will be
+                    automatically installed via Homebrew on first deployment.
+                  </span>
+                }
+                type="info"
+                showIcon
+                style={{ marginBottom: 16 }}
+              />
+            )}
+
           {/* macOS Info */}
           {selectedWorker &&
             selectedWorker.os_type === "darwin" &&
             selectedWorker.capabilities?.ollama_running && (
               <Alert
-                message="macOS Worker"
-                description="This worker uses native Ollama with Metal GPU acceleration. Only Ollama backend is available."
+                message="macOS Worker with Apple Silicon"
+                description={
+                  <div>
+                    <p style={{ margin: "4px 0" }}>
+                      This worker supports native Apple Silicon backends:
+                    </p>
+                    <ul style={{ margin: "4px 0", paddingLeft: 20 }}>
+                      <li>
+                        <strong>Ollama</strong> - Easiest, pull and run models
+                        directly
+                      </li>
+                      <li>
+                        <strong>MLX</strong> - Apple's ML framework, optimized
+                        for Apple Silicon
+                      </li>
+                      <li>
+                        <strong>llama.cpp</strong> - Cross-platform with Metal
+                        acceleration
+                      </li>
+                    </ul>
+                    <p style={{ margin: "4px 0", fontSize: 12, color: "#666" }}>
+                      For MLX/llama.cpp, HuggingFace models will be
+                      automatically converted if needed.
+                    </p>
+                  </div>
+                }
                 type="info"
                 showIcon
                 style={{ marginBottom: 16 }}
@@ -1054,63 +1206,78 @@ export default function Deployments() {
           </Form.Item>
 
           {/* Model Compatibility Check - Show when model is selected for vLLM/SGLang */}
-          {selectedModel && selectedModel.source !== "ollama" && (
-            <ModelCompatibilityCheck
-              modelId={selectedModel.model_id}
-              backend={selectedBackend}
-              gpuMemoryGb={selectedGpuMemoryGb}
-              precision="fp16"
-            />
-          )}
+          {selectedModel &&
+            selectedModel.source !== "ollama" &&
+            !["mlx", "llama_cpp"].includes(selectedBackend) && (
+              <ModelCompatibilityCheck
+                modelId={selectedModel.model_id}
+                backend={selectedBackend}
+                gpuMemoryGb={selectedGpuMemoryGb}
+                precision="fp16"
+              />
+            )}
 
-          {/* Version Override - Show when model is selected */}
-          {selectedModelId && (
-            <Form.Item
-              name={["extra_params", "docker_image"]}
-              label={`${BACKEND_CONFIG[selectedBackend]?.label || "Backend"} Version`}
-              extra="Override the model's default backend version for this deployment"
-            >
-              <Select
-                placeholder="Use model default"
-                allowClear
-                showSearch
-                options={(
-                  (
-                    backendVersionsData as Record<
-                      string,
-                      {
-                        versions: Array<{
-                          version: string;
-                          image: string;
-                          recommended?: boolean;
-                        }>;
-                      }
-                    >
-                  )[selectedBackend]?.versions || []
-                ).map((v) => ({
-                  label: (
-                    <span>
-                      {v.version}
-                      {v.recommended && (
-                        <Tag
-                          color="green"
-                          style={{ marginLeft: 8, fontSize: 10 }}
-                        >
-                          Recommended
-                        </Tag>
-                      )}
-                    </span>
-                  ),
-                  value: v.image,
-                }))}
+          {/* Model Format Compatibility - Show for MLX/llama.cpp backends */}
+          {selectedModel &&
+            selectedModel.source !== "ollama" &&
+            ["mlx", "llama_cpp"].includes(selectedBackend) && (
+              <ModelFormatCompatibility
+                modelId={selectedModel.model_id}
+                backend={selectedBackend as "mlx" | "llama_cpp"}
+                showDetails={true}
               />
-            </Form.Item>
-          )}
+            )}
 
-          {/* Advanced Parameters - Show when model is selected */}
-          {selectedModelId && (
-            <DeploymentAdvancedForm backend={selectedBackend} form={form} />
-          )}
+          {/* Version Override - Show when model is selected (not for MLX/llama.cpp) */}
+          {selectedModelId &&
+            !["mlx", "llama_cpp"].includes(selectedBackend) && (
+              <Form.Item
+                name={["extra_params", "docker_image"]}
+                label={`${BACKEND_CONFIG[selectedBackend]?.label || "Backend"} Version`}
+                extra="Override the model's default backend version for this deployment"
+              >
+                <Select
+                  placeholder="Use model default"
+                  allowClear
+                  showSearch
+                  options={(
+                    (
+                      backendVersionsData as Record<
+                        string,
+                        {
+                          versions: Array<{
+                            version: string;
+                            image: string;
+                            recommended?: boolean;
+                          }>;
+                        }
+                      >
+                    )[selectedBackend]?.versions || []
+                  ).map((v) => ({
+                    label: (
+                      <span>
+                        {v.version}
+                        {v.recommended && (
+                          <Tag
+                            color="green"
+                            style={{ marginLeft: 8, fontSize: 10 }}
+                          >
+                            Recommended
+                          </Tag>
+                        )}
+                      </span>
+                    ),
+                    value: v.image,
+                  }))}
+                />
+              </Form.Item>
+            )}
+
+          {/* Advanced Parameters - Show when model is selected (not for MLX/llama.cpp) */}
+          {selectedModelId &&
+            !["mlx", "llama_cpp"].includes(selectedBackend) && (
+              <DeploymentAdvancedForm backend={selectedBackend} form={form} />
+            )}
 
           <Form.Item>
             <Space>
diff --git a/worker/agent.py b/worker/agent.py
index c493164..2b9b5b7 100644
--- a/worker/agent.py
+++ b/worker/agent.py
@@ -25,12 +25,14 @@
     from docker_ops import ContainerManager, DockerRunner, GPUDetector, ImageManager, SystemDetector
     from routes import (
         containers_router,
+        converter_router,
         deployment_router,
         images_router,
         native_router,
         storage_router,
     )
     from routes.containers import set_agent as set_containers_agent
+    from routes.converter import set_agent as set_converter_agent
     from routes.deployment import set_agent as set_deployment_agent
     from routes.images import set_agent as set_images_agent
     from routes.native import set_agent as set_native_agent
@@ -46,12 +48,14 @@
     )
     from worker.routes import (
         containers_router,
+        converter_router,
         deployment_router,
         images_router,
         native_router,
         storage_router,
     )
     from worker.routes.containers import set_agent as set_containers_agent
+    from worker.routes.converter import set_agent as set_converter_agent
     from worker.routes.deployment import set_agent as set_deployment_agent
     from worker.routes.images import set_agent as set_images_agent
     from worker.routes.native import set_agent as set_native_agent
@@ -127,6 +131,12 @@ def _is_local_worker(self) -> bool:
     async def register(self) -> bool:
         """Register this worker with the server."""
         try:
+            # For Mac workers, ensure Ollama is running with external access
+            if self.os_type == "darwin" and self.native_manager:
+                if self.capabilities.get("ollama"):
+                    logger.info("Ensuring Ollama service is running with external access...")
+                    await self.native_manager.ensure_ollama_running()
+
             gpu_info = self.gpu_detector.detect()
             system_info = self.system_detector.detect()
 
@@ -296,6 +306,7 @@ def _set_agent_references(worker_agent: WorkerAgent):
     set_containers_agent(worker_agent)
     set_storage_agent(worker_agent)
     set_native_agent(worker_agent)
+    set_converter_agent(worker_agent)
 
 
 @asynccontextmanager
@@ -330,6 +341,7 @@ async def lifespan(app: FastAPI):
 app.include_router(containers_router)
 app.include_router(storage_router)
 app.include_router(native_router)
+app.include_router(converter_router)
 
 
 @app.get("/health")
diff --git a/worker/docker_ops/gpu.py b/worker/docker_ops/gpu.py
index f37cc4e..2f529fb 100644
--- a/worker/docker_ops/gpu.py
+++ b/worker/docker_ops/gpu.py
@@ -1,18 +1,24 @@
 """GPU detection for LMStack Worker.
 
-Provides NVIDIA GPU detection using pynvml (nvidia-ml-py).
+Provides GPU detection for NVIDIA (using pynvml) and Apple Silicon.
 """
 
+import json
 import logging
+import platform
+import subprocess
 
 logger = logging.getLogger(__name__)
 
 
 class GPUDetector:
-    """Detect and report GPU information using pynvml (nvidia-ml-py)."""
+    """Detect and report GPU information.
+
+    Supports NVIDIA GPUs (via pynvml) and Apple Silicon (via system_profiler).
+    """
 
     def detect(self) -> list[dict]:
-        """Detect available GPUs with temperature using pynvml.
+        """Detect available GPUs.
 
         Returns:
             List of GPU information dictionaries with:
@@ -24,6 +30,89 @@ def detect(self) -> list[dict]:
             - utilization: GPU utilization percentage
             - temperature: GPU temperature in Celsius
         """
+        # Check platform
+        if platform.system() == "Darwin":
+            return self._detect_apple_silicon()
+        else:
+            return self._detect_nvidia()
+
+    def _detect_apple_silicon(self) -> list[dict]:
+        """Detect Apple Silicon GPU information."""
+        try:
+            # Check if this is Apple Silicon
+            machine = platform.machine()
+            if machine != "arm64":
+                # Intel Mac - no GPU info to report
+                return []
+
+            # Get GPU info from system_profiler
+            result = subprocess.run(
+                ["system_profiler", "SPDisplaysDataType", "-json"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode != 0:
+                logger.warning("system_profiler failed")
+                return []
+
+            data = json.loads(result.stdout)
+            displays = data.get("SPDisplaysDataType", [])
+            if not displays:
+                return []
+
+            gpus = []
+            for idx, display in enumerate(displays):
+                gpu_name = display.get("sppci_model", "Apple Silicon GPU")
+
+                # Get unified memory info (Apple Silicon uses unified memory)
+                # Use psutil to get system memory as reference
+                try:
+                    import psutil
+
+                    mem = psutil.virtual_memory()
+                    # Apple Silicon GPU can use up to ~75% of unified memory for GPU tasks
+                    # Report system memory info as reference
+                    memory_total = mem.total
+                    memory_used = mem.used
+                    memory_free = mem.available
+                except ImportError:
+                    # Fallback: get memory from sysctl
+                    try:
+                        mem_result = subprocess.run(
+                            ["sysctl", "-n", "hw.memsize"],
+                            capture_output=True,
+                            text=True,
+                            timeout=5,
+                        )
+                        memory_total = int(mem_result.stdout.strip())
+                        memory_used = 0
+                        memory_free = memory_total
+                    except Exception:
+                        memory_total = 0
+                        memory_used = 0
+                        memory_free = 0
+
+                gpus.append(
+                    {
+                        "index": idx,
+                        "name": gpu_name,
+                        "memory_total": memory_total,
+                        "memory_used": memory_used,
+                        "memory_free": memory_free,
+                        "utilization": 0,  # Would need powermetrics (requires sudo)
+                        "temperature": 0,  # Would need powermetrics (requires sudo)
+                    }
+                )
+
+            return gpus
+
+        except Exception as e:
+            logger.warning(f"Apple Silicon GPU detection failed: {e}")
+            return []
+
+    def _detect_nvidia(self) -> list[dict]:
+        """Detect NVIDIA GPUs using pynvml."""
         try:
             import pynvml
 
@@ -72,8 +161,8 @@ def detect(self) -> list[dict]:
             return gpus
 
         except ImportError:
-            logger.error("pynvml (nvidia-ml-py) not installed")
+            logger.debug("pynvml (nvidia-ml-py) not installed")
             return []
         except Exception as e:
-            logger.warning(f"GPU detection failed: {e}")
+            logger.warning(f"NVIDIA GPU detection failed: {e}")
             return []
diff --git a/worker/native_ops/__init__.py b/worker/native_ops/__init__.py
index 2acb027..f796cc9 100644
--- a/worker/native_ops/__init__.py
+++ b/worker/native_ops/__init__.py
@@ -1,7 +1,8 @@
 """Native process operations for Mac workers without Docker."""
 
+from .converter import ModelConverter
 from .mlx import MLXManager
 from .ollama import OllamaManager
 from .process_manager import NativeProcessManager
 
-__all__ = ["NativeProcessManager", "OllamaManager", "MLXManager"]
+__all__ = ["NativeProcessManager", "OllamaManager", "MLXManager", "ModelConverter"]
diff --git a/worker/native_ops/converter.py b/worker/native_ops/converter.py
new file mode 100644
index 0000000..a4db5af
--- /dev/null
+++ b/worker/native_ops/converter.py
@@ -0,0 +1,545 @@
+"""Model format converter for MLX and GGUF formats.
+
+This module handles converting HuggingFace models to formats
+compatible with MLX and llama.cpp backends.
+"""
+
+import asyncio
+import logging
+import shutil
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Default cache directory
+DEFAULT_CACHE_DIR = Path.home() / ".lmstack" / "converted_models"
+
+
+@dataclass
+class ConversionTask:
+    """Represents an ongoing conversion task."""
+
+    task_id: str
+    hf_model_id: str
+    target_format: str  # "mlx" or "gguf"
+    status: str  # "pending", "running", "completed", "failed"
+    progress: float  # 0.0 to 1.0
+    message: str
+    output_path: Optional[str] = None
+    error: Optional[str] = None
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+
+
+class ModelConverter:
+    """Model format converter for MLX and GGUF formats.
+
+    Handles:
+    - Converting HuggingFace models to MLX format
+    - Converting HuggingFace models to GGUF format
+    - Caching converted models
+    - Checking if models are already in compatible formats
+    """
+
+    def __init__(self, cache_dir: Optional[Path] = None):
+        self.cache_dir = cache_dir or DEFAULT_CACHE_DIR
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self._tasks: dict[str, ConversionTask] = {}
+
+    def get_mlx_cache_path(self, hf_model_id: str) -> Path:
+        """Get the cache path for MLX converted model."""
+        safe_name = hf_model_id.replace("/", "--")
+        return self.cache_dir / "mlx" / safe_name
+
+    def get_gguf_cache_path(self, hf_model_id: str, quant_type: str = "q8_0") -> Path:
+        """Get the cache path for GGUF converted model."""
+        safe_name = hf_model_id.replace("/", "--")
+        return self.cache_dir / "gguf" / f"{safe_name}-{quant_type}.gguf"
+
+    def get_cached_model(self, hf_model_id: str, format: str) -> Optional[str]:
+        """Get the path to a cached converted model if it exists.
+
+        Args:
+            hf_model_id: HuggingFace model ID
+            format: Target format ("mlx" or "gguf")
+
+        Returns:
+            Path to cached model if exists, None otherwise
+        """
+        if format == "mlx":
+            cache_path = self.get_mlx_cache_path(hf_model_id)
+            # MLX models are directories with config.json and model files
+            if cache_path.exists() and (cache_path / "config.json").exists():
+                return str(cache_path)
+        elif format == "gguf":
+            # Try common quantization types
+            for quant in ["q8_0", "q4_k_m", "q4_0", "f16"]:
+                cache_path = self.get_gguf_cache_path(hf_model_id, quant)
+                if cache_path.exists():
+                    return str(cache_path)
+        return None
+
+    @staticmethod
+    def is_mlx_ready(model_id: str) -> bool:
+        """Check if model is from mlx-community (already MLX format).
+
+        Args:
+            model_id: HuggingFace model ID
+
+        Returns:
+            True if model is from mlx-community organization
+        """
+        return model_id.startswith("mlx-community/")
+
+    @staticmethod
+    def is_gguf_ready(model_id: str, files: Optional[list[str]] = None) -> bool:
+        """Check if model has GGUF files available.
+
+        Args:
+            model_id: HuggingFace model ID
+            files: List of file names in the model repository
+
+        Returns:
+            True if model has .gguf files
+        """
+        if files:
+            return any(f.endswith(".gguf") for f in files)
+        # Common patterns for GGUF models
+        return any(pattern in model_id.lower() for pattern in ["gguf", "-gguf", "_gguf"])
+
+    async def download_gguf_model(self, hf_model_id: str) -> str:
+        """Download a GGUF model from HuggingFace.
+
+        Uses huggingface_hub to download the .gguf file(s) from a repo.
+
+        Args:
+            hf_model_id: HuggingFace model ID (e.g., "hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF")
+
+        Returns:
+            Path to downloaded GGUF file
+        """
+        try:
+            from huggingface_hub import hf_hub_download, list_repo_files
+        except ImportError:
+            raise RuntimeError(
+                "huggingface_hub is required. Install with: pip install huggingface_hub"
+            )
+
+        # Create cache directory for downloaded models
+        cache_dir = self.cache_dir / "gguf"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+
+        # List files in the repo to find .gguf files
+        try:
+            files = list_repo_files(hf_model_id)
+            gguf_files = [f for f in files if f.endswith(".gguf")]
+
+            if not gguf_files:
+                raise RuntimeError(f"No .gguf files found in {hf_model_id}")
+
+            # Pick the best file (prefer Q8_0 or largest quantization)
+            gguf_file = gguf_files[0]
+            for f in gguf_files:
+                # Prefer Q8_0 quantization
+                if "q8_0" in f.lower() or "Q8_0" in f:
+                    gguf_file = f
+                    break
+
+            logger.info(f"Downloading {gguf_file} from {hf_model_id}...")
+
+            # Download the file
+            local_path = hf_hub_download(
+                repo_id=hf_model_id,
+                filename=gguf_file,
+                cache_dir=str(cache_dir),
+                local_dir=str(cache_dir / hf_model_id.replace("/", "--")),
+                local_dir_use_symlinks=False,
+            )
+
+            logger.info(f"Downloaded GGUF model to {local_path}")
+            return local_path
+
+        except Exception as e:
+            logger.error(f"Failed to download GGUF model: {e}")
+            raise RuntimeError(f"Failed to download GGUF model from {hf_model_id}: {e}")
+
+    @staticmethod
+    def find_mlx_variant(hf_model_id: str) -> Optional[str]:
+        """Find MLX variant of a HuggingFace model.
+
+        Searches mlx-community for a converted version of the model.
+
+        Args:
+            hf_model_id: Original HuggingFace model ID
+
+        Returns:
+            MLX model ID if found, None otherwise
+        """
+        # Try common naming patterns
+        model_name = hf_model_id.split("/")[-1]
+        patterns = [
+            f"mlx-community/{model_name}",
+            f"mlx-community/{model_name}-mlx",
+            f"mlx-community/{model_name}-4bit",
+            f"mlx-community/{model_name}-8bit",
+        ]
+        return patterns[0] if patterns else None
+
+    def get_task(self, task_id: str) -> Optional[ConversionTask]:
+        """Get conversion task by ID."""
+        return self._tasks.get(task_id)
+
+    def list_tasks(self) -> list[ConversionTask]:
+        """List all conversion tasks."""
+        return list(self._tasks.values())
+
+    async def convert_to_mlx(
+        self,
+        hf_model_id: str,
+        quantize: bool = True,
+        bits: int = 4,
+    ) -> str:
+        """Convert a HuggingFace model to MLX format.
+
+        Uses mlx_lm.convert to convert the model.
+
+        Args:
+            hf_model_id: HuggingFace model ID
+            quantize: Whether to quantize the model
+            bits: Quantization bits (4 or 8)
+
+        Returns:
+            Path to converted model
+
+        Raises:
+            RuntimeError: If conversion fails
+        """
+        # Check if already cached
+        cached = self.get_cached_model(hf_model_id, "mlx")
+        if cached:
+            logger.info(f"Using cached MLX model: {cached}")
+            return cached
+
+        # Check if already MLX format
+        if self.is_mlx_ready(hf_model_id):
+            logger.info(f"Model {hf_model_id} is already MLX format")
+            return hf_model_id
+
+        output_path = self.get_mlx_cache_path(hf_model_id)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Create task
+        task_id = f"mlx-{hf_model_id.replace('/', '--')}"
+        task = ConversionTask(
+            task_id=task_id,
+            hf_model_id=hf_model_id,
+            target_format="mlx",
+            status="running",
+            progress=0.0,
+            message="Starting MLX conversion...",
+            started_at=datetime.now(),
+        )
+        self._tasks[task_id] = task
+
+        try:
+            # Check if mlx_lm is available
+            mlx_convert = shutil.which("mlx_lm.convert")
+            if not mlx_convert:
+                # Try using python module
+                cmd = [
+                    "python3",
+                    "-m",
+                    "mlx_lm.convert",
+                    "--hf-path",
+                    hf_model_id,
+                    "--mlx-path",
+                    str(output_path),
+                ]
+            else:
+                cmd = [
+                    mlx_convert,
+                    "--hf-path",
+                    hf_model_id,
+                    "--mlx-path",
+                    str(output_path),
+                ]
+
+            if quantize:
+                cmd.extend(["-q", "--q-bits", str(bits)])
+
+            task.progress = 0.1
+            task.message = f"Converting {hf_model_id} to MLX format..."
+            logger.info(f"Running: {' '.join(cmd)}")
+
+            # Run conversion
+            process = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.STDOUT,
+            )
+
+            task.progress = 0.5
+            stdout, _ = await process.communicate()
+
+            if process.returncode != 0:
+                error_msg = stdout.decode() if stdout else "Unknown error"
+                raise RuntimeError(f"MLX conversion failed: {error_msg}")
+
+            task.progress = 1.0
+            task.status = "completed"
+            task.message = "Conversion completed"
+            task.output_path = str(output_path)
+            task.completed_at = datetime.now()
+
+            logger.info(f"MLX conversion completed: {output_path}")
+            return str(output_path)
+
+        except Exception as e:
+            task.status = "failed"
+            task.error = str(e)
+            task.message = f"Conversion failed: {e}"
+            logger.error(f"MLX conversion failed for {hf_model_id}: {e}")
+            raise
+
+    async def convert_to_gguf(
+        self,
+        hf_model_id: str,
+        quant_type: str = "q8_0",
+    ) -> str:
+        """Convert a HuggingFace model to GGUF format.
+
+        Uses llama.cpp's convert scripts to create GGUF.
+
+        Args:
+            hf_model_id: HuggingFace model ID
+            quant_type: Quantization type (q4_0, q4_k_m, q8_0, f16, etc.)
+
+        Returns:
+            Path to converted model
+
+        Raises:
+            RuntimeError: If conversion fails
+        """
+        # Check if already cached
+        cached = self.get_cached_model(hf_model_id, "gguf")
+        if cached:
+            logger.info(f"Using cached GGUF model: {cached}")
+            return cached
+
+        output_path = self.get_gguf_cache_path(hf_model_id, quant_type)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Create task
+        task_id = f"gguf-{hf_model_id.replace('/', '--')}"
+        task = ConversionTask(
+            task_id=task_id,
+            hf_model_id=hf_model_id,
+            target_format="gguf",
+            status="running",
+            progress=0.0,
+            message="Starting GGUF conversion...",
+            started_at=datetime.now(),
+        )
+        self._tasks[task_id] = task
+
+        try:
+            # First, download the model using huggingface-cli
+            task.progress = 0.1
+            task.message = f"Downloading {hf_model_id}..."
+
+            hf_cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
+            model_dir = hf_cache_dir / f"models--{hf_model_id.replace('/', '--')}"
+
+            if not model_dir.exists():
+                download_cmd = [
+                    "huggingface-cli",
+                    "download",
+                    hf_model_id,
+                    "--local-dir",
+                    str(self.cache_dir / "downloads" / hf_model_id.replace("/", "--")),
+                ]
+                process = await asyncio.create_subprocess_exec(
+                    *download_cmd,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.STDOUT,
+                )
+                stdout, _ = await process.communicate()
+                if process.returncode != 0:
+                    raise RuntimeError(
+                        f"Download failed: {stdout.decode() if stdout else 'Unknown error'}"
+                    )
+                model_dir = self.cache_dir / "downloads" / hf_model_id.replace("/", "--")
+
+            task.progress = 0.4
+            task.message = "Converting to GGUF..."
+
+            # Find llama.cpp convert script
+            # Common locations
+            convert_script = None
+            for path in [
+                shutil.which("convert_hf_to_gguf.py"),
+                Path.home() / "llama.cpp" / "convert_hf_to_gguf.py",
+                Path("/usr/local/share/llama.cpp/convert_hf_to_gguf.py"),
+            ]:
+                if path and Path(path).exists():
+                    convert_script = str(path)
+                    break
+
+            if not convert_script:
+                # Try using llama-quantize directly if model is already GGUF
+                raise RuntimeError(
+                    "llama.cpp convert script not found. "
+                    "Please install llama.cpp: brew install llama.cpp"
+                )
+
+            # Convert to GGUF
+            temp_gguf = output_path.parent / f"{output_path.stem}_temp.gguf"
+            convert_cmd = [
+                "python3",
+                convert_script,
+                str(model_dir),
+                "--outfile",
+                str(temp_gguf),
+                "--outtype",
+                "f16",
+            ]
+
+            process = await asyncio.create_subprocess_exec(
+                *convert_cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.STDOUT,
+            )
+            stdout, _ = await process.communicate()
+
+            if process.returncode != 0:
+                raise RuntimeError(
+                    f"GGUF conversion failed: {stdout.decode() if stdout else 'Unknown error'}"
+                )
+
+            task.progress = 0.7
+            task.message = f"Quantizing to {quant_type}..."
+
+            # Quantize if needed
+            if quant_type != "f16":
+                llama_quantize = shutil.which("llama-quantize")
+                if not llama_quantize:
+                    raise RuntimeError("llama-quantize not found. Please install llama.cpp")
+
+                quant_cmd = [
+                    llama_quantize,
+                    str(temp_gguf),
+                    str(output_path),
+                    quant_type.upper(),
+                ]
+                process = await asyncio.create_subprocess_exec(
+                    *quant_cmd,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.STDOUT,
+                )
+                await process.communicate()
+
+                # Remove temp file
+                temp_gguf.unlink(missing_ok=True)
+            else:
+                # Just rename
+                temp_gguf.rename(output_path)
+
+            task.progress = 1.0
+            task.status = "completed"
+            task.message = "Conversion completed"
+            task.output_path = str(output_path)
+            task.completed_at = datetime.now()
+
+            logger.info(f"GGUF conversion completed: {output_path}")
+            return str(output_path)
+
+        except Exception as e:
+            task.status = "failed"
+            task.error = str(e)
+            task.message = f"Conversion failed: {e}"
+            logger.error(f"GGUF conversion failed for {hf_model_id}: {e}")
+            raise
+
+    def clear_cache(self, hf_model_id: Optional[str] = None, format: Optional[str] = None):
+        """Clear converted model cache.
+
+        Args:
+            hf_model_id: Clear cache for specific model (None = all)
+            format: Clear cache for specific format (None = all)
+        """
+        if hf_model_id:
+            if format in (None, "mlx"):
+                cache_path = self.get_mlx_cache_path(hf_model_id)
+                if cache_path.exists():
+                    shutil.rmtree(cache_path)
+                    logger.info(f"Cleared MLX cache: {cache_path}")
+
+            if format in (None, "gguf"):
+                for quant in ["q8_0", "q4_k_m", "q4_0", "f16"]:
+                    cache_path = self.get_gguf_cache_path(hf_model_id, quant)
+                    if cache_path.exists():
+                        cache_path.unlink()
+                        logger.info(f"Cleared GGUF cache: {cache_path}")
+        else:
+            # Clear all
+            if format in (None, "mlx"):
+                mlx_dir = self.cache_dir / "mlx"
+                if mlx_dir.exists():
+                    shutil.rmtree(mlx_dir)
+                    logger.info("Cleared all MLX cache")
+
+            if format in (None, "gguf"):
+                gguf_dir = self.cache_dir / "gguf"
+                if gguf_dir.exists():
+                    shutil.rmtree(gguf_dir)
+                    logger.info("Cleared all GGUF cache")
+
+    def get_cache_info(self) -> dict:
+        """Get information about cached models.
+
+        Returns:
+            Dictionary with cache statistics
+        """
+        info = {
+            "cache_dir": str(self.cache_dir),
+            "mlx_models": [],
+            "gguf_models": [],
+            "total_size_bytes": 0,
+        }
+
+        mlx_dir = self.cache_dir / "mlx"
+        if mlx_dir.exists():
+            for model_dir in mlx_dir.iterdir():
+                if model_dir.is_dir():
+                    size = sum(f.stat().st_size for f in model_dir.rglob("*") if f.is_file())
+                    info["mlx_models"].append(
+                        {
+                            "model_id": model_dir.name.replace("--", "/"),
+                            "path": str(model_dir),
+                            "size_bytes": size,
+                        }
+                    )
+                    info["total_size_bytes"] += size
+
+        gguf_dir = self.cache_dir / "gguf"
+        if gguf_dir.exists():
+            for gguf_file in gguf_dir.glob("*.gguf"):
+                size = gguf_file.stat().st_size
+                # Parse model name from filename (name-quant.gguf)
+                parts = gguf_file.stem.rsplit("-", 1)
+                model_id = parts[0].replace("--", "/") if parts else gguf_file.stem
+                quant = parts[1] if len(parts) > 1 else "unknown"
+                info["gguf_models"].append(
+                    {
+                        "model_id": model_id,
+                        "quant_type": quant,
+                        "path": str(gguf_file),
+                        "size_bytes": size,
+                    }
+                )
+                info["total_size_bytes"] += size
+
+        return info
diff --git a/worker/native_ops/process_manager.py b/worker/native_ops/process_manager.py
index 79a57c6..7997378 100644
--- a/worker/native_ops/process_manager.py
+++ b/worker/native_ops/process_manager.py
@@ -1,17 +1,24 @@
 """Native process manager for Mac workers.
 
 Manages LLM inference processes without Docker for macOS with Apple Silicon.
-Supports Ollama, MLX-LM, and llama.cpp backends.
+Supports Ollama, MLX-LM, llama.cpp, and vLLM-Metal backends.
 """
 
+import asyncio
 import logging
 import os
+import shutil
 import subprocess
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional
 
+from .converter import ModelConverter
+
 logger = logging.getLogger(__name__)
 
+OLLAMA_DEFAULT_PORT = 11434
+
 
 @dataclass
 class NativeProcess:
@@ -19,10 +26,11 @@ class NativeProcess:
 
     process_id: str  # Unique identifier (deployment_id based)
     pid: int  # OS process ID
-    backend: str  # ollama, mlx, llama_cpp
+    backend: str  # ollama, mlx, llama_cpp, vllm
     model_id: str
     port: int
     process: Optional[subprocess.Popen] = None
+    log_file: Optional[Path] = None  # Path to log file for this process
 
 
 class NativeProcessManager:
@@ -30,6 +38,88 @@ class NativeProcessManager:
 
     def __init__(self):
         self._processes: dict[str, NativeProcess] = {}
+        self._ollama_process: Optional[subprocess.Popen] = None
+        self._converter = ModelConverter()
+        self._log_dir = Path.home() / ".lmstack" / "logs"
+        self._log_dir.mkdir(parents=True, exist_ok=True)
+
+    def _write_log(self, process_id: str, message: str) -> None:
+        """Write a message to a process's log file."""
+        log_file = self._log_dir / f"{process_id}.log"
+        with open(log_file, "a") as f:
+            from datetime import datetime
+
+            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            f.write(f"[{timestamp}] {message}\n")
+            f.flush()
+
+    async def ensure_ollama_running(
+        self, host: str = "0.0.0.0", port: int = OLLAMA_DEFAULT_PORT
+    ) -> bool:
+        """Ensure Ollama service is running and accessible.
+
+        If Ollama is not running, starts it with OLLAMA_HOST set to allow external connections.
+
+        Args:
+            host: Host to bind to (default 0.0.0.0 for external access)
+            port: Port to bind to (default 11434)
+
+        Returns:
+            True if Ollama is running and accessible
+        """
+        import httpx
+
+        # Check if Ollama is already running
+        try:
+            async with httpx.AsyncClient(timeout=2.0) as client:
+                response = await client.get(f"http://localhost:{port}/api/tags")
+                if response.status_code == 200:
+                    logger.info("Ollama service is already running")
+                    return True
+        except Exception:
+            pass
+
+        # Ollama not running, try to start it
+        ollama_path = shutil.which("ollama")
+        if not ollama_path:
+            logger.warning("Ollama is not installed")
+            return False
+
+        logger.info(f"Starting Ollama service on {host}:{port}")
+
+        # Set environment for Ollama to bind to all interfaces
+        env = os.environ.copy()
+        env["OLLAMA_HOST"] = f"{host}:{port}"
+
+        try:
+            # Start ollama serve in background
+            self._ollama_process = subprocess.Popen(
+                [ollama_path, "serve"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                env=env,
+                start_new_session=True,
+            )
+            logger.info(f"Started Ollama service (PID {self._ollama_process.pid})")
+
+            # Wait for Ollama to be ready
+            for _ in range(30):  # Wait up to 30 seconds
+                await asyncio.sleep(1)
+                try:
+                    async with httpx.AsyncClient(timeout=2.0) as client:
+                        response = await client.get(f"http://localhost:{port}/api/tags")
+                        if response.status_code == 200:
+                            logger.info("Ollama service is ready")
+                            return True
+                except Exception:
+                    pass
+
+            logger.error("Ollama service failed to start in time")
+            return False
+
+        except Exception as e:
+            logger.error(f"Failed to start Ollama service: {e}")
+            return False
 
     def get_process(self, process_id: str) -> Optional[NativeProcess]:
         """Get a managed process by ID."""
@@ -74,6 +164,8 @@ async def start_process(
             process = await self._start_mlx(process_id, model_id, port, **kwargs)
         elif backend == "llama_cpp":
             process = await self._start_llama_cpp(process_id, model_id, port, **kwargs)
+        elif backend == "vllm":
+            process = await self._start_vllm_metal(process_id, model_id, port, **kwargs)
         else:
             raise ValueError(f"Unknown backend: {backend}")
 
@@ -145,17 +237,13 @@ async def _start_ollama(
         """
         import httpx
 
-        ollama_port = 11434  # Ollama's default port
+        ollama_port = OLLAMA_DEFAULT_PORT
 
-        # Check if Ollama service is running
-        try:
-            async with httpx.AsyncClient(timeout=5.0) as client:
-                response = await client.get(f"http://localhost:{ollama_port}/api/tags")
-                if response.status_code != 200:
-                    raise RuntimeError("Ollama service is not responding")
-        except httpx.ConnectError:
+        # Ensure Ollama service is running (starts it if needed)
+        if not await self.ensure_ollama_running():
             raise RuntimeError(
-                "Ollama service is not running. " "Please start it with: ollama serve"
+                "Ollama service is not running and could not be started. "
+                "Please install Ollama: https://ollama.ai"
             )
 
         # Pull the model if needed
@@ -216,6 +304,71 @@ async def _unload_ollama_model(self, process: NativeProcess):
         except Exception as e:
             logger.warning(f"Failed to unload Ollama model: {e}")
 
+    async def _ensure_mlx_lm_installed(self) -> str:
+        """Ensure MLX-LM is installed in a virtual environment.
+
+        Creates a virtual environment at ~/.lmstack/venvs/mlx-lm
+        and installs mlx-lm if not already present.
+
+        Returns:
+            Path to the python command in the virtual environment
+        """
+        venv_dir = Path.home() / ".lmstack" / "venvs" / "mlx-lm"
+        python_cmd = venv_dir / "bin" / "python"
+
+        # Check if mlx-lm is already installed in venv
+        if python_cmd.exists():
+            # Verify mlx_lm is importable
+            check = await asyncio.create_subprocess_exec(
+                str(python_cmd),
+                "-c",
+                "import mlx_lm",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            await check.wait()
+            if check.returncode == 0:
+                logger.info(f"MLX-LM already installed at {venv_dir}")
+                return str(python_cmd)
+
+        # Create virtual environment
+        logger.info(f"Creating virtual environment for MLX-LM at {venv_dir}")
+        venv_dir.parent.mkdir(parents=True, exist_ok=True)
+
+        # Create venv
+        create_venv = await asyncio.create_subprocess_exec(
+            "python3",
+            "-m",
+            "venv",
+            str(venv_dir),
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        await create_venv.wait()
+
+        if create_venv.returncode != 0:
+            stderr = await create_venv.stderr.read()
+            raise RuntimeError(f"Failed to create virtual environment: {stderr.decode()}")
+
+        # Install mlx-lm
+        pip_cmd = venv_dir / "bin" / "pip"
+        logger.info("Installing mlx-lm (this may take a few minutes)...")
+
+        install_proc = await asyncio.create_subprocess_exec(
+            str(pip_cmd),
+            "install",
+            "mlx-lm",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await install_proc.communicate()
+
+        if install_proc.returncode != 0:
+            raise RuntimeError(f"Failed to install mlx-lm: {stderr.decode()}")
+
+        logger.info("MLX-LM installed successfully")
+        return str(python_cmd)
+
     async def _start_mlx(
         self,
         process_id: str,
@@ -226,14 +379,62 @@ async def _start_mlx(
         """Start MLX-LM server for Apple Silicon.
 
         MLX-LM provides OpenAI-compatible API via mlx_lm.server.
+        Automatically installs mlx-lm and converts models if needed.
         """
-        # Build command
+        # Initialize log file early so we can track progress
+        self._write_log(process_id, f"Starting MLX deployment for {model_id}")
+
+        # Ensure MLX-LM is installed (auto-install if needed)
+        self._write_log(process_id, "Checking MLX-LM installation...")
+        python_cmd = await self._ensure_mlx_lm_installed()
+        self._write_log(process_id, f"MLX-LM ready: {python_cmd}")
+
+        effective_model_id = model_id
+
+        # Check if model needs conversion
+        if not ModelConverter.is_mlx_ready(model_id):
+            # Check for cached conversion first
+            cached = self._converter.get_cached_model(model_id, "mlx")
+            if cached:
+                logger.info(f"Using cached MLX model: {cached}")
+                self._write_log(process_id, f"Using cached MLX model: {cached}")
+                effective_model_id = cached
+            else:
+                # Try to find an existing MLX variant on HuggingFace
+                mlx_variant = ModelConverter.find_mlx_variant(model_id)
+                if mlx_variant and ModelConverter.is_mlx_ready(mlx_variant):
+                    logger.info(f"Using MLX variant: {mlx_variant}")
+                    self._write_log(process_id, f"Using MLX variant: {mlx_variant}")
+                    effective_model_id = mlx_variant
+                else:
+                    # Convert the model
+                    logger.info(f"Converting {model_id} to MLX format...")
+                    self._write_log(process_id, f"Converting {model_id} to MLX format...")
+                    self._write_log(process_id, "This may take a while...")
+                    try:
+                        quantize = kwargs.pop("mlx_quantize", True)
+                        bits = kwargs.pop("mlx_bits", 4)
+                        effective_model_id = await self._converter.convert_to_mlx(
+                            model_id, quantize=quantize, bits=bits
+                        )
+                        self._write_log(process_id, f"Conversion complete: {effective_model_id}")
+                    except Exception as e:
+                        logger.error(f"MLX conversion failed: {e}")
+                        self._write_log(process_id, f"ERROR: MLX conversion failed: {e}")
+                        raise RuntimeError(
+                            f"Failed to convert model to MLX format: {e}. "
+                            "Consider using an mlx-community model or Ollama backend."
+                        )
+        else:
+            self._write_log(process_id, f"Model {model_id} is MLX-ready")
+
+        # Build command using venv python
         cmd = [
-            "python3",
+            python_cmd,
             "-m",
             "mlx_lm.server",
             "--model",
-            model_id,
+            effective_model_id,
             "--host",
             "0.0.0.0",
             "--port",
@@ -244,27 +445,83 @@ async def _start_mlx(
         if kwargs.get("trust_remote_code"):
             cmd.append("--trust-remote-code")
 
-        # Start the process
+        # Create log file
+        log_file = self._log_dir / f"{process_id}.log"
+
+        # Start the process with log file
         env = os.environ.copy()
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            env=env,
-            start_new_session=True,
-        )
+        with open(log_file, "a") as f:
+            process = subprocess.Popen(
+                cmd,
+                stdout=f,
+                stderr=subprocess.STDOUT,
+                env=env,
+                start_new_session=True,
+            )
 
-        logger.info(f"Started MLX-LM server (PID {process.pid}) for {model_id}")
+        logger.info(f"Started MLX-LM server (PID {process.pid}) for {effective_model_id}")
 
         return NativeProcess(
             process_id=process_id,
             pid=process.pid,
             backend="mlx",
-            model_id=model_id,
+            model_id=effective_model_id,
             port=port,
             process=process,
+            log_file=log_file,
         )
 
+    async def _ensure_llama_cpp_installed(self) -> str:
+        """Ensure llama.cpp is installed.
+
+        Installs llama.cpp via Homebrew if not already present.
+
+        Returns:
+            Path to the llama-server command
+        """
+        llama_server = shutil.which("llama-server")
+        if llama_server:
+            logger.info(f"llama.cpp already installed at {llama_server}")
+            return llama_server
+
+        # Check if brew is available
+        brew = shutil.which("brew")
+        if not brew:
+            raise RuntimeError(
+                "llama-server not found and Homebrew is not installed. "
+                "Please install Homebrew first: https://brew.sh"
+            )
+
+        # Install llama.cpp via brew
+        logger.info("Installing llama.cpp via Homebrew (this may take a few minutes)...")
+
+        install_proc = await asyncio.create_subprocess_exec(
+            brew,
+            "install",
+            "llama.cpp",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await install_proc.communicate()
+
+        if install_proc.returncode != 0:
+            raise RuntimeError(f"Failed to install llama.cpp: {stderr.decode()}")
+
+        # Find llama-server again
+        llama_server = shutil.which("llama-server")
+        if not llama_server:
+            # Try common Homebrew paths
+            for path in ["/opt/homebrew/bin/llama-server", "/usr/local/bin/llama-server"]:
+                if Path(path).exists():
+                    llama_server = path
+                    break
+
+        if not llama_server:
+            raise RuntimeError("llama.cpp installed but llama-server not found in PATH")
+
+        logger.info("llama.cpp installed successfully")
+        return llama_server
+
     async def _start_llama_cpp(
         self,
         process_id: str,
@@ -275,22 +532,69 @@ async def _start_llama_cpp(
         """Start llama.cpp server with Metal acceleration.
 
         llama.cpp provides OpenAI-compatible API via llama-server.
+        Automatically installs llama.cpp and downloads/converts models if needed.
         """
-        # Check for llama-server binary
-        import shutil
+        # Initialize log file early so we can track progress
+        self._write_log(process_id, f"Starting llama.cpp deployment for {model_id}")
 
-        llama_server = shutil.which("llama-server")
+        # Ensure llama.cpp is installed (auto-install if needed)
+        self._write_log(process_id, "Checking llama.cpp installation...")
+        llama_server = await self._ensure_llama_cpp_installed()
+        self._write_log(process_id, f"llama.cpp ready: {llama_server}")
 
-        if not llama_server:
-            raise RuntimeError(
-                "llama-server not found. " "Please install llama.cpp: brew install llama.cpp"
-            )
+        effective_model_path = model_id
+
+        # Check if model_id is already a local GGUF file path
+        if model_id.endswith(".gguf") and Path(model_id).exists():
+            logger.info(f"Using local GGUF file: {model_id}")
+            self._write_log(process_id, f"Using local GGUF file: {model_id}")
+            effective_model_path = model_id
+        else:
+            # Check for cached model
+            cached = self._converter.get_cached_model(model_id, "gguf")
+            if cached:
+                logger.info(f"Using cached GGUF model: {cached}")
+                self._write_log(process_id, f"Using cached GGUF model: {cached}")
+                effective_model_path = cached
+            elif ModelConverter.is_gguf_ready(model_id):
+                # Model is already GGUF on HuggingFace, download it directly
+                logger.info(f"Downloading GGUF model from HuggingFace: {model_id}")
+                self._write_log(process_id, f"Downloading GGUF model from HuggingFace: {model_id}")
+                self._write_log(process_id, "This may take a while depending on model size...")
+                try:
+                    effective_model_path = await self._converter.download_gguf_model(model_id)
+                    self._write_log(process_id, f"Download complete: {effective_model_path}")
+                except Exception as e:
+                    logger.error(f"GGUF download failed: {e}")
+                    self._write_log(process_id, f"ERROR: GGUF download failed: {e}")
+                    raise RuntimeError(
+                        f"Failed to download GGUF model: {e}. "
+                        "Check if the model exists and has .gguf files."
+                    )
+            else:
+                # Need to convert from HuggingFace format
+                logger.info(f"Converting {model_id} to GGUF format...")
+                self._write_log(process_id, f"Converting {model_id} to GGUF format...")
+                self._write_log(process_id, "This may take a while...")
+                try:
+                    quant_type = kwargs.pop("gguf_quant", "q8_0")
+                    effective_model_path = await self._converter.convert_to_gguf(
+                        model_id, quant_type=quant_type
+                    )
+                    self._write_log(process_id, f"Conversion complete: {effective_model_path}")
+                except Exception as e:
+                    logger.error(f"GGUF conversion failed: {e}")
+                    self._write_log(process_id, f"ERROR: GGUF conversion failed: {e}")
+                    raise RuntimeError(
+                        f"Failed to convert model to GGUF format: {e}. "
+                        "Consider using a pre-quantized GGUF model or Ollama backend."
+                    )
 
         # Build command
         cmd = [
             llama_server,
             "--model",
-            model_id,
+            effective_model_path,
             "--host",
             "0.0.0.0",
             "--port",
@@ -306,31 +610,165 @@ async def _start_llama_cpp(
         if n_threads := kwargs.get("n_threads"):
             cmd.extend(["-t", str(n_threads)])
 
-        # Start the process
+        # Create log file
+        log_file = self._log_dir / f"{process_id}.log"
+
+        # Start the process with log file
         env = os.environ.copy()
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            env=env,
-            start_new_session=True,
-        )
+        with open(log_file, "a") as f:
+            process = subprocess.Popen(
+                cmd,
+                stdout=f,
+                stderr=subprocess.STDOUT,
+                env=env,
+                start_new_session=True,
+            )
 
-        logger.info(f"Started llama.cpp server (PID {process.pid}) for {model_id}")
+        logger.info(f"Started llama.cpp server (PID {process.pid}) for {effective_model_path}")
 
         return NativeProcess(
             process_id=process_id,
             pid=process.pid,
             backend="llama_cpp",
+            model_id=effective_model_path,
+            port=port,
+            process=process,
+            log_file=log_file,
+        )
+
+    async def _ensure_vllm_metal_installed(self) -> str:
+        """Ensure vLLM-Metal is installed in a virtual environment.
+
+        Creates a virtual environment at ~/.lmstack/venvs/vllm-metal
+        and installs vllm-metal if not already present.
+
+        Returns:
+            Path to the vllm command in the virtual environment
+        """
+        venv_dir = Path.home() / ".lmstack" / "venvs" / "vllm-metal"
+        vllm_cmd = venv_dir / "bin" / "vllm"
+
+        # Check if vllm is already installed in venv
+        if vllm_cmd.exists():
+            logger.info(f"vLLM-Metal already installed at {vllm_cmd}")
+            return str(vllm_cmd)
+
+        # Create virtual environment
+        logger.info(f"Creating virtual environment for vLLM-Metal at {venv_dir}")
+        venv_dir.parent.mkdir(parents=True, exist_ok=True)
+
+        # Create venv
+        create_venv = await asyncio.create_subprocess_exec(
+            "python3",
+            "-m",
+            "venv",
+            str(venv_dir),
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        await create_venv.wait()
+
+        if create_venv.returncode != 0:
+            stderr = await create_venv.stderr.read()
+            raise RuntimeError(f"Failed to create virtual environment: {stderr.decode()}")
+
+        # Install vllm-metal
+        pip_cmd = venv_dir / "bin" / "pip"
+        logger.info("Installing vllm-metal (this may take a few minutes)...")
+
+        install_proc = await asyncio.create_subprocess_exec(
+            str(pip_cmd),
+            "install",
+            "vllm-metal",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await install_proc.communicate()
+
+        if install_proc.returncode != 0:
+            raise RuntimeError(
+                f"Failed to install vllm-metal: {stderr.decode()}\n"
+                "You may need to install it manually: pip install vllm-metal"
+            )
+
+        logger.info("vLLM-Metal installed successfully")
+        return str(vllm_cmd)
+
+    async def _start_vllm_metal(
+        self,
+        process_id: str,
+        model_id: str,
+        port: int,
+        **kwargs,
+    ) -> NativeProcess:
+        """Start vLLM-Metal server for Apple Silicon.
+
+        vLLM-Metal provides OpenAI-compatible API via `vllm serve`.
+        Automatically installs vllm-metal in a virtual environment if needed.
+        See: https://github.com/vllm-project/vllm-metal
+        """
+        # Initialize log file early so we can track progress
+        self._write_log(process_id, f"Starting vLLM-Metal deployment for {model_id}")
+
+        # Ensure vLLM-Metal is installed (auto-install if needed)
+        self._write_log(process_id, "Checking vLLM-Metal installation...")
+        vllm_cmd = await self._ensure_vllm_metal_installed()
+        self._write_log(process_id, f"vLLM-Metal ready: {vllm_cmd}")
+
+        # Build command using vllm serve
+        cmd = [
+            vllm_cmd,
+            "serve",
+            model_id,
+            "--host",
+            "0.0.0.0",
+            "--port",
+            str(port),
+        ]
+
+        # Add optional parameters
+        if gpu_memory_util := kwargs.get("gpu_memory_utilization"):
+            cmd.extend(["--gpu-memory-utilization", str(gpu_memory_util)])
+
+        if max_model_len := kwargs.get("max_model_len"):
+            cmd.extend(["--max-model-len", str(max_model_len)])
+
+        if dtype := kwargs.get("dtype"):
+            cmd.extend(["--dtype", str(dtype)])
+
+        if kwargs.get("trust_remote_code"):
+            cmd.append("--trust-remote-code")
+
+        # Create log file
+        log_file = self._log_dir / f"{process_id}.log"
+
+        # Start the process with log file
+        env = os.environ.copy()
+        with open(log_file, "a") as f:
+            process = subprocess.Popen(
+                cmd,
+                stdout=f,
+                stderr=subprocess.STDOUT,
+                env=env,
+                start_new_session=True,
+            )
+
+        logger.info(f"Started vLLM-Metal server (PID {process.pid}) for {model_id}")
+
+        return NativeProcess(
+            process_id=process_id,
+            pid=process.pid,
+            backend="vllm",
             model_id=model_id,
             port=port,
             process=process,
+            log_file=log_file,
         )
 
     def get_logs(self, process_id: str, tail: int = 100) -> str:
         """Get logs from a process.
 
-        For subprocess-based backends, reads from stdout pipe.
+        Reads from log file for MLX, llama.cpp, and vLLM-Metal backends.
         For Ollama, returns status information about loaded models.
         """
         process = self._processes.get(process_id)
@@ -340,11 +778,15 @@ def get_logs(self, process_id: str, tail: int = 100) -> str:
         if process.backend == "ollama":
             return self._get_ollama_status(process)
 
-        if process.process and process.process.stdout:
+        # Read from log file
+        if process.log_file and process.log_file.exists():
             try:
-                # This is a simple implementation - in production you'd want
-                # to capture logs to a file and read the tail
-                return "Logs are available but streaming is not yet implemented"
+                with open(process.log_file) as f:
+                    lines = f.readlines()
+                    # Return last 'tail' lines
+                    if len(lines) > tail:
+                        lines = lines[-tail:]
+                    return "".join(lines)
             except Exception as e:
                 return f"Error reading logs: {e}"
 
diff --git a/worker/routes/__init__.py b/worker/routes/__init__.py
index cd506d0..9ce61f8 100644
--- a/worker/routes/__init__.py
+++ b/worker/routes/__init__.py
@@ -6,9 +6,11 @@
 - containers.py: Docker container management endpoints
 - storage.py: Storage and volume management endpoints
 - native.py: Native deployment endpoints (Mac without Docker)
+- converter.py: Model format conversion endpoints (MLX/GGUF)
 """
 
 from .containers import router as containers_router
+from .converter import router as converter_router
 from .deployment import router as deployment_router
 from .images import router as images_router
 from .native import router as native_router
@@ -20,4 +22,5 @@
     "containers_router",
     "storage_router",
     "native_router",
+    "converter_router",
 ]
diff --git a/worker/routes/converter.py b/worker/routes/converter.py
new file mode 100644
index 0000000..a259ddb
--- /dev/null
+++ b/worker/routes/converter.py
@@ -0,0 +1,233 @@
+"""Model conversion routes for the worker agent.
+
+Provides API endpoints for converting HuggingFace models to MLX/GGUF formats.
+"""
+
+import logging
+from typing import TYPE_CHECKING, Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+if TYPE_CHECKING:
+    from worker.agent import WorkerAgent
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["converter"])
+
+# Global agent reference (set by agent.py)
+_agent: "WorkerAgent | None" = None
+
+
+def set_agent(agent: "WorkerAgent"):
+    """Set the global agent reference."""
+    global _agent
+    _agent = agent
+
+
+def _get_converter():
+    """Get the converter from native manager."""
+    if not _agent:
+        raise HTTPException(status_code=500, detail="Agent not initialized")
+    if not hasattr(_agent, "native_manager") or not _agent.native_manager:
+        raise HTTPException(
+            status_code=400,
+            detail="Model conversion only available on Mac workers with native support",
+        )
+    return _agent.native_manager._converter
+
+
+class MLXConvertRequest(BaseModel):
+    """Request to convert a model to MLX format."""
+
+    hf_model_id: str
+    quantize: bool = True
+    bits: int = 4  # 4 or 8
+
+
+class GGUFConvertRequest(BaseModel):
+    """Request to convert a model to GGUF format."""
+
+    hf_model_id: str
+    quant_type: str = "q8_0"  # q4_0, q4_k_m, q8_0, f16
+
+
+class ConvertResponse(BaseModel):
+    """Response from conversion request."""
+
+    task_id: str
+    status: str
+    message: str
+    output_path: Optional[str] = None
+
+
+class ConversionProgress(BaseModel):
+    """Conversion task progress."""
+
+    task_id: str
+    status: str
+    progress: float
+    message: str
+    output_path: Optional[str] = None
+    error: Optional[str] = None
+
+
+class FormatCheckRequest(BaseModel):
+    """Request to check model format compatibility."""
+
+    model_id: str
+    files: Optional[list[str]] = None
+
+
+class FormatCheckResponse(BaseModel):
+    """Response with model format compatibility info."""
+
+    model_id: str
+    is_mlx_ready: bool
+    is_gguf_ready: bool
+    cached_mlx: Optional[str] = None
+    cached_gguf: Optional[str] = None
+
+
+@router.post("/convert/mlx", response_model=ConvertResponse)
+async def convert_to_mlx(request: MLXConvertRequest):
+    """Convert a HuggingFace model to MLX format.
+
+    This endpoint starts the conversion process and returns immediately.
+    Use GET /convert/progress/{task_id} to check progress.
+    """
+    converter = _get_converter()
+
+    try:
+        # Start conversion
+        output_path = await converter.convert_to_mlx(
+            hf_model_id=request.hf_model_id,
+            quantize=request.quantize,
+            bits=request.bits,
+        )
+
+        task_id = f"mlx-{request.hf_model_id.replace('/', '--')}"
+
+        return ConvertResponse(
+            task_id=task_id,
+            status="completed",
+            message="Conversion completed successfully",
+            output_path=output_path,
+        )
+
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.exception(f"MLX conversion failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/convert/gguf", response_model=ConvertResponse)
+async def convert_to_gguf(request: GGUFConvertRequest):
+    """Convert a HuggingFace model to GGUF format.
+
+    This endpoint starts the conversion process and returns immediately.
+    Use GET /convert/progress/{task_id} to check progress.
+    """
+    converter = _get_converter()
+
+    try:
+        output_path = await converter.convert_to_gguf(
+            hf_model_id=request.hf_model_id,
+            quant_type=request.quant_type,
+        )
+
+        task_id = f"gguf-{request.hf_model_id.replace('/', '--')}"
+
+        return ConvertResponse(
+            task_id=task_id,
+            status="completed",
+            message="Conversion completed successfully",
+            output_path=output_path,
+        )
+
+    except RuntimeError as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    except Exception as e:
+        logger.exception(f"GGUF conversion failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/convert/progress/{task_id}", response_model=ConversionProgress)
+async def get_conversion_progress(task_id: str):
+    """Get the progress of a conversion task."""
+    converter = _get_converter()
+
+    task = converter.get_task(task_id)
+    if not task:
+        raise HTTPException(status_code=404, detail=f"Task {task_id} not found")
+
+    return ConversionProgress(
+        task_id=task.task_id,
+        status=task.status,
+        progress=task.progress,
+        message=task.message,
+        output_path=task.output_path,
+        error=task.error,
+    )
+
+
+@router.get("/convert/tasks")
+async def list_conversion_tasks():
+    """List all conversion tasks."""
+    converter = _get_converter()
+
+    tasks = converter.list_tasks()
+    return {
+        "tasks": [
+            {
+                "task_id": t.task_id,
+                "hf_model_id": t.hf_model_id,
+                "target_format": t.target_format,
+                "status": t.status,
+                "progress": t.progress,
+                "message": t.message,
+            }
+            for t in tasks
+        ]
+    }
+
+
+@router.post("/convert/check-format", response_model=FormatCheckResponse)
+async def check_model_format(request: FormatCheckRequest):
+    """Check if a model is already in MLX or GGUF format."""
+    converter = _get_converter()
+
+    from worker.native_ops.converter import ModelConverter
+
+    is_mlx = ModelConverter.is_mlx_ready(request.model_id)
+    is_gguf = ModelConverter.is_gguf_ready(request.model_id, request.files)
+
+    return FormatCheckResponse(
+        model_id=request.model_id,
+        is_mlx_ready=is_mlx,
+        is_gguf_ready=is_gguf,
+        cached_mlx=converter.get_cached_model(request.model_id, "mlx") if not is_mlx else None,
+        cached_gguf=converter.get_cached_model(request.model_id, "gguf") if not is_gguf else None,
+    )
+
+
+@router.get("/convert/cache")
+async def get_cache_info():
+    """Get information about cached converted models."""
+    converter = _get_converter()
+    return converter.get_cache_info()
+
+
+@router.delete("/convert/cache")
+async def clear_cache(model_id: Optional[str] = None, format: Optional[str] = None):
+    """Clear the model conversion cache.
+
+    Args:
+        model_id: Clear cache for specific model (None = all)
+        format: Clear cache for specific format: "mlx" or "gguf" (None = all)
+    """
+    converter = _get_converter()
+    converter.clear_cache(model_id, format)
+    return {"status": "ok", "message": "Cache cleared"}