From ca6496efe02d12b0ca256ffdc66829b11e52db8e Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 19:57:32 +0800
Subject: [PATCH 01/27] fix: use bridge network for Windows Docker
 compatibility

---
 docker-compose.local.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docker-compose.local.yml b/docker-compose.local.yml
index 8e2d736..f4dee22 100644
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -9,7 +9,9 @@ services:
     image: infinirc/lmstack-backend:local
     container_name: lmstack-backend
     user: root
-    network_mode: host
+    # Use bridge network for Windows compatibility (network_mode: host doesn't work on Windows)
+    ports:
+      - "52000:52000"
     volumes:
       - lmstack-data:/app/data
       - /var/run/docker.sock:/var/run/docker.sock
@@ -18,21 +20,27 @@ services:
       - LMSTACK_SECRET_KEY=${SECRET_KEY:-dev-secret-key}
       - LMSTACK_EXTERNAL_URL=${EXTERNAL_URL:-}
     restart: unless-stopped
+    networks:
+      - lmstack
 
   frontend:
     image: infinirc/lmstack-frontend:local
     container_name: lmstack-frontend
     ports:
       - "3000:80"
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
     environment:
-      - BACKEND_HOST=host.docker.internal
+      - BACKEND_HOST=server
       - NGINX_ENVSUBST_FILTER=BACKEND_HOST
     depends_on:
       - server
     restart: unless-stopped
+    networks:
+      - lmstack
 
 volumes:
   lmstack-data:
     driver: local
+
+networks:
+  lmstack:
+    driver: bridge

From 5f188a965a640acb3b3a8097317a06f7a464ffb9 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:00:03 +0800
Subject: [PATCH 02/27] fix: bind ports to all interfaces for LAN access

---
 docker-compose.local.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose.local.yml b/docker-compose.local.yml
index f4dee22..4423911 100644
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -11,7 +11,7 @@ services:
     user: root
     # Use bridge network for Windows compatibility (network_mode: host doesn't work on Windows)
     ports:
-      - "52000:52000"
+      - "0.0.0.0:52000:52000"
     volumes:
       - lmstack-data:/app/data
       - /var/run/docker.sock:/var/run/docker.sock
@@ -27,7 +27,7 @@ services:
     image: infinirc/lmstack-frontend:local
     container_name: lmstack-frontend
     ports:
-      - "3000:80"
+      - "0.0.0.0:3000:80"
     environment:
       - BACKEND_HOST=server
       - NGINX_ENVSUBST_FILTER=BACKEND_HOST

From f3948abaeaecf5d5c287539aa5d9a09860e5c2d8 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:10:14 +0800
Subject: [PATCH 03/27] docs: add Windows Docker Desktop LAN access
 instructions

---
 README.md       | 23 +++++++++++++++++++++++
 README_zh-TW.md | 23 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/README.md b/README.md
index 6e3ca7f..e230e4b 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,29 @@ docker compose -f docker-compose.deploy.yml up -d
 - Frontend: http://localhost:3000
 - Backend API: http://localhost:52000
 
+### Windows Docker Desktop - LAN Access
+
+Docker Desktop on Windows binds ports to `127.0.0.1` only. To allow LAN access, run these commands in PowerShell (Administrator):
+
+```powershell
+# Add firewall rule
+New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+
+# Port forwarding for LAN access
+netsh interface portproxy add v4tov4 listenport=3000 listenaddress=0.0.0.0 connectport=3000 connectaddress=127.0.0.1
+netsh interface portproxy add v4tov4 listenport=52000 listenaddress=0.0.0.0 connectport=52000 connectaddress=127.0.0.1
+
+# Verify port forwarding
+netsh interface portproxy show all
+```
+
+To remove port forwarding:
+
+```powershell
+netsh interface portproxy delete v4tov4 listenport=3000 listenaddress=0.0.0.0
+netsh interface portproxy delete v4tov4 listenport=52000 listenaddress=0.0.0.0
+```
+
 ### Usage
 
 1. Login with `admin` / `admin` (change password after first login)
diff --git a/README_zh-TW.md b/README_zh-TW.md
index 28c4500..89ee119 100644
--- a/README_zh-TW.md
+++ b/README_zh-TW.md
@@ -52,6 +52,29 @@ docker compose -f docker-compose.deploy.yml up -d
 - 前端: http://localhost:3000
 - 後端 API: http://localhost:52000
 
+### Windows Docker Desktop - 區域網路存取
+
+Windows 上的 Docker Desktop 預設只綁定到 `127.0.0.1`。若要允許區域網路存取，請在 PowerShell（系統管理員）中執行：
+
+```powershell
+# 新增防火牆規則
+New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+
+# 設定端口轉發以允許區域網路存取
+netsh interface portproxy add v4tov4 listenport=3000 listenaddress=0.0.0.0 connectport=3000 connectaddress=127.0.0.1
+netsh interface portproxy add v4tov4 listenport=52000 listenaddress=0.0.0.0 connectport=52000 connectaddress=127.0.0.1
+
+# 確認端口轉發設定
+netsh interface portproxy show all
+```
+
+移除端口轉發：
+
+```powershell
+netsh interface portproxy delete v4tov4 listenport=3000 listenaddress=0.0.0.0
+netsh interface portproxy delete v4tov4 listenport=52000 listenaddress=0.0.0.0
+```
+
 ### 使用方式
 
 1. 使用 `admin` / `admin` 登入（首次登入後請更改密碼）

From eb257e006cddb3df464366162b05ad76f9a8472f Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:13:49 +0800
Subject: [PATCH 04/27] fix: use single-line docker command for Windows
 compatibility

---
 backend/app/api/workers.py | 62 +++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/backend/app/api/workers.py b/backend/app/api/workers.py
index 8c5954d..1339df3 100644
--- a/backend/app/api/workers.py
+++ b/backend/app/api/workers.py
@@ -24,12 +24,7 @@
     WorkerResponse,
     WorkerUpdate,
 )
-from app.services.local_worker import (
-    get_local_hostname,
-    get_local_ip,
-    spawn_docker_worker,
-    stop_docker_worker,
-)
+from app.services.local_worker import get_local_hostname, spawn_docker_worker, stop_docker_worker
 
 router = APIRouter()
 
@@ -380,10 +375,9 @@ async def register_local_worker(
     hostname = get_local_hostname()
     worker_name = hostname
 
-    # Get backend URL - use local IP so the Docker container can reach it
-    local_ip = get_local_ip()
+    # Use localhost since local worker uses --network host mode
     settings = get_settings()
-    backend_url = f"http://{local_ip}:{settings.port}"
+    backend_url = f"http://localhost:{settings.port}"
 
     # Create a registration token for this worker
     token = RegistrationToken.create(
@@ -425,19 +419,39 @@ def _generate_docker_command(token: str, name: str, backend_url: str) -> str:
     1. Worker registers with localhost/host IP (not Docker internal IP)
     2. Apps deployed by Worker are accessible via host network
     3. Works seamlessly on both regular machines and WSL
+
+    Command is single-line for cross-platform compatibility (Linux/Mac/Windows).
     """
-    return f"""docker run -d \\
-  --name lmstack-worker \\
-  --network host \\
-  --gpus all \\
-  --privileged \\
-  -v /var/run/docker.sock:/var/run/docker.sock \\
-  -v ~/.cache/huggingface:/root/.cache/huggingface \\
-  -v /:/host:ro \\
-  -e BACKEND_URL={backend_url} \\
-  -e WORKER_NAME={name} \\
-  -e REGISTRATION_TOKEN={token} \\
-  infinirc/lmstack-worker:latest"""
+    return (
+        f"docker run -d --name lmstack-worker --network host --gpus all --privileged "
+        f"-v /var/run/docker.sock:/var/run/docker.sock "
+        f"-v ~/.cache/huggingface:/root/.cache/huggingface "
+        f"-v /:/host:ro "
+        f"-e BACKEND_URL={backend_url} "
+        f"-e WORKER_NAME={name} "
+        f"-e REGISTRATION_TOKEN={token} "
+        f"infinirc/lmstack-worker:latest"
+    )
+
+
+def _get_wsl_windows_ip() -> str | None:
+    """Get Windows host IP from WSL."""
+    try:
+        # Check if we're in WSL
+        with open("/proc/version") as f:
+            if "microsoft" not in f.read().lower():
+                return None
+        # Read Windows host IP from resolv.conf (nameserver is Windows host)
+        with open("/etc/resolv.conf") as f:
+            for line in f:
+                if line.startswith("nameserver"):
+                    ip = line.split()[1].strip()
+                    # Filter out localhost entries
+                    if not ip.startswith("127."):
+                        return ip
+    except Exception:
+        pass
+    return None
 
 
 def _get_backend_url(request: Request) -> str:
@@ -445,6 +459,12 @@ def _get_backend_url(request: Request) -> str:
     settings = get_settings()
     if settings.external_url:
         return settings.external_url.rstrip("/")
+
+    # For WSL, try to get the Windows host IP for external access
+    windows_ip = _get_wsl_windows_ip()
+    if windows_ip:
+        return f"http://{windows_ip}:{settings.port}"
+
     # Check X-Forwarded headers (from nginx/vite proxy)
     forwarded_host = request.headers.get("X-Forwarded-Host")
     if forwarded_host:

From 0245894d3ae2c6dfbce9bb0eb11e8c6f1e07ac0d Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:20:19 +0800
Subject: [PATCH 05/27] feat: add Windows local docker command option for
 same-machine worker

---
 frontend/src/pages/Workers.tsx | 63 ++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/frontend/src/pages/Workers.tsx b/frontend/src/pages/Workers.tsx
index 0ba1293..d4e85e4 100644
--- a/frontend/src/pages/Workers.tsx
+++ b/frontend/src/pages/Workers.tsx
@@ -857,6 +857,69 @@ export default function Workers() {
                     </div>
                   ),
                 },
+                {
+                  key: "windows-local",
+                  label: "Windows (Same Machine as Backend)",
+                  children: (
+                    <div
+                      style={{
+                        padding: 12,
+                        backgroundColor: "#1e1e1e",
+                        borderRadius: 6,
+                        position: "relative",
+                      }}
+                    >
+                      <pre
+                        style={{
+                          margin: 0,
+                          whiteSpace: "pre-wrap",
+                          wordBreak: "break-all",
+                          color: "#d4d4d4",
+                          fontSize: 12,
+                          fontFamily: "monospace",
+                        }}
+                      >
+                        {generatedToken.docker_command?.replace(
+                          /BACKEND_URL=http:\/\/[^:]+:52000/,
+                          "BACKEND_URL=http://host.docker.internal:52000",
+                        )}
+                      </pre>
+                      <Button
+                        type="primary"
+                        icon={<CopyOutlined />}
+                        size="small"
+                        onClick={() => {
+                          const windowsCmd =
+                            generatedToken.docker_command?.replace(
+                              /BACKEND_URL=http:\/\/[^:]+:52000/,
+                              "BACKEND_URL=http://host.docker.internal:52000",
+                            );
+                          if (windowsCmd) {
+                            if (navigator.clipboard && window.isSecureContext) {
+                              navigator.clipboard.writeText(windowsCmd);
+                              message.success("Command copied to clipboard");
+                            } else {
+                              const textArea =
+                                document.createElement("textarea");
+                              textArea.value = windowsCmd;
+                              textArea.style.position = "fixed";
+                              textArea.style.left = "-999999px";
+                              document.body.appendChild(textArea);
+                              textArea.focus();
+                              textArea.select();
+                              document.execCommand("copy");
+                              document.body.removeChild(textArea);
+                              message.success("Command copied to clipboard");
+                            }
+                          }
+                        }}
+                        style={{ position: "absolute", top: 8, right: 8 }}
+                      >
+                        Copy
+                      </Button>
+                    </div>
+                  ),
+                },
                 {
                   key: "dev-python",
                   label: "Development Mode (Python)",

From c1ab9d4dc9e2061e8ed6996165c172a851d4116a Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:33:44 +0800
Subject: [PATCH 06/27] fix: prevent progress reset during image extraction
 phase

---
 worker/docker_ops/runner.py | 45 ++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/worker/docker_ops/runner.py b/worker/docker_ops/runner.py
index 8931c94..fd31790 100644
--- a/worker/docker_ops/runner.py
+++ b/worker/docker_ops/runner.py
@@ -131,23 +131,46 @@ def pull_image_with_progress(self, image: str, deployment_id: int) -> None:
 
         try:
             for line in self.client.api.pull(image, stream=True, decode=True):
-                if "id" in line and "progressDetail" in line:
+                if "id" in line:
                     layer_id = line["id"]
+                    status = line.get("status", "")
                     detail = line.get("progressDetail", {})
-                    current = detail.get("current", 0)
-                    total = detail.get("total", 0)
 
                     progress_data = _pull_progress.get(str(deployment_id), {})
                     layers = progress_data.get("layers", {})
-                    layers[layer_id] = {
-                        "status": line.get("status", ""),
-                        "current": current,
-                        "total": total,
-                    }
 
-                    # Calculate overall progress
-                    total_size = sum(layer.get("total", 0) for layer in layers.values())
-                    downloaded = sum(layer.get("current", 0) for layer in layers.values())
+                    # Only update download progress for "Downloading" status
+                    # Once downloaded, keep the layer at 100% (total = total, current = total)
+                    if status == "Downloading":
+                        current = detail.get("current", 0)
+                        total = detail.get("total", 0)
+                        layers[layer_id] = {
+                            "status": status,
+                            "current": current,
+                            "total": total,
+                        }
+                    elif status in ("Download complete", "Pull complete", "Already exists"):
+                        # Layer is complete, mark as 100%
+                        existing = layers.get(layer_id, {})
+                        total = existing.get("total", 0)
+                        layers[layer_id] = {
+                            "status": status,
+                            "current": total,  # current = total = 100%
+                            "total": total,
+                        }
+                    elif status == "Pulling fs layer":
+                        # New layer, initialize with 0
+                        layers[layer_id] = {
+                            "status": status,
+                            "current": 0,
+                            "total": 0,
+                        }
+                    # Ignore "Extracting" and other statuses to avoid progress reset
+
+                    # Calculate overall progress (only count layers with total > 0)
+                    layers_with_size = [lyr for lyr in layers.values() if lyr.get("total", 0) > 0]
+                    total_size = sum(lyr.get("total", 0) for lyr in layers_with_size)
+                    downloaded = sum(lyr.get("current", 0) for lyr in layers_with_size)
                     overall_progress = int((downloaded / total_size) * 100) if total_size > 0 else 0
 
                     _set_pull_progress(

From 6479e1a98c48d784ac6e120e506777b94efbda92 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:41:20 +0800
Subject: [PATCH 07/27] fix: infinite wait for model/app loading, show patience
 message after threshold

---
 backend/app/api/apps/deployment.py | 58 ++++++++++++++++++------------
 backend/app/services/deployer.py   | 39 ++++++++++++--------
 worker/docker_ops/images.py        | 43 +++++++++++++++++-----
 3 files changed, 94 insertions(+), 46 deletions(-)

diff --git a/backend/app/api/apps/deployment.py b/backend/app/api/apps/deployment.py
index 6400d42..1b18f95 100644
--- a/backend/app/api/apps/deployment.py
+++ b/backend/app/api/apps/deployment.py
@@ -145,7 +145,6 @@ async def wait_for_container_healthy(
     container_id: str,
     app_id: int,
     port: int,
-    max_wait: int = 600,
     poll_interval: int = 2,
 ) -> bool:
     """Wait for container to become healthy.
@@ -155,21 +154,22 @@ async def wait_for_container_healthy(
         container_id: Container ID to check
         app_id: App ID for progress tracking
         port: App port for HTTP health check
-        max_wait: Maximum wait time in seconds
         poll_interval: Time between checks in seconds
 
     Returns:
-        True if healthy, False if timeout
+        True if healthy (waits indefinitely until healthy or error)
     """
     waited = 0
     consecutive_failures = 0
     max_consecutive_failures = 10  # Fail after 20 seconds of no connection
+    slow_threshold = 1800  # 30 minutes before showing "check" message
+    shown_slow_message = False
 
     worker_host = worker_address.split(":")[0]
     app_url = f"http://{worker_host}:{port}"
 
     async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
-        while waited < max_wait:
+        while True:  # Wait indefinitely
             try:
                 response = await client.get(f"http://{worker_address}/containers/{container_id}")
 
@@ -205,13 +205,34 @@ async def wait_for_container_healthy(
 
                         elif "health:" in status or "starting)" in status:
                             # Health check still running
-                            progress_pct = min(50 + int(waited / max_wait * 40), 90)
-                            set_deployment_progress(
-                                app_id,
-                                "starting",
-                                progress_pct,
-                                f"App is initializing ({waited}s, please wait)...",
-                            )
+                            mins = waited // 60
+                            secs = waited % 60
+                            time_str = f"{mins}m {secs}s" if mins > 0 else f"{secs}s"
+
+                            if waited >= slow_threshold and not shown_slow_message:
+                                set_deployment_progress(
+                                    app_id,
+                                    "starting",
+                                    80,
+                                    f"App is initializing ({time_str}) - Taking longer than expected. "
+                                    "Please check container logs for issues.",
+                                )
+                                shown_slow_message = True
+                            elif shown_slow_message:
+                                set_deployment_progress(
+                                    app_id,
+                                    "starting",
+                                    80,
+                                    f"App is initializing ({time_str}) - Please check logs if needed.",
+                                )
+                            else:
+                                progress_pct = min(50 + int(waited / 600 * 40), 90)
+                                set_deployment_progress(
+                                    app_id,
+                                    "starting",
+                                    progress_pct,
+                                    f"App is initializing ({time_str}, please wait)...",
+                                )
 
                         elif "health" not in status:
                             # No health check defined, verify HTTP access directly
@@ -243,8 +264,6 @@ async def wait_for_container_healthy(
             await asyncio.sleep(poll_interval)
             waited += poll_interval
 
-    return False
-
 
 async def _verify_http_access(
     client: httpx.AsyncClient,
@@ -354,28 +373,21 @@ async def deploy_app_background(
             app.port = port
             await db.commit()
 
-            # Phase 3: Wait for health
+            # Phase 3: Wait for health (waits indefinitely until healthy or error)
             set_deployment_progress(
                 app_id,
                 "starting",
                 50,
-                "Waiting for app to start (this may take 1-3 minutes)...",
+                "Waiting for app to start...",
             )
 
-            is_healthy = await wait_for_container_healthy(
+            await wait_for_container_healthy(
                 worker_address=worker_address,
                 container_id=container_id,
                 app_id=app_id,
                 port=port,
             )
 
-            if not is_healthy:
-                app.status = AppStatus.ERROR.value
-                app.status_message = "Container health check timed out after 10 minutes"
-                await db.commit()
-                set_deployment_progress(app_id, "error", 0, "Container health check timed out")
-                return
-
             # Phase 4: Setup proxy
             if use_proxy:
                 await _setup_nginx_proxy(app_id, app_type, worker_address, port)
diff --git a/backend/app/services/deployer.py b/backend/app/services/deployer.py
index 103dcec..a455292 100644
--- a/backend/app/services/deployer.py
+++ b/backend/app/services/deployer.py
@@ -23,7 +23,7 @@ class DeployerService:
 
     # Health check configuration
     HEALTH_CHECK_INTERVAL = 5  # seconds between checks
-    HEALTH_CHECK_TIMEOUT = 600  # max seconds to wait (10 minutes for large models)
+    HEALTH_CHECK_SLOW_THRESHOLD = 600  # seconds before showing "slow loading" message (10 min)
     HEALTH_CHECK_REQUEST_TIMEOUT = 10  # timeout for each health check request
 
     async def deploy(self, deployment_id: int) -> None:
@@ -189,12 +189,10 @@ async def deploy(self, deployment_id: int) -> None:
                     # Deployment was cancelled, don't update status
                     logger.info(f"Deployment {deployment_id} cancelled during startup")
                     return
-                elif api_ready:
+                else:
+                    # api_ready is True, model is ready
                     deployment.status = DeploymentStatus.RUNNING.value
                     deployment.status_message = "Model ready"
-                else:
-                    deployment.status = DeploymentStatus.ERROR.value
-                    deployment.status_message = "Model failed to start within timeout"
 
             except httpx.ConnectError:
                 deployment.status = DeploymentStatus.ERROR.value
@@ -321,11 +319,10 @@ async def _wait_for_api_ready(
         backend: str = BackendType.VLLM.value,
     ) -> bool | None:
         """
-        Poll the OpenAI API endpoint until it's ready or timeout.
+        Poll the OpenAI API endpoint until it's ready or cancelled.
 
         Returns:
             True: API is ready
-            False: Timeout or error
             None: Cancelled (user stopped deployment)
         """
         worker_ip = worker_address.split(":")[0]
@@ -339,11 +336,12 @@ async def _wait_for_api_ready(
 
         elapsed = 0
         check_count = 0
+        shown_slow_message = False
 
         logger.info(f"Waiting for API to be ready at {health_endpoint} (backend={backend})")
 
         async with httpx.AsyncClient(timeout=self.HEALTH_CHECK_REQUEST_TIMEOUT) as client:
-            while elapsed < self.HEALTH_CHECK_TIMEOUT:
+            while True:  # Wait indefinitely until ready or cancelled
                 check_count += 1
 
                 # Check if deployment was cancelled
@@ -408,9 +406,25 @@ async def _wait_for_api_ready(
                             mins = elapsed // 60
                             secs = elapsed % 60
                             time_str = f"{mins}m {secs}s" if mins > 0 else f"{secs}s"
-                            deployment.status_message = (
-                                f"Loading model into GPU memory... ({time_str})"
-                            )
+
+                            # Show patience message after threshold
+                            if (
+                                elapsed >= self.HEALTH_CHECK_SLOW_THRESHOLD
+                                and not shown_slow_message
+                            ):
+                                deployment.status_message = (
+                                    f"Loading model... ({time_str}) - "
+                                    "Large model or slow network detected. Please be patient."
+                                )
+                                shown_slow_message = True
+                            elif shown_slow_message:
+                                deployment.status_message = (
+                                    f"Loading model... ({time_str}) - Please be patient."
+                                )
+                            else:
+                                deployment.status_message = (
+                                    f"Loading model into GPU memory... ({time_str})"
+                                )
                             await db.commit()
                     except Exception as e:
                         logger.debug(f"Error updating deployment status message: {e}")
@@ -418,9 +432,6 @@ async def _wait_for_api_ready(
                 await asyncio.sleep(self.HEALTH_CHECK_INTERVAL)
                 elapsed += self.HEALTH_CHECK_INTERVAL
 
-        logger.warning(f"API health check timed out after {elapsed}s ({check_count} checks)")
-        return False
-
     def _is_local_worker(self, address: str) -> bool:
         """Check if the worker address refers to the local machine."""
         if not address:
diff --git a/worker/docker_ops/images.py b/worker/docker_ops/images.py
index f0f3b86..7446be8 100644
--- a/worker/docker_ops/images.py
+++ b/worker/docker_ops/images.py
@@ -164,16 +164,41 @@ def pull_image(
         for line in self.client.api.pull(image, stream=True, decode=True, auth_config=auth):
             if progress_callback and "id" in line:
                 layer_id = line["id"]
+                status = line.get("status", "")
                 detail = line.get("progressDetail", {})
-                layers_progress[layer_id] = {
-                    "status": line.get("status", ""),
-                    "current": detail.get("current", 0),
-                    "total": detail.get("total", 0),
-                }
-
-                # Calculate overall progress
-                total_size = sum(lp.get("total", 0) for lp in layers_progress.values())
-                downloaded = sum(lp.get("current", 0) for lp in layers_progress.values())
+
+                # Only update download progress for "Downloading" status
+                # Once downloaded, keep the layer at 100% (current = total)
+                if status == "Downloading":
+                    layers_progress[layer_id] = {
+                        "status": status,
+                        "current": detail.get("current", 0),
+                        "total": detail.get("total", 0),
+                    }
+                elif status in ("Download complete", "Pull complete", "Already exists"):
+                    # Layer is complete, mark as 100%
+                    existing = layers_progress.get(layer_id, {})
+                    total = existing.get("total", 0)
+                    layers_progress[layer_id] = {
+                        "status": status,
+                        "current": total,
+                        "total": total,
+                    }
+                elif status == "Pulling fs layer":
+                    # New layer, initialize with 0
+                    layers_progress[layer_id] = {
+                        "status": status,
+                        "current": 0,
+                        "total": 0,
+                    }
+                # Ignore "Extracting" and other statuses to avoid progress reset
+
+                # Calculate overall progress (only count layers with total > 0)
+                layers_with_size = [
+                    lyr for lyr in layers_progress.values() if lyr.get("total", 0) > 0
+                ]
+                total_size = sum(lyr.get("total", 0) for lyr in layers_with_size)
+                downloaded = sum(lyr.get("current", 0) for lyr in layers_with_size)
                 progress = int((downloaded / total_size) * 100) if total_size > 0 else 0
 
                 progress_callback(progress, layers_progress)

From 1782711e1f13f21c03140e4e17e98e0ebf35d78b Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 20:51:05 +0800
Subject: [PATCH 08/27] fix: add auto-refresh toggle for logs, change Exit to
 Minimize

---
 frontend/src/pages/Containers.tsx  | 18 +++++++++++++-----
 frontend/src/pages/DeployApps.tsx  | 18 +++++++++++++-----
 frontend/src/pages/Deployments.tsx | 18 +++++++++++++-----
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/frontend/src/pages/Containers.tsx b/frontend/src/pages/Containers.tsx
index 08d88aa..d7bfae9 100644
--- a/frontend/src/pages/Containers.tsx
+++ b/frontend/src/pages/Containers.tsx
@@ -81,6 +81,7 @@ export default function Containers() {
   const [logs, setLogs] = useState<string>("");
   const [logsLoading, setLogsLoading] = useState(false);
   const [logsFullscreen, setLogsFullscreen] = useState(true);
+  const [logsAutoRefresh, setLogsAutoRefresh] = useState(true);
   const [autoScroll, setAutoScroll] = useState(true);
   const logsRef = useRef<HTMLPreElement>(null);
   const [execModal, setExecModal] = useState<Container | null>(null);
@@ -123,9 +124,9 @@ export default function Containers() {
     return () => clearInterval(interval);
   }, [fetchData]);
 
-  // Auto-refresh logs when modal is open
+  // Auto-refresh logs when modal is open and auto-refresh is enabled
   useEffect(() => {
-    if (!logsModal) return;
+    if (!logsModal || !logsAutoRefresh) return;
 
     const interval = setInterval(async () => {
       try {
@@ -141,7 +142,7 @@ export default function Containers() {
     }, 2000); // Refresh every 2 seconds
 
     return () => clearInterval(interval);
-  }, [logsModal]);
+  }, [logsModal, logsAutoRefresh]);
 
   // Auto-scroll logs to bottom
   useEffect(() => {
@@ -783,7 +784,13 @@ export default function Containers() {
             }}
           >
             <span>Logs: {logsModal?.name}</span>
-            <Tag color="green">Auto-refresh</Tag>
+            <Tag
+              color={logsAutoRefresh ? "green" : "default"}
+              style={{ cursor: "pointer" }}
+              onClick={() => setLogsAutoRefresh(!logsAutoRefresh)}
+            >
+              Auto-refresh {logsAutoRefresh ? "ON" : "OFF"}
+            </Tag>
           </div>
         }
         open={!!logsModal}
@@ -791,6 +798,7 @@ export default function Containers() {
           setLogsModal(null);
           setLogs("");
           setLogsFullscreen(true);
+          setLogsAutoRefresh(true);
           setAutoScroll(true);
         }}
         footer={
@@ -821,7 +829,7 @@ export default function Containers() {
               onClick={() => setLogsFullscreen(!logsFullscreen)}
               size="small"
             >
-              {logsFullscreen ? "Exit" : "Fullscreen"}
+              {logsFullscreen ? "Minimize" : "Fullscreen"}
             </Button>
           </Space>
         }
diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index 2047199..be0782f 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -122,6 +122,7 @@ export default function DeployApps() {
   const [logs, setLogs] = useState<string>("");
   const [logsLoading, setLogsLoading] = useState(false);
   const [logsFullscreen, setLogsFullscreen] = useState(true);
+  const [logsAutoRefresh, setLogsAutoRefresh] = useState(true);
   const [autoScroll, setAutoScroll] = useState(true);
   const logsRef = useRef<HTMLPreElement>(null);
   const { isMobile } = useResponsive();
@@ -175,9 +176,9 @@ export default function DeployApps() {
     return () => clearInterval(interval);
   }, [deployedApps]);
 
-  // Auto-refresh logs when modal is open
+  // Auto-refresh logs when modal is open and auto-refresh is enabled
   useEffect(() => {
-    if (!logsModal) return;
+    if (!logsModal || !logsAutoRefresh) return;
 
     const interval = setInterval(async () => {
       try {
@@ -189,7 +190,7 @@ export default function DeployApps() {
     }, 2000); // Refresh every 2 seconds
 
     return () => clearInterval(interval);
-  }, [logsModal]);
+  }, [logsModal, logsAutoRefresh]);
 
   // Auto-scroll logs to bottom
   useEffect(() => {
@@ -728,13 +729,20 @@ export default function DeployApps() {
             }}
           >
             <span>Logs: {logsModal?.name}</span>
-            <Tag color="green">Auto-refresh</Tag>
+            <Tag
+              color={logsAutoRefresh ? "green" : "default"}
+              style={{ cursor: "pointer" }}
+              onClick={() => setLogsAutoRefresh(!logsAutoRefresh)}
+            >
+              Auto-refresh {logsAutoRefresh ? "ON" : "OFF"}
+            </Tag>
           </div>
         }
         open={!!logsModal}
         onCancel={() => {
           setLogsModal(null);
           setLogsFullscreen(false);
+          setLogsAutoRefresh(true);
           setAutoScroll(true);
         }}
         footer={
@@ -765,7 +773,7 @@ export default function DeployApps() {
               onClick={() => setLogsFullscreen(!logsFullscreen)}
               size="small"
             >
-              {logsFullscreen ? "Exit" : "Fullscreen"}
+              {logsFullscreen ? "Minimize" : "Fullscreen"}
             </Button>
           </Space>
         }
diff --git a/frontend/src/pages/Deployments.tsx b/frontend/src/pages/Deployments.tsx
index 9c69c11..50da57a 100644
--- a/frontend/src/pages/Deployments.tsx
+++ b/frontend/src/pages/Deployments.tsx
@@ -66,6 +66,7 @@ export default function Deployments() {
   const [logs, setLogs] = useState<string>("");
   const [logsLoading, setLogsLoading] = useState(false);
   const [logsFullscreen, setLogsFullscreen] = useState(true);
+  const [logsAutoRefresh, setLogsAutoRefresh] = useState(true);
   const [autoScroll, setAutoScroll] = useState(true);
   const logsRef = useRef<HTMLPreElement>(null);
   const [form] = Form.useForm();
@@ -148,9 +149,9 @@ export default function Deployments() {
     return () => clearInterval(interval);
   }, [fetchDeployments]);
 
-  // Auto-refresh logs when modal is open
+  // Auto-refresh logs when modal is open and auto-refresh is enabled
   useEffect(() => {
-    if (!logsModal) return;
+    if (!logsModal || !logsAutoRefresh) return;
 
     const interval = setInterval(async () => {
       try {
@@ -162,7 +163,7 @@ export default function Deployments() {
     }, 2000); // Refresh every 2 seconds
 
     return () => clearInterval(interval);
-  }, [logsModal]);
+  }, [logsModal, logsAutoRefresh]);
 
   // Auto-scroll logs to bottom
   useEffect(() => {
@@ -1021,13 +1022,20 @@ export default function Deployments() {
             }}
           >
             <span>Logs: {logsModal?.name}</span>
-            <Tag color="green">Auto-refresh</Tag>
+            <Tag
+              color={logsAutoRefresh ? "green" : "default"}
+              style={{ cursor: "pointer" }}
+              onClick={() => setLogsAutoRefresh(!logsAutoRefresh)}
+            >
+              Auto-refresh {logsAutoRefresh ? "ON" : "OFF"}
+            </Tag>
           </div>
         }
         open={!!logsModal}
         onCancel={() => {
           setLogsModal(null);
           setLogsFullscreen(false);
+          setLogsAutoRefresh(true);
           setAutoScroll(true);
         }}
         footer={
@@ -1058,7 +1066,7 @@ export default function Deployments() {
               onClick={() => setLogsFullscreen(!logsFullscreen)}
               size="small"
             >
-              {logsFullscreen ? "Exit" : "Fullscreen"}
+              {logsFullscreen ? "Minimize" : "Fullscreen"}
             </Button>
           </Space>
         }

From a0517beecbe22326e3f02bb6fa4d623f0f52cb59 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 21:24:06 +0800
Subject: [PATCH 09/27] feat: add Docker network support for Windows
 compatibility

- Local deployments now join lmstack network for container communication
- Backend uses container name instead of IP:port for local workers
- Worker runner.py supports optional network parameter
- Fixes Windows Docker Desktop networking issues where containers cannot communicate
---
 backend/app/services/deployer.py | 75 ++++++++++++++++++++++++++++----
 worker/docker_ops/runner.py      | 50 +++++++++++++++------
 2 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/backend/app/services/deployer.py b/backend/app/services/deployer.py
index a455292..a68e94c 100644
--- a/backend/app/services/deployer.py
+++ b/backend/app/services/deployer.py
@@ -81,7 +81,10 @@ async def deploy(self, deployment_id: int) -> None:
                         return
                     deployment.container_id = result.get("container_id")
                     deployment.port = result.get("port")
+                    # Store container_name for internal Docker network communication
+                    local_container_name = result.get("container_name")
                 else:
+                    local_container_name = None  # Remote workers use IP:port
                     # Send to remote worker agent
                     worker_url = f"http://{deployment.worker.address}/deploy"
                     progress_url = (
@@ -149,6 +152,7 @@ async def deploy(self, deployment_id: int) -> None:
                     ollama_ready = await self._wait_for_ollama_ready(
                         deployment.worker.address,
                         deployment.port,
+                        container_name=local_container_name,
                     )
                     if not ollama_ready:
                         deployment.status = DeploymentStatus.ERROR.value
@@ -163,6 +167,7 @@ async def deploy(self, deployment_id: int) -> None:
                         deployment.worker.address,
                         deployment.port,
                         deployment.model.model_id,
+                        container_name=local_container_name,
                     )
                     if not pull_success:
                         deployment.status = DeploymentStatus.ERROR.value
@@ -180,6 +185,7 @@ async def deploy(self, deployment_id: int) -> None:
                     deployment_id,
                     db,
                     backend=deployment.backend,
+                    container_name=local_container_name,
                 )
 
                 # Refresh deployment object after health check updates
@@ -211,6 +217,7 @@ async def _wait_for_ollama_ready(
         worker_address: str,
         port: int,
         timeout: int = 60,
+        container_name: str | None = None,
     ) -> bool:
         """Wait for Ollama API to be available.
 
@@ -218,12 +225,17 @@ async def _wait_for_ollama_ready(
             worker_address: Worker address (host:port)
             port: Ollama container port
             timeout: Maximum wait time in seconds
+            container_name: Container name for Docker network (Windows compatibility)
 
         Returns:
             True if Ollama is ready, False on timeout
         """
-        worker_ip = worker_address.split(":")[0]
-        api_url = f"http://{worker_ip}:{port}/api/tags"
+        # Ollama is configured to use port 8000 (OLLAMA_HOST=0.0.0.0:8000)
+        if container_name:
+            api_url = f"http://{container_name}:8000/api/tags"
+        else:
+            worker_ip = worker_address.split(":")[0]
+            api_url = f"http://{worker_ip}:{port}/api/tags"
 
         logger.info(f"Waiting for Ollama API at {api_url}")
 
@@ -253,14 +265,19 @@ async def _ollama_pull_model(
         worker_address: str,
         port: int,
         model_id: str,
+        container_name: str | None = None,
     ) -> bool:
         """Pull a model using Ollama API.
 
         Ollama requires models to be pulled before they can be used.
         This method calls the /api/pull endpoint and waits for completion.
         """
-        worker_ip = worker_address.split(":")[0]
-        api_url = f"http://{worker_ip}:{port}/api/pull"
+        # Ollama is configured to use port 8000 (OLLAMA_HOST=0.0.0.0:8000)
+        if container_name:
+            api_url = f"http://{container_name}:8000/api/pull"
+        else:
+            worker_ip = worker_address.split(":")[0]
+            api_url = f"http://{worker_ip}:{port}/api/pull"
 
         logger.info(f"Pulling Ollama model: {model_id}")
 
@@ -317,16 +334,33 @@ async def _wait_for_api_ready(
         deployment_id: int,
         db,
         backend: str = BackendType.VLLM.value,
+        container_name: str | None = None,
     ) -> bool | None:
         """
         Poll the OpenAI API endpoint until it's ready or cancelled.
 
+        Args:
+            worker_address: Worker address (host:port)
+            port: Host port for the model API
+            deployment_id: Deployment ID for status updates
+            db: Database session
+            backend: Backend type (vllm, ollama, etc.)
+            container_name: Container name for local Docker network communication.
+                           If set, uses container_name:8000 instead of worker_ip:port.
+                           This is needed for Windows Docker Desktop compatibility.
+
         Returns:
             True: API is ready
             None: Cancelled (user stopped deployment)
         """
-        worker_ip = worker_address.split(":")[0]
-        api_base_url = f"http://{worker_ip}:{port}"
+        # For local deployments with container_name, use Docker internal networking
+        # All backends (vLLM, SGLang, Ollama) are configured to use port 8000
+        if container_name:
+            api_base_url = f"http://{container_name}:8000"
+            logger.info(f"Using Docker network for API: {api_base_url}")
+        else:
+            worker_ip = worker_address.split(":")[0]
+            api_base_url = f"http://{worker_ip}:{port}"
 
         # Both vLLM and Ollama support OpenAI-compatible /v1/models endpoint
         health_endpoint = f"{api_base_url}/v1/models"
@@ -490,6 +524,10 @@ async def _deploy_local(self, deploy_request: dict) -> dict:
 
         This is used for local workers where we don't need to go through
         a remote worker agent.
+
+        On Windows Docker Desktop, containers must be on the same network
+        to communicate. We put model containers on the 'lmstack' network
+        so the backend can reach them via container name.
         """
         try:
             client = docker.from_env()
@@ -500,12 +538,26 @@ async def _deploy_local(self, deploy_request: dict) -> dict:
             gpu_indexes = deploy_request.get("gpu_indexes", [0])
             deployment_name = deploy_request.get("deployment_name", "lmstack-deployment")
 
-            # Find available port
+            # Find available port (still used for external access)
             host_port = self._find_available_port()
 
-            # Container name
+            # Container name - used for internal Docker network communication
             container_name = f"lmstack-{deployment_name}-{deploy_request['deployment_id']}"
 
+            # Ensure lmstack network exists (for Windows Docker Desktop compatibility)
+            network_name = "lmstack_lmstack"
+            try:
+                client.networks.get(network_name)
+            except docker.errors.NotFound:
+                # Try alternative network name (depends on compose project name)
+                try:
+                    network_name = "lmstack"
+                    client.networks.get(network_name)
+                except docker.errors.NotFound:
+                    # Create the network if it doesn't exist
+                    logger.info(f"Creating Docker network: {network_name}")
+                    client.networks.create(network_name, driver="bridge")
+
             # Build GPU device requests
             device_requests = [
                 docker.types.DeviceRequest(
@@ -537,12 +589,17 @@ async def _deploy_local(self, deploy_request: dict) -> dict:
                 },
                 shm_size="16g",  # Required for large model inference
                 restart_policy={"Name": "unless-stopped"},
+                network=network_name,  # Join lmstack network for Windows compatibility
             )
 
-            logger.info(f"Started local container: {container.id[:12]} on port {host_port}")
+            logger.info(
+                f"Started local container: {container.id[:12]} "
+                f"(name={container_name}) on network={network_name}, port={host_port}"
+            )
 
             return {
                 "container_id": container.id,
+                "container_name": container_name,
                 "port": host_port,
             }
 
diff --git a/worker/docker_ops/runner.py b/worker/docker_ops/runner.py
index fd31790..a13bc0a 100644
--- a/worker/docker_ops/runner.py
+++ b/worker/docker_ops/runner.py
@@ -216,8 +216,20 @@ async def run(
         environment: dict[str, str],
         deployment_id: int = 0,
         port: Optional[int] = None,
+        network: Optional[str] = None,
     ) -> tuple[str, int]:
-        """Run a container and return (container_id, port)."""
+        """Run a container and return (container_id, port).
+
+        Args:
+            name: Container name
+            image: Docker image
+            command: Container command
+            gpu_indexes: GPU indices to use
+            environment: Environment variables
+            deployment_id: Deployment ID for progress tracking
+            port: Optional specific port to use
+            network: Optional Docker network to join (for Windows compatibility)
+        """
         loop = asyncio.get_event_loop()
         return await loop.run_in_executor(
             None,
@@ -229,6 +241,7 @@ async def run(
             environment,
             deployment_id,
             port,
+            network,
         )
 
     def _run_sync(
@@ -240,6 +253,7 @@ def _run_sync(
         environment: dict[str, str],
         deployment_id: int = 0,
         port: Optional[int] = None,
+        network: Optional[str] = None,
     ) -> tuple[str, int]:
         """Synchronous container run."""
         # Check if container with same name exists
@@ -286,24 +300,32 @@ def _run_sync(
             **environment,
         }
 
-        # Run container
-        container = self.client.containers.run(
-            image=image,
-            name=name,
-            command=command,
-            detach=True,
-            remove=False,
-            ports={"8000/tcp": host_port},
-            device_requests=device_requests,
-            environment=env,
-            shm_size="16g",
-            volumes={
+        # Build container run kwargs
+        run_kwargs = {
+            "image": image,
+            "name": name,
+            "command": command,
+            "detach": True,
+            "remove": False,
+            "ports": {"8000/tcp": host_port},
+            "device_requests": device_requests,
+            "environment": env,
+            "shm_size": "16g",
+            "volumes": {
                 "/root/.cache/huggingface": {
                     "bind": "/root/.cache/huggingface",
                     "mode": "rw",
                 },
             },
-        )
+        }
+
+        # Add network if specified (for Windows Docker Desktop compatibility)
+        if network:
+            run_kwargs["network"] = network
+            logger.info(f"Creating container on network: {network}")
+
+        # Run container
+        container = self.client.containers.run(**run_kwargs)
 
         return container.id, host_port
 

From a1f135ceb420110d97de64cfbae19db7a6af7de2 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Fri, 16 Jan 2026 21:53:15 +0800
Subject: [PATCH 10/27] fix: use container name for Docker internal network
 communication (Windows compatibility)

---
 backend/app/api/gateway.py                    |  4 ++
 backend/app/database.py                       | 21 ++++++-
 backend/app/models/deployment.py              |  1 +
 backend/app/services/deployer.py              |  1 +
 backend/app/services/gateway.py               | 21 ++++++-
 .../007_add_deployment_container_name.py      | 59 +++++++++++++++++++
 6 files changed, 103 insertions(+), 4 deletions(-)
 create mode 100644 backend/migrations/007_add_deployment_container_name.py

diff --git a/backend/app/api/gateway.py b/backend/app/api/gateway.py
index 50d7ad7..c814267 100644
--- a/backend/app/api/gateway.py
+++ b/backend/app/api/gateway.py
@@ -215,6 +215,7 @@ async def chat_completions(
     upstream_url = gateway_service.build_upstream_url(
         deployment.worker.address,
         deployment.port,
+        deployment.container_name,
     )
 
     # Replace model name with the actual model_id for vLLM
@@ -317,6 +318,7 @@ async def completions(
     upstream_url = gateway_service.build_upstream_url(
         deployment.worker.address,
         deployment.port,
+        deployment.container_name,
     )
 
     # Replace model name with the actual model_id for vLLM
@@ -587,6 +589,7 @@ async def embeddings(
     upstream_url = gateway_service.build_upstream_url(
         deployment.worker.address,
         deployment.port,
+        deployment.container_name,
     )
 
     # Replace model name with the actual model_id for vLLM
@@ -930,6 +933,7 @@ async def responses(
     upstream_url = gateway_service.build_upstream_url(
         deployment.worker.address,
         deployment.port,
+        deployment.container_name,
     )
 
     # Convert Responses API format to Chat Completions format
diff --git a/backend/app/database.py b/backend/app/database.py
index 19df838..961bdf9 100644
--- a/backend/app/database.py
+++ b/backend/app/database.py
@@ -37,11 +37,30 @@ async def get_db() -> AsyncSession:
             await session.close()
 
 
+async def _run_migrations(conn):
+    """Run schema migrations for new columns (SQLite compatible)."""
+    from sqlalchemy import text
+
+    async def column_exists(table_name: str, column_name: str) -> bool:
+        """Check if a column exists in a table."""
+        result = await conn.execute(text(f"PRAGMA table_info({table_name})"))
+        columns = [row[1] for row in result.fetchall()]
+        return column_name in columns
+
+    # Migration: Add container_name to deployments (for Windows Docker compatibility)
+    if not await column_exists("deployments", "container_name"):
+        logger.info("Adding 'container_name' column to deployments table...")
+        await conn.execute(text("ALTER TABLE deployments ADD COLUMN container_name VARCHAR(255)"))
+        logger.info("'container_name' column added!")
+
+
 async def init_db():
-    """Initialize database tables"""
+    """Initialize database tables and run migrations"""
     try:
         async with engine.begin() as conn:
             await conn.run_sync(Base.metadata.create_all)
+            # Run schema migrations for any new columns
+            await _run_migrations(conn)
     except Exception as e:
         # Ignore "already exists" errors from race conditions with multiple workers
         if "already exists" in str(e):
diff --git a/backend/app/models/deployment.py b/backend/app/models/deployment.py
index 9064637..83eff06 100644
--- a/backend/app/models/deployment.py
+++ b/backend/app/models/deployment.py
@@ -48,6 +48,7 @@ class Deployment(Base):
 
     # Container info
     container_id: Mapped[str | None] = mapped_column(String(255), nullable=True)
+    container_name: Mapped[str | None] = mapped_column(String(255), nullable=True)
     port: Mapped[int | None] = mapped_column(Integer, nullable=True)
 
     # Configuration
diff --git a/backend/app/services/deployer.py b/backend/app/services/deployer.py
index a68e94c..aff7a4c 100644
--- a/backend/app/services/deployer.py
+++ b/backend/app/services/deployer.py
@@ -83,6 +83,7 @@ async def deploy(self, deployment_id: int) -> None:
                     deployment.port = result.get("port")
                     # Store container_name for internal Docker network communication
                     local_container_name = result.get("container_name")
+                    deployment.container_name = local_container_name
                 else:
                     local_container_name = None  # Remote workers use IP:port
                     # Send to remote worker agent
diff --git a/backend/app/services/gateway.py b/backend/app/services/gateway.py
index 5439c1d..a1dc05e 100644
--- a/backend/app/services/gateway.py
+++ b/backend/app/services/gateway.py
@@ -263,12 +263,27 @@ async def record_usage(
         await db.commit()
 
     @staticmethod
-    def build_upstream_url(worker_address: str, port: int) -> str:
+    def build_upstream_url(
+        worker_address: str, port: int, container_name: str | None = None
+    ) -> str:
         """Build the upstream URL for the deployment.
 
-        worker_address may include port (e.g., "192.168.1.1:8080"),
-        we only need the host part.
+        For local deployments with container_name, use Docker internal networking
+        (container_name:8000) for container-to-container communication.
+        This is required for Windows Docker Desktop where host.docker.internal:port
+        doesn't work for backend-to-model communication.
+
+        For remote workers, use worker_address:port as before.
+
+        Args:
+            worker_address: Worker address (may include port, e.g., "192.168.1.1:8080")
+            port: Host port for the deployment
+            container_name: Docker container name for local deployments
         """
+        # For local deployments with container name, use Docker internal networking
+        if container_name:
+            return f"http://{container_name}:8000"
+
         # Extract host from worker address (remove agent port if present)
         host = worker_address.split(":")[0]
         return f"http://{host}:{port}"
diff --git a/backend/migrations/007_add_deployment_container_name.py b/backend/migrations/007_add_deployment_container_name.py
new file mode 100644
index 0000000..af4b769
--- /dev/null
+++ b/backend/migrations/007_add_deployment_container_name.py
@@ -0,0 +1,59 @@
+"""
+Migration: Add container_name column to deployments table
+
+This column stores the Docker container name for local deployments,
+enabling internal Docker network communication (container-to-container).
+Required for Windows Docker Desktop compatibility where host.docker.internal:port
+doesn't work for backend-to-model communication.
+
+Run with: python -m migrations.007_add_deployment_container_name
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import create_async_engine
+
+from app.config import get_settings
+
+
+async def column_exists(conn, table_name: str, column_name: str) -> bool:
+    """Check if a column exists in a table (SQLite compatible)"""
+    result = await conn.execute(text(f"PRAGMA table_info({table_name})"))
+    columns = [row[1] for row in result.fetchall()]
+    return column_name in columns
+
+
+async def migrate():
+    settings = get_settings()
+    engine = create_async_engine(settings.database_url, echo=True)
+
+    async with engine.begin() as conn:
+        # Add container_name column
+        if not await column_exists(conn, "deployments", "container_name"):
+            print("Adding 'container_name' column to deployments table...")
+            await conn.execute(
+                text(
+                    """
+                ALTER TABLE deployments ADD COLUMN container_name VARCHAR(255)
+            """
+                )
+            )
+            print("'container_name' column added!")
+        else:
+            print("'container_name' column already exists")
+
+        print("\n" + "=" * 50)
+        print("Migration completed successfully!")
+        print("=" * 50)
+
+    await engine.dispose()
+
+
+if __name__ == "__main__":
+    asyncio.run(migrate())

From 309df31c3b4888bf2778298d04b9064b1ea61d65 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 10:51:36 +0800
Subject: [PATCH 11/27] feat: add auto-restart, deployment sync, and chat proxy
 for Windows compatibility

---
 backend/app/api/deployments.py          | 142 +++++++++-
 backend/app/api/workers.py              |   5 +-
 backend/app/main.py                     |  65 ++++-
 backend/app/services/__init__.py        |   3 +
 backend/app/services/deployment_sync.py | 360 ++++++++++++++++++++++++
 backend/app/services/local_worker.py    |   3 +
 frontend/src/pages/Chat.tsx             |  13 +-
 7 files changed, 579 insertions(+), 12 deletions(-)
 create mode 100644 backend/app/services/deployment_sync.py

diff --git a/backend/app/api/deployments.py b/backend/app/api/deployments.py
index 75d60d6..e974213 100644
--- a/backend/app/api/deployments.py
+++ b/backend/app/api/deployments.py
@@ -1,6 +1,12 @@
 """Deployment API routes"""
 
-from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
+import json
+import logging
+from collections.abc import AsyncGenerator
+
+import httpx
+from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request
+from fastapi.responses import StreamingResponse
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import selectinload
@@ -21,6 +27,9 @@
     WorkerSummary,
 )
 from app.services.deployer import DeployerService
+from app.services.gateway import gateway_service
+
+logger = logging.getLogger(__name__)
 
 router = APIRouter()
 
@@ -377,3 +386,134 @@ async def get_deployment_logs(
     logs = await deployer.get_logs(deployment, tail=tail)
 
     return DeploymentLogsResponse(deployment_id=deployment_id, logs=logs)
+
+
+# Chat proxy timeout (5 minutes for long model responses)
+CHAT_PROXY_TIMEOUT = 300.0
+
+
+@router.post("/{deployment_id}/chat")
+async def proxy_chat(
+    deployment_id: int,
+    request: Request,
+    db: AsyncSession = Depends(get_db),
+    current_user: User = Depends(require_viewer),
+):
+    """Proxy chat requests to deployment (requires viewer+).
+
+    This endpoint proxies chat completion requests to the model container,
+    allowing the frontend to communicate with models without needing direct
+    network access to Docker internal IPs.
+
+    The request body should be an OpenAI-compatible chat completion request.
+    Supports both streaming and non-streaming responses.
+    """
+    # Get deployment with worker info
+    result = await db.execute(
+        select(Deployment)
+        .where(Deployment.id == deployment_id)
+        .options(
+            selectinload(Deployment.worker),
+            selectinload(Deployment.model),
+        )
+    )
+    deployment = result.scalar_one_or_none()
+
+    if not deployment:
+        raise HTTPException(status_code=404, detail="Deployment not found")
+
+    if deployment.status != DeploymentStatus.RUNNING.value:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Deployment is not running (status: {deployment.status})",
+        )
+
+    if not deployment.worker or not deployment.port:
+        raise HTTPException(status_code=400, detail="Deployment has no worker or port assigned")
+
+    # Build upstream URL using the gateway service (handles Docker networking correctly)
+    upstream_url = gateway_service.build_upstream_url(
+        deployment.worker.address,
+        deployment.port,
+        deployment.container_name,
+    )
+    chat_endpoint = f"{upstream_url}/v1/chat/completions"
+
+    # Get request body
+    try:
+        body = await request.json()
+    except (json.JSONDecodeError, ValueError):
+        raise HTTPException(status_code=400, detail="Invalid JSON body")
+
+    # Check if streaming
+    is_streaming = body.get("stream", False)
+
+    logger.debug(f"Proxying chat to {chat_endpoint}, streaming={is_streaming}")
+
+    if is_streaming:
+        return await _proxy_streaming_chat(chat_endpoint, body)
+    else:
+        return await _proxy_chat(chat_endpoint, body)
+
+
+async def _proxy_chat(upstream_url: str, body: dict) -> dict:
+    """Proxy a non-streaming chat request."""
+    try:
+        async with httpx.AsyncClient(timeout=CHAT_PROXY_TIMEOUT) as client:
+            response = await client.post(
+                upstream_url,
+                json=body,
+                headers={"Content-Type": "application/json"},
+            )
+            return response.json()
+
+    except httpx.TimeoutException:
+        raise HTTPException(status_code=504, detail="Request to model timed out")
+    except httpx.ConnectError:
+        raise HTTPException(status_code=502, detail="Failed to connect to model")
+    except httpx.RequestError as e:
+        logger.error(f"Chat proxy request error: {e}")
+        raise HTTPException(status_code=502, detail=f"Request error: {str(e)}")
+
+
+async def _proxy_streaming_chat(upstream_url: str, body: dict) -> StreamingResponse:
+    """Proxy a streaming chat request."""
+
+    async def stream_generator() -> AsyncGenerator[bytes, None]:
+        try:
+            async with httpx.AsyncClient(timeout=CHAT_PROXY_TIMEOUT) as client:
+                async with client.stream(
+                    "POST",
+                    upstream_url,
+                    json=body,
+                    headers={"Content-Type": "application/json"},
+                ) as response:
+                    async for chunk in response.aiter_bytes():
+                        yield chunk
+
+        except httpx.TimeoutException:
+            logger.error(f"Streaming timeout for {upstream_url}")
+            error_data = {
+                "error": {"message": "Request to model timed out", "type": "timeout_error"}
+            }
+            yield f"data: {json.dumps(error_data)}\n\n".encode()
+        except httpx.ConnectError:
+            logger.error(f"Connection error for {upstream_url}")
+            error_data = {
+                "error": {"message": "Failed to connect to model", "type": "connection_error"}
+            }
+            yield f"data: {json.dumps(error_data)}\n\n".encode()
+        except httpx.RequestError as e:
+            logger.error(f"Streaming request error: {e}")
+            error_data = {"error": {"message": f"Request error: {str(e)}", "type": "request_error"}}
+            yield f"data: {json.dumps(error_data)}\n\n".encode()
+
+    return StreamingResponse(
+        stream_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
diff --git a/backend/app/api/workers.py b/backend/app/api/workers.py
index 1339df3..c2b2a0a 100644
--- a/backend/app/api/workers.py
+++ b/backend/app/api/workers.py
@@ -420,10 +420,13 @@ def _generate_docker_command(token: str, name: str, backend_url: str) -> str:
     2. Apps deployed by Worker are accessible via host network
     3. Works seamlessly on both regular machines and WSL
 
+    Uses --restart unless-stopped so worker auto-starts after system reboot.
+
     Command is single-line for cross-platform compatibility (Linux/Mac/Windows).
     """
     return (
-        f"docker run -d --name lmstack-worker --network host --gpus all --privileged "
+        f"docker run -d --name lmstack-worker --restart unless-stopped "
+        f"--network host --gpus all --privileged "
         f"-v /var/run/docker.sock:/var/run/docker.sock "
         f"-v ~/.cache/huggingface:/root/.cache/huggingface "
         f"-v /:/host:ro "
diff --git a/backend/app/main.py b/backend/app/main.py
index 3e0a87f..00e2068 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -17,6 +17,7 @@
 from app.core.exceptions import LMStackError
 from app.database import async_session_maker, init_db
 from app.models.worker import Worker, WorkerStatus
+from app.services.deployment_sync import deployment_sync_service
 
 # Configure logging
 logging.basicConfig(
@@ -29,6 +30,10 @@
 
 # Background task control
 _worker_check_task = None
+_deployment_health_task = None
+
+# Deployment health check interval (in seconds)
+DEPLOYMENT_HEALTH_CHECK_INTERVAL = 60  # Check every minute
 
 
 async def check_worker_status():
@@ -72,26 +77,74 @@ async def check_worker_status():
             await asyncio.sleep(10)  # Wait before retrying on error
 
 
+async def check_deployment_health():
+    """Background task to periodically check deployment health.
+
+    This ensures that deployments marked as 'starting' eventually become
+    'running' or 'error', and catches any containers that crash.
+    """
+    # Initial delay to let containers stabilize after startup sync
+    await asyncio.sleep(30)
+
+    while True:
+        try:
+            await asyncio.sleep(DEPLOYMENT_HEALTH_CHECK_INTERVAL)
+
+            # Only sync deployments that are in transitional states
+            stats = await deployment_sync_service.sync_all_deployments()
+
+            if stats["total"] > 0:
+                logger.debug(
+                    f"Deployment health check: {stats['running_verified']} healthy, "
+                    f"{stats['api_not_ready']} loading, {stats['container_missing']} missing"
+                )
+
+        except asyncio.CancelledError:
+            logger.info("Deployment health check task cancelled")
+            break
+        except Exception as e:
+            logger.error(f"Error in deployment health check: {e}")
+            await asyncio.sleep(30)  # Wait before retrying on error
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan handler"""
-    global _worker_check_task
+    global _worker_check_task, _deployment_health_task
 
     # Startup
     logger.info("Starting LMStack API Server...")
     await init_db()
     logger.info("Database initialized")
 
+    # Synchronize deployment status with actual container state
+    # This is important after system reboot
+    try:
+        logger.info("Synchronizing deployment status...")
+        sync_stats = await deployment_sync_service.sync_all_deployments()
+        if sync_stats["total"] > 0:
+            logger.info(
+                f"Deployment sync complete: {sync_stats['running_verified']} running, "
+                f"{sync_stats['restarting']} restarting, {sync_stats['api_not_ready']} loading, "
+                f"{sync_stats['container_missing']} missing"
+            )
+    except Exception as e:
+        logger.error(f"Failed to sync deployments on startup: {e}")
+
     # Start background task for checking worker status
     _worker_check_task = asyncio.create_task(check_worker_status())
     logger.info("Worker status check task started")
 
+    # Start background task for checking deployment health
+    _deployment_health_task = asyncio.create_task(check_deployment_health())
+    logger.info("Deployment health check task started")
+
     yield
 
     # Shutdown
     logger.info("Shutting down LMStack API Server...")
 
-    # Cancel background task
+    # Cancel background tasks
     if _worker_check_task:
         _worker_check_task.cancel()
         try:
@@ -100,6 +153,14 @@ async def lifespan(app: FastAPI):
             pass
         logger.info("Worker status check task stopped")
 
+    if _deployment_health_task:
+        _deployment_health_task.cancel()
+        try:
+            await _deployment_health_task
+        except asyncio.CancelledError:
+            pass
+        logger.info("Deployment health check task stopped")
+
 
 app = FastAPI(
     title=settings.app_name,
diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py
index 8f4b144..97b5f64 100644
--- a/backend/app/services/__init__.py
+++ b/backend/app/services/__init__.py
@@ -2,10 +2,13 @@
 
 from app.services.auth import AuthService, auth_service
 from app.services.deployer import DeployerService
+from app.services.deployment_sync import DeploymentSyncService, deployment_sync_service
 from app.services.gateway import GatewayService, gateway_service
 
 __all__ = [
     "DeployerService",
+    "DeploymentSyncService",
+    "deployment_sync_service",
     "GatewayService",
     "gateway_service",
     "AuthService",
diff --git a/backend/app/services/deployment_sync.py b/backend/app/services/deployment_sync.py
new file mode 100644
index 0000000..d99b7e0
--- /dev/null
+++ b/backend/app/services/deployment_sync.py
@@ -0,0 +1,360 @@
+"""Deployment Sync Service
+
+Synchronizes deployment status with actual container state.
+This is important after system reboot to ensure database status matches reality.
+"""
+
+import asyncio
+import logging
+
+import docker
+import httpx
+from sqlalchemy import select
+from sqlalchemy.orm import selectinload
+
+from app.database import async_session_maker
+from app.models.deployment import Deployment, DeploymentStatus
+from app.models.llm_model import BackendType
+
+logger = logging.getLogger(__name__)
+
+
+class DeploymentSyncService:
+    """Service for synchronizing deployment status with actual container state."""
+
+    # Health check configuration
+    HEALTH_CHECK_TIMEOUT = 10  # seconds
+    CONTAINER_CHECK_TIMEOUT = 5  # seconds
+    MAX_CONCURRENT_CHECKS = 5  # limit concurrent health checks
+
+    def __init__(self):
+        self._docker_client: docker.DockerClient | None = None
+
+    @property
+    def docker_client(self) -> docker.DockerClient:
+        """Lazy-load Docker client."""
+        if self._docker_client is None:
+            try:
+                self._docker_client = docker.from_env()
+            except docker.errors.DockerException as e:
+                logger.warning(f"Failed to connect to Docker: {e}")
+                raise
+        return self._docker_client
+
+    async def sync_all_deployments(self) -> dict:
+        """Synchronize all deployment statuses on startup.
+
+        Returns:
+            dict with sync statistics
+        """
+        logger.info("Starting deployment status synchronization...")
+
+        stats = {
+            "total": 0,
+            "running_verified": 0,
+            "restarting": 0,
+            "container_missing": 0,
+            "api_not_ready": 0,
+            "errors": 0,
+            "skipped": 0,
+        }
+
+        async with async_session_maker() as db:
+            # Get all deployments that should be running
+            result = await db.execute(
+                select(Deployment)
+                .where(
+                    Deployment.status.in_(
+                        [
+                            DeploymentStatus.RUNNING.value,
+                            DeploymentStatus.STARTING.value,
+                        ]
+                    )
+                )
+                .options(
+                    selectinload(Deployment.worker),
+                    selectinload(Deployment.model),
+                )
+            )
+            deployments = result.scalars().all()
+            stats["total"] = len(deployments)
+
+            if not deployments:
+                logger.info("No active deployments to sync")
+                return stats
+
+            logger.info(f"Found {len(deployments)} active deployments to check")
+
+            # Check deployments with limited concurrency
+            semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHECKS)
+
+            async def check_with_semaphore(deployment: Deployment):
+                async with semaphore:
+                    return await self._check_and_update_deployment(deployment, db)
+
+            tasks = [check_with_semaphore(d) for d in deployments]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Aggregate results
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"Deployment check failed: {result}")
+                    stats["errors"] += 1
+                elif isinstance(result, str):
+                    if result == "running_verified":
+                        stats["running_verified"] += 1
+                    elif result == "restarting":
+                        stats["restarting"] += 1
+                    elif result == "container_missing":
+                        stats["container_missing"] += 1
+                    elif result == "api_not_ready":
+                        stats["api_not_ready"] += 1
+                    elif result == "skipped":
+                        stats["skipped"] += 1
+
+            await db.commit()
+
+        logger.info(
+            f"Deployment sync complete: {stats['running_verified']} running, "
+            f"{stats['restarting']} restarting, {stats['container_missing']} missing, "
+            f"{stats['api_not_ready']} not ready, {stats['errors']} errors"
+        )
+
+        return stats
+
+    async def _check_and_update_deployment(self, deployment: Deployment, db) -> str:
+        """Check a single deployment and update its status.
+
+        Returns:
+            Status string: running_verified, restarting, container_missing, api_not_ready, skipped
+        """
+        logger.debug(f"Checking deployment {deployment.id}: {deployment.name}")
+
+        if not deployment.worker:
+            logger.warning(f"Deployment {deployment.id} has no worker, skipping")
+            return "skipped"
+
+        if not deployment.container_id:
+            logger.warning(f"Deployment {deployment.id} has no container_id, marking as error")
+            deployment.status = DeploymentStatus.ERROR.value
+            deployment.status_message = "Container ID missing after restart"
+            return "container_missing"
+
+        # Check if this is a local worker
+        is_local = self._is_local_worker(deployment.worker.address)
+
+        if is_local:
+            return await self._check_local_deployment(deployment)
+        else:
+            return await self._check_remote_deployment(deployment)
+
+    def _is_local_worker(self, address: str) -> bool:
+        """Check if the worker address refers to the local machine."""
+        if not address:
+            return False
+        host = address.split(":")[0].lower()
+        return host in ("localhost", "127.0.0.1", "local")
+
+    async def _check_local_deployment(self, deployment: Deployment) -> str:
+        """Check a local deployment's container and API status."""
+        try:
+            # Check container status
+            container = self.docker_client.containers.get(deployment.container_id)
+            container_status = container.status
+
+            if container_status == "running":
+                # Container is running, check API health
+                api_healthy = await self._check_api_health(
+                    deployment.worker.address,
+                    deployment.port,
+                    deployment.backend,
+                    deployment.container_name,
+                )
+
+                if api_healthy:
+                    # Everything is good
+                    if deployment.status != DeploymentStatus.RUNNING.value:
+                        deployment.status = DeploymentStatus.RUNNING.value
+                        deployment.status_message = "Model ready (verified after restart)"
+                    logger.info(f"Deployment {deployment.name}: running and healthy")
+                    return "running_verified"
+                else:
+                    # Container running but API not ready yet
+                    deployment.status = DeploymentStatus.STARTING.value
+                    deployment.status_message = "Container running, waiting for model to load..."
+                    logger.info(f"Deployment {deployment.name}: container running, API not ready")
+                    return "api_not_ready"
+
+            elif container_status == "restarting":
+                deployment.status = DeploymentStatus.STARTING.value
+                deployment.status_message = "Container restarting after system reboot..."
+                logger.info(f"Deployment {deployment.name}: container restarting")
+                return "restarting"
+
+            elif container_status in ("exited", "dead"):
+                # Container exists but stopped - try to restart it
+                logger.info(
+                    f"Deployment {deployment.name}: container {container_status}, attempting restart"
+                )
+                try:
+                    container.start()
+                    deployment.status = DeploymentStatus.STARTING.value
+                    deployment.status_message = "Restarting container after system reboot..."
+                    return "restarting"
+                except docker.errors.APIError as e:
+                    logger.error(f"Failed to restart container: {e}")
+                    deployment.status = DeploymentStatus.ERROR.value
+                    deployment.status_message = f"Failed to restart container: {e}"
+                    return "container_missing"
+
+            else:
+                # Unknown status
+                deployment.status = DeploymentStatus.STARTING.value
+                deployment.status_message = f"Container status: {container_status}"
+                logger.warning(
+                    f"Deployment {deployment.name}: unknown container status {container_status}"
+                )
+                return "restarting"
+
+        except docker.errors.NotFound:
+            # Container doesn't exist
+            logger.warning(f"Deployment {deployment.name}: container not found")
+            deployment.status = DeploymentStatus.ERROR.value
+            deployment.status_message = "Container not found after restart. Please redeploy."
+            return "container_missing"
+
+        except docker.errors.DockerException as e:
+            logger.error(f"Docker error checking deployment {deployment.name}: {e}")
+            deployment.status = DeploymentStatus.ERROR.value
+            deployment.status_message = f"Docker error: {e}"
+            return "container_missing"
+
+    async def _check_remote_deployment(self, deployment: Deployment) -> str:
+        """Check a remote deployment's status via worker API."""
+        try:
+            # For remote workers, check if worker is online first
+            if deployment.worker.status != "online":
+                deployment.status = DeploymentStatus.ERROR.value
+                deployment.status_message = f"Worker {deployment.worker.name} is offline"
+                logger.warning(f"Deployment {deployment.name}: worker offline")
+                return "container_missing"
+
+            # Check API health
+            api_healthy = await self._check_api_health(
+                deployment.worker.address,
+                deployment.port,
+                deployment.backend,
+                None,  # No container_name for remote
+            )
+
+            if api_healthy:
+                if deployment.status != DeploymentStatus.RUNNING.value:
+                    deployment.status = DeploymentStatus.RUNNING.value
+                    deployment.status_message = "Model ready (verified after restart)"
+                logger.info(f"Deployment {deployment.name}: remote deployment healthy")
+                return "running_verified"
+            else:
+                deployment.status = DeploymentStatus.STARTING.value
+                deployment.status_message = "Waiting for model to load..."
+                logger.info(f"Deployment {deployment.name}: remote API not ready")
+                return "api_not_ready"
+
+        except Exception as e:
+            logger.error(f"Error checking remote deployment {deployment.name}: {e}")
+            deployment.status = DeploymentStatus.ERROR.value
+            deployment.status_message = f"Error checking status: {e}"
+            return "container_missing"
+
+    async def _check_api_health(
+        self,
+        worker_address: str,
+        port: int,
+        backend: str,
+        container_name: str | None = None,
+    ) -> bool:
+        """Check if the deployment API is healthy.
+
+        Args:
+            worker_address: Worker address (host:port)
+            port: Host port for the model API
+            backend: Backend type (vllm, ollama, etc.)
+            container_name: Container name for Docker network (local deployments)
+
+        Returns:
+            True if API is healthy, False otherwise
+        """
+        # Build API URL
+        if container_name:
+            # Local deployment - use container name on Docker network
+            api_base_url = f"http://{container_name}:8000"
+        else:
+            # Remote deployment - use worker IP and port
+            worker_ip = worker_address.split(":")[0]
+            api_base_url = f"http://{worker_ip}:{port}"
+
+        # Check /v1/models endpoint (supported by vLLM, SGLang, and Ollama)
+        health_endpoint = f"{api_base_url}/v1/models"
+
+        try:
+            async with httpx.AsyncClient(timeout=self.HEALTH_CHECK_TIMEOUT) as client:
+                response = await client.get(health_endpoint)
+
+                if response.status_code == 200:
+                    data = response.json()
+                    # Check if models are loaded
+                    if data.get("data") and len(data["data"]) > 0:
+                        return True
+
+                # For Ollama, also try native endpoint
+                if backend == BackendType.OLLAMA.value:
+                    ollama_endpoint = f"{api_base_url}/api/tags"
+                    ollama_response = await client.get(ollama_endpoint)
+                    if ollama_response.status_code == 200:
+                        ollama_data = ollama_response.json()
+                        if ollama_data.get("models") and len(ollama_data["models"]) > 0:
+                            return True
+
+                return False
+
+        except httpx.ConnectError:
+            logger.debug(f"API not reachable at {health_endpoint}")
+            return False
+        except httpx.ReadTimeout:
+            logger.debug(f"API timeout at {health_endpoint}")
+            return False
+        except Exception as e:
+            logger.debug(f"API health check error: {e}")
+            return False
+
+    async def check_deployment_health(self, deployment_id: int) -> dict:
+        """Check health of a single deployment.
+
+        Returns:
+            dict with status and message
+        """
+        async with async_session_maker() as db:
+            result = await db.execute(
+                select(Deployment)
+                .where(Deployment.id == deployment_id)
+                .options(
+                    selectinload(Deployment.worker),
+                    selectinload(Deployment.model),
+                )
+            )
+            deployment = result.scalar_one_or_none()
+
+            if not deployment:
+                return {"status": "error", "message": "Deployment not found"}
+
+            status_result = await self._check_and_update_deployment(deployment, db)
+            await db.commit()
+
+            return {
+                "status": deployment.status,
+                "message": deployment.status_message,
+                "check_result": status_result,
+            }
+
+
+# Global instance
+deployment_sync_service = DeploymentSyncService()
diff --git a/backend/app/services/local_worker.py b/backend/app/services/local_worker.py
index 28f4456..4b5e499 100644
--- a/backend/app/services/local_worker.py
+++ b/backend/app/services/local_worker.py
@@ -160,12 +160,15 @@ def spawn_docker_worker(
 
     # Build the docker run command
     # Use --network host so worker can access backend and deployed apps
+    # Use --restart unless-stopped so worker auto-starts after reboot
     cmd = [
         "docker",
         "run",
         "-d",
         "--name",
         container_name,
+        "--restart",
+        "unless-stopped",
         "--network",
         "host",
         "--gpus",
diff --git a/frontend/src/pages/Chat.tsx b/frontend/src/pages/Chat.tsx
index a456701..0935a02 100644
--- a/frontend/src/pages/Chat.tsx
+++ b/frontend/src/pages/Chat.tsx
@@ -278,18 +278,15 @@ export default function Chat() {
   }, [isStreaming]);
 
   /**
-   * Get endpoint URL for deployment
+   * Get endpoint URL for deployment (uses backend proxy to handle Docker networking)
    */
   const getEndpointUrl = (deployment: Deployment): string | null => {
-    if (
-      !deployment.worker ||
-      !deployment.port ||
-      deployment.status !== "running"
-    ) {
+    if (deployment.status !== "running") {
       return null;
     }
-    const workerIp = deployment.worker.address.split(":")[0];
-    return `http://${workerIp}:${deployment.port}/v1/chat/completions`;
+    // Use backend proxy endpoint instead of direct model URL
+    // This handles Docker internal networking correctly (especially on Windows)
+    return `/api/deployments/${deployment.id}/chat`;
   };
 
   /**

From 07bb66e55a8ab00280cc58c51f1cbbbd1f5131de Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 10:56:51 +0800
Subject: [PATCH 12/27] fix: add credentials to chat proxy request

---
 frontend/src/pages/Chat.tsx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/src/pages/Chat.tsx b/frontend/src/pages/Chat.tsx
index 0935a02..7b3d692 100644
--- a/frontend/src/pages/Chat.tsx
+++ b/frontend/src/pages/Chat.tsx
@@ -354,6 +354,7 @@ export default function Chat() {
         const response = await fetch(endpoint, {
           method: "POST",
           headers: { "Content-Type": "application/json" },
+          credentials: "include",
           body: JSON.stringify({
             model: selectedDeployment.model?.model_id || "default",
             messages: [

From 81ffa82e9e8c450a59aad7d41af70c1200ee214c Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 11:00:53 +0800
Subject: [PATCH 13/27] fix: add Authorization header to chat proxy request

---
 frontend/src/pages/Chat.tsx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/frontend/src/pages/Chat.tsx b/frontend/src/pages/Chat.tsx
index 7b3d692..963e5ea 100644
--- a/frontend/src/pages/Chat.tsx
+++ b/frontend/src/pages/Chat.tsx
@@ -32,6 +32,7 @@ import type {
   ConversationMessage,
 } from "../services/api";
 import { useResponsive } from "../hooks";
+import { STORAGE_KEYS } from "../constants";
 
 // Conversation type for UI (maps from API type)
 interface Conversation {
@@ -351,10 +352,13 @@ export default function Chat() {
       try {
         abortControllerRef.current = new AbortController();
 
+        const token = localStorage.getItem(STORAGE_KEYS.TOKEN);
         const response = await fetch(endpoint, {
           method: "POST",
-          headers: { "Content-Type": "application/json" },
-          credentials: "include",
+          headers: {
+            "Content-Type": "application/json",
+            ...(token && { Authorization: `Bearer ${token}` }),
+          },
           body: JSON.stringify({
             model: selectedDeployment.model?.model_id || "default",
             messages: [

From fbddc24649dbbdd94610b6f6c37b1c0239c18046 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 11:22:02 +0800
Subject: [PATCH 14/27] fix: handle missing container_id in app start/stop

---
 backend/app/api/apps/lifecycle.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/backend/app/api/apps/lifecycle.py b/backend/app/api/apps/lifecycle.py
index 80e6d21..53ff99c 100644
--- a/backend/app/api/apps/lifecycle.py
+++ b/backend/app/api/apps/lifecycle.py
@@ -44,6 +44,12 @@ async def stop_app(
     if app.status != AppStatus.RUNNING.value:
         raise HTTPException(status_code=400, detail="App is not running")
 
+    if not app.container_id:
+        # No container to stop, just mark as stopped
+        app.status = AppStatus.STOPPED.value
+        await db.commit()
+        return app_to_response(app, request)
+
     await db.refresh(app, ["worker"])
     worker = app.worker
 
@@ -92,6 +98,12 @@ async def start_app(
     if app.status not in [AppStatus.STOPPED.value, AppStatus.ERROR.value]:
         raise HTTPException(status_code=400, detail="App is not stopped")
 
+    if not app.container_id:
+        raise HTTPException(
+            status_code=400,
+            detail="Container not found. Please delete and redeploy the app.",
+        )
+
     await db.refresh(app, ["worker"])
     worker = app.worker
 

From 1290f8ed9ff359ee00a3a1c7303e6726c20092e3 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 11:54:16 +0800
Subject: [PATCH 15/27] docs: simplify Windows firewall instructions, show port
 in DeployApps

---
 README.md                         | 30 +++++++++++++++++-------------
 README_zh-TW.md                   | 30 +++++++++++++++++-------------
 frontend/src/pages/DeployApps.tsx | 28 +++++++++++++++++++---------
 3 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index e230e4b..91ac717 100644
--- a/README.md
+++ b/README.md
@@ -54,27 +54,31 @@ docker compose -f docker-compose.deploy.yml up -d
 
 ### Windows Docker Desktop - LAN Access
 
-Docker Desktop on Windows binds ports to `127.0.0.1` only. To allow LAN access, run these commands in PowerShell (Administrator):
+Windows Firewall blocks LAN access by default. Choose one of the following options:
 
-```powershell
-# Add firewall rule
-New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+**Option 1: Disable Firewall (Simplest)**
 
-# Port forwarding for LAN access
-netsh interface portproxy add v4tov4 listenport=3000 listenaddress=0.0.0.0 connectport=3000 connectaddress=127.0.0.1
-netsh interface portproxy add v4tov4 listenport=52000 listenaddress=0.0.0.0 connectport=52000 connectaddress=127.0.0.1
-
-# Verify port forwarding
-netsh interface portproxy show all
+```powershell
+# Run in PowerShell (Administrator)
+Set-NetFirewallProfile -Profile Domain,Public,Private -Enabled False
 ```
 
-To remove port forwarding:
+**Option 2: Add Firewall Rules (More Secure)**
 
 ```powershell
-netsh interface portproxy delete v4tov4 listenport=3000 listenaddress=0.0.0.0
-netsh interface portproxy delete v4tov4 listenport=52000 listenaddress=0.0.0.0
+# Run in PowerShell (Administrator)
+# Base ports (Frontend + Backend API)
+New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+
+# Model deployment ports (add ports as needed, e.g., 40000-40100)
+New-NetFirewallRule -DisplayName "LMStack Models" -Direction Inbound -LocalPort 40000-40100 -Protocol TCP -Action Allow
+
+# App ports (e.g., Open WebUI on 46488)
+New-NetFirewallRule -DisplayName "LMStack Apps" -Direction Inbound -LocalPort 46000-46500 -Protocol TCP -Action Allow
 ```
 
+> **Note**: When you deploy models or apps, check the assigned port in the UI and ensure it's allowed through the firewall.
+
 ### Usage
 
 1. Login with `admin` / `admin` (change password after first login)
diff --git a/README_zh-TW.md b/README_zh-TW.md
index 89ee119..561d06f 100644
--- a/README_zh-TW.md
+++ b/README_zh-TW.md
@@ -54,27 +54,31 @@ docker compose -f docker-compose.deploy.yml up -d
 
 ### Windows Docker Desktop - 區域網路存取
 
-Windows 上的 Docker Desktop 預設只綁定到 `127.0.0.1`。若要允許區域網路存取，請在 PowerShell（系統管理員）中執行：
+Windows 防火牆預設會阻擋區域網路存取。請選擇以下其中一種方式：
 
-```powershell
-# 新增防火牆規則
-New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+**方式一：關閉防火牆（最簡單）**
 
-# 設定端口轉發以允許區域網路存取
-netsh interface portproxy add v4tov4 listenport=3000 listenaddress=0.0.0.0 connectport=3000 connectaddress=127.0.0.1
-netsh interface portproxy add v4tov4 listenport=52000 listenaddress=0.0.0.0 connectport=52000 connectaddress=127.0.0.1
-
-# 確認端口轉發設定
-netsh interface portproxy show all
+```powershell
+# 在 PowerShell（系統管理員）中執行
+Set-NetFirewallProfile -Profile Domain,Public,Private -Enabled False
 ```
 
-移除端口轉發：
+**方式二：新增防火牆規則（較安全）**
 
 ```powershell
-netsh interface portproxy delete v4tov4 listenport=3000 listenaddress=0.0.0.0
-netsh interface portproxy delete v4tov4 listenport=52000 listenaddress=0.0.0.0
+# 在 PowerShell（系統管理員）中執行
+# 基本端口（前端 + 後端 API）
+New-NetFirewallRule -DisplayName "LMStack" -Direction Inbound -LocalPort 3000,52000 -Protocol TCP -Action Allow
+
+# 模型部署端口（依需求新增，例如 40000-40100）
+New-NetFirewallRule -DisplayName "LMStack Models" -Direction Inbound -LocalPort 40000-40100 -Protocol TCP -Action Allow
+
+# App 端口（例如 Open WebUI 使用 46488）
+New-NetFirewallRule -DisplayName "LMStack Apps" -Direction Inbound -LocalPort 46000-46500 -Protocol TCP -Action Allow
 ```
 
+> **注意**：部署模型或 App 時，請在 UI 中查看分配的端口，並確保該端口已在防火牆中開放。
+
 ### 使用方式
 
 1. 使用 `admin` / `admin` 登入（首次登入後請更改密碼）
diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index be0782f..4df4c3c 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -561,15 +561,25 @@ export default function DeployApps() {
                       )}
 
                       {app.status === "running" && appUrl && (
-                        <Button
-                          type="link"
-                          icon={<LinkOutlined />}
-                          href={appUrl}
-                          target="_blank"
-                          style={{ padding: 0, height: "auto" }}
-                        >
-                          Open {app.name}
-                        </Button>
+                        <div>
+                          <Button
+                            type="link"
+                            icon={<LinkOutlined />}
+                            href={appUrl}
+                            target="_blank"
+                            style={{ padding: 0, height: "auto" }}
+                          >
+                            Open {app.name}
+                          </Button>
+                          {app.port && (
+                            <Text
+                              type="secondary"
+                              style={{ fontSize: 12, marginLeft: 8 }}
+                            >
+                              Port: {app.port}
+                            </Text>
+                          )}
+                        </div>
                       )}
 
                       <div style={{ display: "flex", gap: 8, marginTop: 8 }}>

From ffc007a8bf27ed2b743cc78dbc4dfc32159c73d2 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 12:26:02 +0800
Subject: [PATCH 16/27] fix: verify image exists before container creation in
 worker

---
 worker/docker_ops/containers.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/worker/docker_ops/containers.py b/worker/docker_ops/containers.py
index d4e501e..7b2d438 100644
--- a/worker/docker_ops/containers.py
+++ b/worker/docker_ops/containers.py
@@ -8,7 +8,7 @@
 from typing import Any, Optional
 
 import docker
-from docker.errors import APIError, NotFound
+from docker.errors import APIError, ImageNotFound, NotFound
 
 logger = logging.getLogger(__name__)
 
@@ -288,6 +288,15 @@ def create_container(
         """
         logger.info(f"Creating container: {name} from image {image}")
 
+        # Verify image exists, pull if not found
+        try:
+            self.client.images.get(image)
+            logger.info(f"Image {image} found locally")
+        except ImageNotFound:
+            logger.info(f"Image {image} not found, pulling...")
+            self.client.images.pull(image)
+            logger.info(f"Image {image} pulled successfully")
+
         # Remove existing container with same name
         try:
             existing = self.client.containers.get(name)

From 5b8f6534bd8627246f0e80749b353d758ff28051 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 12:37:34 +0800
Subject: [PATCH 17/27] fix: proxy option based on worker labels, prevent
 progress bar regression

---
 frontend/src/pages/DeployApps.tsx | 16 ++++++++--------
 worker/docker_ops/images.py       |  7 +++++++
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index 4df4c3c..782957e 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -670,12 +670,14 @@ export default function DeployApps() {
             value={selectedWorker}
             onChange={(value) => {
               setSelectedWorker(value);
-              // Auto-disable proxy for localhost workers
+              // Auto-disable proxy for local workers (same machine as LMStack)
               const worker = workers.find((w) => w.id === value);
               if (worker) {
-                const workerHost = worker.address.split(":")[0];
-                if (workerHost === "localhost" || workerHost === "127.0.0.1") {
+                const isLocalWorker = worker.labels?.type === "local";
+                if (isLocalWorker) {
                   setUseProxy(false);
+                } else {
+                  setUseProxy(true);
                 }
               }
             }}
@@ -686,13 +688,11 @@ export default function DeployApps() {
           />
         </div>
 
-        {/* Hide proxy option for localhost workers (they use direct connection) */}
+        {/* Hide proxy option for local workers (same machine as LMStack) */}
         {(() => {
           const worker = workers.find((w) => w.id === selectedWorker);
-          const workerHost = worker?.address.split(":")[0];
-          const isLocalhost =
-            workerHost === "localhost" || workerHost === "127.0.0.1";
-          if (isLocalhost) return null;
+          const isLocalWorker = worker?.labels?.type === "local";
+          if (isLocalWorker) return null;
           return (
             <div style={{ marginBottom: 16 }}>
               <div
diff --git a/worker/docker_ops/images.py b/worker/docker_ops/images.py
index 7446be8..9689a3f 100644
--- a/worker/docker_ops/images.py
+++ b/worker/docker_ops/images.py
@@ -160,6 +160,7 @@ def pull_image(
 
         # Pull with progress tracking
         layers_progress: dict[str, dict] = {}
+        max_progress = 0  # Track max progress to prevent going backwards
 
         for line in self.client.api.pull(image, stream=True, decode=True, auth_config=auth):
             if progress_callback and "id" in line:
@@ -201,6 +202,12 @@ def pull_image(
                 downloaded = sum(lyr.get("current", 0) for lyr in layers_with_size)
                 progress = int((downloaded / total_size) * 100) if total_size > 0 else 0
 
+                # Never let progress go backwards
+                if progress > max_progress:
+                    max_progress = progress
+                else:
+                    progress = max_progress
+
                 progress_callback(progress, layers_progress)
 
         # Get the pulled image info

From 72856f03bb11b60f801034078a1c412d29af4eb7 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 13:02:05 +0800
Subject: [PATCH 18/27] fix: detect local workers via token.is_local instead of
 IP

---
 backend/app/api/workers.py               | 16 ++++++++++++++--
 backend/app/models/registration_token.py |  8 +++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/backend/app/api/workers.py b/backend/app/api/workers.py
index c2b2a0a..7e2e047 100644
--- a/backend/app/api/workers.py
+++ b/backend/app/api/workers.py
@@ -145,6 +145,12 @@ async def create_worker(
             existing_worker.status = WorkerStatus.ONLINE.value
             existing_worker.last_heartbeat = datetime.now(UTC)
 
+            # Update labels to mark as local if token is for local worker
+            if token.is_local:
+                worker_labels = dict(existing_worker.labels) if existing_worker.labels else {}
+                worker_labels["type"] = "local"
+                existing_worker.labels = worker_labels
+
             await db.commit()
             await db.refresh(existing_worker)
 
@@ -187,11 +193,16 @@ async def create_worker(
         reported_port = worker_in.address.split(":")[-1]
     real_address = f"{client_ip}:{reported_port}"
 
+    # Set labels for local workers (created via /local endpoint)
+    worker_labels = dict(worker_in.labels) if worker_in.labels else {}
+    if token.is_local:
+        worker_labels["type"] = "local"
+
     worker = Worker(
         name=worker_in.name,
         address=real_address,
         description=worker_in.description,
-        labels=worker_in.labels,
+        labels=worker_labels if worker_labels else None,
         gpu_info=([gpu.model_dump() for gpu in worker_in.gpu_info] if worker_in.gpu_info else None),
         system_info=(worker_in.system_info.model_dump() if worker_in.system_info else None),
         status=WorkerStatus.ONLINE.value,
@@ -379,10 +390,11 @@ async def register_local_worker(
     settings = get_settings()
     backend_url = f"http://localhost:{settings.port}"
 
-    # Create a registration token for this worker
+    # Create a registration token for this worker (marked as local)
     token = RegistrationToken.create(
         name=worker_name,
         expires_in_hours=24,  # Token valid for 24 hours
+        is_local=True,  # Mark as local worker
     )
 
     db.add(token)
diff --git a/backend/app/models/registration_token.py b/backend/app/models/registration_token.py
index a26d84d..9dfdfd8 100644
--- a/backend/app/models/registration_token.py
+++ b/backend/app/models/registration_token.py
@@ -25,6 +25,9 @@ class RegistrationToken(Base):
     )
     name: Mapped[str] = mapped_column(String(255), nullable=False)  # Suggested worker name
     is_used: Mapped[bool] = mapped_column(Boolean, default=False)
+    is_local: Mapped[bool] = mapped_column(
+        Boolean, default=False
+    )  # True if created via /local endpoint
     used_by_worker_id: Mapped[int | None] = mapped_column(Integer, nullable=True)
 
     created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
@@ -32,11 +35,14 @@ class RegistrationToken(Base):
     used_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
 
     @classmethod
-    def create(cls, name: str, expires_in_hours: int = 24) -> "RegistrationToken":
+    def create(
+        cls, name: str, expires_in_hours: int = 24, is_local: bool = False
+    ) -> "RegistrationToken":
         """Create a new registration token"""
         return cls(
             token=generate_token(),
             name=name,
+            is_local=is_local,
             expires_at=datetime.utcnow() + timedelta(hours=expires_in_hours),
         )
 

From 5572874239d7b7f01c10943be0fe14e8e32170bd Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 13:05:17 +0800
Subject: [PATCH 19/27] fix: add database migration for is_local column

---
 backend/app/database.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/backend/app/database.py b/backend/app/database.py
index 961bdf9..eaa925a 100644
--- a/backend/app/database.py
+++ b/backend/app/database.py
@@ -53,6 +53,14 @@ async def column_exists(table_name: str, column_name: str) -> bool:
         await conn.execute(text("ALTER TABLE deployments ADD COLUMN container_name VARCHAR(255)"))
         logger.info("'container_name' column added!")
 
+    # Migration: Add is_local to registration_tokens (for local worker detection)
+    if not await column_exists("registration_tokens", "is_local"):
+        logger.info("Adding 'is_local' column to registration_tokens table...")
+        await conn.execute(
+            text("ALTER TABLE registration_tokens ADD COLUMN is_local BOOLEAN DEFAULT 0")
+        )
+        logger.info("'is_local' column added!")
+
 
 async def init_db():
     """Initialize database tables and run migrations"""

From cf2250108b0aad5728c5959956ec7e031e90a507 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 13:28:55 +0800
Subject: [PATCH 20/27] fix: worker auto-detects local via BACKEND_URL, use
 indeterminate progress for starting stage

---
 frontend/src/pages/DeployApps.tsx | 17 +++++++++--------
 worker/agent.py                   | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index 782957e..b4a1a96 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -511,18 +511,19 @@ export default function DeployApps() {
                             {app.status === "starting" && <LoadingOutlined />}
                             {app.status === "pending" && <SyncOutlined spin />}
                             <Text type="secondary" style={{ fontSize: 13 }}>
-                              {progress?.stage === "unknown" ||
-                              !progress?.message
-                                ? app.status === "starting"
-                                  ? "Starting app (first startup may take 1-3 minutes)..."
-                                  : app.status === "pulling"
+                              {app.status === "starting"
+                                ? "Starting app (first startup may take 1-3 minutes)..."
+                                : progress?.stage === "unknown" ||
+                                    !progress?.message
+                                  ? app.status === "pulling"
                                     ? "Pulling image..."
                                     : "Preparing..."
-                                : progress.message}
+                                  : progress.message}
                             </Text>
                           </div>
-                          {/* Use indeterminate style when no real progress data */}
-                          {!progress ||
+                          {/* Use indeterminate style for starting stage or when no real progress data */}
+                          {app.status === "starting" ||
+                          !progress ||
                           progress.stage === "unknown" ||
                           (app.status === "pulling" &&
                             progress.progress === 0) ? (
diff --git a/worker/agent.py b/worker/agent.py
index c1dbafa..3fbe87a 100644
--- a/worker/agent.py
+++ b/worker/agent.py
@@ -80,17 +80,31 @@ def __init__(
         self._heartbeat_task: Optional[asyncio.Task] = None
         self._running = True
 
+    def _is_local_worker(self) -> bool:
+        """Check if this worker is running on the same machine as the server."""
+        # Check if server_url contains localhost or host.docker.internal
+        local_indicators = ["localhost", "127.0.0.1", "host.docker.internal", "::1"]
+        server_host = self.server_url.split("://")[-1].split(":")[0].split("/")[0].lower()
+        return server_host in local_indicators
+
     async def register(self) -> bool:
         """Register this worker with the server."""
         try:
             gpu_info = self.gpu_detector.detect()
             system_info = self.system_detector.detect()
 
+            # Build labels - mark as local if connecting to localhost/host.docker.internal
+            labels = {}
+            if self._is_local_worker():
+                labels["type"] = "local"
+                logger.info("Detected local worker (same machine as LMStack server)")
+
             registration_data = {
                 "name": self.name,
                 "address": f"{self._get_advertise_address()}:{self.port}",
                 "gpu_info": gpu_info,
                 "system_info": system_info,
+                "labels": labels if labels else None,
             }
 
             # Include registration token if provided

From 012d5dbb06e53044085afad6743b870a26253f73 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 13:33:48 +0800
Subject: [PATCH 21/27] fix: use browser hostname for apps with internal worker
 IP

---
 frontend/src/pages/DeployApps.tsx | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index b4a1a96..56102e0 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -304,10 +304,24 @@ export default function DeployApps() {
       // Use current hostname (LMStack IP) + app port
       return `http://${window.location.hostname}:${app.port}`;
     } else {
-      // Direct connection to worker
+      // Direct connection - for local workers, use current hostname
+      // Check if worker_address is a private/internal IP (Docker network, etc.)
       if (app.worker_address) {
         const workerHost = app.worker_address.split(":")[0];
-        return `http://${workerHost}:${app.port}`;
+        const isInternalIp =
+          workerHost.startsWith("172.") ||
+          workerHost.startsWith("10.") ||
+          workerHost.startsWith("192.168.") ||
+          workerHost === "localhost" ||
+          workerHost === "127.0.0.1";
+
+        if (isInternalIp) {
+          // Local worker - use current browser hostname
+          return `http://${window.location.hostname}:${app.port}`;
+        } else {
+          // Remote worker - use worker's IP
+          return `http://${workerHost}:${app.port}`;
+        }
       }
       return null;
     }

From d1df7a95a1176299e1e38b21c69abc20c5b36299 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 20:09:58 +0800
Subject: [PATCH 22/27] fix: handle stale image tags pointing to pruned SHA

When a Docker image tag exists but points to a deleted/pruned SHA,
Docker returns APIError (404) instead of ImageNotFound. Now catching
both exceptions to properly trigger image re-pull.
---
 worker/docker_ops/containers.py | 7 +++++--
 worker/docker_ops/runner.py     | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/worker/docker_ops/containers.py b/worker/docker_ops/containers.py
index 7b2d438..02a9017 100644
--- a/worker/docker_ops/containers.py
+++ b/worker/docker_ops/containers.py
@@ -289,11 +289,14 @@ def create_container(
         logger.info(f"Creating container: {name} from image {image}")
 
         # Verify image exists, pull if not found
+        # Note: We catch both ImageNotFound and APIError because when a tag exists
+        # but points to a deleted/pruned image SHA, Docker returns a 404 APIError
+        # instead of ImageNotFound.
         try:
             self.client.images.get(image)
             logger.info(f"Image {image} found locally")
-        except ImageNotFound:
-            logger.info(f"Image {image} not found, pulling...")
+        except (ImageNotFound, APIError) as e:
+            logger.info(f"Image {image} not found or invalid ({type(e).__name__}), pulling...")
             self.client.images.pull(image)
             logger.info(f"Image {image} pulled successfully")
 
diff --git a/worker/docker_ops/runner.py b/worker/docker_ops/runner.py
index a13bc0a..62a97bb 100644
--- a/worker/docker_ops/runner.py
+++ b/worker/docker_ops/runner.py
@@ -265,10 +265,13 @@ def _run_sync(
             pass
 
         # Pull image if not exists
+        # Note: We catch both NotFound and APIError because when a tag exists
+        # but points to a deleted/pruned image SHA, Docker returns a 404 APIError
+        # instead of NotFound.
         try:
             self.client.images.get(image)
-        except NotFound:
-            logger.info(f"Pulling image {image}...")
+        except (NotFound, APIError) as e:
+            logger.info(f"Image {image} not found or invalid ({type(e).__name__}), pulling...")
             self.pull_image_with_progress(image, deployment_id)
 
         # Update progress to starting

From 847db2dcf0dcd8c012f3325b4975c7036402a8fc Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sat, 17 Jan 2026 21:27:04 +0800
Subject: [PATCH 23/27] fix: prevent progress bar regression during image pull

Track last known progress and only update when progress moves forward.
Ignore 'unknown' status to avoid UI flicker.
---
 backend/app/api/apps/deployment.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/backend/app/api/apps/deployment.py b/backend/app/api/apps/deployment.py
index 1b18f95..503aaad 100644
--- a/backend/app/api/apps/deployment.py
+++ b/backend/app/api/apps/deployment.py
@@ -96,6 +96,9 @@ async def pull_image_with_progress(
             )
 
             # Poll for progress while waiting
+            # Track last known progress to avoid regression when status is "unknown"
+            last_known_progress = 0
+
             while not pull_task.done():
                 try:
                     progress_resp = await client.get(progress_url, timeout=5.0)
@@ -105,12 +108,15 @@ async def pull_image_with_progress(
                         progress = progress_data.get("progress", 0)
 
                         if status == "pulling":
-                            set_deployment_progress(
-                                app_id,
-                                "pulling",
-                                progress,
-                                f"Pulling image {image}... ({progress}%)",
-                            )
+                            # Only update if progress is moving forward (avoid regression)
+                            if progress >= last_known_progress:
+                                last_known_progress = progress
+                                set_deployment_progress(
+                                    app_id,
+                                    "pulling",
+                                    progress,
+                                    f"Pulling image {image}... ({progress}%)",
+                                )
                         elif status == "completed":
                             set_deployment_progress(
                                 app_id,
@@ -118,6 +124,7 @@ async def pull_image_with_progress(
                                 100,
                                 "Image pulled successfully",
                             )
+                        # Ignore "unknown" status - keep showing last known progress
                 except Exception:
                     pass  # Progress polling is best-effort
 

From 63703dfbb33e1e851ed7c5647bf49daeedaf0063 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sun, 18 Jan 2026 09:04:35 +0800
Subject: [PATCH 24/27] fix: use Docker bridge gateway IP for container API
 access

---
 backend/app/api/apps/routes.py    | 10 +++----
 backend/app/api/apps/utils.py     | 44 ++++++++++++++++++++-----------
 frontend/src/pages/ApiKeys.tsx    |  9 ++++++-
 frontend/src/pages/DeployApps.tsx |  6 +++++
 4 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/backend/app/api/apps/routes.py b/backend/app/api/apps/routes.py
index 675bd51..f694a9f 100644
--- a/backend/app/api/apps/routes.py
+++ b/backend/app/api/apps/routes.py
@@ -216,9 +216,8 @@ async def deploy_app(
     # Initialize progress
     set_deployment_progress(app.id, "pending", 0, "Deployment queued...")
 
-    # Extract lmstack_port for background task
-    lmstack_host = request.headers.get("host", "localhost:52000")
-    lmstack_port = lmstack_host.split(":")[-1] if ":" in lmstack_host else "8000"
+    # Always use backend API port (52000)
+    lmstack_port = "52000"
 
     # Start background deployment
     background_tasks.add_task(
@@ -296,8 +295,9 @@ async def _build_env_vars(
     db: AsyncSession,
 ) -> dict:
     """Build environment variables for the app container."""
-    lmstack_host = request.headers.get("host", "localhost:52000")
-    lmstack_port = lmstack_host.split(":")[-1] if ":" in lmstack_host else "8000"
+    # Always use backend API port (52000), not the frontend port from request
+    # The request may come from frontend (port 3000) but API is on 52000
+    lmstack_port = "52000"
 
     host_ip = get_host_ip(request, worker)
 
diff --git a/backend/app/api/apps/utils.py b/backend/app/api/apps/utils.py
index 94e16a9..d6cba48 100644
--- a/backend/app/api/apps/utils.py
+++ b/backend/app/api/apps/utils.py
@@ -141,6 +141,9 @@ async def call_worker_api(
 def get_host_ip(request: Request, worker: Worker) -> str:
     """Determine the host IP that the container can use to reach LMStack.
 
+    For Docker containers to reach the host, we use the Docker bridge gateway IP
+    (typically 172.17.0.1) which is more reliable than the host's LAN IP.
+
     Args:
         request: FastAPI request object
         worker: Worker where app is deployed
@@ -148,27 +151,36 @@ def get_host_ip(request: Request, worker: Worker) -> str:
     Returns:
         Host IP address string
     """
-    import socket
+    # Check if worker is local (on same machine as LMStack)
+    worker_ip = worker.address.split(":")[0]
+    worker_labels = worker.labels or {}
+    is_local_worker = (
+        worker_ip in ("localhost", "127.0.0.1") or worker_labels.get("type") == "local"
+    )
 
+    if is_local_worker:
+        # For local workers, containers need to use Docker bridge gateway
+        # to reach host services. 172.17.0.1 is the default Docker bridge gateway.
+        # This works on Linux. For Docker Desktop (Windows/Mac), host.docker.internal
+        # would be used, but that's handled by Docker's DNS.
+        return "172.17.0.1"
+
+    # For remote workers, use the LMStack host IP that the worker can reach
     lmstack_host = request.headers.get("host", "localhost:52000")
     host_ip = lmstack_host.split(":")[0] if ":" in lmstack_host else lmstack_host
 
-    # If host is localhost, try alternatives
+    # If host is localhost, try to find our external IP
     if host_ip in ("localhost", "127.0.0.1"):
-        forwarded_host = request.headers.get("x-forwarded-host")
-        if forwarded_host:
-            host_ip = forwarded_host.split(":")[0]
-        else:
-            # Try to get our IP on the same network as the worker
-            try:
-                worker_ip = worker.address.split(":")[0]
-                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-                s.connect((worker_ip, 80))
-                host_ip = s.getsockname()[0]
-                s.close()
-            except OSError as e:
-                logger.warning(f"Could not determine host IP for worker {worker_ip}: {e}")
-                host_ip = "host.docker.internal"  # Fallback for Docker Desktop
+        import socket
+
+        try:
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            s.connect((worker_ip, 80))
+            host_ip = s.getsockname()[0]
+            s.close()
+        except OSError as e:
+            logger.warning(f"Could not determine host IP for worker {worker_ip}: {e}")
+            host_ip = "172.17.0.1"  # Fallback to Docker bridge
 
     return host_ip
 
diff --git a/frontend/src/pages/ApiKeys.tsx b/frontend/src/pages/ApiKeys.tsx
index bb3580f..082f96c 100644
--- a/frontend/src/pages/ApiKeys.tsx
+++ b/frontend/src/pages/ApiKeys.tsx
@@ -764,11 +764,18 @@ export default function ApiKeys() {
           <div style={{ padding: 24 }}>
             <Text
               type="secondary"
-              style={{ display: "block", marginBottom: 16 }}
+              style={{ display: "block", marginBottom: 8 }}
             >
               Use the OpenAI SDK with your LMStack endpoint. Base URL:{" "}
               <code>{baseUrl}/v1</code>
             </Text>
+            <Text
+              type="secondary"
+              style={{ display: "block", marginBottom: 16, fontSize: 12 }}
+            >
+              For Docker containers (e.g., Open WebUI, n8n), use:{" "}
+              <code>http://172.17.0.1:52000/v1</code>
+            </Text>
             <Tabs
               items={[
                 {
diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index 56102e0..5d26315 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -675,6 +675,12 @@ export default function DeployApps() {
             Select a worker to deploy {selectedApp?.name}. An API key will be
             automatically created.
           </Text>
+          <div style={{ marginTop: 8 }}>
+            <Text type="secondary" style={{ fontSize: 12 }}>
+              The app will connect to LMStack API at{" "}
+              <code>http://172.17.0.1:52000/v1</code> (Docker bridge gateway).
+            </Text>
+          </div>
         </div>
 
         <div style={{ marginBottom: 16 }}>

From 8158906c85a77a12f7b0fe8c6dc84d54cfb9bc5c Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sun, 18 Jan 2026 09:16:44 +0800
Subject: [PATCH 25/27] feat: use host.docker.internal for apps, reset Open
 WebUI config on start

---
 backend/app/api/apps/deployment.py |  2 ++
 backend/app/api/apps/utils.py      | 15 +++++++--------
 backend/app/models/app.py          |  1 +
 frontend/src/pages/ApiKeys.tsx     |  2 +-
 frontend/src/pages/DeployApps.tsx  | 12 ++++++------
 worker/docker_ops/containers.py    |  3 +++
 worker/models.py                   |  1 +
 worker/routes/containers.py        |  1 +
 8 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/backend/app/api/apps/deployment.py b/backend/app/api/apps/deployment.py
index 503aaad..4002c5d 100644
--- a/backend/app/api/apps/deployment.py
+++ b/backend/app/api/apps/deployment.py
@@ -470,6 +470,8 @@ async def _create_container(
             "lmstack.app.type": app_type.value,
             "lmstack.app.id": str(app_id),
         },
+        # Add host.docker.internal mapping for container to access host services
+        "extra_hosts": {"host.docker.internal": "host-gateway"},
     }
 
     # Add Linux capabilities if specified (e.g., SYS_ADMIN for AnythingLLM)
diff --git a/backend/app/api/apps/utils.py b/backend/app/api/apps/utils.py
index d6cba48..36b52b2 100644
--- a/backend/app/api/apps/utils.py
+++ b/backend/app/api/apps/utils.py
@@ -141,8 +141,8 @@ async def call_worker_api(
 def get_host_ip(request: Request, worker: Worker) -> str:
     """Determine the host IP that the container can use to reach LMStack.
 
-    For Docker containers to reach the host, we use the Docker bridge gateway IP
-    (typically 172.17.0.1) which is more reliable than the host's LAN IP.
+    For Docker containers to reach the host, we use host.docker.internal which
+    is mapped via extra_hosts to host-gateway when creating the container.
 
     Args:
         request: FastAPI request object
@@ -159,11 +159,10 @@ def get_host_ip(request: Request, worker: Worker) -> str:
     )
 
     if is_local_worker:
-        # For local workers, containers need to use Docker bridge gateway
-        # to reach host services. 172.17.0.1 is the default Docker bridge gateway.
-        # This works on Linux. For Docker Desktop (Windows/Mac), host.docker.internal
-        # would be used, but that's handled by Docker's DNS.
-        return "172.17.0.1"
+        # For local workers, use host.docker.internal which is mapped to
+        # host-gateway via extra_hosts when creating the container.
+        # This works on all platforms (Linux, Windows, Mac).
+        return "host.docker.internal"
 
     # For remote workers, use the LMStack host IP that the worker can reach
     lmstack_host = request.headers.get("host", "localhost:52000")
@@ -180,7 +179,7 @@ def get_host_ip(request: Request, worker: Worker) -> str:
             s.close()
         except OSError as e:
             logger.warning(f"Could not determine host IP for worker {worker_ip}: {e}")
-            host_ip = "172.17.0.1"  # Fallback to Docker bridge
+            host_ip = "host.docker.internal"  # Fallback
 
     return host_ip
 
diff --git a/backend/app/models/app.py b/backend/app/models/app.py
index 0c6a553..845d1af 100644
--- a/backend/app/models/app.py
+++ b/backend/app/models/app.py
@@ -49,6 +49,7 @@ class AppStatus(str, Enum):
             "OLLAMA_BASE_URL": "",  # Disable Ollama
             "WEBUI_SECRET_KEY": "",
             "ENABLE_OLLAMA_API": "false",
+            "RESET_CONFIG_ON_START": "true",  # Force use env vars on restart
         },
         "volumes": [{"name": "open-webui-data", "destination": "/app/backend/data"}],
     },
diff --git a/frontend/src/pages/ApiKeys.tsx b/frontend/src/pages/ApiKeys.tsx
index 082f96c..561d8d9 100644
--- a/frontend/src/pages/ApiKeys.tsx
+++ b/frontend/src/pages/ApiKeys.tsx
@@ -774,7 +774,7 @@ export default function ApiKeys() {
               style={{ display: "block", marginBottom: 16, fontSize: 12 }}
             >
               For Docker containers (e.g., Open WebUI, n8n), use:{" "}
-              <code>http://172.17.0.1:52000/v1</code>
+              <code>http://host.docker.internal:52000/v1</code>
             </Text>
             <Tabs
               items={[
diff --git a/frontend/src/pages/DeployApps.tsx b/frontend/src/pages/DeployApps.tsx
index 5d26315..96ec530 100644
--- a/frontend/src/pages/DeployApps.tsx
+++ b/frontend/src/pages/DeployApps.tsx
@@ -358,6 +358,12 @@ export default function DeployApps() {
         <Text type="secondary" style={{ fontSize: isMobile ? 13 : 14 }}>
           Deploy companion applications that integrate with LMStack
         </Text>
+        <div style={{ marginTop: 8 }}>
+          <Text type="secondary" style={{ fontSize: 12 }}>
+            Apps connect to LMStack API via{" "}
+            <code>http://host.docker.internal:52000/v1</code>
+          </Text>
+        </div>
       </div>
 
       {/* Available Apps */}
@@ -675,12 +681,6 @@ export default function DeployApps() {
             Select a worker to deploy {selectedApp?.name}. An API key will be
             automatically created.
           </Text>
-          <div style={{ marginTop: 8 }}>
-            <Text type="secondary" style={{ fontSize: 12 }}>
-              The app will connect to LMStack API at{" "}
-              <code>http://172.17.0.1:52000/v1</code> (Docker bridge gateway).
-            </Text>
-          </div>
         </div>
 
         <div style={{ marginBottom: 16 }}>
diff --git a/worker/docker_ops/containers.py b/worker/docker_ops/containers.py
index 02a9017..4259f64 100644
--- a/worker/docker_ops/containers.py
+++ b/worker/docker_ops/containers.py
@@ -265,6 +265,7 @@ def create_container(
         cpu_limit: Optional[float] = None,
         memory_limit: Optional[int] = None,
         cap_add: Optional[list[str]] = None,
+        extra_hosts: Optional[dict[str, str]] = None,
     ) -> dict[str, Any]:
         """Create and start a new container.
 
@@ -282,6 +283,7 @@ def create_container(
             cpu_limit: CPU limit (number of CPUs)
             memory_limit: Memory limit in bytes
             cap_add: Linux capabilities to add (e.g., ["SYS_ADMIN"])
+            extra_hosts: Extra hostname mappings (e.g., {"host.docker.internal": "host-gateway"})
 
         Returns:
             Created container information
@@ -363,6 +365,7 @@ def create_container(
             cpu_quota=int(cpu_limit * 100000) if cpu_limit else None,
             mem_limit=memory_limit,
             cap_add=cap_add,
+            extra_hosts=extra_hosts,
         )
 
         logger.info(f"Container {name} created with ID {container.short_id}")
diff --git a/worker/models.py b/worker/models.py
index 99057cd..ec982c9 100644
--- a/worker/models.py
+++ b/worker/models.py
@@ -87,6 +87,7 @@ class ContainerCreateRequest(BaseModel):
     cpu_limit: Optional[float] = None
     memory_limit: Optional[int] = None
     cap_add: Optional[list[str]] = None
+    extra_hosts: Optional[dict[str, str]] = None
 
 
 class ContainerActionRequest(BaseModel):
diff --git a/worker/routes/containers.py b/worker/routes/containers.py
index 5c7c4e1..1c38c4b 100644
--- a/worker/routes/containers.py
+++ b/worker/routes/containers.py
@@ -95,6 +95,7 @@ async def create_container(request: ContainerCreateRequest):
             cpu_limit=request.cpu_limit,
             memory_limit=request.memory_limit,
             cap_add=request.cap_add,
+            extra_hosts=request.extra_hosts,
         )
     except Exception as e:
         logger.error(f"Failed to create container {request.name}: {e}")

From 34f589024d7413f1ad1573cc261859d81bd755f1 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sun, 18 Jan 2026 09:28:08 +0800
Subject: [PATCH 26/27] feat: add app status sync and fix stale image SHA error

---
 backend/app/main.py              |  60 ++++++++++-
 backend/app/services/app_sync.py | 173 +++++++++++++++++++++++++++++++
 worker/docker_ops/containers.py  |  69 +++++++-----
 3 files changed, 275 insertions(+), 27 deletions(-)
 create mode 100644 backend/app/services/app_sync.py

diff --git a/backend/app/main.py b/backend/app/main.py
index 00e2068..703a75f 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -17,6 +17,7 @@
 from app.core.exceptions import LMStackError
 from app.database import async_session_maker, init_db
 from app.models.worker import Worker, WorkerStatus
+from app.services.app_sync import app_sync_service
 from app.services.deployment_sync import deployment_sync_service
 
 # Configure logging
@@ -31,9 +32,11 @@
 # Background task control
 _worker_check_task = None
 _deployment_health_task = None
+_app_health_task = None
 
-# Deployment health check interval (in seconds)
+# Health check interval (in seconds)
 DEPLOYMENT_HEALTH_CHECK_INTERVAL = 60  # Check every minute
+APP_HEALTH_CHECK_INTERVAL = 30  # Check apps more frequently
 
 
 async def check_worker_status():
@@ -107,10 +110,39 @@ async def check_deployment_health():
             await asyncio.sleep(30)  # Wait before retrying on error
 
 
+async def check_app_health():
+    """Background task to periodically check app container health.
+
+    This ensures that apps marked as 'running' are actually running,
+    and catches any containers that are manually deleted or crashed.
+    """
+    # Initial delay to let system stabilize
+    await asyncio.sleep(15)
+
+    while True:
+        try:
+            await asyncio.sleep(APP_HEALTH_CHECK_INTERVAL)
+
+            stats = await app_sync_service.sync_all_apps()
+
+            if stats["total"] > 0:
+                logger.debug(
+                    f"App health check: {stats['running_verified']} healthy, "
+                    f"{stats['container_missing']} missing"
+                )
+
+        except asyncio.CancelledError:
+            logger.info("App health check task cancelled")
+            break
+        except Exception as e:
+            logger.error(f"Error in app health check: {e}")
+            await asyncio.sleep(30)  # Wait before retrying on error
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan handler"""
-    global _worker_check_task, _deployment_health_task
+    global _worker_check_task, _deployment_health_task, _app_health_task
 
     # Startup
     logger.info("Starting LMStack API Server...")
@@ -131,6 +163,18 @@ async def lifespan(app: FastAPI):
     except Exception as e:
         logger.error(f"Failed to sync deployments on startup: {e}")
 
+    # Synchronize app status with actual container state
+    try:
+        logger.info("Synchronizing app status...")
+        app_stats = await app_sync_service.sync_all_apps()
+        if app_stats["total"] > 0:
+            logger.info(
+                f"App sync complete: {app_stats['running_verified']} running, "
+                f"{app_stats['container_missing']} missing"
+            )
+    except Exception as e:
+        logger.error(f"Failed to sync apps on startup: {e}")
+
     # Start background task for checking worker status
     _worker_check_task = asyncio.create_task(check_worker_status())
     logger.info("Worker status check task started")
@@ -139,6 +183,10 @@ async def lifespan(app: FastAPI):
     _deployment_health_task = asyncio.create_task(check_deployment_health())
     logger.info("Deployment health check task started")
 
+    # Start background task for checking app health
+    _app_health_task = asyncio.create_task(check_app_health())
+    logger.info("App health check task started")
+
     yield
 
     # Shutdown
@@ -161,6 +209,14 @@ async def lifespan(app: FastAPI):
             pass
         logger.info("Deployment health check task stopped")
 
+    if _app_health_task:
+        _app_health_task.cancel()
+        try:
+            await _app_health_task
+        except asyncio.CancelledError:
+            pass
+        logger.info("App health check task stopped")
+
 
 app = FastAPI(
     title=settings.app_name,
diff --git a/backend/app/services/app_sync.py b/backend/app/services/app_sync.py
new file mode 100644
index 0000000..b8ddaef
--- /dev/null
+++ b/backend/app/services/app_sync.py
@@ -0,0 +1,173 @@
+"""App Sync Service
+
+Synchronizes app status with actual container state.
+This is important after system reboot to ensure database status matches reality.
+"""
+
+import asyncio
+import logging
+
+import httpx
+from sqlalchemy import select
+from sqlalchemy.orm import selectinload
+
+from app.database import async_session_maker
+from app.models.app import App, AppStatus
+
+logger = logging.getLogger(__name__)
+
+
+class AppSyncService:
+    """Service for synchronizing app status with actual container state."""
+
+    # Configuration
+    CONTAINER_CHECK_TIMEOUT = 10  # seconds
+    MAX_CONCURRENT_CHECKS = 5  # limit concurrent checks
+
+    async def sync_all_apps(self) -> dict:
+        """Synchronize all app statuses.
+
+        Returns:
+            dict with sync statistics
+        """
+        logger.info("Starting app status synchronization...")
+
+        stats = {
+            "total": 0,
+            "running_verified": 0,
+            "container_missing": 0,
+            "errors": 0,
+            "skipped": 0,
+        }
+
+        async with async_session_maker() as db:
+            # Get all apps that should be running or are in transitional states
+            result = await db.execute(
+                select(App)
+                .where(
+                    App.status.in_(
+                        [
+                            AppStatus.RUNNING.value,
+                            AppStatus.STARTING.value,
+                            AppStatus.PULLING.value,
+                        ]
+                    )
+                )
+                .options(selectinload(App.worker))
+            )
+            apps = result.scalars().all()
+            stats["total"] = len(apps)
+
+            if not apps:
+                logger.info("No active apps to sync")
+                return stats
+
+            logger.info(f"Found {len(apps)} active apps to check")
+
+            # Check apps with limited concurrency
+            semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_CHECKS)
+
+            async def check_with_semaphore(app: App):
+                async with semaphore:
+                    return await self._check_and_update_app(app, db)
+
+            tasks = [check_with_semaphore(a) for a in apps]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Aggregate results
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"App check failed: {result}")
+                    stats["errors"] += 1
+                elif isinstance(result, str):
+                    if result == "running_verified":
+                        stats["running_verified"] += 1
+                    elif result == "container_missing":
+                        stats["container_missing"] += 1
+                    elif result == "skipped":
+                        stats["skipped"] += 1
+
+            await db.commit()
+
+        logger.info(
+            f"App sync complete: {stats['running_verified']} running, "
+            f"{stats['container_missing']} missing, {stats['errors']} errors"
+        )
+
+        return stats
+
+    async def _check_and_update_app(self, app: App, db) -> str:
+        """Check a single app and update its status.
+
+        Returns:
+            Status string: running_verified, container_missing, skipped
+        """
+        logger.debug(f"Checking app {app.id}: {app.name}")
+
+        if not app.worker:
+            logger.warning(f"App {app.id} has no worker, skipping")
+            return "skipped"
+
+        if not app.container_id:
+            logger.warning(f"App {app.id} has no container_id, marking as error")
+            app.status = AppStatus.ERROR.value
+            app.status_message = "Container ID missing"
+            return "container_missing"
+
+        # Check if worker is online
+        if app.worker.status != "online":
+            logger.warning(f"App {app.id}: worker offline")
+            app.status = AppStatus.ERROR.value
+            app.status_message = f"Worker {app.worker.name} is offline"
+            return "container_missing"
+
+        # Check container status via worker API
+        try:
+            async with httpx.AsyncClient(timeout=self.CONTAINER_CHECK_TIMEOUT) as client:
+                response = await client.get(
+                    f"http://{app.worker.address}/containers/{app.container_id}"
+                )
+
+                if response.status_code == 404:
+                    # Container doesn't exist
+                    logger.warning(f"App {app.name}: container not found")
+                    app.status = AppStatus.ERROR.value
+                    app.status_message = "Container not found. Please redeploy."
+                    return "container_missing"
+
+                if response.status_code == 200:
+                    container_info = response.json()
+                    state = container_info.get("state", "").lower()
+
+                    if state == "running":
+                        if app.status != AppStatus.RUNNING.value:
+                            app.status = AppStatus.RUNNING.value
+                            app.status_message = None
+                        logger.debug(f"App {app.name}: running and verified")
+                        return "running_verified"
+
+                    elif state in ("exited", "dead"):
+                        logger.warning(f"App {app.name}: container {state}")
+                        app.status = AppStatus.STOPPED.value
+                        app.status_message = f"Container {state}"
+                        return "container_missing"
+
+                    else:
+                        logger.debug(f"App {app.name}: container state {state}")
+                        return "running_verified"
+
+        except httpx.ConnectError:
+            logger.warning(f"App {app.id}: cannot connect to worker")
+            app.status = AppStatus.ERROR.value
+            app.status_message = f"Cannot connect to worker {app.worker.name}"
+            return "container_missing"
+
+        except Exception as e:
+            logger.error(f"Error checking app {app.name}: {e}")
+            return "skipped"
+
+        return "skipped"
+
+
+# Global instance
+app_sync_service = AppSyncService()
diff --git a/worker/docker_ops/containers.py b/worker/docker_ops/containers.py
index 4259f64..3ace542 100644
--- a/worker/docker_ops/containers.py
+++ b/worker/docker_ops/containers.py
@@ -290,15 +290,21 @@ def create_container(
         """
         logger.info(f"Creating container: {name} from image {image}")
 
-        # Verify image exists, pull if not found
-        # Note: We catch both ImageNotFound and APIError because when a tag exists
-        # but points to a deleted/pruned image SHA, Docker returns a 404 APIError
-        # instead of ImageNotFound.
+        # Verify image exists and is valid, pull if needed
+        # Note: We need to handle the case where a tag exists but points to a
+        # deleted/pruned image SHA. In this case, images.get() may succeed but
+        # container creation will fail with a 404 error.
+        image_valid = False
         try:
-            self.client.images.get(image)
-            logger.info(f"Image {image} found locally")
+            img = self.client.images.get(image)
+            # Try to inspect the image to verify the SHA exists
+            img.attrs  # This will fail if SHA is pruned
+            logger.info(f"Image {image} found locally and valid")
+            image_valid = True
         except (ImageNotFound, APIError) as e:
             logger.info(f"Image {image} not found or invalid ({type(e).__name__}), pulling...")
+
+        if not image_valid:
             self.client.images.pull(image)
             logger.info(f"Image {image} pulled successfully")
 
@@ -348,25 +354,38 @@ def create_container(
         if restart_policy == "on-failure":
             restart_config["MaximumRetryCount"] = 3
 
-        container = self.client.containers.run(
-            image=image,
-            name=name,
-            command=command,
-            entrypoint=entrypoint,
-            detach=True,
-            remove=False,
-            ports=port_bindings if port_bindings else None,
-            volumes=volume_bindings if volume_bindings else None,
-            device_requests=device_requests,
-            environment=environment,
-            labels=container_labels,
-            restart_policy=restart_config,
-            cpu_period=100000 if cpu_limit else None,
-            cpu_quota=int(cpu_limit * 100000) if cpu_limit else None,
-            mem_limit=memory_limit,
-            cap_add=cap_add,
-            extra_hosts=extra_hosts,
-        )
+        # Try to create container, retry with fresh pull if image is stale
+        run_kwargs = {
+            "image": image,
+            "name": name,
+            "command": command,
+            "entrypoint": entrypoint,
+            "detach": True,
+            "remove": False,
+            "ports": port_bindings if port_bindings else None,
+            "volumes": volume_bindings if volume_bindings else None,
+            "device_requests": device_requests,
+            "environment": environment,
+            "labels": container_labels,
+            "restart_policy": restart_config,
+            "cpu_period": 100000 if cpu_limit else None,
+            "cpu_quota": int(cpu_limit * 100000) if cpu_limit else None,
+            "mem_limit": memory_limit,
+            "cap_add": cap_add,
+            "extra_hosts": extra_hosts,
+        }
+
+        try:
+            container = self.client.containers.run(**run_kwargs)
+        except APIError as e:
+            # If error is related to missing image SHA, re-pull and retry
+            if "No such image" in str(e) or "not found" in str(e).lower():
+                logger.warning(f"Image {image} appears stale, re-pulling...")
+                self.client.images.pull(image)
+                logger.info(f"Image {image} re-pulled, retrying container creation")
+                container = self.client.containers.run(**run_kwargs)
+            else:
+                raise
 
         logger.info(f"Container {name} created with ID {container.short_id}")
         return self.get_container_detail(container.id)

From 6d8a678b36d1537a9f8897e154d55c8c749e09d3 Mon Sep 17 00:00:00 2001
From: rickychen-infinirc <ricky.chen@infinirc.com>
Date: Sun, 18 Jan 2026 11:48:17 +0800
Subject: [PATCH 27/27] fix: update Docker CLI to 27.x and mount docker.sock
 for local worker

---
 backend/Dockerfile | 3 ++-
 docker-compose.yml | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/backend/Dockerfile b/backend/Dockerfile
index 11fc738..323cd1e 100644
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -20,9 +20,10 @@ FROM python:3.11-slim
 WORKDIR /app
 
 # Install docker CLI for local worker spawn feature
+# Using Docker 27.x for API version 1.47 compatibility
 RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
-    && curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-24.0.7.tgz | tar xz --strip-components=1 -C /usr/local/bin docker/docker \
+    && curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-27.4.1.tgz | tar xz --strip-components=1 -C /usr/local/bin docker/docker \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy installed packages from builder
diff --git a/docker-compose.yml b/docker-compose.yml
index 1254427..388d427 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -36,6 +36,9 @@ services:
       LMSTACK_CORS_ORIGINS: ${CORS_ORIGINS:-*}
     ports:
       - "${BACKEND_PORT:-52000}:52000"
+    volumes:
+      # Mount docker socket for local worker management
+      - /var/run/docker.sock:/var/run/docker.sock
     depends_on:
       db:
         condition: service_healthy