diff --git a/README.md b/README.md index 216cb14..6e3ca7f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ -# LMStack +

+ LMStack +

-[中文文檔](README_zh-TW.md) +

+ 中文文檔 +

LLM Deployment Management Platform - Deploy and manage Large Language Models on distributed GPU workers. @@ -48,29 +52,14 @@ docker compose -f docker-compose.deploy.yml up -d - Frontend: http://localhost:3000 - Backend API: http://localhost:52000 -### Start Worker (on GPU machine) - -```bash -docker run -d \ - --name lmstack-worker \ - --gpus all \ - --privileged \ - -p 52001:52001 \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -v /:/host:ro \ - -e BACKEND_URL=http://YOUR_SERVER_IP:52000 \ - -e WORKER_NAME=gpu-worker-01 \ - infinirc/lmstack-worker:latest -``` - ### Usage 1. Login with `admin` / `admin` (change password after first login) -2. Check **Workers** page - workers auto-register -3. Add model in **Models** page -4. Create deployment in **Deployments** page -5. Use OpenAI-compatible API: +2. Go to **Workers** page and click **Add Worker** to get the Docker command +3. Run the Docker command on your GPU machine to register a worker +4. Add model in **Models** page +5. Create deployment in **Deployments** page +6. Use OpenAI-compatible API: ```bash curl http://localhost:52000/v1/chat/completions \ @@ -96,21 +85,10 @@ Build and run Docker images locally: # Run locally built backend + frontend docker compose -f docker-compose.local.yml up -d - -# Run locally built worker (on GPU machine) -docker run -d \ - --name lmstack-worker \ - --gpus all \ - --privileged \ - -p 52001:52001 \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -v /:/host:ro \ - -e BACKEND_URL=http://YOUR_SERVER_IP:52000 \ - -e WORKER_NAME=gpu-worker-01 \ - infinirc/lmstack-worker:local ``` +Then go to **Workers** page in the UI to add a worker. + ### Without Docker ```bash diff --git a/README_zh-TW.md b/README_zh-TW.md index 5e48087..28c4500 100644 --- a/README_zh-TW.md +++ b/README_zh-TW.md @@ -1,6 +1,10 @@ -# LMStack +

+ LMStack +

-[English](README.md) +

+ English +

LLM 部署管理平台 - 在分散式 GPU 節點上部署和管理大型語言模型。 @@ -23,8 +27,8 @@ LLM 部署管理平台 - 在分散式 GPU 節點上部署和管理大型語言 ┌────────────┴────────────┐ ▼ ▼ ┌──────────────┐ ┌──────────────┐ - │ Worker Agent │ │ Worker Agent │ - │ (GPU 節點) │ │ (GPU 節點) │ + │ Worker │ │ Worker │ + │ (GPU 節點) │ │ (GPU 節點) │ └──────────────┘ └──────────────┘ ``` @@ -48,29 +52,14 @@ docker compose -f docker-compose.deploy.yml up -d - 前端: http://localhost:3000 - 後端 API: http://localhost:52000 -### 啟動 Worker(在 GPU 機器上) - -```bash -docker run -d \ - --name lmstack-worker \ - --gpus all \ - --privileged \ - -p 52001:52001 \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -v /:/host:ro \ - -e BACKEND_URL=http://你的伺服器IP:52000 \ - -e WORKER_NAME=gpu-worker-01 \ - infinirc/lmstack-worker:latest -``` - ### 使用方式 1. 使用 `admin` / `admin` 登入(首次登入後請更改密碼) -2. 查看 **Workers** 頁面 - Workers 會自動註冊 -3. 在 **Models** 頁面新增模型 -4. 在 **Deployments** 頁面建立部署 -5. 使用 OpenAI 相容 API: +2. 前往 **Workers** 頁面,點擊 **Add Worker** 取得 Docker 指令 +3. 在 GPU 機器上執行該 Docker 指令以註冊 Worker +4. 在 **Models** 頁面新增模型 +5. 在 **Deployments** 頁面建立部署 +6. 使用 OpenAI 相容 API: ```bash curl http://localhost:52000/v1/chat/completions \ @@ -96,21 +85,10 @@ curl http://localhost:52000/v1/chat/completions \ # 運行本地構建的 backend + frontend docker compose -f docker-compose.local.yml up -d - -# 運行本地構建的 worker(在 GPU 機器上) -docker run -d \ - --name lmstack-worker \ - --gpus all \ - --privileged \ - -p 52001:52001 \ - -v /var/run/docker.sock:/var/run/docker.sock \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -v /:/host:ro \ - -e BACKEND_URL=http://你的伺服器IP:52000 \ - -e WORKER_NAME=gpu-worker-01 \ - infinirc/lmstack-worker:local ``` +然後前往 UI 中的 **Workers** 頁面新增 Worker。 + ### 不使用 Docker ```bash diff --git a/backend/Dockerfile b/backend/Dockerfile index 8259ba4..11fc738 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -19,6 +19,12 @@ FROM python:3.11-slim WORKDIR /app +# Install docker CLI for local worker spawn feature +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-24.0.7.tgz | tar xz --strip-components=1 -C /usr/local/bin docker/docker \ + && rm -rf /var/lib/apt/lists/* + # Copy installed packages from builder COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages COPY --from=builder /usr/local/bin /usr/local/bin diff --git a/backend/app/api/apps/deployment.py b/backend/app/api/apps/deployment.py index 0a1bdd1..6400d42 100644 --- a/backend/app/api/apps/deployment.py +++ b/backend/app/api/apps/deployment.py @@ -84,12 +84,47 @@ async def pull_image_with_progress( Exception: On pull failure """ url = f"http://{worker.address}/images/pull" + progress_url = f"http://{worker.address}/images/pull-progress/{app_id}" set_deployment_progress(app_id, "pulling", 0, f"Pulling image {image}...") try: async with httpx.AsyncClient(timeout=IMAGE_PULL_TIMEOUT) as client: - response = await client.post(url, json={"image": image}) + # Start the pull request in a task with app_id for progress tracking + pull_task = asyncio.create_task( + client.post(url, json={"image": image, "app_id": app_id}) + ) + + # Poll for progress while waiting + while not pull_task.done(): + try: + progress_resp = await client.get(progress_url, timeout=5.0) + if progress_resp.status_code == 200: + progress_data = progress_resp.json() + status = progress_data.get("status", "") + progress = progress_data.get("progress", 0) + + if status == "pulling": + set_deployment_progress( + app_id, + "pulling", + progress, + f"Pulling image {image}... ({progress}%)", + ) + elif status == "completed": + set_deployment_progress( + app_id, + "pulling", + 100, + "Image pulled successfully", + ) + except Exception: + pass # Progress polling is best-effort + + await asyncio.sleep(2) + + # Get the final response + response = await pull_task if response.status_code >= 400: raise Exception(f"Failed to pull image: {response.text}") diff --git a/backend/app/api/apps/routes.py b/backend/app/api/apps/routes.py index d8323ff..675bd51 100644 --- a/backend/app/api/apps/routes.py +++ b/backend/app/api/apps/routes.py @@ -179,6 +179,15 @@ async def deploy_app( proxy_path = f"/apps/{app_type.value}" port = await _find_available_port(db, worker.id) + # Auto-disable proxy for localhost workers to avoid port conflicts + # When worker is on localhost (using --network host), app container binds + # directly to host port, so proxy would conflict + worker_host = worker.address.split(":")[0] + use_proxy = deploy_request.use_proxy + if worker_host in ("localhost", "127.0.0.1"): + use_proxy = False + logger.info(f"Auto-disabled proxy for localhost worker {worker.name}") + # Create app record app = App( app_type=app_type.value, @@ -188,7 +197,7 @@ async def deploy_app( status=AppStatus.PENDING.value, proxy_path=proxy_path, port=port, - use_proxy=deploy_request.use_proxy, + use_proxy=use_proxy, ) db.add(app) await db.commit() @@ -222,7 +231,7 @@ async def deploy_app( port=port, app_def=app_def, lmstack_port=lmstack_port, - use_proxy=deploy_request.use_proxy, + use_proxy=use_proxy, ) return app_to_response(app, request) diff --git a/backend/app/api/headscale.py b/backend/app/api/headscale.py index f83d74d..9e743d1 100644 --- a/backend/app/api/headscale.py +++ b/backend/app/api/headscale.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field from app.api.auth import require_admin -from app.services.headscale_manager import LMSTACK_USER, get_headscale_manager +from app.services.headscale_manager import LMSTACK_USER, get_headscale_manager, get_startup_progress logger = logging.getLogger(__name__) router = APIRouter() @@ -94,6 +94,14 @@ async def get_headscale_status( return HeadscaleStatusResponse(enabled=False, running=False) +@router.get("/progress") +async def get_headscale_progress( + _: dict = Depends(require_admin), +): + """Get Headscale startup progress.""" + return get_startup_progress() + + @router.post("/start", response_model=HeadscaleStatusResponse) async def start_headscale( request: Request, diff --git a/backend/app/api/workers.py b/backend/app/api/workers.py index c497b16..8c5954d 100644 --- a/backend/app/api/workers.py +++ b/backend/app/api/workers.py @@ -24,7 +24,12 @@ WorkerResponse, WorkerUpdate, ) -from app.services.local_worker import get_local_worker_info +from app.services.local_worker import ( + get_local_hostname, + get_local_ip, + spawn_docker_worker, + stop_docker_worker, +) router = APIRouter() @@ -326,6 +331,10 @@ async def delete_worker( if deployment_count and deployment_count > 0: raise HTTPException(status_code=400, detail="Cannot delete worker with active deployments") + # Try to stop and remove Docker container if it's a local worker + # Container name is "lmstack-worker" by default + stop_docker_worker("lmstack-worker") + await db.delete(worker) await db.commit() @@ -356,90 +365,72 @@ async def worker_heartbeat( return {"status": "ok"} -@router.post("/local", response_model=WorkerResponse, status_code=201) +@router.post("/local", status_code=201) async def register_local_worker( + request: Request, db: AsyncSession = Depends(get_db), current_user: User = Depends(require_operator), ): - """Register the local machine as a worker (requires operator+)""" - # Get local machine info - info = get_local_worker_info() - - worker_name = f"local-{info['hostname']}" - - # Check if local worker already exists - existing = await db.execute(select(Worker).where(Worker.name == worker_name)) - existing_worker = existing.scalar_one_or_none() + """Spawn a Docker worker on the local machine (requires operator+). - if existing_worker: - # Update existing worker info - existing_worker.gpu_info = info["gpu_info"] - existing_worker.system_info = info["system_info"] - existing_worker.status = WorkerStatus.ONLINE.value - existing_worker.last_heartbeat = datetime.now(UTC) - - await db.commit() - await db.refresh(existing_worker) + This creates a registration token, then runs docker to start a worker container. + The worker will register itself using the token. + """ + # Get local hostname for worker name + hostname = get_local_hostname() + worker_name = hostname - deployment_count_query = select(func.count()).where( - Deployment.worker_id == existing_worker.id - ) - deployment_count = await db.scalar(deployment_count_query) or 0 - - return WorkerResponse( - id=existing_worker.id, - name=existing_worker.name, - address=existing_worker.address, - description=existing_worker.description, - labels=existing_worker.labels, - status=existing_worker.status, - gpu_info=existing_worker.gpu_info, - system_info=existing_worker.system_info, - created_at=existing_worker.created_at, - updated_at=existing_worker.updated_at, - last_heartbeat=existing_worker.last_heartbeat, - deployment_count=deployment_count, - ) + # Get backend URL - use local IP so the Docker container can reach it + local_ip = get_local_ip() + settings = get_settings() + backend_url = f"http://{local_ip}:{settings.port}" - # Create new local worker - worker = Worker( + # Create a registration token for this worker + token = RegistrationToken.create( name=worker_name, - address="localhost", - description=f"Local worker on {info['hostname']} ({info['platform']} {info['platform_release']})", - labels={"type": "local", "hostname": info["hostname"]}, - gpu_info=info["gpu_info"], - system_info=info["system_info"], - status=WorkerStatus.ONLINE.value, - last_heartbeat=datetime.now(UTC), + expires_in_hours=24, # Token valid for 24 hours ) - db.add(worker) + db.add(token) await db.commit() - await db.refresh(worker) + await db.refresh(token) - return WorkerResponse( - id=worker.id, - name=worker.name, - address=worker.address, - description=worker.description, - labels=worker.labels, - status=worker.status, - gpu_info=worker.gpu_info, - system_info=worker.system_info, - created_at=worker.created_at, - updated_at=worker.updated_at, - last_heartbeat=worker.last_heartbeat, - deployment_count=0, + # Spawn the Docker worker + result = spawn_docker_worker( + worker_name=worker_name, + backend_url=backend_url, + registration_token=token.token, + container_name="lmstack-worker", ) + if not result["success"]: + # Clean up the token if spawn failed + await db.delete(token) + await db.commit() + raise HTTPException(status_code=500, detail=result["message"]) + + return { + "success": True, + "message": result["message"], + "worker_name": worker_name, + "container_id": result.get("container_id"), + "backend_url": backend_url, + } + def _generate_docker_command(token: str, name: str, backend_url: str) -> str: - """Generate docker run command for worker registration.""" + """Generate docker run command for worker registration. + + Uses --network host mode so that: + 1. Worker registers with localhost/host IP (not Docker internal IP) + 2. Apps deployed by Worker are accessible via host network + 3. Works seamlessly on both regular machines and WSL + """ return f"""docker run -d \\ --name lmstack-worker \\ + --network host \\ --gpus all \\ --privileged \\ - -p 52001:52001 \\ -v /var/run/docker.sock:/var/run/docker.sock \\ -v ~/.cache/huggingface:/root/.cache/huggingface \\ -v /:/host:ro \\ diff --git a/backend/app/services/deployer.py b/backend/app/services/deployer.py index 42565aa..103dcec 100644 --- a/backend/app/services/deployer.py +++ b/backend/app/services/deployer.py @@ -84,9 +84,46 @@ async def deploy(self, deployment_id: int) -> None: else: # Send to remote worker agent worker_url = f"http://{deployment.worker.address}/deploy" + progress_url = ( + f"http://{deployment.worker.address}/pull-progress/{deployment.id}" + ) + # Start deployment request and poll for progress async with httpx.AsyncClient(timeout=300.0) as client: - response = await client.post(worker_url, json=deploy_request) + # Start the deployment in a task + deploy_task = asyncio.create_task( + client.post(worker_url, json=deploy_request) + ) + + # Poll for progress while waiting + while not deploy_task.done(): + try: + progress_resp = await client.get(progress_url, timeout=5.0) + if progress_resp.status_code == 200: + progress_data = progress_resp.json() + status = progress_data.get("status", "") + image = progress_data.get("image", "") + progress = progress_data.get("progress", 0) + + if status == "pulling": + deployment.status_message = ( + f"Pulling image {image}... ({progress}%)" + ) + await db.commit() + elif status == "completed": + deployment.status_message = ( + "Image pulled, starting container..." + ) + await db.commit() + elif status == "starting": + deployment.status_message = "Starting container..." + await db.commit() + except Exception: + pass # Progress polling is best-effort + + await asyncio.sleep(2) + + response = await deploy_task if response.status_code != 200: deployment.status = DeploymentStatus.ERROR.value @@ -105,6 +142,20 @@ async def deploy(self, deployment_id: int) -> None: # For Ollama, we need to pull the model first if deployment.backend == BackendType.OLLAMA.value: + deployment.status_message = "Waiting for Ollama container to start..." + await db.commit() + + # Wait for Ollama API to be available before pulling + ollama_ready = await self._wait_for_ollama_ready( + deployment.worker.address, + deployment.port, + ) + if not ollama_ready: + deployment.status = DeploymentStatus.ERROR.value + deployment.status_message = "Ollama container failed to start" + await db.commit() + return + deployment.status_message = "Pulling model with Ollama..." await db.commit() @@ -157,6 +208,48 @@ async def deploy(self, deployment_id: int) -> None: await db.commit() + async def _wait_for_ollama_ready( + self, + worker_address: str, + port: int, + timeout: int = 60, + ) -> bool: + """Wait for Ollama API to be available. + + Args: + worker_address: Worker address (host:port) + port: Ollama container port + timeout: Maximum wait time in seconds + + Returns: + True if Ollama is ready, False on timeout + """ + worker_ip = worker_address.split(":")[0] + api_url = f"http://{worker_ip}:{port}/api/tags" + + logger.info(f"Waiting for Ollama API at {api_url}") + + elapsed = 0 + check_interval = 2 + + async with httpx.AsyncClient(timeout=10.0) as client: + while elapsed < timeout: + try: + response = await client.get(api_url) + if response.status_code == 200: + logger.info(f"Ollama API ready after {elapsed}s") + return True + except httpx.ConnectError: + logger.debug(f"Ollama not ready yet ({elapsed}s)") + except Exception as e: + logger.debug(f"Ollama check error: {e}") + + await asyncio.sleep(check_interval) + elapsed += check_interval + + logger.error(f"Ollama API not ready after {timeout}s") + return False + async def _ollama_pull_model( self, worker_address: str, diff --git a/backend/app/services/headscale_manager.py b/backend/app/services/headscale_manager.py index 693f9b6..83caf62 100644 --- a/backend/app/services/headscale_manager.py +++ b/backend/app/services/headscale_manager.py @@ -17,6 +17,26 @@ HEADSCALE_CONTAINER_NAME = "lmstack-headscale" HEADSCALE_IMAGE = "headscale/headscale:latest" + +# ============================================================================= +# Progress Tracking +# ============================================================================= + +# In-memory store for Headscale startup progress +_startup_progress: dict = {"status": "idle", "progress": 0, "message": ""} + + +def get_startup_progress() -> dict: + """Get Headscale startup progress.""" + return _startup_progress.copy() + + +def _set_startup_progress(status: str, progress: int, message: str) -> None: + """Set Headscale startup progress.""" + global _startup_progress + _startup_progress = {"status": status, "progress": progress, "message": message} + + # Docker volume name for headscale data (shared between backend and headscale containers) HEADSCALE_VOLUME_NAME = "lmstack-headscale-data" # Path inside containers where the volume is mounted @@ -170,28 +190,36 @@ async def start( self.server_url = server_url self.http_port = http_port + _set_startup_progress("starting", 0, "Checking container status...") + container = self._get_container() if container: if container.status == "running": logger.info("Headscale is already running") + _set_startup_progress("completed", 100, "Headscale is already running") return True # Remove old container to ensure port config is correct logger.info("Removing old Headscale container") + _set_startup_progress("starting", 5, "Removing old container...") container.remove(force=True) # Write config to volume + _set_startup_progress("starting", 10, "Writing configuration...") self._write_config_to_volume(server_url, http_port, grpc_port) # Pull image if needed try: self.client.images.get(HEADSCALE_IMAGE) + _set_startup_progress("starting", 50, "Image already exists") except NotFound: logger.info(f"Pulling {HEADSCALE_IMAGE}...") - self.client.images.pull(HEADSCALE_IMAGE) + _set_startup_progress("pulling", 15, f"Pulling image {HEADSCALE_IMAGE}...") + await self._pull_image_with_progress(HEADSCALE_IMAGE) # Create and start container logger.info("Creating Headscale container") + _set_startup_progress("starting", 70, "Creating container...") try: self.client.containers.run( HEADSCALE_IMAGE, @@ -209,18 +237,57 @@ async def start( ) # Wait for Headscale to start + _set_startup_progress("starting", 85, "Waiting for service to start...") await asyncio.sleep(3) # Create default user + _set_startup_progress("starting", 95, "Creating default user...") await self._create_user(LMSTACK_USER) + _set_startup_progress("completed", 100, "Headscale started successfully") logger.info("Headscale started successfully") return True except APIError as e: logger.error(f"Failed to start Headscale: {e}") + _set_startup_progress("error", 0, f"Failed to start: {e}") return False + async def _pull_image_with_progress(self, image: str) -> None: + """Pull image with progress tracking.""" + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self._pull_image_sync, image) + + def _pull_image_sync(self, image: str) -> None: + """Synchronously pull image with progress updates.""" + layers_progress: dict[str, dict] = {} + + for line in self.client.api.pull(image, stream=True, decode=True): + if "id" in line and "progressDetail" in line: + layer_id = line["id"] + detail = line.get("progressDetail", {}) + current = detail.get("current", 0) + total = detail.get("total", 0) + + layers_progress[layer_id] = { + "status": line.get("status", ""), + "current": current, + "total": total, + } + + # Calculate overall progress (15% to 65%) + total_size = sum(lp.get("total", 0) for lp in layers_progress.values()) + downloaded = sum(lp.get("current", 0) for lp in layers_progress.values()) + if total_size > 0: + pull_progress = int((downloaded / total_size) * 100) + # Map to 15-65% range + overall_progress = 15 + int(pull_progress * 0.5) + _set_startup_progress( + "pulling", + overall_progress, + f"Pulling image {image}... ({pull_progress}%)", + ) + async def stop(self) -> bool: """Stop Headscale server.""" container = self._get_container() diff --git a/backend/app/services/local_worker.py b/backend/app/services/local_worker.py index f884e3b..28f4456 100644 --- a/backend/app/services/local_worker.py +++ b/backend/app/services/local_worker.py @@ -119,3 +119,188 @@ def get_local_worker_info() -> dict: "platform": platform.system(), "platform_release": platform.release(), } + + +def spawn_docker_worker( + worker_name: str, + backend_url: str, + registration_token: str, + container_name: str = "lmstack-worker", +) -> dict: + """Spawn a Docker worker container on the local machine. + + Returns: + dict with keys: success, message, container_id (if success) + """ + # First, check if container with same name exists and remove it + try: + check_result = subprocess.run( + ["docker", "ps", "-a", "-q", "-f", f"name=^{container_name}$"], + capture_output=True, + text=True, + timeout=10, + ) + if check_result.stdout.strip(): + # Container exists, stop and remove it + logger.info(f"Removing existing container: {container_name}") + subprocess.run( + ["docker", "stop", container_name], + capture_output=True, + timeout=30, + ) + subprocess.run( + ["docker", "rm", container_name], + capture_output=True, + timeout=10, + ) + except subprocess.TimeoutExpired: + logger.warning("Timeout while checking/removing existing container") + except Exception as e: + logger.warning(f"Error checking existing container: {e}") + + # Build the docker run command + # Use --network host so worker can access backend and deployed apps + cmd = [ + "docker", + "run", + "-d", + "--name", + container_name, + "--network", + "host", + "--gpus", + "all", + "--privileged", + "-v", + "/var/run/docker.sock:/var/run/docker.sock", + "-v", + f"{_get_huggingface_cache()}:/root/.cache/huggingface", + "-v", + "/:/host:ro", + "-e", + f"BACKEND_URL={backend_url}", + "-e", + f"WORKER_NAME={worker_name}", + "-e", + f"REGISTRATION_TOKEN={registration_token}", + "infinirc/lmstack-worker:latest", + ] + + try: + logger.info(f"Spawning Docker worker: {worker_name}") + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=60, + ) + + if result.returncode == 0: + container_id = result.stdout.strip()[:12] + logger.info(f"Docker worker spawned successfully: {container_id}") + return { + "success": True, + "message": f"Worker container started: {container_id}", + "container_id": container_id, + } + else: + error_msg = result.stderr.strip() or result.stdout.strip() + logger.error(f"Failed to spawn Docker worker: {error_msg}") + return { + "success": False, + "message": f"Failed to start worker: {error_msg}", + } + except subprocess.TimeoutExpired: + return { + "success": False, + "message": "Timeout while starting Docker container", + } + except FileNotFoundError: + return { + "success": False, + "message": "Docker not found. Please install Docker first.", + } + except Exception as e: + return { + "success": False, + "message": f"Error starting Docker worker: {str(e)}", + } + + +def _get_huggingface_cache() -> str: + """Get the HuggingFace cache directory path.""" + import os + + # Check HF_HOME first, then default to ~/.cache/huggingface + hf_home = os.environ.get("HF_HOME") + if hf_home: + return hf_home + return os.path.expanduser("~/.cache/huggingface") + + +def stop_docker_worker(container_name: str = "lmstack-worker") -> dict: + """Stop and remove a Docker worker container. + + Returns: + dict with keys: success, message + """ + try: + # Check if container exists + check_result = subprocess.run( + ["docker", "ps", "-a", "-q", "-f", f"name=^{container_name}$"], + capture_output=True, + text=True, + timeout=10, + ) + + if not check_result.stdout.strip(): + return { + "success": True, + "message": f"Container {container_name} not found (already removed)", + } + + # Stop the container + logger.info(f"Stopping container: {container_name}") + stop_result = subprocess.run( + ["docker", "stop", container_name], + capture_output=True, + text=True, + timeout=30, + ) + + # Remove the container + logger.info(f"Removing container: {container_name}") + rm_result = subprocess.run( + ["docker", "rm", container_name], + capture_output=True, + text=True, + timeout=10, + ) + + if rm_result.returncode == 0: + logger.info(f"Container {container_name} stopped and removed") + return { + "success": True, + "message": f"Container {container_name} stopped and removed", + } + else: + error_msg = rm_result.stderr.strip() or stop_result.stderr.strip() + return { + "success": False, + "message": f"Failed to remove container: {error_msg}", + } + except subprocess.TimeoutExpired: + return { + "success": False, + "message": "Timeout while stopping Docker container", + } + except FileNotFoundError: + return { + "success": True, + "message": "Docker not found (container may not exist)", + } + except Exception as e: + return { + "success": False, + "message": f"Error stopping Docker worker: {str(e)}", + } diff --git a/docs/LMStack-dark.png b/docs/LMStack-dark.png new file mode 100644 index 0000000..5fe4f2b Binary files /dev/null and b/docs/LMStack-dark.png differ diff --git a/docs/LMStack-light.png b/docs/LMStack-light.png new file mode 100644 index 0000000..8782d39 Binary files /dev/null and b/docs/LMStack-light.png differ diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 5d8fbc9..85adeb4 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -118,7 +118,7 @@ const DockerIcon = ({ size = 14 }: { size?: number }) => ( function getMenuItems(isAdmin: boolean) { const workersChildren: any[] = [ - { key: "/workers", icon: , label: "Overview" }, + { key: "/workers", icon: , label: "Worker Nodes" }, { key: "docker-group", icon: , diff --git a/frontend/src/api/headscale.ts b/frontend/src/api/headscale.ts index 7b244b9..ea4516a 100644 --- a/frontend/src/api/headscale.ts +++ b/frontend/src/api/headscale.ts @@ -28,6 +28,12 @@ export interface PreauthKeyResponse { join_command: string; } +export interface HeadscaleProgress { + status: string; + progress: number; + message: string; +} + export const headscaleApi = { getStatus: async (): Promise => { const response = await api.get("/headscale/status"); @@ -73,4 +79,9 @@ export const headscaleApi = { deleteNode: async (nodeId: number): Promise => { await api.delete(`/headscale/nodes/${nodeId}`); }, + + getProgress: async (): Promise => { + const response = await api.get("/headscale/progress"); + return response.data; + }, }; diff --git a/frontend/src/api/index.ts b/frontend/src/api/index.ts index d0975e9..f7974c2 100644 --- a/frontend/src/api/index.ts +++ b/frontend/src/api/index.ts @@ -109,4 +109,5 @@ export type { HeadscaleStatus, HeadscaleNode, PreauthKeyResponse, + HeadscaleProgress, } from "./headscale"; diff --git a/frontend/src/api/workers.ts b/frontend/src/api/workers.ts index 9e94fdd..e127f51 100644 --- a/frontend/src/api/workers.ts +++ b/frontend/src/api/workers.ts @@ -8,6 +8,7 @@ import type { ListResponse, RegistrationToken, RegistrationTokenCreate, + LocalWorkerSpawnResponse, } from "../types"; export interface WorkerListParams { @@ -43,8 +44,8 @@ export const workersApi = { await api.delete(`/workers/${id}`); }, - registerLocal: async (): Promise => { - const response = await api.post("/workers/local"); + registerLocal: async (): Promise => { + const response = await api.post("/workers/local"); return response.data; }, diff --git a/frontend/src/assets/logo/LMStack-dark.png b/frontend/src/assets/logo/LMStack-dark.png new file mode 100644 index 0000000..5fe4f2b Binary files /dev/null and b/frontend/src/assets/logo/LMStack-dark.png differ diff --git a/frontend/src/assets/logo/LMStack-light.png b/frontend/src/assets/logo/LMStack-light.png new file mode 100644 index 0000000..8782d39 Binary files /dev/null and b/frontend/src/assets/logo/LMStack-light.png differ diff --git a/frontend/src/components/layout/Logo.tsx b/frontend/src/components/layout/Logo.tsx index a51a35e..a9c4a38 100644 --- a/frontend/src/components/layout/Logo.tsx +++ b/frontend/src/components/layout/Logo.tsx @@ -1,36 +1,31 @@ /** * Logo Icon Component */ +import logoLight from "../../assets/logo/LMStack-light.png"; +import logoDark from "../../assets/logo/LMStack-dark.png"; interface LogoIconProps { - color: string; - size?: number; + color?: string; + width?: number; + height?: number; + isDark?: boolean; } -export function LogoIcon({ color, size = 18 }: LogoIconProps) { +export function LogoIcon({ + width = 160, + height = 36, + isDark = false, +}: LogoIconProps) { return ( - - - - - + LMStack ); } diff --git a/frontend/src/components/layout/Sidebar.tsx b/frontend/src/components/layout/Sidebar.tsx index 8f95b4f..ead02c4 100644 --- a/frontend/src/components/layout/Sidebar.tsx +++ b/frontend/src/components/layout/Sidebar.tsx @@ -52,7 +52,7 @@ export function Sidebar({ {/* Logo and Collapse Button */}
collapsed && onCollapse(false)} @@ -71,17 +70,11 @@ export function Sidebar({ >
{collapsed && logoHovered ? ( @@ -89,22 +82,13 @@ export function Sidebar({ style={{ fontSize: 14, color: colors.text }} /> ) : ( - + )}
- {!collapsed && ( - - LMStack - - )}
{!collapsed && ( -
- Development Mode (Python): -
-
-                  {`cd worker
+            
+                      
+                        {generatedToken.docker_command?.replace(
+                          "infinirc/lmstack-worker:latest",
+                          "infinirc/lmstack-worker:local",
+                        )}
+                      
+ +
+ ), + }, + { + key: "dev-python", + label: "Development Mode (Python)", + children: ( +
+
+                        {`cd worker
 pip install -r requirements.txt
 python agent.py \\
   --name ${generatedToken.name} \\
   --server-url ${window.location.protocol}//${window.location.hostname}:52000 \\
   --registration-token ${generatedToken.token}`}
-                
- -
-
+ if (navigator.clipboard && window.isSecureContext) { + navigator.clipboard.writeText(devCommand); + message.success("Command copied to clipboard"); + } else { + const textArea = document.createElement("textarea"); + textArea.value = devCommand; + textArea.style.position = "fixed"; + textArea.style.left = "-999999px"; + document.body.appendChild(textArea); + textArea.focus(); + textArea.select(); + document.execCommand("copy"); + document.body.removeChild(textArea); + message.success("Command copied to clipboard"); + } + }} + style={{ position: "absolute", top: 8, right: 8 }} + > + Copy + + + ), + }, + ]} + />
@@ -1110,6 +1238,49 @@ python agent.py \\ /> )} + + {/* Edit Worker Modal */} + { + setEditModal(null); + editForm.resetFields(); + }} + footer={null} + width={isMobile ? "100%" : 400} + style={ + isMobile ? { top: 20, maxWidth: "100%", margin: "0 8px" } : undefined + } + > +
+ + + + + + + + + + + + +
+
); } diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 67e56a7..2fe8829 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -19,6 +19,7 @@ export type { WorkerCreate, RegistrationToken, RegistrationTokenCreate, + LocalWorkerSpawnResponse, } from "./worker"; // Model Types diff --git a/frontend/src/types/worker.ts b/frontend/src/types/worker.ts index 92d89c5..7a0dc40 100644 --- a/frontend/src/types/worker.ts +++ b/frontend/src/types/worker.ts @@ -86,3 +86,11 @@ export interface RegistrationTokenCreate { name: string; expires_in_hours?: number; } + +export interface LocalWorkerSpawnResponse { + success: boolean; + message: string; + worker_name: string; + container_id?: string; + backend_url: string; +} diff --git a/worker/docker_ops/runner.py b/worker/docker_ops/runner.py index b9e2968..8931c94 100644 --- a/worker/docker_ops/runner.py +++ b/worker/docker_ops/runner.py @@ -234,6 +234,16 @@ def _run_sync( logger.info(f"Pulling image {image}...") self.pull_image_with_progress(image, deployment_id) + # Update progress to starting + _set_pull_progress( + deployment_id, + { + "status": "starting", + "image": image, + "progress": 100, + }, + ) + # Use specified port or get a new one host_port = port if port else self._get_next_port() diff --git a/worker/models.py b/worker/models.py index 7018991..99057cd 100644 --- a/worker/models.py +++ b/worker/models.py @@ -48,6 +48,7 @@ class ImagePullRequest(BaseModel): image: str registry_auth: Optional[dict[str, str]] = None + app_id: Optional[int] = None # Optional app ID for progress tracking class ImageBuildRequest(BaseModel): diff --git a/worker/routes/images.py b/worker/routes/images.py index 535c204..a9c17dc 100644 --- a/worker/routes/images.py +++ b/worker/routes/images.py @@ -3,6 +3,7 @@ Contains endpoints for listing, pulling, building, and removing Docker images. """ +import asyncio import logging from typing import Optional @@ -20,6 +21,36 @@ # Agent reference - set by main app _agent = None +# ============================================================================= +# Progress Tracking for App Image Pulls +# ============================================================================= + +# In-memory store for app image pull progress +_app_pull_progress: dict[int, dict] = {} +_MAX_PROGRESS_ENTRIES = 100 + + +def get_app_pull_progress(app_id: int) -> dict: + """Get pull progress for an app.""" + return _app_pull_progress.get(app_id, {"status": "unknown", "progress": 0}) + + +def _set_app_pull_progress(app_id: int, data: dict) -> None: + """Set pull progress for an app.""" + _app_pull_progress[app_id] = data + + +def _cleanup_old_app_progress() -> None: + """Remove old progress entries to prevent memory leaks.""" + if len(_app_pull_progress) > _MAX_PROGRESS_ENTRIES: + completed_keys = [ + key + for key, val in _app_pull_progress.items() + if val.get("status") in ("completed", "error") + ] + for key in completed_keys[: len(_app_pull_progress) - _MAX_PROGRESS_ENTRIES // 2]: + _app_pull_progress.pop(key, None) + def set_agent(agent): """Set the agent reference for route handlers.""" @@ -59,17 +90,109 @@ async def get_image(image_id: str): raise HTTPException(status_code=404, detail=f"Image not found: {image_id}") +@router.get("/pull-progress/{app_id}") +async def pull_progress(app_id: int): + """Get image pull progress for an app.""" + return get_app_pull_progress(app_id) + + +def _pull_image_with_tracking(agent, image: str, app_id: int, auth_config: dict | None = None): + """Pull image with progress tracking (runs in background thread).""" + _cleanup_old_app_progress() + + logger.info(f"Starting image pull for app {app_id}: {image}") + + _set_app_pull_progress( + app_id, + { + "status": "pulling", + "image": image, + "progress": 0, + "layers": {}, + }, + ) + + last_logged_progress = 0 + + def progress_callback(progress: int, layers: dict): + """Update progress during pull.""" + nonlocal last_logged_progress + _set_app_pull_progress( + app_id, + { + "status": "pulling", + "image": image, + "progress": progress, + "layers": layers, + }, + ) + # Log progress every 10% + if progress >= last_logged_progress + 10: + logger.info(f"Pulling image {image} for app {app_id}: {progress}%") + last_logged_progress = progress + + try: + result = agent.image_manager.pull_image( + image=image, + auth_config=auth_config, + progress_callback=progress_callback, + ) + logger.info(f"Image pull completed for app {app_id}: {image}") + _set_app_pull_progress( + app_id, + { + "status": "completed", + "image": image, + "progress": 100, + }, + ) + return result + except Exception as e: + logger.error(f"Image pull failed for app {app_id}: {image} - {e}") + _set_app_pull_progress( + app_id, + { + "status": "error", + "image": image, + "progress": 0, + "error": str(e), + }, + ) + raise + + @router.post("/pull") async def pull_image(request: ImagePullRequest): """Pull an image from registry.""" agent = get_agent() + logger.info( + f"Received pull request for image: {request.image}" + + (f" (app_id: {request.app_id})" if request.app_id else "") + ) + try: - result = agent.image_manager.pull_image( - image=request.image, - auth_config=request.registry_auth, - ) - return result + if request.app_id: + # Pull with progress tracking in thread pool + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + None, + _pull_image_with_tracking, + agent, + request.image, + request.app_id, + request.registry_auth, + ) + return result + else: + # Simple pull without tracking + logger.info(f"Pulling image without tracking: {request.image}") + result = agent.image_manager.pull_image( + image=request.image, + auth_config=request.registry_auth, + ) + logger.info(f"Image pull completed: {request.image}") + return result except Exception as e: logger.error(f"Failed to pull image {request.image}: {e}") raise HTTPException(status_code=500, detail=str(e))