Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions apps/api/src/ailiance_demo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ class Settings(BaseSettings):
)
dataset_flags_dir: Path = Path("/dataset-flags")
machine_label: str = "studio"
# Serving is consolidated onto the omlx multi-model server (:8500) plus
# the two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio.
# The old per-port workers (9301/9303/9304, macm1:9302, kxkm-ai:8002) are
# decommissioned and no longer probed.
workers_to_check: list[dict] = Field(
default_factory=lambda: [
{"name": "gateway", "url": "http://host.docker.internal:9300/health"},
{"name": "mistral-medium-3.5", "url": "http://studio:9301/health"},
{"name": "gemma4-e4b-curriculum", "url": "http://macm1:8502/health"},
{"name": "eurollm", "url": "http://studio:9303/health"},
{"name": "gemma3", "url": "http://tower:9304/health"},
{"name": "qwen3-next", "url": "http://host.docker.internal:8002/health"},
{"name": "omlx", "url": "http://100.116.92.12:8500/health"},
{"name": "qwen36-hardware", "url": "http://100.116.92.12:9360/health"},
{"name": "qwen36-code", "url": "http://100.116.92.12:9361/health"},
],
)

Expand Down
269 changes: 154 additions & 115 deletions apps/api/src/ailiance_demo/routers/public/models.py

Large diffs are not rendered by default.

41 changes: 15 additions & 26 deletions apps/api/src/ailiance_demo/services/chat_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,35 +18,23 @@
# gateway fall through to its domain router, which produced garbled output
# in tests because the request reaches a worker that doesn't recognize the
# model id and degenerates.
# --- generalist base models served by the gateway ---
"ailiance/mistral-medium-3.5-128b": "ailiance-mistral",
"ailiance/gemma4-e4b-curriculum": "ailiance-gemma4",
"ailiance/gemma3-4b": "ailiance-gemma",
"ailiance/qwen3-next-80b-a3b-instruct": "ailiance-qwen",
# --- generalist base models served by the omlx consolidated server ---
"ailiance/mistral-medium-3.5-128b": "ailiance-mistral-medium",
"ailiance/gemma4-e4b-curriculum": "ailiance-gemma4-omlx",
"ailiance/qwen3-coder-next-80b": "ailiance-qwen36",
"ailiance/granite-30b": "ailiance-granite",
"ailiance/ministral-14b": "ailiance-ministral",
"ailiance/ministral-14b-reasoning": "ailiance-ministral-reasoning",
"ailiance/eurollm-22b": "ailiance-eurollm",
"ailiance/apertus-70b": "ailiance-apertus",
# --- additional gateway-exposed flagship / variant aliases ---
"ailiance/pixtral-12b": "ailiance-pixtral",
"ailiance/reasoning-r1": "ailiance-reasoning-r1",
"ailiance/coder-pro": "ailiance-coder-pro",
"ailiance/mistral-small-3.5": "ailiance-mistral-small",
"ailiance/devstral-base": "ailiance-devstral-base",
"ailiance/mixtral-8x22b": "ailiance-mixtral-8x22b",
# --- mascarade family card routes to auto-router (auto-classifies which
# mascarade specialist to use) ---
# qwen36 LoRA specialist to use) ---
"ailiance/mascarade": "ailiance",
# --- mascarade hardware specialists (Qwen3-4B LoRA on Tower :8004) ---
"ailiance/mascarade-kicad": "ailiance-kicad",
"ailiance/mascarade-spice": "ailiance-spice",
"ailiance/mascarade-stm32": "ailiance-stm32",
"ailiance/mascarade-emc": "ailiance-emc",
"ailiance/mascarade-embedded": "ailiance-embedded",
"ailiance/mascarade-platformio": "ailiance-platformio",
"ailiance/mascarade-freecad": "ailiance-freecad",
"ailiance/mascarade-dsp": "ailiance-dsp",
"ailiance/mascarade-iot": "ailiance-iot",
"ailiance/mascarade-power": "ailiance-power",
"ailiance/mascarade-components-review": "ailiance-components-review",
"ailiance/mascarade-coder": "ailiance-coder",
# The bare "ailiance" alias triggers the gateway's domain router
# (MiniLM L6 v2 embeddings + MLP classifier) — not in MODEL_FORCE_MAP on
# purpose. We surface the decision in the chat stream via a route
Expand All @@ -55,12 +43,13 @@
}

# Worker port → human-readable label, used for the route preamble.
# Serving is consolidated onto the omlx multi-model server (:8500) plus the
# two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio. The old
# per-port workers (9301/9303/9304/8002) are decommissioned.
_PORT_LABELS: dict[int, str] = {
9301: "Mistral Medium 3.5 128B (studio)",
8502: "Gemma 4 E4B + ailiance curriculum LoRA (macm1)",
9303: "EuroLLM 22B (studio)",
9304: "Gemma 3 4B (tower)",
8002: "Qwen3.5 35B (kxkm-ai)",
8500: "omlx multi-model server (studio)",
9360: "Qwen3.6-35B multi-LoRA · hardware/EDA/math (studio)",
9361: "Qwen3.6-35B multi-LoRA · code/web/lang (studio)",
}
AILIANCE_ALIASES: frozenset[str] = frozenset(ALIAS_TO_GATEWAY_MODEL)

Expand Down
194 changes: 42 additions & 152 deletions apps/api/src/ailiance_demo/services/gateway_probe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,173 +18,64 @@

log = structlog.get_logger()

# Serving is consolidated onto Mac Studio (M3 Ultra). Three live endpoints:
# - omlx :8500 — multi-model server (Mistral-Medium, DeepSeek-R1,
# Qwen3-Coder-30B/Next-80B, EuroLLM, granite, Mixtral,
# Pixtral, gemma-4, Devstral, ...)
# - qwen36 :9360 — Qwen3.6-35B multi-LoRA, hardware/EDA/math specialists
# - qwen36 :9361 — Qwen3.6-35B multi-LoRA, code/web/lang specialists
# The old per-port workers (studio:9301/9323/9325/9326/9327/9330,
# macm1:8502/9302, tower:9304/8004, kxkm-ai:8002/8003) are decommissioned.
WORKERS = [
# --- Studio (M3 Ultra, 512 GB unified) ---
{
"id": "studio-mistral-medium",
"label": "Mac Studio · Mistral-Medium-128B :9301",
"url": "http://studio:9301",
"id": "studio-omlx",
"label": "Mac Studio · omlx multi-model :8500",
"url": "http://100.116.92.12:8500",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": ["ailiance-mistral", "ailiance-mistral-medium"],
"served_models": ["Mistral-Medium-3.5-128B-MLX-Q8"],
},
{
"id": "studio-reasoning-r1",
"label": "Mac Studio · DeepSeek-R1 :9323",
"url": "http://studio:9323",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": ["ailiance-reasoning-r1"],
"served_models": ["DeepSeek-R1-Distill-Qwen-32B-MLX-4bit"],
},
{
"id": "studio-pixtral",
"label": "Mac Studio · Pixtral-12B :9325",
"url": "http://studio:9325",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": ["ailiance-pixtral"],
"served_models": ["Pixtral-12B-MLX-4bit"],
},
{
"id": "studio-mistral-small",
"label": "Mac Studio · Mistral-Small-24B :9326",
"url": "http://studio:9326",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": ["ailiance-mistral-small"],
"served_models": ["Mistral-Small-3.1-24B-Instruct-MLX-4bit"],
"gateway_aliases": [
"ailiance-mistral-medium", "ailiance-mistral", "ailiance-eurollm",
"ailiance-apertus", "ailiance-gemma", "ailiance-granite",
"ailiance-devstral-base", "ailiance-flagship", "ailiance-qwen-235b",
"ailiance-reasoning-r1", "ailiance-llama", "ailiance-pixtral",
"ailiance-gemma4-omlx", "ailiance-mistral-small", "ailiance-coder-pro",
"ailiance-mixtral", "ailiance-mixtral-8x22b",
],
"served_models": [
"Mistral-Medium-3.5-128B-MLX-Q8",
"DeepSeek-R1-Distill-Qwen-32B",
"Qwen3-Coder-30B-A3B", "Qwen3-Coder-Next-8bit (80B MoE)",
"EuroLLM-22B", "granite-4.1-30b", "Mixtral-8x22B",
"Devstral-Small-2-24B", "Pixtral-12B", "gemma-4-E4B",
],
},
{
"id": "studio-coder-pro",
"label": "Mac Studio · Qwen3-Coder-30B :9327",
"url": "http://studio:9327",
"id": "studio-qwen36-hardware",
"label": "Mac Studio · Qwen3.6-35B multi-LoRA (hardware/EDA/math) :9360",
"url": "http://100.116.92.12:9360",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": ["ailiance-coder-pro"],
"served_models": ["Qwen3-Coder-30B-A3B-Instruct-MLX-4bit"],
"gateway_aliases": ["ailiance-qwen36"],
"served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"],
},
{
"id": "studio-devstral-multi",
"label": "Mac Studio · Devstral multi-LoRA :9330",
"url": "http://studio:9330",
"id": "studio-qwen36-code",
"label": "Mac Studio · Qwen3.6-35B multi-LoRA (code/web/lang) :9361",
"url": "http://100.116.92.12:9361",
"host": "studio",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": [
"ailiance-devstral-base", "ailiance-python", "ailiance-cpp",
"ailiance-rust-emb", "ailiance-html", "ailiance-ml-training",
"ailiance-python", "ailiance-cpp", "ailiance-rust-emb",
"ailiance-html", "ailiance-ml-training",
"ailiance-components-review", "ailiance-coder",
],
"served_models": ["Devstral-Small-2-24B-MLX-4bit + 5 LoRA hot-swap"],
},
# --- macM1 (M1, 32 GB) ---
{
"id": "macm1-mlx",
"label": "macM1 · mlx_lm.server :8502",
"url": "http://macm1:8502",
"host": "macm1",
"gpu": "Apple M1 (8-core GPU)",
"vram_gb": 32.0,
"tdp_w": 30,
# ailiance-granite is NOT here: the gateway force-maps that alias to
# kxkm-ai :8003, not macM1. macM1 hosts a granite-4.1-30b model but
# the gateway never routes the alias to it.
"gateway_aliases": [
"ailiance-gemma2", "ailiance-gemma4", "ailiance-ministral",
"ailiance-ministral-reasoning",
],
"served_models": [
"gemma-4-E4B-it-MLX-4bit",
"Ministral-3-14B-Instruct-2512-4bit",
"Ministral-3-14B-Reasoning-2512-4bit",
],
},
# --- Tower (NVIDIA Quadro P2000, 5 GB) ---
{
"id": "tower-gemma",
"label": "Tower · llama.cpp Gemma 3 :9304",
"url": "http://tower:9304",
"host": "tower (NVIDIA Quadro P2000)",
"gpu": "NVIDIA Quadro P2000",
"vram_gb": 5.0,
"tdp_w": 75,
"gateway_aliases": ["ailiance-gemma"],
"served_models": ["gemma-3-4b-it (Q4 GGUF)"],
},
{
# The 10 hardware mascarade aliases (kicad/spice/stm32/emc/embedded/
# platformio/freecad/dsp/iot/power) moved to the Studio MLX worker
# :9340 with PR #100/#102. Tower Ollama now only backs the two
# aliases the gateway still force-maps to :8004, plus the embed
# surface.
"id": "tower-ollama",
"label": "Tower · Ollama mascarade :8004",
"url": "http://host.docker.internal:8004",
"host": "tower (autossh tunnel)",
"gpu": "NVIDIA Quadro P2000",
"vram_gb": 5.0,
"tdp_w": 75,
"gateway_aliases": [
"ailiance-components-review", "ailiance-coder", "ailiance-embed",
],
"served_models": [
"mascarade-components-review", "mascarade-coder-v2", "bge-m3",
],
},
# --- Studio (M3 Ultra) MLX bf16 mascarade experts ---
{
"id": "studio-mascarade",
"label": "Mac Studio · MLX mascarade :9340",
"url": "http://host.docker.internal:9340",
"host": "studio (autossh tunnel)",
"gpu": "Apple M3 Ultra (76-core GPU)",
"vram_gb": 512.0,
"tdp_w": 215,
"gateway_aliases": [
"ailiance-kicad", "ailiance-spice", "ailiance-stm32", "ailiance-emc",
"ailiance-embedded", "ailiance-platformio", "ailiance-freecad",
"ailiance-dsp", "ailiance-iot", "ailiance-power",
],
"served_models": [
"mascarade-kicad", "mascarade-spice", "mascarade-stm32",
"mascarade-emc", "mascarade-embedded", "mascarade-platformio",
"mascarade-freecad", "mascarade-dsp", "mascarade-iot", "mascarade-power",
],
},
# --- kxkm-ai (RTX 4090, 24 GB) ---
{
"id": "kxkm-qwen",
"label": "kxkm-ai · llama.cpp Qwen3-Next 80B :8002",
"url": "http://host.docker.internal:8002",
"host": "kxkm-ai (RTX 4090, autossh tunnel)",
"gpu": "NVIDIA RTX 4090",
"vram_gb": 24.0,
"tdp_w": 450,
"gateway_aliases": ["ailiance-qwen"],
"served_models": ["Qwen3-Next-80B-A3B-Instruct (Q4_K_M MoE)"],
},
{
"id": "kxkm-granite",
"label": "kxkm-ai · llama.cpp Granite 30B :8003",
"url": "http://host.docker.internal:8003",
"host": "kxkm-ai (RTX 4090, autossh tunnel)",
"gpu": "NVIDIA RTX 4090",
"vram_gb": 24.0,
"tdp_w": 450,
"gateway_aliases": ["ailiance-granite"],
"served_models": ["granite-4.1-30b-instruct (Q4_K_M)"],
"served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"],
},
]

Expand All @@ -196,10 +87,9 @@
# `nvidia-smi` (Linux/NVIDIA) or `ioreg` (Apple Silicon). The api container
# has openssh-client and /root/.ssh mounted RO from /home/electron/.ssh.
_HOST_PROBES: dict[str, dict[str, str]] = {
# Serving consolidated onto Mac Studio — the other physical hosts no
# longer serve LLM workers, so we only probe studio's GPU.
"studio": {"ssh": "studio", "kind": "apple"},
"macm1": {"ssh": "electron@macm1", "kind": "apple"},
"tower": {"ssh": "clems@tower", "kind": "nvidia"},
"kxkm-ai": {"ssh": "kxkm@10.2.0.237", "kind": "nvidia"},
}


Expand Down Expand Up @@ -592,8 +482,8 @@ async def _produce() -> list[WorkerStatus]:
request_counts = await _fetch_gateway_request_counts(client, gateway_url)
host_probes = await _gather_host_probes()
# Probe all workers in parallel to bound total latency to
# ~max(probe), not sum(probe). 11 workers * 300 ms sequential
# = 3.3 s -> ~500 ms. Fixes "probe indisponible" on cockpit.
# ~max(probe), not sum(probe). Keeps the cockpit page render
# fast and fixes "probe indisponible".
return list(
await asyncio.gather(
*(_probe_one(client, w, request_counts, host_probes) for w in WORKERS)
Expand Down
9 changes: 5 additions & 4 deletions apps/api/tests/integration/test_models_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ def test_list_models_returns_cards(client_with_cache: TestClient) -> None:
assert response.status_code == 200
cards = response.json()
ids = {c["id"] for c in cards}
# Live workers + auto-router + 12 mascarade specialists + mocked HF entry.
# Live workers (omlx :8500 + qwen36 :9360/:9361) + auto-router +
# consolidated mascarade card + mocked HF entry.
assert {
"ailiance/mistral-medium-3.5-128b",
"ailiance/gemma3-4b",
"ailiance/qwen3-next-80b-a3b-instruct",
"ailiance/gemma4-e4b-curriculum",
"ailiance/qwen3-coder-next-80b",
"ailiance/granite-30b",
"ailiance/ministral-14b",
"ailiance/eurollm-22b",
"ailiance/mascarade",
"ailiance/auto",
"Ailiance-fr/micro-kiki-v3",
Expand Down
24 changes: 10 additions & 14 deletions apps/api/tests/integration/test_status_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,17 @@ def test_workers_constant_matches_production_fleet():
"""The hard-coded WORKERS list is the single source of truth for /status."""
from ailiance_demo.services.gateway_probe import WORKERS

# Serving is consolidated onto Mac Studio: the omlx multi-model server
# (:8500) plus the two Qwen3.6-35B multi-LoRA instances (:9360 hardware/
# EDA/math, :9361 code/web/lang). The old per-port / multi-host fleet
# (studio:9301/9323/.., macm1, tower, kxkm-ai) is decommissioned.
ids = {w["id"] for w in WORKERS}
assert ids == {
"studio-mistral-medium", "studio-reasoning-r1", "studio-pixtral",
"studio-mistral-small", "studio-coder-pro", "studio-devstral-multi",
"studio-mascarade", "macm1-mlx", "tower-gemma", "tower-ollama",
"kxkm-qwen", "kxkm-granite",
"studio-omlx", "studio-qwen36-hardware", "studio-qwen36-code",
}
by_id = {w["id"]: w for w in WORKERS}
# kxkm-*, tower-ollama and studio-mascarade reach the cockpit via autossh
# tunnels owned by the gateway host; from inside the api container we must
# talk to host.docker.internal.
assert "host.docker.internal" in by_id["kxkm-qwen"]["url"]
assert "host.docker.internal" in by_id["kxkm-granite"]["url"]
assert "host.docker.internal" in by_id["tower-ollama"]["url"]
assert "host.docker.internal" in by_id["studio-mascarade"]["url"]
# Other workers are addressed over Tailscale magic DNS.
assert by_id["studio-mistral-medium"]["url"] == "http://studio:9301"
assert by_id["tower-gemma"]["url"] == "http://tower:9304"
# All three workers live on Mac Studio, reached over Tailscale by IP.
assert by_id["studio-omlx"]["url"] == "http://100.116.92.12:8500"
assert by_id["studio-qwen36-hardware"]["url"] == "http://100.116.92.12:9360"
assert by_id["studio-qwen36-code"]["url"] == "http://100.116.92.12:9361"
assert all(w["host"] == "studio" for w in WORKERS)
14 changes: 6 additions & 8 deletions apps/api/tests/integration/test_workers_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,15 @@ def test_workers_status_returns_list(empty_hf_cache, empty_eval_index) -> None:
)
assert response.status_code == 200
workers = response.json()
# 6 default workers configured: gateway + 5-worker production fleet
# (mistral-medium-3.5, gemma4-e4b-curriculum, eurollm, gemma3, qwen3-next).
assert len(workers) == 6
# 4 default workers configured: gateway + the consolidated Mac Studio
# serving fleet (omlx multi-model :8500, qwen36 multi-LoRA :9360/:9361).
assert len(workers) == 4
names = {w["name"] for w in workers}
assert names == {
"gateway",
"mistral-medium-3.5",
"gemma4-e4b-curriculum",
"eurollm",
"gemma3",
"qwen3-next",
"omlx",
"qwen36-hardware",
"qwen36-code",
}
# Each entry must report a valid health status; we don't assert "down"
# because this test sometimes runs from a host that can actually reach
Expand Down
Loading
Loading