ailiance · electron-rare · May 30, 2026 · May 30, 2026
diff --git a/apps/api/src/ailiance_demo/config.py b/apps/api/src/ailiance_demo/config.py
@@ -49,14 +49,16 @@ class Settings(BaseSettings):
     )
     dataset_flags_dir: Path = Path("/dataset-flags")
     machine_label: str = "studio"
+    # Serving is consolidated onto the omlx multi-model server (:8500) plus
+    # the two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio.
+    # The old per-port workers (9301/9303/9304, macm1:9302, kxkm-ai:8002) are
+    # decommissioned and no longer probed.
     workers_to_check: list[dict] = Field(
         default_factory=lambda: [
             {"name": "gateway", "url": "http://host.docker.internal:9300/health"},
-            {"name": "mistral-medium-3.5", "url": "http://studio:9301/health"},
-            {"name": "gemma4-e4b-curriculum", "url": "http://macm1:8502/health"},
-            {"name": "eurollm", "url": "http://studio:9303/health"},
-            {"name": "gemma3", "url": "http://tower:9304/health"},
-            {"name": "qwen3-next", "url": "http://host.docker.internal:8002/health"},
+            {"name": "omlx", "url": "http://100.116.92.12:8500/health"},
+            {"name": "qwen36-hardware", "url": "http://100.116.92.12:9360/health"},
+            {"name": "qwen36-code", "url": "http://100.116.92.12:9361/health"},
         ],
     )
 

diff --git a/apps/api/src/ailiance_demo/routers/public/models.py b/apps/api/src/ailiance_demo/routers/public/models.py
diff --git a/apps/api/src/ailiance_demo/services/chat_proxy.py b/apps/api/src/ailiance_demo/services/chat_proxy.py
@@ -18,35 +18,23 @@
     # gateway fall through to its domain router, which produced garbled output
     # in tests because the request reaches a worker that doesn't recognize the
     # model id and degenerates.
-    # --- generalist base models served by the gateway ---
-    "ailiance/mistral-medium-3.5-128b": "ailiance-mistral",
-    "ailiance/gemma4-e4b-curriculum": "ailiance-gemma4",
-    "ailiance/gemma3-4b": "ailiance-gemma",
-    "ailiance/qwen3-next-80b-a3b-instruct": "ailiance-qwen",
+    # --- generalist base models served by the omlx consolidated server ---
+    "ailiance/mistral-medium-3.5-128b": "ailiance-mistral-medium",
+    "ailiance/gemma4-e4b-curriculum": "ailiance-gemma4-omlx",
+    "ailiance/qwen3-coder-next-80b": "ailiance-qwen36",
     "ailiance/granite-30b": "ailiance-granite",
-    "ailiance/ministral-14b": "ailiance-ministral",
-    "ailiance/ministral-14b-reasoning": "ailiance-ministral-reasoning",
+    "ailiance/eurollm-22b": "ailiance-eurollm",
+    "ailiance/apertus-70b": "ailiance-apertus",
     # --- additional gateway-exposed flagship / variant aliases ---
     "ailiance/pixtral-12b": "ailiance-pixtral",
     "ailiance/reasoning-r1": "ailiance-reasoning-r1",
     "ailiance/coder-pro": "ailiance-coder-pro",
     "ailiance/mistral-small-3.5": "ailiance-mistral-small",
+    "ailiance/devstral-base": "ailiance-devstral-base",
+    "ailiance/mixtral-8x22b": "ailiance-mixtral-8x22b",
     # --- mascarade family card routes to auto-router (auto-classifies which
-    # mascarade specialist to use) ---
+    # qwen36 LoRA specialist to use) ---
     "ailiance/mascarade": "ailiance",
-    # --- mascarade hardware specialists (Qwen3-4B LoRA on Tower :8004) ---
-    "ailiance/mascarade-kicad": "ailiance-kicad",
-    "ailiance/mascarade-spice": "ailiance-spice",
-    "ailiance/mascarade-stm32": "ailiance-stm32",
-    "ailiance/mascarade-emc": "ailiance-emc",
-    "ailiance/mascarade-embedded": "ailiance-embedded",
-    "ailiance/mascarade-platformio": "ailiance-platformio",
-    "ailiance/mascarade-freecad": "ailiance-freecad",
-    "ailiance/mascarade-dsp": "ailiance-dsp",
-    "ailiance/mascarade-iot": "ailiance-iot",
-    "ailiance/mascarade-power": "ailiance-power",
-    "ailiance/mascarade-components-review": "ailiance-components-review",
-    "ailiance/mascarade-coder": "ailiance-coder",
     # The bare "ailiance" alias triggers the gateway's domain router
     # (MiniLM L6 v2 embeddings + MLP classifier) — not in MODEL_FORCE_MAP on
     # purpose. We surface the decision in the chat stream via a route
@@ -55,12 +43,13 @@
 }
 
 # Worker port → human-readable label, used for the route preamble.
+# Serving is consolidated onto the omlx multi-model server (:8500) plus the
+# two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio. The old
+# per-port workers (9301/9303/9304/8002) are decommissioned.
 _PORT_LABELS: dict[int, str] = {
-    9301: "Mistral Medium 3.5 128B (studio)",
-    8502: "Gemma 4 E4B + ailiance curriculum LoRA (macm1)",
-    9303: "EuroLLM 22B (studio)",
-    9304: "Gemma 3 4B (tower)",
-    8002: "Qwen3.5 35B (kxkm-ai)",
+    8500: "omlx multi-model server (studio)",
+    9360: "Qwen3.6-35B multi-LoRA · hardware/EDA/math (studio)",
+    9361: "Qwen3.6-35B multi-LoRA · code/web/lang (studio)",
 }
 AILIANCE_ALIASES: frozenset[str] = frozenset(ALIAS_TO_GATEWAY_MODEL)
 

diff --git a/apps/api/src/ailiance_demo/services/gateway_probe.py b/apps/api/src/ailiance_demo/services/gateway_probe.py
@@ -18,173 +18,64 @@
 
 log = structlog.get_logger()
 
+# Serving is consolidated onto Mac Studio (M3 Ultra). Three live endpoints:
+#   - omlx :8500     — multi-model server (Mistral-Medium, DeepSeek-R1,
+#                      Qwen3-Coder-30B/Next-80B, EuroLLM, granite, Mixtral,
+#                      Pixtral, gemma-4, Devstral, ...)
+#   - qwen36 :9360   — Qwen3.6-35B multi-LoRA, hardware/EDA/math specialists
+#   - qwen36 :9361   — Qwen3.6-35B multi-LoRA, code/web/lang specialists
+# The old per-port workers (studio:9301/9323/9325/9326/9327/9330,
+# macm1:8502/9302, tower:9304/8004, kxkm-ai:8002/8003) are decommissioned.
 WORKERS = [
-    # --- Studio (M3 Ultra, 512 GB unified) ---
     {
-        "id": "studio-mistral-medium",
-        "label": "Mac Studio · Mistral-Medium-128B :9301",
-        "url": "http://studio:9301",
+        "id": "studio-omlx",
+        "label": "Mac Studio · omlx multi-model :8500",
+        "url": "http://100.116.92.12:8500",
         "host": "studio",
         "gpu": "Apple M3 Ultra (76-core GPU)",
         "vram_gb": 512.0,
         "tdp_w": 215,
-        "gateway_aliases": ["ailiance-mistral", "ailiance-mistral-medium"],
-        "served_models": ["Mistral-Medium-3.5-128B-MLX-Q8"],
-    },
-    {
-        "id": "studio-reasoning-r1",
-        "label": "Mac Studio · DeepSeek-R1 :9323",
-        "url": "http://studio:9323",
-        "host": "studio",
-        "gpu": "Apple M3 Ultra (76-core GPU)",
-        "vram_gb": 512.0,
-        "tdp_w": 215,
-        "gateway_aliases": ["ailiance-reasoning-r1"],
-        "served_models": ["DeepSeek-R1-Distill-Qwen-32B-MLX-4bit"],
-    },
-    {
-        "id": "studio-pixtral",
-        "label": "Mac Studio · Pixtral-12B :9325",
-        "url": "http://studio:9325",
-        "host": "studio",
-        "gpu": "Apple M3 Ultra (76-core GPU)",
-        "vram_gb": 512.0,
-        "tdp_w": 215,
-        "gateway_aliases": ["ailiance-pixtral"],
-        "served_models": ["Pixtral-12B-MLX-4bit"],
-    },
-    {
-        "id": "studio-mistral-small",
-        "label": "Mac Studio · Mistral-Small-24B :9326",
-        "url": "http://studio:9326",
-        "host": "studio",
-        "gpu": "Apple M3 Ultra (76-core GPU)",
-        "vram_gb": 512.0,
-        "tdp_w": 215,
-        "gateway_aliases": ["ailiance-mistral-small"],
-        "served_models": ["Mistral-Small-3.1-24B-Instruct-MLX-4bit"],
+        "gateway_aliases": [
+            "ailiance-mistral-medium", "ailiance-mistral", "ailiance-eurollm",
+            "ailiance-apertus", "ailiance-gemma", "ailiance-granite",
+            "ailiance-devstral-base", "ailiance-flagship", "ailiance-qwen-235b",
+            "ailiance-reasoning-r1", "ailiance-llama", "ailiance-pixtral",
+            "ailiance-gemma4-omlx", "ailiance-mistral-small", "ailiance-coder-pro",
+            "ailiance-mixtral", "ailiance-mixtral-8x22b",
+        ],
+        "served_models": [
+            "Mistral-Medium-3.5-128B-MLX-Q8",
+            "DeepSeek-R1-Distill-Qwen-32B",
+            "Qwen3-Coder-30B-A3B", "Qwen3-Coder-Next-8bit (80B MoE)",
+            "EuroLLM-22B", "granite-4.1-30b", "Mixtral-8x22B",
+            "Devstral-Small-2-24B", "Pixtral-12B", "gemma-4-E4B",
+        ],
     },
     {
-        "id": "studio-coder-pro",
-        "label": "Mac Studio · Qwen3-Coder-30B :9327",
-        "url": "http://studio:9327",
+        "id": "studio-qwen36-hardware",
+        "label": "Mac Studio · Qwen3.6-35B multi-LoRA (hardware/EDA/math) :9360",
+        "url": "http://100.116.92.12:9360",
         "host": "studio",
         "gpu": "Apple M3 Ultra (76-core GPU)",
         "vram_gb": 512.0,
         "tdp_w": 215,
-        "gateway_aliases": ["ailiance-coder-pro"],
-        "served_models": ["Qwen3-Coder-30B-A3B-Instruct-MLX-4bit"],
+        "gateway_aliases": ["ailiance-qwen36"],
+        "served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"],
     },
     {
-        "id": "studio-devstral-multi",
-        "label": "Mac Studio · Devstral multi-LoRA :9330",
-        "url": "http://studio:9330",
+        "id": "studio-qwen36-code",
+        "label": "Mac Studio · Qwen3.6-35B multi-LoRA (code/web/lang) :9361",
+        "url": "http://100.116.92.12:9361",
         "host": "studio",
         "gpu": "Apple M3 Ultra (76-core GPU)",
         "vram_gb": 512.0,
         "tdp_w": 215,
         "gateway_aliases": [
-            "ailiance-devstral-base", "ailiance-python", "ailiance-cpp",
-            "ailiance-rust-emb", "ailiance-html", "ailiance-ml-training",
+            "ailiance-python", "ailiance-cpp", "ailiance-rust-emb",
+            "ailiance-html", "ailiance-ml-training",
+            "ailiance-components-review", "ailiance-coder",
         ],
-        "served_models": ["Devstral-Small-2-24B-MLX-4bit + 5 LoRA hot-swap"],
-    },
-    # --- macM1 (M1, 32 GB) ---
-    {
-        "id": "macm1-mlx",
-        "label": "macM1 · mlx_lm.server :8502",
-        "url": "http://macm1:8502",
-        "host": "macm1",
-        "gpu": "Apple M1 (8-core GPU)",
-        "vram_gb": 32.0,
-        "tdp_w": 30,
-        # ailiance-granite is NOT here: the gateway force-maps that alias to
-        # kxkm-ai :8003, not macM1. macM1 hosts a granite-4.1-30b model but
-        # the gateway never routes the alias to it.
-        "gateway_aliases": [
-            "ailiance-gemma2", "ailiance-gemma4", "ailiance-ministral",
-            "ailiance-ministral-reasoning",
-        ],
-        "served_models": [
-            "gemma-4-E4B-it-MLX-4bit",
-            "Ministral-3-14B-Instruct-2512-4bit",
-            "Ministral-3-14B-Reasoning-2512-4bit",
-        ],
-    },
-    # --- Tower (NVIDIA Quadro P2000, 5 GB) ---
-    {
-        "id": "tower-gemma",
-        "label": "Tower · llama.cpp Gemma 3 :9304",
-        "url": "http://tower:9304",
-        "host": "tower (NVIDIA Quadro P2000)",
-        "gpu": "NVIDIA Quadro P2000",
-        "vram_gb": 5.0,
-        "tdp_w": 75,
-        "gateway_aliases": ["ailiance-gemma"],
-        "served_models": ["gemma-3-4b-it (Q4 GGUF)"],
-    },
-    {
-        # The 10 hardware mascarade aliases (kicad/spice/stm32/emc/embedded/
-        # platformio/freecad/dsp/iot/power) moved to the Studio MLX worker
-        # :9340 with PR #100/#102. Tower Ollama now only backs the two
-        # aliases the gateway still force-maps to :8004, plus the embed
-        # surface.
-        "id": "tower-ollama",
-        "label": "Tower · Ollama mascarade :8004",
-        "url": "http://host.docker.internal:8004",
-        "host": "tower (autossh tunnel)",
-        "gpu": "NVIDIA Quadro P2000",
-        "vram_gb": 5.0,
-        "tdp_w": 75,
-        "gateway_aliases": [
-            "ailiance-components-review", "ailiance-coder", "ailiance-embed",
-        ],
-        "served_models": [
-            "mascarade-components-review", "mascarade-coder-v2", "bge-m3",
-        ],
-    },
-    # --- Studio (M3 Ultra) MLX bf16 mascarade experts ---
-    {
-        "id": "studio-mascarade",
-        "label": "Mac Studio · MLX mascarade :9340",
-        "url": "http://host.docker.internal:9340",
-        "host": "studio (autossh tunnel)",
-        "gpu": "Apple M3 Ultra (76-core GPU)",
-        "vram_gb": 512.0,
-        "tdp_w": 215,
-        "gateway_aliases": [
-            "ailiance-kicad", "ailiance-spice", "ailiance-stm32", "ailiance-emc",
-            "ailiance-embedded", "ailiance-platformio", "ailiance-freecad",
-            "ailiance-dsp", "ailiance-iot", "ailiance-power",
-        ],
-        "served_models": [
-            "mascarade-kicad", "mascarade-spice", "mascarade-stm32",
-            "mascarade-emc", "mascarade-embedded", "mascarade-platformio",
-            "mascarade-freecad", "mascarade-dsp", "mascarade-iot", "mascarade-power",
-        ],
-    },
-    # --- kxkm-ai (RTX 4090, 24 GB) ---
-    {
-        "id": "kxkm-qwen",
-        "label": "kxkm-ai · llama.cpp Qwen3-Next 80B :8002",
-        "url": "http://host.docker.internal:8002",
-        "host": "kxkm-ai (RTX 4090, autossh tunnel)",
-        "gpu": "NVIDIA RTX 4090",
-        "vram_gb": 24.0,
-        "tdp_w": 450,
-        "gateway_aliases": ["ailiance-qwen"],
-        "served_models": ["Qwen3-Next-80B-A3B-Instruct (Q4_K_M MoE)"],
-    },
-    {
-        "id": "kxkm-granite",
-        "label": "kxkm-ai · llama.cpp Granite 30B :8003",
-        "url": "http://host.docker.internal:8003",
-        "host": "kxkm-ai (RTX 4090, autossh tunnel)",
-        "gpu": "NVIDIA RTX 4090",
-        "vram_gb": 24.0,
-        "tdp_w": 450,
-        "gateway_aliases": ["ailiance-granite"],
-        "served_models": ["granite-4.1-30b-instruct (Q4_K_M)"],
+        "served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"],
     },
 ]
 
@@ -196,10 +87,9 @@
 # `nvidia-smi` (Linux/NVIDIA) or `ioreg` (Apple Silicon). The api container
 # has openssh-client and /root/.ssh mounted RO from /home/electron/.ssh.
 _HOST_PROBES: dict[str, dict[str, str]] = {
+    # Serving consolidated onto Mac Studio — the other physical hosts no
+    # longer serve LLM workers, so we only probe studio's GPU.
     "studio": {"ssh": "studio", "kind": "apple"},
-    "macm1": {"ssh": "electron@macm1", "kind": "apple"},
-    "tower": {"ssh": "clems@tower", "kind": "nvidia"},
-    "kxkm-ai": {"ssh": "kxkm@10.2.0.237", "kind": "nvidia"},
 }
 
 
@@ -592,8 +482,8 @@ async def _produce() -> list[WorkerStatus]:
             request_counts = await _fetch_gateway_request_counts(client, gateway_url)
             host_probes = await _gather_host_probes()
             # Probe all workers in parallel to bound total latency to
-            # ~max(probe), not sum(probe). 11 workers * 300 ms sequential
-            # = 3.3 s -> ~500 ms. Fixes "probe indisponible" on cockpit.
+            # ~max(probe), not sum(probe). Keeps the cockpit page render
+            # fast and fixes "probe indisponible".
             return list(
                 await asyncio.gather(
                     *(_probe_one(client, w, request_counts, host_probes) for w in WORKERS)

diff --git a/apps/api/tests/integration/test_models_endpoint.py b/apps/api/tests/integration/test_models_endpoint.py
@@ -8,13 +8,14 @@ def test_list_models_returns_cards(client_with_cache: TestClient) -> None:
     assert response.status_code == 200
     cards = response.json()
     ids = {c["id"] for c in cards}
-    # Live workers + auto-router + 12 mascarade specialists + mocked HF entry.
+    # Live workers (omlx :8500 + qwen36 :9360/:9361) + auto-router +
+    # consolidated mascarade card + mocked HF entry.
     assert {
         "ailiance/mistral-medium-3.5-128b",
-        "ailiance/gemma3-4b",
-        "ailiance/qwen3-next-80b-a3b-instruct",
+        "ailiance/gemma4-e4b-curriculum",
+        "ailiance/qwen3-coder-next-80b",
         "ailiance/granite-30b",
-        "ailiance/ministral-14b",
+        "ailiance/eurollm-22b",
         "ailiance/mascarade",
         "ailiance/auto",
         "Ailiance-fr/micro-kiki-v3",

diff --git a/apps/api/tests/integration/test_status_endpoint.py b/apps/api/tests/integration/test_status_endpoint.py
@@ -68,21 +68,17 @@ def test_workers_constant_matches_production_fleet():
     """The hard-coded WORKERS list is the single source of truth for /status."""
     from ailiance_demo.services.gateway_probe import WORKERS
 
+    # Serving is consolidated onto Mac Studio: the omlx multi-model server
+    # (:8500) plus the two Qwen3.6-35B multi-LoRA instances (:9360 hardware/
+    # EDA/math, :9361 code/web/lang). The old per-port / multi-host fleet
+    # (studio:9301/9323/.., macm1, tower, kxkm-ai) is decommissioned.
     ids = {w["id"] for w in WORKERS}
     assert ids == {
-        "studio-mistral-medium", "studio-reasoning-r1", "studio-pixtral",
-        "studio-mistral-small", "studio-coder-pro", "studio-devstral-multi",
-        "studio-mascarade", "macm1-mlx", "tower-gemma", "tower-ollama",
-        "kxkm-qwen", "kxkm-granite",
+        "studio-omlx", "studio-qwen36-hardware", "studio-qwen36-code",
     }
     by_id = {w["id"]: w for w in WORKERS}
-    # kxkm-*, tower-ollama and studio-mascarade reach the cockpit via autossh
-    # tunnels owned by the gateway host; from inside the api container we must
-    # talk to host.docker.internal.
-    assert "host.docker.internal" in by_id["kxkm-qwen"]["url"]
-    assert "host.docker.internal" in by_id["kxkm-granite"]["url"]
-    assert "host.docker.internal" in by_id["tower-ollama"]["url"]
-    assert "host.docker.internal" in by_id["studio-mascarade"]["url"]
-    # Other workers are addressed over Tailscale magic DNS.
-    assert by_id["studio-mistral-medium"]["url"] == "http://studio:9301"
-    assert by_id["tower-gemma"]["url"] == "http://tower:9304"
+    # All three workers live on Mac Studio, reached over Tailscale by IP.
+    assert by_id["studio-omlx"]["url"] == "http://100.116.92.12:8500"
+    assert by_id["studio-qwen36-hardware"]["url"] == "http://100.116.92.12:9360"
+    assert by_id["studio-qwen36-code"]["url"] == "http://100.116.92.12:9361"
+    assert all(w["host"] == "studio" for w in WORKERS)
diff --git a/apps/api/tests/integration/test_workers_endpoint.py b/apps/api/tests/integration/test_workers_endpoint.py
@@ -27,17 +27,15 @@ def test_workers_status_returns_list(empty_hf_cache, empty_eval_index) -> None:
     )
     assert response.status_code == 200
     workers = response.json()
-    # 6 default workers configured: gateway + 5-worker production fleet
-    # (mistral-medium-3.5, gemma4-e4b-curriculum, eurollm, gemma3, qwen3-next).
-    assert len(workers) == 6
+    # 4 default workers configured: gateway + the consolidated Mac Studio
+    # serving fleet (omlx multi-model :8500, qwen36 multi-LoRA :9360/:9361).
+    assert len(workers) == 4
     names = {w["name"] for w in workers}
     assert names == {
         "gateway",
-        "mistral-medium-3.5",
-        "gemma4-e4b-curriculum",
-        "eurollm",
-        "gemma3",
-        "qwen3-next",
+        "omlx",
+        "qwen36-hardware",
+        "qwen36-code",
     }
     # Each entry must report a valid health status; we don't assert "down"
     # because this test sometimes runs from a host that can actually reach