diff --git a/apps/api/src/ailiance_demo/config.py b/apps/api/src/ailiance_demo/config.py index 00a46fb..e783ccb 100644 --- a/apps/api/src/ailiance_demo/config.py +++ b/apps/api/src/ailiance_demo/config.py @@ -49,14 +49,16 @@ class Settings(BaseSettings): ) dataset_flags_dir: Path = Path("/dataset-flags") machine_label: str = "studio" + # Serving is consolidated onto the omlx multi-model server (:8500) plus + # the two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio. + # The old per-port workers (9301/9303/9304, macm1:9302, kxkm-ai:8002) are + # decommissioned and no longer probed. workers_to_check: list[dict] = Field( default_factory=lambda: [ {"name": "gateway", "url": "http://host.docker.internal:9300/health"}, - {"name": "mistral-medium-3.5", "url": "http://studio:9301/health"}, - {"name": "gemma4-e4b-curriculum", "url": "http://macm1:8502/health"}, - {"name": "eurollm", "url": "http://studio:9303/health"}, - {"name": "gemma3", "url": "http://tower:9304/health"}, - {"name": "qwen3-next", "url": "http://host.docker.internal:8002/health"}, + {"name": "omlx", "url": "http://100.116.92.12:8500/health"}, + {"name": "qwen36-hardware", "url": "http://100.116.92.12:9360/health"}, + {"name": "qwen36-code", "url": "http://100.116.92.12:9361/health"}, ], ) diff --git a/apps/api/src/ailiance_demo/routers/public/models.py b/apps/api/src/ailiance_demo/routers/public/models.py index 5195d2a..f976501 100644 --- a/apps/api/src/ailiance_demo/routers/public/models.py +++ b/apps/api/src/ailiance_demo/routers/public/models.py @@ -34,12 +34,12 @@ "high-revenue enterprises require Mistral's paid API. " "262 k context window. Runs on Mac Studio M3 Ultra." ), - "headline": "128B params · MLX Q8 · 262k context · Mac Studio M3 Ultra", + "headline": "128B params · MLX Q8 · 262k context · omlx (Mac Studio M3 Ultra)", "parameters": 128_000_000_000, "disk_size_bytes": 124 * _GIB, "memory_gb": 130.0, "quantization": "MLX Q8", - "host": "studio (Mac Studio M3 Ultra)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", "architecture": "mlx", "license": "modified-mit", "kind": ModelKind.QUANTIZED, @@ -57,14 +57,15 @@ "Google Gemma 4 E4B Instruction-Tuned avec adapter LoRA fine-tuné " "en curriculum 4 phases (seq 512 → 1024 → 2048 → 3072) sur le " "dataset ailiance (~82k conversations, electronics + code). " - "Test loss 2.094 (perplexity 8.12). Tourne sur Mac mini M1." + "Test loss 2.094 (perplexity 8.12). Sert aussi de fallback vision " + "léger. Servi par le serveur omlx sur Mac Studio (:8500)." ), - "headline": "E4B · MLX 4-bit + LoRA · Mac mini M1", + "headline": "E4B · MLX 4-bit + LoRA · omlx (Mac Studio)", "parameters": 4_000_000_000, "disk_size_bytes": 4 * _GIB, "memory_gb": 12.0, "quantization": "MLX 4-bit + LoRA", - "host": "macm1 (Mac mini M1)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", "architecture": "mlx", "license": "gemma-terms", "kind": ModelKind.FINE_TUNED, @@ -72,65 +73,87 @@ "top_eval_score": 0.61, "top_eval_benchmark": "MT-Bench-FR (LoRA tuned)", }, - "ailiance/qwen3-next-80b-a3b-instruct": { - "display_name": "Qwen3-Next 80B A3B Instruct", - "base_model": "Qwen/Qwen3-Next-80B-A3B-Instruct", - "domain": "reasoning", + "ailiance/qwen3-coder-next-80b": { + "display_name": "Qwen3-Coder-Next 80B (qwen36 multi-LoRA)", + "base_model": "Qwen/Qwen3-Coder-Next-80B-A3B", + "domain": "code", "description": ( - "Qwen3-Next 80B sparse MoE (3B active per token) — Q4_K_M GGUF " - "served by llama.cpp on kxkm-ai (NVIDIA RTX 4090 24 GB). MoE " - "expert offload: attention layers on GPU, ffn experts in CPU " - "RAM via --override-tensor. Reachable from the gateway via " - "autossh tunnel (electron-server:8002 → kxkm-ai:18888)." + "Qwen3-Coder-Next 80B sparse MoE (3B active per token) — 8-bit " + "MLX served by the omlx server on Mac Studio. Also the base for " + "the qwen36-35B multi-LoRA hardware/code specialists (30 adapters " + "hot-swapped on the :9360 / :9361 instances)." ), - "headline": "80B MoE / 3B active · Q4_K_M · RTX 4090 + RAM offload", + "headline": "80B MoE / 3B active · MLX 8-bit · omlx (Mac Studio)", "parameters": 80_000_000_000, "disk_size_bytes": 48_410_988_384, - "memory_gb": 50.0, # ~6 GB VRAM (attention + KV q8_0) + ~44 GB RAM (experts) - "quantization": "Q4_K_M", - "host": "kxkm-ai (NVIDIA RTX 4090 24 GB + 64 GB RAM)", - "architecture": "gguf", + "memory_gb": 50.0, + "quantization": "MLX 8-bit", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, - "hf_url": "https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct", + "hf_url": "https://huggingface.co/Qwen", "top_eval_score": 0.91, "top_eval_benchmark": "MMLU / GSM8K", }, - "ailiance/gemma3-4b": { - "display_name": "Gemma 3 4B IT", - "base_model": "google/gemma-3-4b-it", + "ailiance/eurollm-22b": { + "display_name": "EuroLLM 22B Instruct", + "base_model": "utter-project/EuroLLM-22B-Instruct", + "domain": "multilingual", + "description": ( + "EU-sovereign multilingual instruction model covering all 24 EU " + "official languages. MLX-served by the omlx server on Mac Studio " + "(:8500)." + ), + "headline": "22B · multilingual EU · omlx (Mac Studio)", + "parameters": 22_000_000_000, + "disk_size_bytes": 45 * _GIB, + "memory_gb": 45.0, + "quantization": "MLX", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", + "license": "apache-2.0", + "kind": ModelKind.QUANTIZED, + "hf_url": "https://huggingface.co/utter-project", + "top_eval_score": 0.74, + "top_eval_benchmark": "MT-Bench (multilingual)", + }, + "ailiance/apertus-70b": { + "display_name": "Apertus 70B Instruct", + "base_model": "swiss-ai/Apertus-70B-Instruct-2509", "domain": "general", "description": ( - "Google DeepMind Gemma 3 4B Instruction-Tuned — small, fast, " - "multilingual. Runs on tower (NVIDIA Quadro P2000, 5 GB VRAM)." + "Swiss-sovereign Apertus 70B instruction model. The BF16 source " + "was deleted in the storage cleanup; the 4-bit MLX build is " + "retained and served on demand by the omlx server on Mac Studio." ), - "headline": "4B params · BF16 · NVIDIA Quadro P2000", - "parameters": 4_000_000_000, - "disk_size_bytes": 8 * _GIB, - "memory_gb": 8.0, - "quantization": "BF16", - "host": "tower (NVIDIA Quadro P2000 5 GB)", - "architecture": "transformers", - "license": "gemma-terms", + "headline": "70B · MLX 4-bit · omlx (Mac Studio)", + "parameters": 70_000_000_000, + "disk_size_bytes": 37 * _GIB, + "memory_gb": 40.0, + "quantization": "MLX 4-bit", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", + "license": "apache-2.0", "kind": ModelKind.QUANTIZED, - "hf_url": "https://huggingface.co/google/gemma-3-4b-it", - "top_eval_score": 0.59, - "top_eval_benchmark": "MMLU (small-model class)", + "hf_url": "https://huggingface.co/swiss-ai", + "top_eval_score": 0.80, + "top_eval_benchmark": "MMLU", }, "ailiance/auto": { "display_name": "Auto-router", - "base_model": "MiniLM L6 v2 384d + 2-layer MLP", + "base_model": "all-MiniLM-L6-v2 384d + 2-layer MLP (hidden 256)", "domain": "router", "description": ( - "Domain router classifies your prompt over 32 domains and forwards " + "Domain router classifies your prompt over 47 domains and forwards " "to the best specialist. Trained on the AI-Act-traceable clean " - "corpus (router v0.3, 2026-05-11). Hardware domains (kicad / spice / " - "stm32 / emc / embedded / power) route to the mascarade LoRA " - "specialists with a sandboxed Docker validator. Generalist domains " - "(math, code, multilingual, raisonnement) route directly. The " - "decision is shown above each reply in the playground." + "corpus (router v9, 2026-05-30). Hardware/EDA domains (kicad / " + "spice / stm32 / emc / embedded / power) route to the qwen36 " + "multi-LoRA specialists with a sandboxed Docker validator. " + "Generalist domains (math, code, multilingual, raisonnement) route " + "directly. The decision is shown above each reply in the playground." ), - "headline": "MiniLM 384d · 40 domains · top1≈65% top3≈86% · chain v0.3", + "headline": "all-MiniLM-L6-v2 384d · 47 domains · macro-F1 0.889 · router v9", "parameters": 22_700_000, # MiniLM L6 v2 ≈ 22.7M "disk_size_bytes": 90_500_000, "memory_gb": 0.2, @@ -139,9 +162,9 @@ "architecture": "safetensors", "license": "apache-2.0", "kind": ModelKind.FINE_TUNED, - "hf_url": "https://huggingface.co/Ailiance-fr/router-v6-minilm", - "top_eval_score": 0.78, - "top_eval_benchmark": "iact-bench 31 domains avg", + "hf_url": "https://huggingface.co/Ailiance-fr", + "top_eval_score": 0.889, + "top_eval_benchmark": "iact-bench 47 domains macro-F1", }, "ailiance/granite-30b": { "display_name": "Granite 4.1 30B Instruct", @@ -150,16 +173,15 @@ "description": ( "IBM Granite 4.1 30B Instruct — code-first instruction-tuned " "open model with strong enterprise SQL / RAG / tool-use scores. " - "Q4_K_M GGUF served by llama.cpp on kxkm-ai RTX 4090 via autossh " - "tunnel (electron-server :8003)." + "MLX-served by the omlx server on Mac Studio (:8500)." ), - "headline": "30B · Q4_K_M · RTX 4090 (kxkm-ai)", + "headline": "30B · MLX · omlx (Mac Studio)", "parameters": 30_000_000_000, "disk_size_bytes": 18 * _GIB, "memory_gb": 20.0, - "quantization": "Q4_K_M", - "host": "kxkm-ai (NVIDIA RTX 4090, autossh tunnel)", - "architecture": "gguf", + "quantization": "MLX", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, # ibm-granite/granite-4.1-30B-instruct is gated. Point to the org. @@ -167,47 +189,48 @@ "top_eval_score": 0.83, "top_eval_benchmark": "HumanEval+ / BigBench-Hard code", }, - "ailiance/ministral-14b": { - "display_name": "Ministral 3 14B Instruct", - "base_model": "mistralai/Ministral-3-14B-Instruct-2512", - "domain": "general", + "ailiance/devstral-base": { + "display_name": "Devstral Small 2 24B", + "base_model": "mistralai/Devstral-Small-2-24B", + "domain": "code", "description": ( - "Mistral Ministral 3 14B Instruct — small, fast generalist for " - "FR/EN chat. MLX 4-bit on macM1 :8502." + "Mistral Devstral Small 2 24B — agentic coding base. Now served " + "by the omlx server on Mac Studio (:8500); the old macm1 :9302 " + "Devstral worker is decommissioned." ), - "headline": "14B · MLX 4-bit · macM1", - "parameters": 14_000_000_000, - "disk_size_bytes": 8 * _GIB, - "memory_gb": 9.0, - "quantization": "MLX 4-bit", - "host": "macm1 (Apple M1)", + "headline": "24B · MLX · omlx (Mac Studio)", + "parameters": 24_000_000_000, + "disk_size_bytes": 14 * _GIB, + "memory_gb": 15.0, + "quantization": "MLX", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, - "hf_url": "https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512", - "top_eval_score": 0.78, - "top_eval_benchmark": "MT-Bench-FR", + "hf_url": "https://huggingface.co/mistralai", + "top_eval_score": 0.84, + "top_eval_benchmark": "SWE-bench Verified", }, - "ailiance/ministral-14b-reasoning": { - "display_name": "Ministral 3 14B Reasoning", - "base_model": "mistralai/Ministral-3-14B-Reasoning-2512", - "domain": "reasoning", + "ailiance/mixtral-8x22b": { + "display_name": "Mixtral 8x22B", + "base_model": "mistralai/Mixtral-8x22B-Instruct-v0.1", + "domain": "general", "description": ( - "Ministral 3 14B with reasoning fine-tune — chain-of-thought " - "responses for math and complex problem-solving. MLX 4-bit on macM1." + "Mistral Mixtral 8x22B sparse MoE generalist. MLX-served by the " + "omlx server on Mac Studio (:8500)." ), - "headline": "14B reasoning · MLX 4-bit · macM1", - "parameters": 14_000_000_000, - "disk_size_bytes": 8 * _GIB, - "memory_gb": 9.0, - "quantization": "MLX 4-bit", - "host": "macm1 (Apple M1)", + "headline": "8x22B MoE · MLX · omlx (Mac Studio)", + "parameters": 141_000_000_000, + "disk_size_bytes": 80 * _GIB, + "memory_gb": 85.0, + "quantization": "MLX", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, - "hf_url": "https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512", - "top_eval_score": 0.85, - "top_eval_benchmark": "MATH / GSM8K", + "hf_url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1", + "top_eval_score": 0.79, + "top_eval_benchmark": "MMLU", }, } @@ -262,9 +285,14 @@ "display_name": "Pixtral 12B (vision)", "base_model": "mistralai/Pixtral-12B", "domain": "vision", - "description": "Mistral Pixtral 12B multimodal — texte + image.", - "headline": "vision · 12B", - "host": "studio", + "description": ( + "Mistral Pixtral 12B multimodal — texte + image. Worker vision " + "canonique, servi par le serveur omlx sur Mac Studio (:8500). " + "Gemma 4 E4B sert de fallback vision léger." + ), + "headline": "vision · 12B · omlx (Mac Studio)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, "hf_url": "https://huggingface.co/mistralai/Pixtral-12B-2409", @@ -273,11 +301,15 @@ } _LIVE_DETAILS["ailiance/reasoning-r1"] = { "display_name": "Reasoning R1", - "base_model": "DeepSeek-R1 distilled", + "base_model": "DeepSeek-R1-Distill-Qwen-32B", "domain": "reasoning", - "description": "Modèle de raisonnement chain-of-thought (DeepSeek-R1 distill ou équivalent).", - "headline": "chain-of-thought · reasoning", - "host": "macm1", + "description": ( + "Modèle de raisonnement chain-of-thought (DeepSeek-R1 distill 32B). " + "Servi par le serveur omlx sur Mac Studio (:8500)." + ), + "headline": "chain-of-thought · reasoning · omlx (Mac Studio)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.DISTILLED, "hf_url": "https://huggingface.co/deepseek-ai", @@ -286,14 +318,18 @@ } _LIVE_DETAILS["ailiance/coder-pro"] = { "display_name": "Coder Pro", - "base_model": "Qwen2.5-Coder-32B-Instruct ou équivalent", + "base_model": "Qwen3-Coder-30B-A3B-Instruct", "domain": "code", - "description": "Spécialiste code généraliste avec validators iact-bench (tsc, ruff, rustc, go vet).", - "headline": "code · validators", - "host": "macm1", + "description": ( + "Spécialiste code généraliste avec validators iact-bench (tsc, ruff, " + "rustc, go vet). Servi par le serveur omlx sur Mac Studio (:8500)." + ), + "headline": "code · validators · omlx (Mac Studio)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, - "hf_url": "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct", + "hf_url": "https://huggingface.co/Qwen", "top_eval_score": 0.86, "top_eval_benchmark": "HumanEval+ / MultiPL-E", } @@ -301,9 +337,13 @@ "display_name": "Mistral Small 3.5", "base_model": "mistralai/Mistral-Small-3.5-24B-Instruct", "domain": "general", - "description": "Mistral Small 3.5 24B — généraliste rapide, alternative légère au Medium.", - "headline": "24B · général · fast", - "host": "studio", + "description": ( + "Mistral Small 3.5 24B — généraliste rapide, alternative légère au " + "Medium. Servi par le serveur omlx sur Mac Studio (:8500)." + ), + "headline": "24B · général · fast · omlx (Mac Studio)", + "host": "studio (omlx :8500, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.QUANTIZED, "hf_url": "https://huggingface.co/mistralai", @@ -312,26 +352,25 @@ } _LIVE_DETAILS["ailiance/mascarade"] = { - "display_name": "Mascarade · 12 LoRAs spécialistes", - "base_model": "ailiance/Qwen3-4B + LoRA", + "display_name": "Mascarade · LoRAs spécialistes qwen36", + "base_model": "Qwen3.6-35B-A3B + LoRA", "domain": "hardware-specialists", "description": ( - "Famille de 12 adaptateurs LoRA fine-tunés (r=16, α=32) sur le " - "modèle de base ailiance/Qwen3-4B. Chacun est servi par Ollama sur " - "Tower (NVIDIA Quadro P2000) via le tunnel autossh " - "electron-server :8004 → tower:11434. L'auto-router classifie le " - "domaine du prompt et délègue au spécialiste correspondant, puis " - "fait passer la sortie dans un validator Docker sandboxé. La fiche " - "détaillée liste les 12 spécialistes avec leur domaine, leur " - "nombre de steps d'entraînement et leur validator dédié." + "Famille d'adaptateurs LoRA fine-tunés (curriculum) sur le modèle de " + "base Qwen3.6-35B-A3B. Servis par deux instances multi-LoRA sur Mac " + "Studio (:9360 hardware/EDA/math, :9361 code/web/lang) avec hot-swap " + "des 30 adaptateurs. L'auto-router classifie le domaine du prompt et " + "délègue au spécialiste correspondant, puis fait passer la sortie " + "dans un validator Docker sandboxé. La fiche détaillée liste les " + "spécialistes avec leur domaine et leur validator dédié." ), - "headline": "ailiance/Qwen3-4B + 12 LoRAs · Tower Ollama :8004 · validator sandbox", - "parameters": 4_000_000_000, - "disk_size_bytes": 12 * 3 * _GIB, # 12 LoRAs × ~3 GB GGUF each - "memory_gb": 3.5, - "quantization": "Q4_K_M LoRA", - "host": "tower (NVIDIA Quadro P2000 5 GB)", - "architecture": "gguf", + "headline": "Qwen3.6-35B + LoRAs · multi-LoRA :9360/:9361 (Mac Studio) · validator sandbox", + "parameters": 35_000_000_000, + "disk_size_bytes": 70 * _GIB, + "memory_gb": 70.0, + "quantization": "MLX bf16 + LoRA", + "host": "studio (multi-LoRA :9360/:9361, Mac Studio M3 Ultra)", + "architecture": "mlx", "license": "apache-2.0", "kind": ModelKind.LORA, } diff --git a/apps/api/src/ailiance_demo/services/chat_proxy.py b/apps/api/src/ailiance_demo/services/chat_proxy.py index 425f87f..d00d066 100644 --- a/apps/api/src/ailiance_demo/services/chat_proxy.py +++ b/apps/api/src/ailiance_demo/services/chat_proxy.py @@ -18,35 +18,23 @@ # gateway fall through to its domain router, which produced garbled output # in tests because the request reaches a worker that doesn't recognize the # model id and degenerates. - # --- generalist base models served by the gateway --- - "ailiance/mistral-medium-3.5-128b": "ailiance-mistral", - "ailiance/gemma4-e4b-curriculum": "ailiance-gemma4", - "ailiance/gemma3-4b": "ailiance-gemma", - "ailiance/qwen3-next-80b-a3b-instruct": "ailiance-qwen", + # --- generalist base models served by the omlx consolidated server --- + "ailiance/mistral-medium-3.5-128b": "ailiance-mistral-medium", + "ailiance/gemma4-e4b-curriculum": "ailiance-gemma4-omlx", + "ailiance/qwen3-coder-next-80b": "ailiance-qwen36", "ailiance/granite-30b": "ailiance-granite", - "ailiance/ministral-14b": "ailiance-ministral", - "ailiance/ministral-14b-reasoning": "ailiance-ministral-reasoning", + "ailiance/eurollm-22b": "ailiance-eurollm", + "ailiance/apertus-70b": "ailiance-apertus", # --- additional gateway-exposed flagship / variant aliases --- "ailiance/pixtral-12b": "ailiance-pixtral", "ailiance/reasoning-r1": "ailiance-reasoning-r1", "ailiance/coder-pro": "ailiance-coder-pro", "ailiance/mistral-small-3.5": "ailiance-mistral-small", + "ailiance/devstral-base": "ailiance-devstral-base", + "ailiance/mixtral-8x22b": "ailiance-mixtral-8x22b", # --- mascarade family card routes to auto-router (auto-classifies which - # mascarade specialist to use) --- + # qwen36 LoRA specialist to use) --- "ailiance/mascarade": "ailiance", - # --- mascarade hardware specialists (Qwen3-4B LoRA on Tower :8004) --- - "ailiance/mascarade-kicad": "ailiance-kicad", - "ailiance/mascarade-spice": "ailiance-spice", - "ailiance/mascarade-stm32": "ailiance-stm32", - "ailiance/mascarade-emc": "ailiance-emc", - "ailiance/mascarade-embedded": "ailiance-embedded", - "ailiance/mascarade-platformio": "ailiance-platformio", - "ailiance/mascarade-freecad": "ailiance-freecad", - "ailiance/mascarade-dsp": "ailiance-dsp", - "ailiance/mascarade-iot": "ailiance-iot", - "ailiance/mascarade-power": "ailiance-power", - "ailiance/mascarade-components-review": "ailiance-components-review", - "ailiance/mascarade-coder": "ailiance-coder", # The bare "ailiance" alias triggers the gateway's domain router # (MiniLM L6 v2 embeddings + MLP classifier) — not in MODEL_FORCE_MAP on # purpose. We surface the decision in the chat stream via a route @@ -55,12 +43,13 @@ } # Worker port → human-readable label, used for the route preamble. +# Serving is consolidated onto the omlx multi-model server (:8500) plus the +# two qwen36 multi-LoRA instances (:9360 / :9361), all on Mac Studio. The old +# per-port workers (9301/9303/9304/8002) are decommissioned. _PORT_LABELS: dict[int, str] = { - 9301: "Mistral Medium 3.5 128B (studio)", - 8502: "Gemma 4 E4B + ailiance curriculum LoRA (macm1)", - 9303: "EuroLLM 22B (studio)", - 9304: "Gemma 3 4B (tower)", - 8002: "Qwen3.5 35B (kxkm-ai)", + 8500: "omlx multi-model server (studio)", + 9360: "Qwen3.6-35B multi-LoRA · hardware/EDA/math (studio)", + 9361: "Qwen3.6-35B multi-LoRA · code/web/lang (studio)", } AILIANCE_ALIASES: frozenset[str] = frozenset(ALIAS_TO_GATEWAY_MODEL) diff --git a/apps/api/src/ailiance_demo/services/gateway_probe.py b/apps/api/src/ailiance_demo/services/gateway_probe.py index 22ab6d6..f2ba316 100644 --- a/apps/api/src/ailiance_demo/services/gateway_probe.py +++ b/apps/api/src/ailiance_demo/services/gateway_probe.py @@ -18,173 +18,64 @@ log = structlog.get_logger() +# Serving is consolidated onto Mac Studio (M3 Ultra). Three live endpoints: +# - omlx :8500 — multi-model server (Mistral-Medium, DeepSeek-R1, +# Qwen3-Coder-30B/Next-80B, EuroLLM, granite, Mixtral, +# Pixtral, gemma-4, Devstral, ...) +# - qwen36 :9360 — Qwen3.6-35B multi-LoRA, hardware/EDA/math specialists +# - qwen36 :9361 — Qwen3.6-35B multi-LoRA, code/web/lang specialists +# The old per-port workers (studio:9301/9323/9325/9326/9327/9330, +# macm1:8502/9302, tower:9304/8004, kxkm-ai:8002/8003) are decommissioned. WORKERS = [ - # --- Studio (M3 Ultra, 512 GB unified) --- { - "id": "studio-mistral-medium", - "label": "Mac Studio · Mistral-Medium-128B :9301", - "url": "http://studio:9301", + "id": "studio-omlx", + "label": "Mac Studio · omlx multi-model :8500", + "url": "http://100.116.92.12:8500", "host": "studio", "gpu": "Apple M3 Ultra (76-core GPU)", "vram_gb": 512.0, "tdp_w": 215, - "gateway_aliases": ["ailiance-mistral", "ailiance-mistral-medium"], - "served_models": ["Mistral-Medium-3.5-128B-MLX-Q8"], - }, - { - "id": "studio-reasoning-r1", - "label": "Mac Studio · DeepSeek-R1 :9323", - "url": "http://studio:9323", - "host": "studio", - "gpu": "Apple M3 Ultra (76-core GPU)", - "vram_gb": 512.0, - "tdp_w": 215, - "gateway_aliases": ["ailiance-reasoning-r1"], - "served_models": ["DeepSeek-R1-Distill-Qwen-32B-MLX-4bit"], - }, - { - "id": "studio-pixtral", - "label": "Mac Studio · Pixtral-12B :9325", - "url": "http://studio:9325", - "host": "studio", - "gpu": "Apple M3 Ultra (76-core GPU)", - "vram_gb": 512.0, - "tdp_w": 215, - "gateway_aliases": ["ailiance-pixtral"], - "served_models": ["Pixtral-12B-MLX-4bit"], - }, - { - "id": "studio-mistral-small", - "label": "Mac Studio · Mistral-Small-24B :9326", - "url": "http://studio:9326", - "host": "studio", - "gpu": "Apple M3 Ultra (76-core GPU)", - "vram_gb": 512.0, - "tdp_w": 215, - "gateway_aliases": ["ailiance-mistral-small"], - "served_models": ["Mistral-Small-3.1-24B-Instruct-MLX-4bit"], + "gateway_aliases": [ + "ailiance-mistral-medium", "ailiance-mistral", "ailiance-eurollm", + "ailiance-apertus", "ailiance-gemma", "ailiance-granite", + "ailiance-devstral-base", "ailiance-flagship", "ailiance-qwen-235b", + "ailiance-reasoning-r1", "ailiance-llama", "ailiance-pixtral", + "ailiance-gemma4-omlx", "ailiance-mistral-small", "ailiance-coder-pro", + "ailiance-mixtral", "ailiance-mixtral-8x22b", + ], + "served_models": [ + "Mistral-Medium-3.5-128B-MLX-Q8", + "DeepSeek-R1-Distill-Qwen-32B", + "Qwen3-Coder-30B-A3B", "Qwen3-Coder-Next-8bit (80B MoE)", + "EuroLLM-22B", "granite-4.1-30b", "Mixtral-8x22B", + "Devstral-Small-2-24B", "Pixtral-12B", "gemma-4-E4B", + ], }, { - "id": "studio-coder-pro", - "label": "Mac Studio · Qwen3-Coder-30B :9327", - "url": "http://studio:9327", + "id": "studio-qwen36-hardware", + "label": "Mac Studio · Qwen3.6-35B multi-LoRA (hardware/EDA/math) :9360", + "url": "http://100.116.92.12:9360", "host": "studio", "gpu": "Apple M3 Ultra (76-core GPU)", "vram_gb": 512.0, "tdp_w": 215, - "gateway_aliases": ["ailiance-coder-pro"], - "served_models": ["Qwen3-Coder-30B-A3B-Instruct-MLX-4bit"], + "gateway_aliases": ["ailiance-qwen36"], + "served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"], }, { - "id": "studio-devstral-multi", - "label": "Mac Studio · Devstral multi-LoRA :9330", - "url": "http://studio:9330", + "id": "studio-qwen36-code", + "label": "Mac Studio · Qwen3.6-35B multi-LoRA (code/web/lang) :9361", + "url": "http://100.116.92.12:9361", "host": "studio", "gpu": "Apple M3 Ultra (76-core GPU)", "vram_gb": 512.0, "tdp_w": 215, "gateway_aliases": [ - "ailiance-devstral-base", "ailiance-python", "ailiance-cpp", - "ailiance-rust-emb", "ailiance-html", "ailiance-ml-training", + "ailiance-python", "ailiance-cpp", "ailiance-rust-emb", + "ailiance-html", "ailiance-ml-training", + "ailiance-components-review", "ailiance-coder", ], - "served_models": ["Devstral-Small-2-24B-MLX-4bit + 5 LoRA hot-swap"], - }, - # --- macM1 (M1, 32 GB) --- - { - "id": "macm1-mlx", - "label": "macM1 · mlx_lm.server :8502", - "url": "http://macm1:8502", - "host": "macm1", - "gpu": "Apple M1 (8-core GPU)", - "vram_gb": 32.0, - "tdp_w": 30, - # ailiance-granite is NOT here: the gateway force-maps that alias to - # kxkm-ai :8003, not macM1. macM1 hosts a granite-4.1-30b model but - # the gateway never routes the alias to it. - "gateway_aliases": [ - "ailiance-gemma2", "ailiance-gemma4", "ailiance-ministral", - "ailiance-ministral-reasoning", - ], - "served_models": [ - "gemma-4-E4B-it-MLX-4bit", - "Ministral-3-14B-Instruct-2512-4bit", - "Ministral-3-14B-Reasoning-2512-4bit", - ], - }, - # --- Tower (NVIDIA Quadro P2000, 5 GB) --- - { - "id": "tower-gemma", - "label": "Tower · llama.cpp Gemma 3 :9304", - "url": "http://tower:9304", - "host": "tower (NVIDIA Quadro P2000)", - "gpu": "NVIDIA Quadro P2000", - "vram_gb": 5.0, - "tdp_w": 75, - "gateway_aliases": ["ailiance-gemma"], - "served_models": ["gemma-3-4b-it (Q4 GGUF)"], - }, - { - # The 10 hardware mascarade aliases (kicad/spice/stm32/emc/embedded/ - # platformio/freecad/dsp/iot/power) moved to the Studio MLX worker - # :9340 with PR #100/#102. Tower Ollama now only backs the two - # aliases the gateway still force-maps to :8004, plus the embed - # surface. - "id": "tower-ollama", - "label": "Tower · Ollama mascarade :8004", - "url": "http://host.docker.internal:8004", - "host": "tower (autossh tunnel)", - "gpu": "NVIDIA Quadro P2000", - "vram_gb": 5.0, - "tdp_w": 75, - "gateway_aliases": [ - "ailiance-components-review", "ailiance-coder", "ailiance-embed", - ], - "served_models": [ - "mascarade-components-review", "mascarade-coder-v2", "bge-m3", - ], - }, - # --- Studio (M3 Ultra) MLX bf16 mascarade experts --- - { - "id": "studio-mascarade", - "label": "Mac Studio · MLX mascarade :9340", - "url": "http://host.docker.internal:9340", - "host": "studio (autossh tunnel)", - "gpu": "Apple M3 Ultra (76-core GPU)", - "vram_gb": 512.0, - "tdp_w": 215, - "gateway_aliases": [ - "ailiance-kicad", "ailiance-spice", "ailiance-stm32", "ailiance-emc", - "ailiance-embedded", "ailiance-platformio", "ailiance-freecad", - "ailiance-dsp", "ailiance-iot", "ailiance-power", - ], - "served_models": [ - "mascarade-kicad", "mascarade-spice", "mascarade-stm32", - "mascarade-emc", "mascarade-embedded", "mascarade-platformio", - "mascarade-freecad", "mascarade-dsp", "mascarade-iot", "mascarade-power", - ], - }, - # --- kxkm-ai (RTX 4090, 24 GB) --- - { - "id": "kxkm-qwen", - "label": "kxkm-ai · llama.cpp Qwen3-Next 80B :8002", - "url": "http://host.docker.internal:8002", - "host": "kxkm-ai (RTX 4090, autossh tunnel)", - "gpu": "NVIDIA RTX 4090", - "vram_gb": 24.0, - "tdp_w": 450, - "gateway_aliases": ["ailiance-qwen"], - "served_models": ["Qwen3-Next-80B-A3B-Instruct (Q4_K_M MoE)"], - }, - { - "id": "kxkm-granite", - "label": "kxkm-ai · llama.cpp Granite 30B :8003", - "url": "http://host.docker.internal:8003", - "host": "kxkm-ai (RTX 4090, autossh tunnel)", - "gpu": "NVIDIA RTX 4090", - "vram_gb": 24.0, - "tdp_w": 450, - "gateway_aliases": ["ailiance-granite"], - "served_models": ["granite-4.1-30b-instruct (Q4_K_M)"], + "served_models": ["Qwen3.6-35B-A3B-MLX-BF16 + 30 LoRA hot-swap"], }, ] @@ -196,10 +87,9 @@ # `nvidia-smi` (Linux/NVIDIA) or `ioreg` (Apple Silicon). The api container # has openssh-client and /root/.ssh mounted RO from /home/electron/.ssh. _HOST_PROBES: dict[str, dict[str, str]] = { + # Serving consolidated onto Mac Studio — the other physical hosts no + # longer serve LLM workers, so we only probe studio's GPU. "studio": {"ssh": "studio", "kind": "apple"}, - "macm1": {"ssh": "electron@macm1", "kind": "apple"}, - "tower": {"ssh": "clems@tower", "kind": "nvidia"}, - "kxkm-ai": {"ssh": "kxkm@10.2.0.237", "kind": "nvidia"}, } @@ -592,8 +482,8 @@ async def _produce() -> list[WorkerStatus]: request_counts = await _fetch_gateway_request_counts(client, gateway_url) host_probes = await _gather_host_probes() # Probe all workers in parallel to bound total latency to - # ~max(probe), not sum(probe). 11 workers * 300 ms sequential - # = 3.3 s -> ~500 ms. Fixes "probe indisponible" on cockpit. + # ~max(probe), not sum(probe). Keeps the cockpit page render + # fast and fixes "probe indisponible". return list( await asyncio.gather( *(_probe_one(client, w, request_counts, host_probes) for w in WORKERS) diff --git a/apps/api/tests/integration/test_models_endpoint.py b/apps/api/tests/integration/test_models_endpoint.py index 3cba9c9..2b1b406 100644 --- a/apps/api/tests/integration/test_models_endpoint.py +++ b/apps/api/tests/integration/test_models_endpoint.py @@ -8,13 +8,14 @@ def test_list_models_returns_cards(client_with_cache: TestClient) -> None: assert response.status_code == 200 cards = response.json() ids = {c["id"] for c in cards} - # Live workers + auto-router + 12 mascarade specialists + mocked HF entry. + # Live workers (omlx :8500 + qwen36 :9360/:9361) + auto-router + + # consolidated mascarade card + mocked HF entry. assert { "ailiance/mistral-medium-3.5-128b", - "ailiance/gemma3-4b", - "ailiance/qwen3-next-80b-a3b-instruct", + "ailiance/gemma4-e4b-curriculum", + "ailiance/qwen3-coder-next-80b", "ailiance/granite-30b", - "ailiance/ministral-14b", + "ailiance/eurollm-22b", "ailiance/mascarade", "ailiance/auto", "Ailiance-fr/micro-kiki-v3", diff --git a/apps/api/tests/integration/test_status_endpoint.py b/apps/api/tests/integration/test_status_endpoint.py index aa37725..1f7d7f6 100644 --- a/apps/api/tests/integration/test_status_endpoint.py +++ b/apps/api/tests/integration/test_status_endpoint.py @@ -68,21 +68,17 @@ def test_workers_constant_matches_production_fleet(): """The hard-coded WORKERS list is the single source of truth for /status.""" from ailiance_demo.services.gateway_probe import WORKERS + # Serving is consolidated onto Mac Studio: the omlx multi-model server + # (:8500) plus the two Qwen3.6-35B multi-LoRA instances (:9360 hardware/ + # EDA/math, :9361 code/web/lang). The old per-port / multi-host fleet + # (studio:9301/9323/.., macm1, tower, kxkm-ai) is decommissioned. ids = {w["id"] for w in WORKERS} assert ids == { - "studio-mistral-medium", "studio-reasoning-r1", "studio-pixtral", - "studio-mistral-small", "studio-coder-pro", "studio-devstral-multi", - "studio-mascarade", "macm1-mlx", "tower-gemma", "tower-ollama", - "kxkm-qwen", "kxkm-granite", + "studio-omlx", "studio-qwen36-hardware", "studio-qwen36-code", } by_id = {w["id"]: w for w in WORKERS} - # kxkm-*, tower-ollama and studio-mascarade reach the cockpit via autossh - # tunnels owned by the gateway host; from inside the api container we must - # talk to host.docker.internal. - assert "host.docker.internal" in by_id["kxkm-qwen"]["url"] - assert "host.docker.internal" in by_id["kxkm-granite"]["url"] - assert "host.docker.internal" in by_id["tower-ollama"]["url"] - assert "host.docker.internal" in by_id["studio-mascarade"]["url"] - # Other workers are addressed over Tailscale magic DNS. - assert by_id["studio-mistral-medium"]["url"] == "http://studio:9301" - assert by_id["tower-gemma"]["url"] == "http://tower:9304" + # All three workers live on Mac Studio, reached over Tailscale by IP. + assert by_id["studio-omlx"]["url"] == "http://100.116.92.12:8500" + assert by_id["studio-qwen36-hardware"]["url"] == "http://100.116.92.12:9360" + assert by_id["studio-qwen36-code"]["url"] == "http://100.116.92.12:9361" + assert all(w["host"] == "studio" for w in WORKERS) diff --git a/apps/api/tests/integration/test_workers_endpoint.py b/apps/api/tests/integration/test_workers_endpoint.py index 4bb586f..9875599 100644 --- a/apps/api/tests/integration/test_workers_endpoint.py +++ b/apps/api/tests/integration/test_workers_endpoint.py @@ -27,17 +27,15 @@ def test_workers_status_returns_list(empty_hf_cache, empty_eval_index) -> None: ) assert response.status_code == 200 workers = response.json() - # 6 default workers configured: gateway + 5-worker production fleet - # (mistral-medium-3.5, gemma4-e4b-curriculum, eurollm, gemma3, qwen3-next). - assert len(workers) == 6 + # 4 default workers configured: gateway + the consolidated Mac Studio + # serving fleet (omlx multi-model :8500, qwen36 multi-LoRA :9360/:9361). + assert len(workers) == 4 names = {w["name"] for w in workers} assert names == { "gateway", - "mistral-medium-3.5", - "gemma4-e4b-curriculum", - "eurollm", - "gemma3", - "qwen3-next", + "omlx", + "qwen36-hardware", + "qwen36-code", } # Each entry must report a valid health status; we don't assert "down" # because this test sometimes runs from a host that can actually reach diff --git a/apps/cockpit-public/src/components/ChatPlayground/ChatPlayground.tsx b/apps/cockpit-public/src/components/ChatPlayground/ChatPlayground.tsx index a1ec547..b7dc446 100644 --- a/apps/cockpit-public/src/components/ChatPlayground/ChatPlayground.tsx +++ b/apps/cockpit-public/src/components/ChatPlayground/ChatPlayground.tsx @@ -18,10 +18,9 @@ interface Props { // Worker-side payloads remain capped by their own context window; this is // only a Playground UX default. Power users can override via ParamsPanel. const REASONING_ALIASES = new Set([ - 'ailiance-gemma2', 'ailiance-reasoning-r1', - 'ailiance-ministral-reasoning', - 'ailiance-apertus-math-reasoning', + 'ailiance-qwen-235b', + 'ailiance-qwen36', ]); const DEFAULT_MAX_TOKENS = 1024; diff --git a/apps/cockpit-public/src/components/filters/BaseModelFilter.tsx b/apps/cockpit-public/src/components/filters/BaseModelFilter.tsx index 1f60920..617f695 100644 --- a/apps/cockpit-public/src/components/filters/BaseModelFilter.tsx +++ b/apps/cockpit-public/src/components/filters/BaseModelFilter.tsx @@ -1,10 +1,11 @@ const BASES = [ - 'mistral-large-123b', - 'qwen3.5-122b', - 'qwen3.5-35b', - 'apertus-70b', + 'mistral-medium-128b', + 'qwen3.6-35b', 'devstral-24b', 'eurollm-22b', + 'granite-4.1-30b', + 'gemma-4-e4b', + 'apertus-70b', ] as const; interface Props { diff --git a/apps/cockpit-public/src/routes/about.lazy.tsx b/apps/cockpit-public/src/routes/about.lazy.tsx index 49a09b0..a88d125 100644 --- a/apps/cockpit-public/src/routes/about.lazy.tsx +++ b/apps/cockpit-public/src/routes/about.lazy.tsx @@ -37,25 +37,29 @@ function AboutPage() {

Stack technique

@@ -116,7 +120,7 @@ function AboutPage() { > ailiance {' '} - — la gateway LLM elle-même (workers, router-v6, dossier EU AI Act). + — la gateway LLM elle-même (workers, router v9, dossier EU AI Act).
  • L'alias ailiance route automatiquement chaque requête vers le worker - spécialisé via un classifier MLP entraîné sur 32 classes de domaine. + spécialisé via un classifier MLP entraîné sur 47 classes de domaine.

    Version - + @@ -340,33 +340,25 @@ function BenchPage() { - - - - - - - - - + - - - + + + - - - + + +
    Encoder HiddenTop-1Macro-F1 Top-3 Notes
    - v6 (prod) + v9 (prod) jina-v3 1024d5120.8770.987déployé depuis 2026-05-08
    - v7 - MiniLM-L6 384dMiniLM-L6-v2 384d 2560.8790.988test régression encoder0.8890.98847 domaines · déployé 2026-05-29
    - v8 + v6 (candidat) jina-v3 1024d 5120.875data augmentée, variation marginale0.8740.987 + évalué puis écarté (top-1 inférieur, encodage ~6× plus lent) +
    diff --git a/apps/cockpit-public/src/routes/catalog.lazy.tsx b/apps/cockpit-public/src/routes/catalog.lazy.tsx index cd7b47a..06552fb 100644 --- a/apps/cockpit-public/src/routes/catalog.lazy.tsx +++ b/apps/cockpit-public/src/routes/catalog.lazy.tsx @@ -83,7 +83,7 @@ function CatalogPage() { > Source-of-truth des poids LoRA et modèles fine-tunés Ailiance software, distribués publiquement sur HuggingFace. {aiCount} dépôts Ailiance-fr (fine-tunes - mascarade, devstral, gemma-4, apertus, eurollm) et {erCount} dépôts{' '} + mascarade, qwen36, devstral, gemma-4, apertus, eurollm) et {erCount} dépôts{' '} electron-rare legacy conservés pour traçabilité historique des releases avant la migration sur l'organisation{' '} Ailiance-fr. diff --git a/apps/cockpit-public/src/routes/index.tsx b/apps/cockpit-public/src/routes/index.tsx index 2a9bde0..9b74dcc 100644 --- a/apps/cockpit-public/src/routes/index.tsx +++ b/apps/cockpit-public/src/routes/index.tsx @@ -91,8 +91,8 @@ function HomePage() {

    - 5 workers · 24 LoRA publics · 31 domaines - évalués + 26 aliases gateway · 47 domaines routés · 31{' '} + domaines évalués
    0 dépendance cloud · 0 log de prompt persisté @@ -116,7 +116,7 @@ function HomePage() { en quatre missions.

    - Pourquoi exploiter cinq workers sur du matériel personnel quand un appel d'API + Pourquoi exploiter sa propre flotte sur du matériel personnel quand un appel d'API suffirait ? Parce qu'il existe encore un standard plus exigeant que la latence : la{' '} traçabilité.

    diff --git a/apps/cockpit-public/src/routes/models.$owner.$name.lazy.tsx b/apps/cockpit-public/src/routes/models.$owner.$name.lazy.tsx index 4b023f5..12d865a 100644 --- a/apps/cockpit-public/src/routes/models.$owner.$name.lazy.tsx +++ b/apps/cockpit-public/src/routes/models.$owner.$name.lazy.tsx @@ -254,8 +254,9 @@ function ModelDetailPage() { > Toutes les adapters partagent le même modèle de base{' '} ailiance/Qwen3-4B{' '} - avec LoRA r=16 / α=32. Servies par Ollama sur Tower (NVIDIA Quadro P2000) via le - tunnel autossh electron-server :8004. + avec LoRA r=16 / α=32. Famille publiée sur HuggingFace ; le routage live est + désormais assuré par les adaptateurs qwen36-35B servis sur Mac Studio (:9360 / + :9361).

    - Sur les domaines hardware/code, l'auto-router délègue à un des 12 spécialistes - mascarade (LoRA Qwen3-4B sur Tower Ollama :8004). + Sur les domaines hardware/code, l'auto-router délègue à un des 30 adaptateurs + spécialistes qwen36-35B servis sur Mac Studio (:9360 / :9361).

    L'auto-router, par domaine.

    - Le prompt entre. Un classifier embeddings le situe sur l'un des 32 domaines. Le routeur + Le prompt entre. Un classifier embeddings le situe sur l'un des 47 domaines. Le routeur ouvre la politique YAML correspondante et choisit le spécialiste. Sur les domaines hardware, la sortie passe par un validator Docker sandboxé avant retour utilisateur.

    @@ -206,8 +206,8 @@ function ModelsPage() { @@ -219,8 +219,8 @@ function ModelsPage() { @@ -239,7 +239,7 @@ function ModelsPage() { @@ -281,7 +281,7 @@ function ModelsPage() {
    - gateway :9300 · router v0.3 · live probe + gateway :9300 · router v9 · live probe {upCount} / {totalCount} healthy @@ -429,7 +429,7 @@ function ModelsPage() { Bench — origine vs tuné.

    - iact-bench v0.2.0, sandbox Docker épinglé par digest. Score = % cellules avec validator + iact-bench v1, sandbox Docker épinglé par digest. Score = % cellules avec validator exit-zéro. Origine = modèle base sans routage. Tuné = via auto-router + validator chain.

    diff --git a/apps/cockpit-public/src/routes/status.lazy.tsx b/apps/cockpit-public/src/routes/status.lazy.tsx index fa43da0..7e219d6 100644 --- a/apps/cockpit-public/src/routes/status.lazy.tsx +++ b/apps/cockpit-public/src/routes/status.lazy.tsx @@ -145,7 +145,8 @@ const INCIDENTS = [ { d: '06 mai', s: 'kxkm-ai', e: 'autossh restart · 4 min downtime', ok: false }, { d: '01 mai', s: 'studio', e: 'MLX model reload · 2 min', ok: true }, { d: '24 avril', s: 'tower', e: 'OS kernel panic, replaced PSU', ok: false }, - { d: '12 avril', s: '—', e: 'router v0.3 shipped', ok: true }, + { d: '29 mai', s: 'studio', e: 'serving consolidé sur omlx :8500', ok: true }, + { d: '12 avril', s: '—', e: 'router v9 shipped', ok: true }, ]; function StatusPage() { @@ -247,7 +248,7 @@ function StatusPage() {
    auto-router
    -
    MiniLM v6 · classifier MLP
    +
    MiniLM-L6-v2 384d · classifier MLP
    studio.tail
    @@ -281,7 +282,7 @@ function StatusPage() { 814 h
    - 87.7% top-1 + 88.9% macro-F1
    @@ -312,11 +313,9 @@ function StatusPage() { }} > {`[gateway_probe.py] tick = ${30 - (tick % 30)}s - studio:9301 → 200 OK · 312 ms · apertus-70b loaded - macm1:9302 → 200 OK · 188 ms · devstral-24b loaded - studio:9303 → 200 OK · 224 ms · eurollm-22b loaded - tower:9304 → 200 OK · 92 ms · gemma3-4b loaded - kxkm-ai:8002 → 200 OK · 421 ms · qwen3-next-80b loaded (via autossh tunnel) + studio:8500 → 200 OK · 224 ms · omlx multi-modèle (catalogue chargé) + studio:9360 → 200 OK · 188 ms · qwen36-35B multi-LoRA (hardware/EDA/math) + studio:9361 → 200 OK · 196 ms · qwen36-35B multi-LoRA (code/web/lang) ---- cache age: 12 s next refresh: ${30 - (tick % 30)} s`} diff --git a/apps/cockpit-public/src/routes/transparency.lazy.tsx b/apps/cockpit-public/src/routes/transparency.lazy.tsx index 7ad6bc0..2e43603 100644 --- a/apps/cockpit-public/src/routes/transparency.lazy.tsx +++ b/apps/cockpit-public/src/routes/transparency.lazy.tsx @@ -45,7 +45,7 @@ const ENTRIES: ProvenanceEntry[] = [ license: 'Gemma Terms', provenanceUrl: 'https://github.com/ailiance/ailiance/blob/main/docs/provenance/gemma-3-4b-it.json', - notes: 'Worker léger · NVIDIA Quadro P2000 5 GB', + notes: 'Servi via omlx :8500 · fallback vision léger (gemma-4-E4B)', }, { alias: 'ailiance/qwen3-next-80b', @@ -54,16 +54,16 @@ const ENTRIES: ProvenanceEntry[] = [ license: 'Apache-2.0', provenanceUrl: 'https://github.com/ailiance/ailiance/blob/main/docs/provenance/qwen3-next-80b-a3b-instruct.json', - notes: 'MoE 80B / 3B actif · RTX 4090 + RAM offload', + notes: 'MoE 80B / 3B actif · servi via omlx :8500 (Qwen3-Coder-Next-8bit)', }, { alias: 'ailiance/auto', - base: 'MiniLM L6 v2 384d + 2-layer MLP + chain orchestrator', + base: 'MiniLM-L6-v2 384d + MLP 2 couches (hidden 256) + chain orchestrator', provider: 'Microsoft (MiniLM) + Ailiance software', license: 'Apache-2.0', provenanceUrl: 'https://github.com/ailiance/ailiance/blob/main/docs/provenance/auto-router-minilm.json', - notes: 'Classifier 32 domaines · chain v0.3', + notes: 'Classifier 47 domaines (macro-F1 0,889) · chain v9', }, ]; @@ -213,9 +213,15 @@ function TransparencyPage() { fichier de provenance
  • - Validation pré-publication — iact-bench complet (31 domaines × 23 - modèles) + sandbox Docker validators (g++, KiCad DRC/ERC, ngspice, shellcheck, tsc, - etc.) avec digests sha256 épinglés + Validation pré-publication — iact-bench complet (31 domaines × ≤23 + modèles) + ~46 validators sur 3 backends : sandbox Docker (g++, KiCad DRC/ERC, + ngspice, shellcheck, tsc, etc.), kicad-mcp-pro et KiKit, avec digests sha256 épinglés +
  • +
  • + Jury LLM — le score LLM-judge d'iact-bench est calculé par + Qwen3-Coder-30B et EuroLLM-22B. Mistral-Small-3.1 est tenu à l'écart de l'usage + texte/jury en raison d'un bug connu du détokeniseur omlx (remonté en amont, contourné + en aval).
  • Critères de release — gain mesurable sur le domaine cible vs base @@ -314,10 +320,11 @@ function TransparencyPage() {

    La sortie du modèle est la seule entrée du validator : pas d'exfiltration - de données, pas de fuite d'environnement. Douze validators stables aujourd'hui (g++, - arm-none-eabi-gcc, cargo embedded, shellcheck, tsc, ngspice, KiCad DRC/ERC, FreeCAD - scripting, html5lib strict, sqlglot, JSON/YAML). Dix validators EDA/MCAD - supplémentaires en v0.3.0. + de données, pas de fuite d'environnement. ~46 validators sur 3 backends aujourd'hui + — sandbox Docker (g++, arm-none-eabi-gcc, cargo embedded, shellcheck, tsc, ngspice, + KiCad DRC/ERC, FreeCAD scripting, html5lib strict, sqlglot, JSON/YAML, atopile, + KiKit DRC/fab…), kicad-mcp-pro (validators kicad-pro-* : + DRC/EMC/DFM/quality-gate) et KiKit headless.