kfallah · kfallah · Mar 7, 2026 · Mar 7, 2026 · coderabbitai · Mar 7, 2026
diff --git a/.claude/skills/setup-local/SKILL.md b/.claude/skills/setup-local/SKILL.md
@@ -38,7 +38,7 @@ See `docker/README.md` for details. Report success and stop.
 
 ```bash
 # Python deps (from repo root)
-uv sync --extra local --extra teacher --extra dev
+uv sync --extra local --extra dev
 
 # pyproject.toml pins torch to CPU — reinstall with CUDA
 # Match cu1XX to your CUDA version from nvidia-smi (e.g. cu124, cu128)
@@ -71,7 +71,7 @@ mkdir -p "$LORA_ROOT" "$OPENCLAW_HOME"
 CLAAS_CONFIG_NAME=local \
 CLAAS_LORA_ROOT="$LORA_ROOT" \
 LORA_NAME=openclaw/assistant \
-MODEL=Qwen/Qwen3-8B \
+MODEL=Qwen/Qwen3.5-9B \
 VLLM_BASE_URL=http://localhost:8000/v1 \
 API_KEY=sk-local \
 TELEGRAM_BOT_TOKEN=<token> \
@@ -117,17 +117,18 @@ LORA_ROOT="${HOME}/.local/share/claas/loras"
 [ -f "$LORA_ROOT/.aliases.json" ] || echo '{}' > "$LORA_ROOT/.aliases.json"
 
 export PATH="$(pwd)/.venv/bin:$PATH"  # puts 'vllm' on PATH
-export MODEL=Qwen/Qwen3-8B HOST=0.0.0.0 PORT=8000 API_KEY=sk-local
-export SERVED_MODEL_NAMES=qwen3-8b MAX_MODEL_LEN=32768 GPU_MEMORY_UTILIZATION=0.70
+export MODEL=Qwen/Qwen3.5-9B HOST=0.0.0.0 PORT=8000 API_KEY=sk-local
+export SERVED_MODEL_NAMES=qwen3.5-9b MAX_MODEL_LEN=32768 GPU_MEMORY_UTILIZATION=0.70
 export ENABLE_SLEEP_MODE=1 VLLM_SERVER_DEV_MODE=1 VLLM_ALLOW_RUNTIME_LORA_UPDATING=1
-export ENABLE_AUTO_TOOL_CHOICE=1 TOOL_CALL_PARSER=qwen3_xml
+export ENABLE_AUTO_TOOL_CHOICE=1 TOOL_CALL_PARSER=qwen3_coder
 export LORA_ROOT="$LORA_ROOT" LORA_ALIAS_FILE="$LORA_ROOT/.aliases.json" INCLUDE_ALIAS_LORAS=1
 # Enable LoRA even with no initial adapters — needed for runtime LoRA loading
-export EXTRA_ARGS='--enable-lora --max-lora-rank 32'
+# --enforce-eager required: CUDA graph capture has a bug in Qwen3.5's GDN causal conv1d layer
+export EXTRA_ARGS='--enable-lora --max-lora-rank 32 --enforce-eager'
 
-bash docker/scripts/start_vllm_qwen3_8b.sh >> /tmp/vllm.log 2>&1 &
+bash docker/scripts/start_vllm.sh >> /tmp/vllm.log 2>&1 &
 
-# First run downloads Qwen3-8B (~16 GB) — expect 5-20 min
+# First run downloads Qwen3.5-9B — expect 5-20 min
 until curl -sf http://localhost:8000/health; do sleep 5; done && echo "vLLM ready"
 ```
 

diff --git a/.claude/skills/setup-modal/SKILL.md b/.claude/skills/setup-modal/SKILL.md
@@ -26,7 +26,7 @@ When this skill is invoked, perform the following steps:
 ## Deployed Services
 
 The deployment includes:
-- **DistillWorker**: L40S GPU worker with Qwen3-8B student model (self-distillation)
+- **DistillWorker**: L40S GPU worker with Qwen3.5-9B student model (self-distillation)
 - **FastAPI endpoint**: REST API for feedback and distillation requests
 
 ## Health Check

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ cp .env.local.example .env
 docker compose --profile local up --build
 ```
 
-This brings up vLLM with Qwen3-8B, the CLaaS feedback API, and OpenClaw's Telegram gateway. See [`docker/README.md`](docker/README.md) for details.
+This brings up vLLM with Qwen3.5-9B, the CLaaS feedback API, and OpenClaw's Telegram gateway. See [`docker/README.md`](docker/README.md) for details.
 
 **Manual install:**
 
@@ -90,7 +90,7 @@ Deploy:
 ```bash
 # Set HF_TOKEN if using gated models
 export HF_TOKEN=...
-export CLAAS_BASE_MODEL_ID=Qwen/Qwen3-8B
+export CLAAS_BASE_MODEL_ID=Qwen/Qwen3.5-9B
 uv run modal deploy -m claas.modal.deploy
 ```
 
@@ -104,7 +104,7 @@ For manual (non-Docker) local setup:
 
 ```bash
 # 1. Start vLLM with LoRA support
-vllm serve Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 \
+vllm serve Qwen/Qwen3.5-9B --host 0.0.0.0 --port 8000 \
   --enable-lora --lora-modules my-lora=/loras/user/my-lora-init
 
 # 2. Start the CLaaS API

diff --git a/claas/core/config.py b/claas/core/config.py
@@ -43,7 +43,7 @@ class LocalConfig(CLaaSConfig):
     feedback_min_free_vram_gb: float = 20.0
     feedback_sleep_verify_timeout_s: float = 30.0
     feedback_drain_timeout_s: float = 30.0
-    base_model_id: str = "Qwen/Qwen3-8B"
+    base_model_id: str = "Qwen/Qwen3.5-9B"
     attn_implementation: str = "flash_attention_2"
 
 
@@ -59,7 +59,7 @@ class ModalConfig(CLaaSConfig):
     feedback_min_free_vram_gb: float = 20.0
     feedback_sleep_verify_timeout_s: float = 30.0
     feedback_drain_timeout_s: float = 30.0
-    base_model_id: str = "Qwen/Qwen3-8B"
+    base_model_id: str = "Qwen/Qwen3.5-9B"
     hf_secret_name: str = ""
 
 

diff --git a/claas/core/configs/local.yaml b/claas/core/configs/local.yaml
@@ -7,13 +7,13 @@ feedback_log_dir: ./data/feedback
 lora_root: /loras
 storage_backend: local_fs
 allowed_init_base_models:
-  - Qwen/Qwen3-8B
+  - Qwen/Qwen3.5-9B
 vllm_base_url: http://127.0.0.1:8000
 feedback_lock_timeout_s: 120.0
 feedback_wake_on_failure: true
 feedback_min_free_vram_gb: 20.0
 feedback_sleep_verify_timeout_s: 30.0
 feedback_drain_timeout_s: 30.0
-base_model_id: Qwen/Qwen3-8B
+base_model_id: Qwen/Qwen3.5-9B
 attn_implementation: flash_attention_2
 completion_cache_size: 100
diff --git a/claas/core/configs/modal.yaml b/claas/core/configs/modal.yaml
@@ -7,7 +7,7 @@ feedback_log_dir: ./data/feedback
 lora_root: /loras
 storage_backend: modal_volume
 allowed_init_base_models:
-  - Qwen/Qwen3-8B
+  - Qwen/Qwen3.5-9B
 vllm_base_url: http://127.0.0.1:8000
 feedback_lock_timeout_s: 120.0
 feedback_wake_on_failure: true

diff --git a/claas/core/configs/tinker.yaml b/claas/core/configs/tinker.yaml
@@ -7,7 +7,7 @@ feedback_log_dir: ./data/feedback
 lora_root: /loras
 storage_backend: local_fs
 allowed_init_base_models:
-  - Qwen/Qwen3-8B
+  - Qwen/Qwen3.5-9B
 vllm_base_url: http://127.0.0.1:8000
 tinker_base_model: Qwen/Qwen3-30B-A3B
 tinker_state_path: /data/tinker_state.json

diff --git a/claas/core/types.py b/claas/core/types.py
@@ -264,7 +264,7 @@ class LoraInitRequest(BaseModel):
         description="LoRA identifier (e.g., 'user123/coder-v1')",
     )
     base_model: str = Field(
-        default="Qwen/Qwen3-8B",
+        default="Qwen/Qwen3.5-9B",
         description="Base model the LoRA will be applied to",
     )
     lora_r: int = Field(

diff --git a/claas/eval/types.py b/claas/eval/types.py
@@ -92,7 +92,7 @@ class EvalConfig:
     lora_id_prefix: str = "eval"
     seed: int = 42
     openclaw_url: Optional[str] = None
-    base_model: str = "Qwen/Qwen3-8B"
+    base_model: str = "Qwen/Qwen3.5-9B"
     batch_size: int = 4
     training: TrainingConfig = field(default_factory=TrainingConfig)
 

diff --git a/claas/inference/helpers.py b/claas/inference/helpers.py
@@ -127,11 +127,17 @@ def bounded_float(
 
 
 def coerce_template_ids(result: Any) -> list[int]:
-    """Normalize ``tokenizer.apply_chat_template`` output to a plain list[int]."""
+    """Normalize ``tokenizer.apply_chat_template`` output to a plain list[int].
+
+    Handles plain list[int], dict-like objects with ``input_ids`` key
+    (including ``BatchEncoding`` from transformers), and tensor-like
+    objects with a ``tolist()`` method.
+    """
     if isinstance(result, list):
         return [int(tok) for tok in result]
-    if isinstance(result, dict):
-        maybe_ids = result.get("input_ids")
+    # BatchEncoding (from transformers) is a Mapping but not a plain dict
+    if hasattr(result, "__getitem__") and "input_ids" in result:
+        maybe_ids = result["input_ids"]
         if isinstance(maybe_ids, list):
             return [int(tok) for tok in maybe_ids]
     if hasattr(result, "tolist"):

diff --git a/claas/modal/worker.py b/claas/modal/worker.py
@@ -67,7 +67,7 @@ class DistillWorker:
 
     # Defaults are captured at deploy/import time. Move env reads into runtime
     # initialization if container-level overrides are required.
-    base_model_id: str = os.environ.get("CLAAS_BASE_MODEL_ID", "Qwen/Qwen3-8B")
+    base_model_id: str = os.environ.get("CLAAS_BASE_MODEL_ID", "Qwen/Qwen3.5-9B")
     attn_implementation: str = os.environ.get("CLAAS_ATTN_IMPLEMENTATION", "sdpa")
 
     @modal.enter(snap=True)

diff --git a/claas/training/storage.py b/claas/training/storage.py
@@ -454,15 +454,28 @@ def create_initial_lora(
 
     # Resolve layer dimensions from the base model config (no weights downloaded).
     model_config = AutoConfig.from_pretrained(base_model_name, trust_remote_code=True)
-    hidden_size = model_config.hidden_size
-    intermediate_size = getattr(model_config, "intermediate_size", hidden_size * 4)
-    num_heads = model_config.num_attention_heads
-    head_dim = hidden_size // num_heads
-    num_kv_heads = getattr(model_config, "num_key_value_heads", num_heads)
-    num_layers = model_config.num_hidden_layers
+    # Multimodal models (e.g. Qwen3.5) nest text dimensions under text_config.
+    text_config = getattr(model_config, "text_config", model_config)
+    hidden_size = text_config.hidden_size
+    intermediate_size = getattr(text_config, "intermediate_size", hidden_size * 4)
+    num_heads = text_config.num_attention_heads
+    head_dim = getattr(text_config, "head_dim", hidden_size // num_heads)
+    num_kv_heads = getattr(text_config, "num_key_value_heads", num_heads)
+    num_layers = text_config.num_hidden_layers
+
+    # Hybrid models (e.g. Qwen3.5) mix layer types: full_attention layers have
+    # standard q/k/v/o_proj, while linear_attention (GDN) layers use different
+    # projections (in_proj_qkv, out_proj).  Detect via layer_types config.
+    layer_types: list[str] | None = getattr(text_config, "layer_types", None)
+    # Full attention q_proj may be doubled for output gating (attn_output_gate).
+    attn_output_gate = getattr(text_config, "attn_output_gate", False)
+    q_proj_multiplier = 2 if attn_output_gate else 1
+
+    attn_modules = {"q_proj", "k_proj", "v_proj", "o_proj"}
+    mlp_modules = {"gate_proj", "up_proj", "down_proj"}
 
     dim_map = {
-        "q_proj": (num_heads * head_dim, hidden_size),
+        "q_proj": (num_heads * head_dim * q_proj_multiplier, hidden_size),
         "k_proj": (num_kv_heads * head_dim, hidden_size),
         "v_proj": (num_kv_heads * head_dim, hidden_size),
         "o_proj": (hidden_size, num_heads * head_dim),
@@ -488,10 +501,15 @@ def create_initial_lora(
     # while allowing gradients to propagate through A.
     tensors: dict[str, torch.Tensor] = {}
     for layer_idx in range(num_layers):
+        layer_type = layer_types[layer_idx] if layer_types else "full_attention"
         for mod_name in target_modules:
+            # Skip attention modules for non-full-attention layers (e.g. GDN
+            # layers use in_proj_qkv/out_proj instead of q/k/v/o_proj).
+            if mod_name in attn_modules and layer_type != "full_attention":
+                continue
             out_dim, in_dim = dim_map[mod_name]
             prefix = f"base_model.model.model.layers.{layer_idx}.self_attn.{mod_name}"
-            if mod_name in ("gate_proj", "up_proj", "down_proj"):
+            if mod_name in mlp_modules:
                 prefix = f"base_model.model.model.layers.{layer_idx}.mlp.{mod_name}"
             lora_a = torch.empty(lora_r, in_dim)
             torch.nn.init.kaiming_uniform_(lora_a, a=5**0.5)

diff --git a/docker/.env.local.example b/docker/.env.local.example
@@ -7,7 +7,7 @@
 
 # REQUIRED
 TELEGRAM_BOT_TOKEN=your-telegram-bot-token-here
-MODEL=Qwen/Qwen3-8B
+MODEL=Qwen/Qwen3.5-9B
 LORA_NAME=openclaw/assistant
 MAX_MODEL_LEN=32768
 GPU_MEMORY_UTILIZATION=0.70

diff --git a/docker/Dockerfile.claas-api b/docker/Dockerfile.claas-api
@@ -16,6 +16,12 @@ RUN mkdir -p claas && touch claas/__init__.py
 
 RUN pip install --no-cache-dir ".[local]" modal
 
+# Qwen3.5 GDN (Gated Delta Networks) layers require these CUDA kernels.
+# Without them, transformers falls back to a buggy torch implementation.
+# causal-conv1d must be built from source to match the container's torch ABI.
+RUN pip install --no-cache-dir flash-linear-attention \
+    && pip install --no-cache-dir --no-build-isolation causal-conv1d
+
 # Now copy the full source and reinstall the package (no-deps: deps cached above)
 COPY . /app
 RUN pip install --no-cache-dir --no-deps --force-reinstall "."

diff --git a/docker/Dockerfile.init b/docker/Dockerfile.init
@@ -12,12 +12,15 @@ WORKDIR /app
 # Copy only dependency files first for layer caching
 COPY pyproject.toml uv.lock ./
 
-# Install core dependencies only (no extras)
-RUN uv sync --frozen --no-install-project --no-dev
+# Install core + local deps (torch, transformers, safetensors for LoRA init)
+RUN uv sync --frozen --no-install-project --no-dev --extra local
+
+# Override GPU torch with CPU-only to keep image small
+RUN uv pip install torch --index-url https://download.pytorch.org/whl/cpu --reinstall
 
 # Copy source and install just the project (deps already cached)
 COPY . .
-RUN uv sync --frozen --no-dev
+RUN uv sync --frozen --no-dev --extra local
 
 ENV PATH="/app/.venv/bin:$PATH"
 

diff --git a/docker/README.md b/docker/README.md
@@ -24,7 +24,7 @@ cp .env.local.example .env
 docker compose --profile local up --build
 ```
 
-The first run downloads Qwen3-8B (~16 GB) — expect the vLLM health check to take 10-20 minutes. Subsequent runs use the cached model.
+The first run downloads Qwen3.5-9B (~16 GB) — expect the vLLM health check to take 10-20 minutes. Subsequent runs use the cached model.
 
 ## Tinker Profile
 
@@ -49,7 +49,7 @@ This stack brings up:
 
 | Service | Port | Description |
 |---------|------|-------------|
-| `vllm` | 8000 | Qwen3-8B with LoRA serving and sleep/wake support |
+| `vllm` | 8000 | Qwen3.5-9B with LoRA serving and sleep/wake support |
 | `claas-api` | 8080 | CLaaS feedback API and distill worker |
 | `openclaw-local` | 18789 | OpenClaw gateway with Telegram bot |
 | `init-local` | — | One-shot: creates LoRA adapter + writes OpenClaw config |
@@ -75,7 +75,7 @@ This stack brings up:
           ▼                          ▼                      ▼
  ┌────────────────┐        ┌─────────────────┐    ┌────────────────┐
  │  vllm (:8000)  │◄───────│ claas-api(:8080) │   │ openclaw(:18789)│
- │  Qwen3-8B +    │  sleep │  Feedback API    │   │  Telegram bot  │
+ │  Qwen3.5-9B +    │  sleep │  Feedback API    │   │  Telegram bot  │
  │  LoRA serving  │  /wake │  Distill worker  │   │  Uses LoRA     │
  └────────────────┘        └─────────────────┘    └────────────────┘
        │                          │                       │
@@ -133,7 +133,7 @@ Settings live in `.env` (local profile) and `.env.tinker` (tinker profile).
 | `TELEGRAM_BOT_TOKEN` | *(required)* | Bot token from @BotFather |
 | `TINKER_API_KEY` | *(tinker only)* | API key for Tinker SDK |
 | `HF_TOKEN` | — | HuggingFace token for gated models (local only) |
-| `MODEL` | `Qwen/Qwen3-8B` (local) / *(required, tinker)* | Base model ID |
+| `MODEL` | `Qwen/Qwen3.5-9B` (local) / *(required, tinker)* | Base model ID (local uses Qwen3.5-9B, tinker uses Qwen3-30B-A3B) |
 | `MAX_MODEL_LEN` | `32768` | Max sequence length (local only) |
 | `GPU_MEMORY_UTILIZATION` | `0.70` | GPU VRAM fraction (local only) |
 | `LORA_NAME` | `openclaw/assistant` | LoRA adapter identity |
@@ -163,7 +163,7 @@ Only secrets should be passed via environment variables:
 
 ## Troubleshooting
 
-**vLLM takes forever to start**: The first run downloads Qwen3-8B. Check progress with `docker compose --profile local logs -f vllm`.
+**vLLM takes forever to start**: The first run downloads Qwen3.5-9B. Check progress with `docker compose --profile local logs -f vllm`.
 
 **Out of GPU memory**: Lower `GPU_MEMORY_UTILIZATION` in `.env` (e.g., `0.85`). The sleep/wake mechanism ensures vLLM and CLaaS don't use GPU simultaneously.
 

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -3,26 +3,27 @@ services:
 
   vllm:
     profiles: [local]
-    image: vllm/vllm-openai:v0.15.1
-    entrypoint: ["/scripts/start_vllm_qwen3_8b.sh"]
+    image: vllm/vllm-openai:qwen3_5
+    entrypoint: ["/scripts/start_vllm.sh"]
     environment:
       - MODEL=${MODEL}
       - HOST=0.0.0.0
       - API_KEY=sk-local
-      - SERVED_MODEL_NAMES=qwen3-8b
+      - SERVED_MODEL_NAMES=qwen3.5-9b
       - MAX_MODEL_LEN=${MAX_MODEL_LEN}
       - GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION}
       - ENABLE_SLEEP_MODE=1
       - VLLM_SERVER_DEV_MODE=1
       - VLLM_ALLOW_RUNTIME_LORA_UPDATING=1
       - ENABLE_AUTO_TOOL_CHOICE=1
-      - TOOL_CALL_PARSER=qwen3_xml
+      - TOOL_CALL_PARSER=qwen3_coder
       - LORA_ROOT=/loras
       - LORA_ALIAS_FILE=/loras/.aliases.json
       - INCLUDE_ALIAS_LORAS=1
       - HF_TOKEN=${HF_TOKEN:-}
+      - EXTRA_ARGS=--enforce-eager
     volumes:
-      - ./scripts/start_vllm_qwen3_8b.sh:/scripts/start_vllm_qwen3_8b.sh:ro
+      - ./scripts/start_vllm.sh:/scripts/start_vllm.sh:ro
       - hf-cache:/root/.cache/huggingface
       - lora-storage:/loras
     ports:

diff --git a/docker/scripts/init-stack.py b/docker/scripts/init-stack.py
@@ -24,7 +24,7 @@
 # ---------------------------------------------------------------------------
 LORA_NAME = os.environ.get("LORA_NAME", "openclaw/assistant")
 DISTILL_MODE = "local"
-BASE_MODEL = "Qwen/Qwen3-8B"
+BASE_MODEL = "Qwen/Qwen3.5-9B"
 LORA_ROOT = "/loras"
 OPENCLAW_HOME = Path(os.environ.get("OPENCLAW_HOME", "/openclaw-config"))
 VLLM_BASE_URL = "http://vllm:8000/v1"
@@ -46,7 +46,7 @@ def _parse_feedback_batch_size() -> int:
 def _default_base_model(config_name: str) -> str:
     if config_name == "tinker":
         return "Qwen/Qwen3-30B-A3B"
-    return "Qwen/Qwen3-8B"
+    return "Qwen/Qwen3.5-9B"
 
 
 def _default_vllm_base_url(config_name: str) -> str:
@@ -184,7 +184,7 @@ def _model_entry(model_id: str) -> dict:
 
 
 def _normalize_lora_alias(name: str) -> str:
-    """Slash-separated LoRA alias → vLLM-safe name (matches start_vllm_qwen3_8b.sh)."""
+    """Slash-separated LoRA alias → vLLM-safe name (matches start_vllm.sh)."""
     import re
 
     normalized = re.sub(r"[^a-zA-Z0-9._-]+", "-", name).strip("-")
@@ -195,7 +195,7 @@ def write_openclaw_config() -> None:
     lora_alias = f"{LORA_NAME}-latest"
     vllm_model_name = _normalize_lora_alias(lora_alias)
 
-    model_ids = ["qwen3-8b", vllm_model_name]
+    model_ids = ["qwen3.5-9b", vllm_model_name]
     primary_model = vllm_model_name
 
     now = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")