From 390771bc1d6dd0ed23dc0bdfce57528d57b11b0b Mon Sep 17 00:00:00 2001 From: Pushkinist <4850452+Pushkinist@users.noreply.github.com> Date: Fri, 19 Jun 2026 12:38:18 +0700 Subject: [PATCH] chore(release): 0.2.4 --- CHANGELOG.md | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++- Cargo.lock | 24 +++++++++--------- Cargo.toml | 2 +- 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3911ea..6dfd611 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,73 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.2.4] - 2026-06-19 + +Vision, KV, and embedding-lookup bug-fix batch for Qwen3-VL and Gemma 4, plus a +`/metrics/cache` recording/docs fix and a Homebrew bottle build+publish flow. +Highlights: Qwen3-VL large images now work end to end (KV sized from `--max-ctx`; +the O(seq²) embedding lookup that tripped the Metal GPU watchdog is gone), and +Gemma 4 image grounding is fixed by placing image tokens inside the user turn. No +breaking changes. + +### Added + +- **Homebrew bottle build+publish flow.** `scripts/release/build_bottle.sh` + + `make bottle` drive `brew bottle` against an installed keg, rename the local + bottle to the GitHub-Release asset name, and emit the ready-to-paste + `bottle do` block; documented as a release-time step in `docs/RELEASING.md`. + The committed formula stays source-build until a real bottle is uploaded, so + existing tap installs are unaffected. (#143, #139) + +### Fixed + +- **`/metrics/cache` TTFT empty for non-streaming completions.** Both + non-streaming paths (`generate_blocking`, OpenAI + Anthropic) measured TTFT + but never pushed it into the in-memory `ttft_store` ring — only the streaming + path did, so `ttft` stayed `[]` for non-streaming traffic. The ring is now + written on both paths. `docs/SERVER.md` is realigned to the endpoint's actual + shape (`models[]`, `itl`, `tokens_in/out`), dropping the never-emitted + `prompt_cache` / `last_itl` keys. (#142, #141) +- **Gemma 4 image grounding (degenerate / image-independent output).** The + per-image token block was spliced after BOS but *before* the user-turn opener, + leaving the image outside the user message; the model then ignored it. Image + blocks are now spliced inside the (final) user turn via a shared + `splice_image_block`, matching the HF/mlx-vlm placeholder substitution. Fixes + the reported e4b QAT-fp4 degeneration (the soft tokens were correct all along) + and a latent flakiness that affected all Gemma 4 image requests; Qwen3-VL is + unified onto the same path. (#144, #140) +- **Qwen3-VL ignored `--max-ctx`; large images failed with a `slice_update` + broadcast.** The image and text generate paths built KV with the bare 4096 + default and never bracketed prefill, so any prompt over 4096 tokens (a large + image tiles to thousands of soft tokens) overran the fixed buffer. Both paths + now size the KV ring from the effective `--max-ctx` and chunk the prefill; + an over-cap prompt returns a clean `context_overflow` instead of the broadcast + panic. (#145, #138) +- **Qwen3-VL large images hit the Metal GPU watchdog.** The quantized embedding + lookup used an O(seq²) `eye(seq) @ w` identity-matmul on CPU (plus a GPU↔CPU + round-trip); embedding the whole augmented prompt for a large image produced a + single command buffer that overran the ~10 s watchdog. Replaced with on-device + `take + dequantize` (O(seq)); added query-tiled ViT attention as a faithful + defense for very large single images. (#147, #146) +- **Qwen3.6 (`qwen3_5_moe`) embedding lookup** carried the same O(seq²) + `eye(seq) @ w`-on-CPU trick (plus an `unsafe` block); ported to the same + on-device `take + dequantize`. Numerically faithful, removes a per-step CPU + round-trip. (#149, #148) + +### Performance + +- Qwen3-VL: large images (e.g. 2560×2560 → 6400 soft tokens) now complete + end-to-end instead of aborting the process at the Metal GPU watchdog. (#145, #147) + +### Tested + +- New CI-gated tests: image-token placement (in-turn, last-turn, multi-image, + after-BOS fallback), ViT attention tiling equals a single SDPA, and + `qwen3_5_moe` embed_lookup numeric equivalence across both dtype arms (the + prior coverage was `#[ignore]` + env-gated). Real-model proofs across Qwen3-VL + (KV + large-image), Gemma 4 e4b QAT-fp4 vision, and Qwen3.6 (decode-TPS + same-session A/B: no regression). + ## [0.2.3] - 2026-06-18 Multi-model registry hardening. Two `--registry` serving bugs fixed: the @@ -372,7 +439,8 @@ inference + conversion backend for Apple Silicon — no Python at runtime. - Speculative drafters validated against their verifiers: Qwen 3.6 MTP sidecar and the Gemma 4 assistant drafter. -[Unreleased]: https://github.com/Pushkinist/rMLX/compare/v0.2.3...HEAD +[Unreleased]: https://github.com/Pushkinist/rMLX/compare/v0.2.4...HEAD +[0.2.4]: https://github.com/Pushkinist/rMLX/releases/tag/v0.2.4 [0.2.3]: https://github.com/Pushkinist/rMLX/releases/tag/v0.2.3 [0.2.2]: https://github.com/Pushkinist/rMLX/releases/tag/v0.2.2 [0.2.1]: https://github.com/Pushkinist/rMLX/releases/tag/v0.2.1 diff --git a/Cargo.lock b/Cargo.lock index 6377e33..1025394 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1921,7 +1921,7 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rmlx-audio" -version = "0.2.3" +version = "0.2.4" dependencies = [ "miniz_oxide", "rmlx-core", @@ -1939,7 +1939,7 @@ dependencies = [ [[package]] name = "rmlx-cli" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "chrono", @@ -1974,7 +1974,7 @@ dependencies = [ [[package]] name = "rmlx-core" -version = "0.2.3" +version = "0.2.4" dependencies = [ "chrono", "libc", @@ -1988,7 +1988,7 @@ dependencies = [ [[package]] name = "rmlx-kv-quant" -version = "0.2.3" +version = "0.2.4" dependencies = [ "rmlx-core", "rmlx-mlx", @@ -1998,7 +1998,7 @@ dependencies = [ [[package]] name = "rmlx-kv-ssd" -version = "0.2.3" +version = "0.2.4" dependencies = [ "rmlx-core", "rmlx-kv-quant", @@ -2014,7 +2014,7 @@ dependencies = [ [[package]] name = "rmlx-loader" -version = "0.2.3" +version = "0.2.4" dependencies = [ "memmap2", "rayon", @@ -2030,7 +2030,7 @@ dependencies = [ [[package]] name = "rmlx-metrics" -version = "0.2.3" +version = "0.2.4" dependencies = [ "csv", "regex-lite", @@ -2049,7 +2049,7 @@ dependencies = [ [[package]] name = "rmlx-mlx" -version = "0.2.3" +version = "0.2.4" dependencies = [ "bindgen", "rmlx-core", @@ -2061,7 +2061,7 @@ dependencies = [ [[package]] name = "rmlx-models" -version = "0.2.3" +version = "0.2.4" dependencies = [ "criterion", "image", @@ -2088,7 +2088,7 @@ dependencies = [ [[package]] name = "rmlx-quant" -version = "0.2.3" +version = "0.2.4" dependencies = [ "criterion", "rmlx-core", @@ -2099,7 +2099,7 @@ dependencies = [ [[package]] name = "rmlx-runtime" -version = "0.2.3" +version = "0.2.4" dependencies = [ "rmlx-core", "rmlx-mlx", @@ -2109,7 +2109,7 @@ dependencies = [ [[package]] name = "rmlx-server" -version = "0.2.3" +version = "0.2.4" dependencies = [ "anyhow", "axum", diff --git a/Cargo.toml b/Cargo.toml index 4339d82..4db2bfd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ members = [ ] [workspace.package] -version = "0.2.3" +version = "0.2.4" edition = "2021" rust-version = "1.95" license = "MIT OR Apache-2.0"