Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
ea47938
read engine metrics from PROMETHEUS_MULTIPROC_DIR in grpc mode
Mar 25, 2026
65b14bb
lint and format
Mar 25, 2026
d8616a8
fix(tool_parser): fix function call parsing for models with native to…
ConnorLi96 Apr 7, 2026
f58f15b
fix(gateway): comprehensive func call and response quality fixes
ConnorLi96 Apr 8, 2026
bfab5ad
fix(trtllm): add <|im_end|> stop string for TRT-LLM gRPC requests
ConnorLi96 Apr 8, 2026
46b284f
fix(tokenizer): load merged EOS token IDs from config.json + generati…
ConnorLi96 Apr 8, 2026
98bd903
fix(metrics): guard against empty prometheus multiproc metrics output
ConnorLi96 Apr 9, 2026
de6058e
fix(tokenizer): reduce tiktoken partial UTF-8 decode log from warn to…
ConnorLi96 Apr 10, 2026
9cdaa5d
fix(gateway): graceful degradation for unsupported multimodal requests
ConnorLi96 Apr 10, 2026
bc34b81
Revert "fix(gateway): graceful degradation for unsupported multimodal…
ConnorLi96 Apr 10, 2026
20aa536
feat(multimodal): add Kimi-K2.5 vision model spec and image processor
ConnorLi96 Apr 10, 2026
2bc8274
feat(protocol): add thinking param to Chat API and support bare strin…
ConnorLi96 Apr 11, 2026
28218b4
feat(observability): add worker selection and backend response INFO logs
ConnorLi96 Apr 15, 2026
4be6380
feat(metrics): merge engine metrics into :29000/metrics endpoint
ConnorLi96 Apr 15, 2026
58fe788
fix: resolve all clippy warnings for clean --all-targets build
ConnorLi96 Apr 15, 2026
7f21625
fix(reasoning): run reasoning parser before JSON/tool post-processing…
ConnorLi96 Apr 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bindings/python/src/smg/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
import logging
import os
import random
import shutil
import signal
import tempfile
import socket
import subprocess
import sys
Expand Down Expand Up @@ -550,6 +552,7 @@ def __init__(self, backend: str, args: argparse.Namespace, backend_args: list[st
self.launcher: WorkerLauncher = BACKEND_LAUNCHERS[backend]()
self.workers: list[tuple[subprocess.Popen, int]] = []
self._shutting_down = False
self._prometheus_dir: str | None = None

# -- public API ---------------------------------------------------------

Expand All @@ -571,6 +574,15 @@ def run(self) -> None:
def _launch_workers(self) -> None:
ports = _find_available_ports(self.args.worker_base_port, self.args.data_parallel_size)
host = self.args.worker_host

if getattr(self.args, "connection_mode", "grpc") == "grpc":
self._prometheus_dir = tempfile.mkdtemp(prefix="smg_prometheus_")
os.environ["PROMETHEUS_MULTIPROC_DIR"] = self._prometheus_dir
logger.info(
"Set PROMETHEUS_MULTIPROC_DIR=%s for gRPC metrics collection",
self._prometheus_dir,
)

for dp_rank, port in enumerate(ports):
env = self.launcher.gpu_env(self.args, dp_rank)
proc = self.launcher.launch(self.args, self.backend_args, host, port, env)
Expand Down Expand Up @@ -641,6 +653,23 @@ def _cleanup_workers(self) -> None:
except (ProcessLookupError, OSError):
pass

self._cleanup_prometheus_dir()

def _cleanup_prometheus_dir(self) -> None:
"""Remove the temporary prometheus multiprocess directory and its .db files."""
if self._prometheus_dir is None:
return
try:
shutil.rmtree(self._prometheus_dir)
logger.info("Cleaned up PROMETHEUS_MULTIPROC_DIR=%s", self._prometheus_dir)
except OSError as e:
logger.warning(
"Failed to clean up PROMETHEUS_MULTIPROC_DIR=%s: %s",
self._prometheus_dir,
e,
)
self._prometheus_dir = None


# ---------------------------------------------------------------------------
# Entry point
Expand Down
12 changes: 11 additions & 1 deletion crates/grpc_client/src/trtllm_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ impl TrtllmServiceClient {
clippy::unused_self,
reason = "method receiver kept for consistent public API across gRPC backends"
)]
#[expect(
clippy::too_many_arguments,
reason = "mirrors vLLM backend API; refactoring to a params struct is tracked separately"
)]
pub fn build_generate_request_from_chat(
&self,
request_id: String,
Expand All @@ -273,6 +277,7 @@ impl TrtllmServiceClient {
token_ids: Vec<u32>,
multimodal_input: Option<proto::MultimodalInput>,
tool_call_constraint: Option<(String, String)>, // (constraint_type, constraint_value)
eos_token_ids: &[u32],
) -> Result<proto::GenerateRequest, String> {
// Build sampling config
let sampling_config = Self::build_sampling_config_from_chat(body);
Expand All @@ -287,6 +292,11 @@ impl TrtllmServiceClient {

let max_tokens = body.max_completion_tokens.unwrap_or(2048);

// Pass merged EOS token IDs from config.json + generation_config.json.
// TRT-LLM's gRPC path does not reliably merge these internally,
// so we provide them explicitly via the standard stop_token_ids field.
let stop_token_ids: Vec<u32> = eos_token_ids.to_vec();
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Respect ignore_eos when injecting EOS stop token IDs

This unconditionally copies merged EOS IDs into stop_token_ids, but stop_token_ids are hard stop conditions while ignore_eos is meant to continue generation past EOS. As written, requests with ignore_eos=true will still terminate when an EOS token appears, which breaks the documented ignore_eos behavior for TRT-LLM chat/harmony requests.

Useful? React with 👍 / 👎.


let grpc_request = proto::GenerateRequest {
request_id,
tokenized: Some(proto::TokenizedInput {
Expand All @@ -299,7 +309,7 @@ impl TrtllmServiceClient {
max_tokens,
streaming: body.stream,
stop,
stop_token_ids: vec![],
stop_token_ids,
ignore_eos: body.ignore_eos,
bad: vec![],
bad_token_ids: vec![],
Expand Down
167 changes: 167 additions & 0 deletions crates/multimodal/src/registry/kimi_k25.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use std::collections::HashMap;

use serde_json::{json, Value};

use crate::{
registry::{ModelMetadata, ModelProcessorSpec, ModelRegistryError, RegistryResult},
types::{FieldLayout, Modality, PromptReplacement, TokenId},
vision::image_processor::PreprocessedImages,
};

pub(super) struct KimiK25Spec;

impl KimiK25Spec {
fn media_placeholder_token_id(metadata: &ModelMetadata) -> RegistryResult<TokenId> {
metadata
.config_u32(&["media_placeholder_token_id"])
.map(|v| v as TokenId)
.ok_or_else(|| ModelRegistryError::MissingConfigField {
field: "media_placeholder_token_id".to_string(),
})
}
}

impl ModelProcessorSpec for KimiK25Spec {
fn name(&self) -> &'static str {
"kimi_k25"
}

fn matches(&self, metadata: &ModelMetadata) -> bool {
metadata
.config_model_type()
.is_some_and(|mt| mt == "kimi_k25")
|| {
let id = metadata.model_id.to_ascii_lowercase();
id.contains("kimi") && (id.contains("k2.5") || id.contains("k25"))
}
}

fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
Ok("<|media_pad|>".to_string())
}

fn placeholder_token_id(&self, metadata: &ModelMetadata) -> RegistryResult<TokenId> {
Self::media_placeholder_token_id(metadata)
}

fn modality_limits(
&self,
_metadata: &ModelMetadata,
) -> RegistryResult<HashMap<Modality, usize>> {
Ok(HashMap::from([(Modality::Image, 10)]))
}

fn processor_kwargs(&self, _metadata: &ModelMetadata) -> RegistryResult<Value> {
Ok(json!({}))
}

fn prompt_replacements(
&self,
metadata: &ModelMetadata,
preprocessed: &PreprocessedImages,
) -> RegistryResult<Vec<PromptReplacement>> {
let pad_token_id = Self::media_placeholder_token_id(metadata)?;
let placeholder_token = self.placeholder_token(metadata)?;
// Keep 1 placeholder per image — TRT-LLM's KimiK25InputProcessor
// handles expansion to N vision tokens server-side based on grid_thws.
// SMG must NOT pre-expand or TRT-LLM will see N placeholders and
// attempt to expand each one again.
Ok(preprocessed
.num_img_tokens
.iter()
.map(|_| {
let tokens = vec![pad_token_id; 1];
PromptReplacement::sequence(Modality::Image, &placeholder_token, tokens)
})
.collect())
}

fn field_layouts(&self) -> HashMap<String, FieldLayout> {
// pixel_values is patchified: [total_patches, C, patch_h, patch_w].
// grid_thws is [num_images, 3] with (t, h, w) per image.
HashMap::from([
(
"pixel_values".to_string(),
FieldLayout::flat("patches_per_image"),
),
("grid_thws".to_string(), FieldLayout::Batched),
("patches_per_image".to_string(), FieldLayout::Batched),
])
}

fn keep_on_cpu_keys(&self) -> Vec<String> {
vec!["grid_thws".to_string()]
}
}

#[cfg(test)]
mod tests {
use serde_json::json;

use crate::{
registry::{test_helpers::*, ModelMetadata, ModelRegistry},
types::ImageSize,
};

#[test]
fn kimi_k25_matches_by_model_type() {
let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
let config = json!({
"model_type": "kimi_k25",
"media_placeholder_token_id": 163605,
});
let metadata = ModelMetadata {
model_id: "some-custom-name",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("should match kimi_k25");
assert_eq!(spec.name(), "kimi_k25");
}

#[test]
fn kimi_k25_matches_by_model_id() {
let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
let config = json!({
"model_type": "unknown",
"media_placeholder_token_id": 163605,
});
let metadata = ModelMetadata {
model_id: "nvidia/Kimi-K2.5-NVFP4",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry
.lookup(&metadata)
.expect("should match kimi by name");
assert_eq!(spec.name(), "kimi_k25");
}

#[test]
fn kimi_k25_prompt_replacements() {
let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
let config = json!({
"model_type": "kimi_k25",
"media_placeholder_token_id": 163605,
});
let metadata = ModelMetadata {
model_id: "nvidia/Kimi-K2.5-NVFP4",
tokenizer: &tokenizer,
config: &config,
};
let registry = ModelRegistry::new();
let spec = registry.lookup(&metadata).expect("kimi spec");
// 1 placeholder per image (TRT-LLM expands server-side)
let replacements = spec
.prompt_replacements(
&metadata,
&test_preprocessed_with_tokens(&[ImageSize::new(448, 448)], &[256]),
)
.unwrap();
assert_eq!(replacements.len(), 1);
assert_eq!(replacements[0].tokens.len(), 1);
assert_eq!(replacements[0].tokens[0], 163605);
}
}
4 changes: 4 additions & 0 deletions crates/multimodal/src/registry/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
mod kimi_k25;
mod llama4;
mod llava;
mod phi3_v;
mod qwen3_vl;
mod qwen_vl;
mod traits;

use kimi_k25::KimiK25Spec;
use llama4::Llama4Spec;
use llava::{LlavaNextSpec, LlavaSpec};
use once_cell::sync::Lazy;
Expand All @@ -22,6 +24,7 @@ impl ModelRegistry {
pub fn new() -> Self {
Self {
specs: vec![
LazySpec::new("kimi_k25", || Box::new(KimiK25Spec)),
LazySpec::new("llama4", || Box::new(Llama4Spec)),
// LlavaNext must be registered before Llava so "llava_next" model_type matches first.
LazySpec::new("llava_next", || Box::new(LlavaNextSpec)),
Expand Down Expand Up @@ -131,6 +134,7 @@ pub(super) mod test_helpers {
cls_token: None,
mask_token: None,
additional_special_tokens: vec![],
eos_token_ids: vec![],
});
&TOKENS
}
Expand Down
3 changes: 3 additions & 0 deletions crates/multimodal/src/vision/image_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,9 @@ impl ImageProcessorRegistry {
Box::new(super::processors::Llama4VisionProcessor::new()),
);

// Register Kimi-K2.5 (MoonViT3d, NaViT-style)
registry.register("kimi", Box::new(super::processors::KimiK25Processor::new()));

registry
}
}
Expand Down
67 changes: 67 additions & 0 deletions crates/multimodal/src/vision/processors/kimi_k25.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//! Kimi-K2.5 image processor.
//!
//! Kimi-K2.5 uses MoonViT3d with a NaViT-style architecture very similar
//! to Qwen VL: patch_size=14, merge_kernel_size=(2,2), dynamic resolution.
//! We reuse the QwenVLProcessorBase with Kimi-specific defaults.

use image::DynamicImage;

use super::qwen_vl_base::{QwenVLConfig, QwenVLProcessorBase};
use crate::vision::{
image_processor::{ImagePreProcessor, PreprocessedImages},
preprocessor_config::PreProcessorConfig,
transforms::TransformError,
};

pub struct KimiK25Processor {
inner: QwenVLProcessorBase,
}

impl Default for KimiK25Processor {
fn default() -> Self {
Self::new()
}
}

impl KimiK25Processor {
pub fn new() -> Self {
Self {
inner: QwenVLProcessorBase::new(QwenVLConfig {
patch_size: 14,
merge_size: 2,
min_pixels: 14 * 14 * 4,
max_pixels: 14 * 14 * 16384,
temporal_patch_size: 1,
mean: [0.5, 0.5, 0.5],
std: [0.5, 0.5, 0.5],
model_name: "kimi_k25",
}),
}
}
}

impl ImagePreProcessor for KimiK25Processor {
fn default_mean(&self) -> [f64; 3] {
[0.5, 0.5, 0.5]
}

fn default_std(&self) -> [f64; 3] {
[0.5, 0.5, 0.5]
}

fn preprocess(
&self,
images: &[DynamicImage],
config: &PreProcessorConfig,
) -> Result<PreprocessedImages, TransformError> {
self.inner.preprocess(images, config)
}

fn calculate_num_tokens(&self, width: u32, height: u32, config: &PreProcessorConfig) -> usize {
self.inner.calculate_num_tokens(width, height, config)
}

fn model_name(&self) -> &'static str {
"kimi_k25"
}
}
2 changes: 2 additions & 0 deletions crates/multimodal/src/vision/processors/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
//! - **LLaMA 4 Vision** (`llama4_vision`): Tile-based processing with 336x336 tiles and global tile
//! - **Pixtral/Mistral3** (`pixtral`): CLIP-based preprocessing with dynamic resolution

pub mod kimi_k25;
pub mod llama4_vision;
pub mod llava;
pub mod phi3_vision;
Expand All @@ -24,6 +25,7 @@ pub mod qwen2_vl;
pub mod qwen3_vl;
pub mod qwen_vl_base;

pub use kimi_k25::KimiK25Processor;
pub use llama4_vision::Llama4VisionProcessor;
pub use llava::{ImageAspectRatio, LlavaNextProcessor, LlavaProcessor};
pub use phi3_vision::Phi3VisionProcessor;
Expand Down
Loading