lightseekorg · ConnorLi96 · Mar 25, 2026 · Mar 25, 2026 · Apr 7, 2026 · Apr 8, 2026
@@ -11,7 +11,9 @@
 import logging
 import os
 import random
+import shutil
 import signal
+import tempfile
 import socket
 import subprocess
 import sys
@@ -550,6 +552,7 @@ def __init__(self, backend: str, args: argparse.Namespace, backend_args: list[st
         self.launcher: WorkerLauncher = BACKEND_LAUNCHERS[backend]()
         self.workers: list[tuple[subprocess.Popen, int]] = []
         self._shutting_down = False
+        self._prometheus_dir: str | None = None
 
     # -- public API ---------------------------------------------------------
 
@@ -571,6 +574,15 @@ def run(self) -> None:
     def _launch_workers(self) -> None:
         ports = _find_available_ports(self.args.worker_base_port, self.args.data_parallel_size)
         host = self.args.worker_host
+
+        if getattr(self.args, "connection_mode", "grpc") == "grpc":
+            self._prometheus_dir = tempfile.mkdtemp(prefix="smg_prometheus_")
+            os.environ["PROMETHEUS_MULTIPROC_DIR"] = self._prometheus_dir
+            logger.info(
+                "Set PROMETHEUS_MULTIPROC_DIR=%s for gRPC metrics collection",
+                self._prometheus_dir,
+            )
+
         for dp_rank, port in enumerate(ports):
             env = self.launcher.gpu_env(self.args, dp_rank)
             proc = self.launcher.launch(self.args, self.backend_args, host, port, env)
@@ -641,6 +653,23 @@ def _cleanup_workers(self) -> None:
                 except (ProcessLookupError, OSError):
                     pass
 
+        self._cleanup_prometheus_dir()
+
+    def _cleanup_prometheus_dir(self) -> None:
+        """Remove the temporary prometheus multiprocess directory and its .db files."""
+        if self._prometheus_dir is None:
+            return
+        try:
+            shutil.rmtree(self._prometheus_dir)
+            logger.info("Cleaned up PROMETHEUS_MULTIPROC_DIR=%s", self._prometheus_dir)
+        except OSError as e:
+            logger.warning(
+                "Failed to clean up PROMETHEUS_MULTIPROC_DIR=%s: %s",
+                self._prometheus_dir,
+                e,
+            )
+        self._prometheus_dir = None
+
 
 # ---------------------------------------------------------------------------
 # Entry point

@@ -265,6 +265,10 @@ impl TrtllmServiceClient {
         clippy::unused_self,
         reason = "method receiver kept for consistent public API across gRPC backends"
     )]
+    #[expect(
+        clippy::too_many_arguments,
+        reason = "mirrors vLLM backend API; refactoring to a params struct is tracked separately"
+    )]
     pub fn build_generate_request_from_chat(
         &self,
         request_id: String,
@@ -273,6 +277,7 @@ impl TrtllmServiceClient {
         token_ids: Vec<u32>,
         multimodal_input: Option<proto::MultimodalInput>,
         tool_call_constraint: Option<(String, String)>, // (constraint_type, constraint_value)
+        eos_token_ids: &[u32],
     ) -> Result<proto::GenerateRequest, String> {
         // Build sampling config
         let sampling_config = Self::build_sampling_config_from_chat(body);
@@ -287,6 +292,11 @@ impl TrtllmServiceClient {
 
         let max_tokens = body.max_completion_tokens.unwrap_or(2048);
 
+        // Pass merged EOS token IDs from config.json + generation_config.json.
+        // TRT-LLM's gRPC path does not reliably merge these internally,
+        // so we provide them explicitly via the standard stop_token_ids field.
+        let stop_token_ids: Vec<u32> = eos_token_ids.to_vec();
+
         let grpc_request = proto::GenerateRequest {
             request_id,
             tokenized: Some(proto::TokenizedInput {
@@ -299,7 +309,7 @@ impl TrtllmServiceClient {
             max_tokens,
             streaming: body.stream,
             stop,
-            stop_token_ids: vec![],
+            stop_token_ids,
             ignore_eos: body.ignore_eos,
             bad: vec![],
             bad_token_ids: vec![],

@@ -0,0 +1,167 @@
+use std::collections::HashMap;
+
+use serde_json::{json, Value};
+
+use crate::{
+    registry::{ModelMetadata, ModelProcessorSpec, ModelRegistryError, RegistryResult},
+    types::{FieldLayout, Modality, PromptReplacement, TokenId},
+    vision::image_processor::PreprocessedImages,
+};
+
+pub(super) struct KimiK25Spec;
+
+impl KimiK25Spec {
+    fn media_placeholder_token_id(metadata: &ModelMetadata) -> RegistryResult<TokenId> {
+        metadata
+            .config_u32(&["media_placeholder_token_id"])
+            .map(|v| v as TokenId)
+            .ok_or_else(|| ModelRegistryError::MissingConfigField {
+                field: "media_placeholder_token_id".to_string(),
+            })
+    }
+}
+
+impl ModelProcessorSpec for KimiK25Spec {
+    fn name(&self) -> &'static str {
+        "kimi_k25"
+    }
+
+    fn matches(&self, metadata: &ModelMetadata) -> bool {
+        metadata
+            .config_model_type()
+            .is_some_and(|mt| mt == "kimi_k25")
+            || {
+                let id = metadata.model_id.to_ascii_lowercase();
+                id.contains("kimi") && (id.contains("k2.5") || id.contains("k25"))
+            }
+    }
+
+    fn placeholder_token(&self, _metadata: &ModelMetadata) -> RegistryResult<String> {
+        Ok("<|media_pad|>".to_string())
+    }
+
+    fn placeholder_token_id(&self, metadata: &ModelMetadata) -> RegistryResult<TokenId> {
+        Self::media_placeholder_token_id(metadata)
+    }
+
+    fn modality_limits(
+        &self,
+        _metadata: &ModelMetadata,
+    ) -> RegistryResult<HashMap<Modality, usize>> {
+        Ok(HashMap::from([(Modality::Image, 10)]))
+    }
+
+    fn processor_kwargs(&self, _metadata: &ModelMetadata) -> RegistryResult<Value> {
+        Ok(json!({}))
+    }
+
+    fn prompt_replacements(
+        &self,
+        metadata: &ModelMetadata,
+        preprocessed: &PreprocessedImages,
+    ) -> RegistryResult<Vec<PromptReplacement>> {
+        let pad_token_id = Self::media_placeholder_token_id(metadata)?;
+        let placeholder_token = self.placeholder_token(metadata)?;
+        // Keep 1 placeholder per image — TRT-LLM's KimiK25InputProcessor
+        // handles expansion to N vision tokens server-side based on grid_thws.
+        // SMG must NOT pre-expand or TRT-LLM will see N placeholders and
+        // attempt to expand each one again.
+        Ok(preprocessed
+            .num_img_tokens
+            .iter()
+            .map(|_| {
+                let tokens = vec![pad_token_id; 1];
+                PromptReplacement::sequence(Modality::Image, &placeholder_token, tokens)
+            })
+            .collect())
+    }
+
+    fn field_layouts(&self) -> HashMap<String, FieldLayout> {
+        // pixel_values is patchified: [total_patches, C, patch_h, patch_w].
+        // grid_thws is [num_images, 3] with (t, h, w) per image.
+        HashMap::from([
+            (
+                "pixel_values".to_string(),
+                FieldLayout::flat("patches_per_image"),
+            ),
+            ("grid_thws".to_string(), FieldLayout::Batched),
+            ("patches_per_image".to_string(), FieldLayout::Batched),
+        ])
+    }
+
+    fn keep_on_cpu_keys(&self) -> Vec<String> {
+        vec!["grid_thws".to_string()]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use serde_json::json;
+
+    use crate::{
+        registry::{test_helpers::*, ModelMetadata, ModelRegistry},
+        types::ImageSize,
+    };
+
+    #[test]
+    fn kimi_k25_matches_by_model_type() {
+        let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
+        let config = json!({
+            "model_type": "kimi_k25",
+            "media_placeholder_token_id": 163605,
+        });
+        let metadata = ModelMetadata {
+            model_id: "some-custom-name",
+            tokenizer: &tokenizer,
+            config: &config,
+        };
+        let registry = ModelRegistry::new();
+        let spec = registry.lookup(&metadata).expect("should match kimi_k25");
+        assert_eq!(spec.name(), "kimi_k25");
+    }
+
+    #[test]
+    fn kimi_k25_matches_by_model_id() {
+        let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
+        let config = json!({
+            "model_type": "unknown",
+            "media_placeholder_token_id": 163605,
+        });
+        let metadata = ModelMetadata {
+            model_id: "nvidia/Kimi-K2.5-NVFP4",
+            tokenizer: &tokenizer,
+            config: &config,
+        };
+        let registry = ModelRegistry::new();
+        let spec = registry
+            .lookup(&metadata)
+            .expect("should match kimi by name");
+        assert_eq!(spec.name(), "kimi_k25");
+    }
+
+    #[test]
+    fn kimi_k25_prompt_replacements() {
+        let tokenizer = TestTokenizer::new(&[("<|media_pad|>", 163605)]);
+        let config = json!({
+            "model_type": "kimi_k25",
+            "media_placeholder_token_id": 163605,
+        });
+        let metadata = ModelMetadata {
+            model_id: "nvidia/Kimi-K2.5-NVFP4",
+            tokenizer: &tokenizer,
+            config: &config,
+        };
+        let registry = ModelRegistry::new();
+        let spec = registry.lookup(&metadata).expect("kimi spec");
+        // 1 placeholder per image (TRT-LLM expands server-side)
+        let replacements = spec
+            .prompt_replacements(
+                &metadata,
+                &test_preprocessed_with_tokens(&[ImageSize::new(448, 448)], &[256]),
+            )
+            .unwrap();
+        assert_eq!(replacements.len(), 1);
+        assert_eq!(replacements[0].tokens.len(), 1);
+        assert_eq!(replacements[0].tokens[0], 163605);
+    }
+}
@@ -1,10 +1,12 @@
+mod kimi_k25;
 mod llama4;
 mod llava;
 mod phi3_v;
 mod qwen3_vl;
 mod qwen_vl;
 mod traits;
 
+use kimi_k25::KimiK25Spec;
 use llama4::Llama4Spec;
 use llava::{LlavaNextSpec, LlavaSpec};
 use once_cell::sync::Lazy;
@@ -22,6 +24,7 @@ impl ModelRegistry {
     pub fn new() -> Self {
         Self {
             specs: vec![
+                LazySpec::new("kimi_k25", || Box::new(KimiK25Spec)),
                 LazySpec::new("llama4", || Box::new(Llama4Spec)),
                 // LlavaNext must be registered before Llava so "llava_next" model_type matches first.
                 LazySpec::new("llava_next", || Box::new(LlavaNextSpec)),
@@ -131,6 +134,7 @@ pub(super) mod test_helpers {
                 cls_token: None,
                 mask_token: None,
                 additional_special_tokens: vec![],
+                eos_token_ids: vec![],
             });
             &TOKENS
         }

@@ -464,6 +464,9 @@ impl ImageProcessorRegistry {
             Box::new(super::processors::Llama4VisionProcessor::new()),
         );
 
+        // Register Kimi-K2.5 (MoonViT3d, NaViT-style)
+        registry.register("kimi", Box::new(super::processors::KimiK25Processor::new()));
+
         registry
     }
 }

@@ -0,0 +1,67 @@
+//! Kimi-K2.5 image processor.
+//!
+//! Kimi-K2.5 uses MoonViT3d with a NaViT-style architecture very similar
+//! to Qwen VL: patch_size=14, merge_kernel_size=(2,2), dynamic resolution.
+//! We reuse the QwenVLProcessorBase with Kimi-specific defaults.
+
+use image::DynamicImage;
+
+use super::qwen_vl_base::{QwenVLConfig, QwenVLProcessorBase};
+use crate::vision::{
+    image_processor::{ImagePreProcessor, PreprocessedImages},
+    preprocessor_config::PreProcessorConfig,
+    transforms::TransformError,
+};
+
+pub struct KimiK25Processor {
+    inner: QwenVLProcessorBase,
+}
+
+impl Default for KimiK25Processor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl KimiK25Processor {
+    pub fn new() -> Self {
+        Self {
+            inner: QwenVLProcessorBase::new(QwenVLConfig {
+                patch_size: 14,
+                merge_size: 2,
+                min_pixels: 14 * 14 * 4,
+                max_pixels: 14 * 14 * 16384,
+                temporal_patch_size: 1,
+                mean: [0.5, 0.5, 0.5],
+                std: [0.5, 0.5, 0.5],
+                model_name: "kimi_k25",
+            }),
+        }
+    }
+}
+
+impl ImagePreProcessor for KimiK25Processor {
+    fn default_mean(&self) -> [f64; 3] {
+        [0.5, 0.5, 0.5]
+    }
+
+    fn default_std(&self) -> [f64; 3] {
+        [0.5, 0.5, 0.5]
+    }
+
+    fn preprocess(
+        &self,
+        images: &[DynamicImage],
+        config: &PreProcessorConfig,
+    ) -> Result<PreprocessedImages, TransformError> {
+        self.inner.preprocess(images, config)
+    }
+
+    fn calculate_num_tokens(&self, width: u32, height: u32, config: &PreProcessorConfig) -> usize {
+        self.inner.calculate_num_tokens(width, height, config)
+    }
+
+    fn model_name(&self) -> &'static str {
+        "kimi_k25"
+    }
+}
@@ -15,6 +15,7 @@
 //! - **LLaMA 4 Vision** (`llama4_vision`): Tile-based processing with 336x336 tiles and global tile
 //! - **Pixtral/Mistral3** (`pixtral`): CLIP-based preprocessing with dynamic resolution
 
+pub mod kimi_k25;
 pub mod llama4_vision;
 pub mod llava;
 pub mod phi3_vision;
@@ -24,6 +25,7 @@ pub mod qwen2_vl;
 pub mod qwen3_vl;
 pub mod qwen_vl_base;
 
+pub use kimi_k25::KimiK25Processor;
 pub use llama4_vision::Llama4VisionProcessor;
 pub use llava::{ImageAspectRatio, LlavaNextProcessor, LlavaProcessor};
 pub use phi3_vision::Phi3VisionProcessor;