lai3d · lai3d · May 19, 2026 · May 19, 2026
diff --git a/docs/ai-triage.en.md b/docs/ai-triage.en.md
@@ -209,6 +209,17 @@ Unit tests in `sigma-api/src/routes/ai_triage.rs` cover the marker placement, sa
 
 The endpoint sits behind the API's standard `auth` middleware (JWT or `X-Api-Key`) **and requires `admin` or `operator` role**. `readonly` consumers (dashboards, monitoring) and per-VPS `agent` keys receive a `403 Forbidden` before any LLM call is made — they can't spend tokens. The global rate limit still applies on top, and provider-side quota remains the second line of defence.
 
+### Per-user rate limit
+
+A second sliding window applies after the RBAC check, keyed on the authenticated user's UUID (stable across both JWT and API-key auth). This is independent of the global per-IP limiter: ten operators behind one NAT don't share a token budget, and one operator hammering from many IPs can't bypass it.
+
+| Env var | Default | Meaning |
+|---------|---------|---------|
+| `LLM_RATE_LIMIT_REQUESTS` | `20` | Max triages per window, per user |
+| `LLM_RATE_LIMIT_WINDOW` | `3600` | Window length in seconds (default 1 h) |
+
+When exhausted, the endpoint returns `429 Too Many Requests` with a `Retry-After` header (seconds). Fails open if Redis is unreachable — availability over perfect accounting when our own infra hiccups.
+
 ### OpenAPI
 
 The full schema is published at `/swagger-ui` under the **AI Triage** tag (`/api-docs/openapi.json` for machine consumption).

diff --git a/docs/ai-triage.zh.md b/docs/ai-triage.zh.md
@@ -209,6 +209,17 @@ Content-Type: application/json
 
 端点位于 API 的标准 `auth` 中间件之后(JWT 或 `X-Api-Key`),**并要求 `admin` 或 `operator` 角色**。`readonly` 消费者(仪表盘、监控)和每个 VPS 的 `agent` key 在到达 LLM 调用之前就会收到 `403 Forbidden` —— 它们无法消费 token。全局速率限制仍然叠加生效,provider 侧的 quota 是第二道防线。
 
+### 每用户速率限制
+
+RBAC 检查之后还有一道滑动窗口限制,key 用认证用户的 UUID(JWT 与 API key 两条路径下都稳定)。它与全局按 IP 限流彼此独立:NAT 后面的十个运维不会共享一份 token 预算,一个运维从多个 IP 频繁调用也无法绕过限制。
+
+| 环境变量 | 默认值 | 含义 |
+|---------|--------|------|
+| `LLM_RATE_LIMIT_REQUESTS` | `20` | 每用户、每窗口的最大诊断次数 |
+| `LLM_RATE_LIMIT_WINDOW` | `3600` | 窗口长度(秒,默认 1 小时) |
+
+超过限额后,端点返回 `429 Too Many Requests`,并带上 `Retry-After` 响应头(秒)。Redis 不可达时**放行**(fail open) —— 自家基础设施抖动时,可用性优先于完美的计数。
+
 ### OpenAPI
 
 完整 schema 发布在 `/swagger-ui`,标签为 **AI Triage**(机器消费走 `/api-docs/openapi.json`)。

diff --git a/sigma-api/src/config.rs b/sigma-api/src/config.rs
@@ -27,6 +27,12 @@ pub struct Config {
     /// `LLM_PROVIDER` is unset or `anthropic`, so existing deployments
     /// keep working without rotating env vars.
     pub llm_api_key: Option<String>,
+
+    /// Per-user rate limit on LLM-spending endpoints. Distinct from the
+    /// global per-IP rate limit because the cost shape is different —
+    /// tokens, not connection pressure.
+    pub llm_rate_limit_requests: u32,
+    pub llm_rate_limit_window: u64,
 }
 
 impl Config {
@@ -105,6 +111,14 @@ impl Config {
                         None
                     }
                 }),
+            llm_rate_limit_requests: std::env::var("LLM_RATE_LIMIT_REQUESTS")
+                .ok()
+                .and_then(|p| p.parse().ok())
+                .unwrap_or(20),
+            llm_rate_limit_window: std::env::var("LLM_RATE_LIMIT_WINDOW")
+                .ok()
+                .and_then(|p| p.parse().ok())
+                .unwrap_or(3600),
         }
     }
 }
diff --git a/sigma-api/src/errors.rs b/sigma-api/src/errors.rs
@@ -24,6 +24,14 @@ pub enum AppError {
     #[error("Forbidden: {0}")]
     Forbidden(String),
 
+    /// Handler-level rate limit (e.g. LLM token spend). The optional
+    /// `retry_after_secs` is surfaced as a `Retry-After` HTTP header.
+    #[error("Too many requests: {message}")]
+    TooManyRequests {
+        message: String,
+        retry_after_secs: Option<u64>,
+    },
+
     #[error("Database error: {0}")]
     Sqlx(#[from] sqlx::Error),
 
@@ -36,11 +44,29 @@ pub enum AppError {
 
 impl IntoResponse for AppError {
     fn into_response(self) -> Response {
+        // TooManyRequests carries an optional Retry-After value; handle it
+        // separately so we can attach the header before serializing.
+        if let AppError::TooManyRequests {
+            ref message,
+            retry_after_secs,
+        } = self
+        {
+            let body = Json(json!({ "error": message.clone() }));
+            let mut response = (StatusCode::TOO_MANY_REQUESTS, body).into_response();
+            if let Some(secs) = retry_after_secs {
+                if let Ok(v) = axum::http::HeaderValue::from_str(&secs.to_string()) {
+                    response.headers_mut().insert("Retry-After", v);
+                }
+            }
+            return response;
+        }
+
         let (status, msg) = match &self {
             AppError::NotFound => (StatusCode::NOT_FOUND, self.to_string()),
             AppError::BadRequest(_) => (StatusCode::BAD_REQUEST, self.to_string()),
             AppError::Unauthorized => (StatusCode::UNAUTHORIZED, self.to_string()),
             AppError::Forbidden(_) => (StatusCode::FORBIDDEN, self.to_string()),
+            AppError::TooManyRequests { .. } => unreachable!("handled above"),
             AppError::Sqlx(e) => {
                 tracing::error!("Database error: {:?}", e);
                 (StatusCode::INTERNAL_SERVER_ERROR, "Database error".into())
@@ -96,4 +122,31 @@ mod tests {
             StatusCode::INTERNAL_SERVER_ERROR
         );
     }
+
+    #[test]
+    fn test_too_many_requests_is_429_with_retry_after() {
+        let response = AppError::TooManyRequests {
+            message: "slow down".into(),
+            retry_after_secs: Some(42),
+        }
+        .into_response();
+        assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS);
+        let retry = response
+            .headers()
+            .get("Retry-After")
+            .and_then(|v| v.to_str().ok())
+            .unwrap();
+        assert_eq!(retry, "42");
+    }
+
+    #[test]
+    fn test_too_many_requests_without_retry_after_still_429() {
+        let response = AppError::TooManyRequests {
+            message: "slow down".into(),
+            retry_after_secs: None,
+        }
+        .into_response();
+        assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS);
+        assert!(response.headers().get("Retry-After").is_none());
+    }
 }
diff --git a/sigma-api/src/main.rs b/sigma-api/src/main.rs
@@ -87,6 +87,8 @@ async fn main() -> anyhow::Result<()> {
         jwt_expiry_hours: cfg.jwt_expiry_hours,
         llm_provider,
         llm_api_key: cfg.llm_api_key.clone(),
+        llm_rate_limit_requests: cfg.llm_rate_limit_requests,
+        llm_rate_limit_window: cfg.llm_rate_limit_window,
     };
 
     // Capture before cfg is moved into notification worker

diff --git a/sigma-api/src/routes/ai_triage.rs b/sigma-api/src/routes/ai_triage.rs
@@ -22,11 +22,13 @@
 //!   human-in-the-loop is the design.
 
 use axum::{extract::State, routing::post, Extension, Json, Router};
+use redis::AsyncCommands;
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Value};
 use std::time::Duration;
 use tracing::{info, warn};
 use utoipa::ToSchema;
+use uuid::Uuid;
 
 use crate::auth::{require_role, CurrentUser};
 use crate::errors::AppError;
@@ -192,6 +194,7 @@ pub struct TriageResponse {
     responses(
         (status = 200, description = "Triage suggestion (degrades gracefully when LLM unavailable)", body = TriageResponse),
         (status = 403, description = "Caller's role is not permitted to spend LLM tokens (requires admin or operator)"),
+        (status = 429, description = "Per-user LLM rate limit exceeded. Retry after the `Retry-After` header's value (seconds)."),
     )
 )]
 pub async fn triage(
@@ -205,6 +208,11 @@ pub async fn triage(
     // middleware still applies on top.
     require_role(&user, &["admin", "operator"])?;
 
+    // Per-user LLM rate limit. Distinct from the global per-IP limit: ten
+    // operators behind one NAT shouldn't share a token budget, and one
+    // operator hammering from many IPs shouldn't bypass it.
+    check_llm_rate_limit(&state, user.id).await?;
+
     let provider = state.llm_provider;
     let provider_str = provider.as_str();
 
@@ -275,6 +283,56 @@ pub async fn triage(
     Ok(Json(parse_llm_response(&llm_text, &model, provider_str)))
 }
 
+// ---------- Per-user rate limit ----------
+
+/// Sliding-window per-user limit on LLM-spending requests. Mirrors the
+/// shape of `routes::rate_limit` but keys on `user.id` (stable across JWT
+/// + API-key auth) and uses a separate Redis prefix + window so the two
+/// limiters don't interfere. Fails open if Redis is down — same posture
+/// as the global limiter: availability beats perfect accounting when our
+/// own infra has a hiccup.
+async fn check_llm_rate_limit(state: &AppState, user_id: Uuid) -> Result<(), AppError> {
+    let key = format!("llm-rate:{}", user_id);
+    let limit = state.llm_rate_limit_requests;
+    let window = state.llm_rate_limit_window;
+
+    let mut conn = state.redis.clone();
+    let count: u32 = match redis::cmd("INCR").arg(&key).query_async(&mut conn).await {
+        Ok(c) => c,
+        Err(e) => {
+            warn!("LLM rate-limit Redis INCR failed (failing open): {e}");
+            return Ok(());
+        }
+    };
+
+    if count == 1 {
+        if let Err(e) = conn.expire::<_, ()>(&key, window as i64).await {
+            warn!("LLM rate-limit Redis EXPIRE failed: {e}");
+        }
+    }
+
+    if count > limit {
+        let ttl: i64 = conn.ttl(&key).await.unwrap_or(window as i64);
+        let retry = if ttl > 0 { Some(ttl as u64) } else { Some(window) };
+        info!(
+            user_id = %user_id,
+            count,
+            limit,
+            window,
+            "LLM rate limit exceeded"
+        );
+        return Err(AppError::TooManyRequests {
+            message: format!(
+                "LLM rate limit exceeded ({} requests per {}s per user)",
+                limit, window
+            ),
+            retry_after_secs: retry,
+        });
+    }
+
+    Ok(())
+}
+
 // ---------- Prompt construction ----------
 
 fn build_system_prompt() -> String {

diff --git a/sigma-api/src/routes/mod.rs b/sigma-api/src/routes/mod.rs
@@ -45,6 +45,8 @@ pub struct AppState {
     pub jwt_expiry_hours: u64,
     pub llm_provider: ai_triage::LlmProvider,
     pub llm_api_key: Option<String>,
+    pub llm_rate_limit_requests: u32,
+    pub llm_rate_limit_window: u64,
 }
 
 /// Auth middleware: try Bearer JWT → try X-Api-Key → allow if no API_KEY set → 401.

diff --git a/sigma-api/tests/ai_triage_test.rs b/sigma-api/tests/ai_triage_test.rs
@@ -108,3 +108,53 @@ async fn test_agent_cannot_triage() {
 
     common::cleanup(&pool).await;
 }
+
+#[tokio::test]
+async fn test_per_user_rate_limit_triggers_429() {
+    // Pin the limit low so we don't need 100 requests to verify it.
+    let (router, pool) = common::setup_with_llm_limit(2, 60).await;
+    let admin_token = common::login_admin(&router).await;
+    let token =
+        login_as(&router, &admin_token, "ratelimit@test.local", "operator").await;
+
+    // First two requests should pass.
+    for i in 0..2 {
+        let (status, _) = common::request_with_token(
+            &router,
+            "POST",
+            "/api/ai/triage",
+            &token,
+            Some(alert_body()),
+        )
+        .await;
+        assert_eq!(status, 200, "request {i} should succeed (within limit)");
+    }
+
+    // Third request — same user — must be 429 with a Retry-After header.
+    use http_body_util::BodyExt;
+    use tower::ServiceExt;
+    let req = axum::http::Request::builder()
+        .method("POST")
+        .uri("/api/ai/triage")
+        .header("authorization", format!("Bearer {token}"))
+        .header("content-type", "application/json")
+        .body(Body::from(serde_json::to_string(&alert_body()).unwrap()))
+        .unwrap();
+    let response = router.clone().oneshot(req).await.unwrap();
+    assert_eq!(response.status().as_u16(), 429);
+    let retry_after = response
+        .headers()
+        .get("Retry-After")
+        .and_then(|v| v.to_str().ok())
+        .expect("Retry-After header should be set on 429")
+        .to_string();
+    let secs: u64 = retry_after.parse().expect("Retry-After should parse as u64");
+    assert!(
+        (1..=60).contains(&secs),
+        "Retry-After should be within the 60s window, got {secs}"
+    );
+    // Drain the body so the response can be dropped cleanly.
+    let _ = response.into_body().collect().await;
+
+    common::cleanup(&pool).await;
+}
diff --git a/sigma-api/tests/common/mod.rs b/sigma-api/tests/common/mod.rs
@@ -12,6 +12,15 @@ const ADMIN_EMAIL: &str = "admin@test.local";
 const ADMIN_PASSWORD: &str = "testpass123";
 
 pub async fn setup() -> (Router, PgPool) {
+    setup_with_llm_limit(100, 60).await
+}
+
+/// Same as `setup` but lets a test pin the per-user LLM rate-limit window.
+/// Useful for verifying the 429 path without making 100+ requests.
+pub async fn setup_with_llm_limit(
+    llm_requests: u32,
+    llm_window: u64,
+) -> (Router, PgPool) {
     let database_url =
         std::env::var("DATABASE_URL").expect("DATABASE_URL must be set for integration tests");
     let redis_url = std::env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".into());
@@ -65,6 +74,8 @@ pub async fn setup() -> (Router, PgPool) {
         jwt_expiry_hours: 24,
         llm_provider: routes::ai_triage::LlmProvider::default(),
         llm_api_key: None,
+        llm_rate_limit_requests: llm_requests,
+        llm_rate_limit_window: llm_window,
     };
 
     // Build router matching main.rs structure