diff --git a/docs/ai-triage.en.md b/docs/ai-triage.en.md index b1e555d..3e62db2 100644 --- a/docs/ai-triage.en.md +++ b/docs/ai-triage.en.md @@ -209,6 +209,17 @@ Unit tests in `sigma-api/src/routes/ai_triage.rs` cover the marker placement, sa The endpoint sits behind the API's standard `auth` middleware (JWT or `X-Api-Key`) **and requires `admin` or `operator` role**. `readonly` consumers (dashboards, monitoring) and per-VPS `agent` keys receive a `403 Forbidden` before any LLM call is made — they can't spend tokens. The global rate limit still applies on top, and provider-side quota remains the second line of defence. +### Per-user rate limit + +A second sliding window applies after the RBAC check, keyed on the authenticated user's UUID (stable across both JWT and API-key auth). This is independent of the global per-IP limiter: ten operators behind one NAT don't share a token budget, and one operator hammering from many IPs can't bypass it. + +| Env var | Default | Meaning | +|---------|---------|---------| +| `LLM_RATE_LIMIT_REQUESTS` | `20` | Max triages per window, per user | +| `LLM_RATE_LIMIT_WINDOW` | `3600` | Window length in seconds (default 1 h) | + +When exhausted, the endpoint returns `429 Too Many Requests` with a `Retry-After` header (seconds). Fails open if Redis is unreachable — availability over perfect accounting when our own infra hiccups. + ### OpenAPI The full schema is published at `/swagger-ui` under the **AI Triage** tag (`/api-docs/openapi.json` for machine consumption). diff --git a/docs/ai-triage.zh.md b/docs/ai-triage.zh.md index a3c46ae..b9e6fb5 100644 --- a/docs/ai-triage.zh.md +++ b/docs/ai-triage.zh.md @@ -209,6 +209,17 @@ Content-Type: application/json 端点位于 API 的标准 `auth` 中间件之后(JWT 或 `X-Api-Key`),**并要求 `admin` 或 `operator` 角色**。`readonly` 消费者(仪表盘、监控)和每个 VPS 的 `agent` key 在到达 LLM 调用之前就会收到 `403 Forbidden` —— 它们无法消费 token。全局速率限制仍然叠加生效,provider 侧的 quota 是第二道防线。 +### 每用户速率限制 + +RBAC 检查之后还有一道滑动窗口限制,key 用认证用户的 UUID(JWT 与 API key 两条路径下都稳定)。它与全局按 IP 限流彼此独立:NAT 后面的十个运维不会共享一份 token 预算,一个运维从多个 IP 频繁调用也无法绕过限制。 + +| 环境变量 | 默认值 | 含义 | +|---------|--------|------| +| `LLM_RATE_LIMIT_REQUESTS` | `20` | 每用户、每窗口的最大诊断次数 | +| `LLM_RATE_LIMIT_WINDOW` | `3600` | 窗口长度(秒,默认 1 小时) | + +超过限额后,端点返回 `429 Too Many Requests`,并带上 `Retry-After` 响应头(秒)。Redis 不可达时**放行**(fail open) —— 自家基础设施抖动时,可用性优先于完美的计数。 + ### OpenAPI 完整 schema 发布在 `/swagger-ui`,标签为 **AI Triage**(机器消费走 `/api-docs/openapi.json`)。 diff --git a/sigma-api/src/config.rs b/sigma-api/src/config.rs index b2960df..51edb40 100644 --- a/sigma-api/src/config.rs +++ b/sigma-api/src/config.rs @@ -27,6 +27,12 @@ pub struct Config { /// `LLM_PROVIDER` is unset or `anthropic`, so existing deployments /// keep working without rotating env vars. pub llm_api_key: Option, + + /// Per-user rate limit on LLM-spending endpoints. Distinct from the + /// global per-IP rate limit because the cost shape is different — + /// tokens, not connection pressure. + pub llm_rate_limit_requests: u32, + pub llm_rate_limit_window: u64, } impl Config { @@ -105,6 +111,14 @@ impl Config { None } }), + llm_rate_limit_requests: std::env::var("LLM_RATE_LIMIT_REQUESTS") + .ok() + .and_then(|p| p.parse().ok()) + .unwrap_or(20), + llm_rate_limit_window: std::env::var("LLM_RATE_LIMIT_WINDOW") + .ok() + .and_then(|p| p.parse().ok()) + .unwrap_or(3600), } } } diff --git a/sigma-api/src/errors.rs b/sigma-api/src/errors.rs index 451a8ff..2944246 100644 --- a/sigma-api/src/errors.rs +++ b/sigma-api/src/errors.rs @@ -24,6 +24,14 @@ pub enum AppError { #[error("Forbidden: {0}")] Forbidden(String), + /// Handler-level rate limit (e.g. LLM token spend). The optional + /// `retry_after_secs` is surfaced as a `Retry-After` HTTP header. + #[error("Too many requests: {message}")] + TooManyRequests { + message: String, + retry_after_secs: Option, + }, + #[error("Database error: {0}")] Sqlx(#[from] sqlx::Error), @@ -36,11 +44,29 @@ pub enum AppError { impl IntoResponse for AppError { fn into_response(self) -> Response { + // TooManyRequests carries an optional Retry-After value; handle it + // separately so we can attach the header before serializing. + if let AppError::TooManyRequests { + ref message, + retry_after_secs, + } = self + { + let body = Json(json!({ "error": message.clone() })); + let mut response = (StatusCode::TOO_MANY_REQUESTS, body).into_response(); + if let Some(secs) = retry_after_secs { + if let Ok(v) = axum::http::HeaderValue::from_str(&secs.to_string()) { + response.headers_mut().insert("Retry-After", v); + } + } + return response; + } + let (status, msg) = match &self { AppError::NotFound => (StatusCode::NOT_FOUND, self.to_string()), AppError::BadRequest(_) => (StatusCode::BAD_REQUEST, self.to_string()), AppError::Unauthorized => (StatusCode::UNAUTHORIZED, self.to_string()), AppError::Forbidden(_) => (StatusCode::FORBIDDEN, self.to_string()), + AppError::TooManyRequests { .. } => unreachable!("handled above"), AppError::Sqlx(e) => { tracing::error!("Database error: {:?}", e); (StatusCode::INTERNAL_SERVER_ERROR, "Database error".into()) @@ -96,4 +122,31 @@ mod tests { StatusCode::INTERNAL_SERVER_ERROR ); } + + #[test] + fn test_too_many_requests_is_429_with_retry_after() { + let response = AppError::TooManyRequests { + message: "slow down".into(), + retry_after_secs: Some(42), + } + .into_response(); + assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS); + let retry = response + .headers() + .get("Retry-After") + .and_then(|v| v.to_str().ok()) + .unwrap(); + assert_eq!(retry, "42"); + } + + #[test] + fn test_too_many_requests_without_retry_after_still_429() { + let response = AppError::TooManyRequests { + message: "slow down".into(), + retry_after_secs: None, + } + .into_response(); + assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS); + assert!(response.headers().get("Retry-After").is_none()); + } } diff --git a/sigma-api/src/main.rs b/sigma-api/src/main.rs index 3fd8a1a..ce08ec1 100644 --- a/sigma-api/src/main.rs +++ b/sigma-api/src/main.rs @@ -87,6 +87,8 @@ async fn main() -> anyhow::Result<()> { jwt_expiry_hours: cfg.jwt_expiry_hours, llm_provider, llm_api_key: cfg.llm_api_key.clone(), + llm_rate_limit_requests: cfg.llm_rate_limit_requests, + llm_rate_limit_window: cfg.llm_rate_limit_window, }; // Capture before cfg is moved into notification worker diff --git a/sigma-api/src/routes/ai_triage.rs b/sigma-api/src/routes/ai_triage.rs index d9ed307..a6d8d81 100644 --- a/sigma-api/src/routes/ai_triage.rs +++ b/sigma-api/src/routes/ai_triage.rs @@ -22,11 +22,13 @@ //! human-in-the-loop is the design. use axum::{extract::State, routing::post, Extension, Json, Router}; +use redis::AsyncCommands; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::time::Duration; use tracing::{info, warn}; use utoipa::ToSchema; +use uuid::Uuid; use crate::auth::{require_role, CurrentUser}; use crate::errors::AppError; @@ -192,6 +194,7 @@ pub struct TriageResponse { responses( (status = 200, description = "Triage suggestion (degrades gracefully when LLM unavailable)", body = TriageResponse), (status = 403, description = "Caller's role is not permitted to spend LLM tokens (requires admin or operator)"), + (status = 429, description = "Per-user LLM rate limit exceeded. Retry after the `Retry-After` header's value (seconds)."), ) )] pub async fn triage( @@ -205,6 +208,11 @@ pub async fn triage( // middleware still applies on top. require_role(&user, &["admin", "operator"])?; + // Per-user LLM rate limit. Distinct from the global per-IP limit: ten + // operators behind one NAT shouldn't share a token budget, and one + // operator hammering from many IPs shouldn't bypass it. + check_llm_rate_limit(&state, user.id).await?; + let provider = state.llm_provider; let provider_str = provider.as_str(); @@ -275,6 +283,56 @@ pub async fn triage( Ok(Json(parse_llm_response(&llm_text, &model, provider_str))) } +// ---------- Per-user rate limit ---------- + +/// Sliding-window per-user limit on LLM-spending requests. Mirrors the +/// shape of `routes::rate_limit` but keys on `user.id` (stable across JWT +/// + API-key auth) and uses a separate Redis prefix + window so the two +/// limiters don't interfere. Fails open if Redis is down — same posture +/// as the global limiter: availability beats perfect accounting when our +/// own infra has a hiccup. +async fn check_llm_rate_limit(state: &AppState, user_id: Uuid) -> Result<(), AppError> { + let key = format!("llm-rate:{}", user_id); + let limit = state.llm_rate_limit_requests; + let window = state.llm_rate_limit_window; + + let mut conn = state.redis.clone(); + let count: u32 = match redis::cmd("INCR").arg(&key).query_async(&mut conn).await { + Ok(c) => c, + Err(e) => { + warn!("LLM rate-limit Redis INCR failed (failing open): {e}"); + return Ok(()); + } + }; + + if count == 1 { + if let Err(e) = conn.expire::<_, ()>(&key, window as i64).await { + warn!("LLM rate-limit Redis EXPIRE failed: {e}"); + } + } + + if count > limit { + let ttl: i64 = conn.ttl(&key).await.unwrap_or(window as i64); + let retry = if ttl > 0 { Some(ttl as u64) } else { Some(window) }; + info!( + user_id = %user_id, + count, + limit, + window, + "LLM rate limit exceeded" + ); + return Err(AppError::TooManyRequests { + message: format!( + "LLM rate limit exceeded ({} requests per {}s per user)", + limit, window + ), + retry_after_secs: retry, + }); + } + + Ok(()) +} + // ---------- Prompt construction ---------- fn build_system_prompt() -> String { diff --git a/sigma-api/src/routes/mod.rs b/sigma-api/src/routes/mod.rs index 290114f..39e85bc 100644 --- a/sigma-api/src/routes/mod.rs +++ b/sigma-api/src/routes/mod.rs @@ -45,6 +45,8 @@ pub struct AppState { pub jwt_expiry_hours: u64, pub llm_provider: ai_triage::LlmProvider, pub llm_api_key: Option, + pub llm_rate_limit_requests: u32, + pub llm_rate_limit_window: u64, } /// Auth middleware: try Bearer JWT → try X-Api-Key → allow if no API_KEY set → 401. diff --git a/sigma-api/tests/ai_triage_test.rs b/sigma-api/tests/ai_triage_test.rs index 031a56f..8a47cd8 100644 --- a/sigma-api/tests/ai_triage_test.rs +++ b/sigma-api/tests/ai_triage_test.rs @@ -108,3 +108,53 @@ async fn test_agent_cannot_triage() { common::cleanup(&pool).await; } + +#[tokio::test] +async fn test_per_user_rate_limit_triggers_429() { + // Pin the limit low so we don't need 100 requests to verify it. + let (router, pool) = common::setup_with_llm_limit(2, 60).await; + let admin_token = common::login_admin(&router).await; + let token = + login_as(&router, &admin_token, "ratelimit@test.local", "operator").await; + + // First two requests should pass. + for i in 0..2 { + let (status, _) = common::request_with_token( + &router, + "POST", + "/api/ai/triage", + &token, + Some(alert_body()), + ) + .await; + assert_eq!(status, 200, "request {i} should succeed (within limit)"); + } + + // Third request — same user — must be 429 with a Retry-After header. + use http_body_util::BodyExt; + use tower::ServiceExt; + let req = axum::http::Request::builder() + .method("POST") + .uri("/api/ai/triage") + .header("authorization", format!("Bearer {token}")) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&alert_body()).unwrap())) + .unwrap(); + let response = router.clone().oneshot(req).await.unwrap(); + assert_eq!(response.status().as_u16(), 429); + let retry_after = response + .headers() + .get("Retry-After") + .and_then(|v| v.to_str().ok()) + .expect("Retry-After header should be set on 429") + .to_string(); + let secs: u64 = retry_after.parse().expect("Retry-After should parse as u64"); + assert!( + (1..=60).contains(&secs), + "Retry-After should be within the 60s window, got {secs}" + ); + // Drain the body so the response can be dropped cleanly. + let _ = response.into_body().collect().await; + + common::cleanup(&pool).await; +} diff --git a/sigma-api/tests/common/mod.rs b/sigma-api/tests/common/mod.rs index 8e3d4b7..a9ac6e1 100644 --- a/sigma-api/tests/common/mod.rs +++ b/sigma-api/tests/common/mod.rs @@ -12,6 +12,15 @@ const ADMIN_EMAIL: &str = "admin@test.local"; const ADMIN_PASSWORD: &str = "testpass123"; pub async fn setup() -> (Router, PgPool) { + setup_with_llm_limit(100, 60).await +} + +/// Same as `setup` but lets a test pin the per-user LLM rate-limit window. +/// Useful for verifying the 429 path without making 100+ requests. +pub async fn setup_with_llm_limit( + llm_requests: u32, + llm_window: u64, +) -> (Router, PgPool) { let database_url = std::env::var("DATABASE_URL").expect("DATABASE_URL must be set for integration tests"); let redis_url = std::env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".into()); @@ -65,6 +74,8 @@ pub async fn setup() -> (Router, PgPool) { jwt_expiry_hours: 24, llm_provider: routes::ai_triage::LlmProvider::default(), llm_api_key: None, + llm_rate_limit_requests: llm_requests, + llm_rate_limit_window: llm_window, }; // Build router matching main.rs structure