Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/ai-triage.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,17 @@ Unit tests in `sigma-api/src/routes/ai_triage.rs` cover the marker placement, sa

The endpoint sits behind the API's standard `auth` middleware (JWT or `X-Api-Key`) **and requires `admin` or `operator` role**. `readonly` consumers (dashboards, monitoring) and per-VPS `agent` keys receive a `403 Forbidden` before any LLM call is made — they can't spend tokens. The global rate limit still applies on top, and provider-side quota remains the second line of defence.

### Per-user rate limit

A second sliding window applies after the RBAC check, keyed on the authenticated user's UUID (stable across both JWT and API-key auth). This is independent of the global per-IP limiter: ten operators behind one NAT don't share a token budget, and one operator hammering from many IPs can't bypass it.

| Env var | Default | Meaning |
|---------|---------|---------|
| `LLM_RATE_LIMIT_REQUESTS` | `20` | Max triages per window, per user |
| `LLM_RATE_LIMIT_WINDOW` | `3600` | Window length in seconds (default 1 h) |

When exhausted, the endpoint returns `429 Too Many Requests` with a `Retry-After` header (seconds). Fails open if Redis is unreachable — availability over perfect accounting when our own infra hiccups.

### OpenAPI

The full schema is published at `/swagger-ui` under the **AI Triage** tag (`/api-docs/openapi.json` for machine consumption).
Expand Down
11 changes: 11 additions & 0 deletions docs/ai-triage.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,17 @@ Content-Type: application/json

端点位于 API 的标准 `auth` 中间件之后(JWT 或 `X-Api-Key`),**并要求 `admin` 或 `operator` 角色**。`readonly` 消费者(仪表盘、监控)和每个 VPS 的 `agent` key 在到达 LLM 调用之前就会收到 `403 Forbidden` —— 它们无法消费 token。全局速率限制仍然叠加生效,provider 侧的 quota 是第二道防线。

### 每用户速率限制

RBAC 检查之后还有一道滑动窗口限制,key 用认证用户的 UUID(JWT 与 API key 两条路径下都稳定)。它与全局按 IP 限流彼此独立:NAT 后面的十个运维不会共享一份 token 预算,一个运维从多个 IP 频繁调用也无法绕过限制。

| 环境变量 | 默认值 | 含义 |
|---------|--------|------|
| `LLM_RATE_LIMIT_REQUESTS` | `20` | 每用户、每窗口的最大诊断次数 |
| `LLM_RATE_LIMIT_WINDOW` | `3600` | 窗口长度(秒,默认 1 小时) |

超过限额后,端点返回 `429 Too Many Requests`,并带上 `Retry-After` 响应头(秒)。Redis 不可达时**放行**(fail open) —— 自家基础设施抖动时,可用性优先于完美的计数。

### OpenAPI

完整 schema 发布在 `/swagger-ui`,标签为 **AI Triage**(机器消费走 `/api-docs/openapi.json`)。
Expand Down
14 changes: 14 additions & 0 deletions sigma-api/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ pub struct Config {
/// `LLM_PROVIDER` is unset or `anthropic`, so existing deployments
/// keep working without rotating env vars.
pub llm_api_key: Option<String>,

/// Per-user rate limit on LLM-spending endpoints. Distinct from the
/// global per-IP rate limit because the cost shape is different —
/// tokens, not connection pressure.
pub llm_rate_limit_requests: u32,
pub llm_rate_limit_window: u64,
}

impl Config {
Expand Down Expand Up @@ -105,6 +111,14 @@ impl Config {
None
}
}),
llm_rate_limit_requests: std::env::var("LLM_RATE_LIMIT_REQUESTS")
.ok()
.and_then(|p| p.parse().ok())
.unwrap_or(20),
llm_rate_limit_window: std::env::var("LLM_RATE_LIMIT_WINDOW")
.ok()
.and_then(|p| p.parse().ok())
.unwrap_or(3600),
}
}
}
53 changes: 53 additions & 0 deletions sigma-api/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ pub enum AppError {
#[error("Forbidden: {0}")]
Forbidden(String),

/// Handler-level rate limit (e.g. LLM token spend). The optional
/// `retry_after_secs` is surfaced as a `Retry-After` HTTP header.
#[error("Too many requests: {message}")]
TooManyRequests {
message: String,
retry_after_secs: Option<u64>,
},

#[error("Database error: {0}")]
Sqlx(#[from] sqlx::Error),

Expand All @@ -36,11 +44,29 @@ pub enum AppError {

impl IntoResponse for AppError {
fn into_response(self) -> Response {
// TooManyRequests carries an optional Retry-After value; handle it
// separately so we can attach the header before serializing.
if let AppError::TooManyRequests {
ref message,
retry_after_secs,
} = self
{
let body = Json(json!({ "error": message.clone() }));
let mut response = (StatusCode::TOO_MANY_REQUESTS, body).into_response();
if let Some(secs) = retry_after_secs {
if let Ok(v) = axum::http::HeaderValue::from_str(&secs.to_string()) {
response.headers_mut().insert("Retry-After", v);
}
}
return response;
}

let (status, msg) = match &self {
AppError::NotFound => (StatusCode::NOT_FOUND, self.to_string()),
AppError::BadRequest(_) => (StatusCode::BAD_REQUEST, self.to_string()),
AppError::Unauthorized => (StatusCode::UNAUTHORIZED, self.to_string()),
AppError::Forbidden(_) => (StatusCode::FORBIDDEN, self.to_string()),
AppError::TooManyRequests { .. } => unreachable!("handled above"),
AppError::Sqlx(e) => {
tracing::error!("Database error: {:?}", e);
(StatusCode::INTERNAL_SERVER_ERROR, "Database error".into())
Expand Down Expand Up @@ -96,4 +122,31 @@ mod tests {
StatusCode::INTERNAL_SERVER_ERROR
);
}

#[test]
fn test_too_many_requests_is_429_with_retry_after() {
let response = AppError::TooManyRequests {
message: "slow down".into(),
retry_after_secs: Some(42),
}
.into_response();
assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS);
let retry = response
.headers()
.get("Retry-After")
.and_then(|v| v.to_str().ok())
.unwrap();
assert_eq!(retry, "42");
}

#[test]
fn test_too_many_requests_without_retry_after_still_429() {
let response = AppError::TooManyRequests {
message: "slow down".into(),
retry_after_secs: None,
}
.into_response();
assert_eq!(response.status(), StatusCode::TOO_MANY_REQUESTS);
assert!(response.headers().get("Retry-After").is_none());
}
}
2 changes: 2 additions & 0 deletions sigma-api/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ async fn main() -> anyhow::Result<()> {
jwt_expiry_hours: cfg.jwt_expiry_hours,
llm_provider,
llm_api_key: cfg.llm_api_key.clone(),
llm_rate_limit_requests: cfg.llm_rate_limit_requests,
llm_rate_limit_window: cfg.llm_rate_limit_window,
};

// Capture before cfg is moved into notification worker
Expand Down
58 changes: 58 additions & 0 deletions sigma-api/src/routes/ai_triage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
//! human-in-the-loop is the design.

use axum::{extract::State, routing::post, Extension, Json, Router};
use redis::AsyncCommands;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::time::Duration;
use tracing::{info, warn};
use utoipa::ToSchema;
use uuid::Uuid;

use crate::auth::{require_role, CurrentUser};
use crate::errors::AppError;
Expand Down Expand Up @@ -192,6 +194,7 @@ pub struct TriageResponse {
responses(
(status = 200, description = "Triage suggestion (degrades gracefully when LLM unavailable)", body = TriageResponse),
(status = 403, description = "Caller's role is not permitted to spend LLM tokens (requires admin or operator)"),
(status = 429, description = "Per-user LLM rate limit exceeded. Retry after the `Retry-After` header's value (seconds)."),
)
)]
pub async fn triage(
Expand All @@ -205,6 +208,11 @@ pub async fn triage(
// middleware still applies on top.
require_role(&user, &["admin", "operator"])?;

// Per-user LLM rate limit. Distinct from the global per-IP limit: ten
// operators behind one NAT shouldn't share a token budget, and one
// operator hammering from many IPs shouldn't bypass it.
check_llm_rate_limit(&state, user.id).await?;

let provider = state.llm_provider;
let provider_str = provider.as_str();

Expand Down Expand Up @@ -275,6 +283,56 @@ pub async fn triage(
Ok(Json(parse_llm_response(&llm_text, &model, provider_str)))
}

// ---------- Per-user rate limit ----------

/// Sliding-window per-user limit on LLM-spending requests. Mirrors the
/// shape of `routes::rate_limit` but keys on `user.id` (stable across JWT
/// + API-key auth) and uses a separate Redis prefix + window so the two
/// limiters don't interfere. Fails open if Redis is down — same posture
/// as the global limiter: availability beats perfect accounting when our
/// own infra has a hiccup.
async fn check_llm_rate_limit(state: &AppState, user_id: Uuid) -> Result<(), AppError> {
let key = format!("llm-rate:{}", user_id);
let limit = state.llm_rate_limit_requests;
let window = state.llm_rate_limit_window;

let mut conn = state.redis.clone();
let count: u32 = match redis::cmd("INCR").arg(&key).query_async(&mut conn).await {
Ok(c) => c,
Err(e) => {
warn!("LLM rate-limit Redis INCR failed (failing open): {e}");
return Ok(());
}
};

if count == 1 {
if let Err(e) = conn.expire::<_, ()>(&key, window as i64).await {
warn!("LLM rate-limit Redis EXPIRE failed: {e}");
}
}

if count > limit {
let ttl: i64 = conn.ttl(&key).await.unwrap_or(window as i64);
let retry = if ttl > 0 { Some(ttl as u64) } else { Some(window) };
info!(
user_id = %user_id,
count,
limit,
window,
"LLM rate limit exceeded"
);
return Err(AppError::TooManyRequests {
message: format!(
"LLM rate limit exceeded ({} requests per {}s per user)",
limit, window
),
retry_after_secs: retry,
});
}

Ok(())
}

// ---------- Prompt construction ----------

fn build_system_prompt() -> String {
Expand Down
2 changes: 2 additions & 0 deletions sigma-api/src/routes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ pub struct AppState {
pub jwt_expiry_hours: u64,
pub llm_provider: ai_triage::LlmProvider,
pub llm_api_key: Option<String>,
pub llm_rate_limit_requests: u32,
pub llm_rate_limit_window: u64,
}

/// Auth middleware: try Bearer JWT → try X-Api-Key → allow if no API_KEY set → 401.
Expand Down
50 changes: 50 additions & 0 deletions sigma-api/tests/ai_triage_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,53 @@ async fn test_agent_cannot_triage() {

common::cleanup(&pool).await;
}

#[tokio::test]
async fn test_per_user_rate_limit_triggers_429() {
// Pin the limit low so we don't need 100 requests to verify it.
let (router, pool) = common::setup_with_llm_limit(2, 60).await;
let admin_token = common::login_admin(&router).await;
let token =
login_as(&router, &admin_token, "ratelimit@test.local", "operator").await;

// First two requests should pass.
for i in 0..2 {
let (status, _) = common::request_with_token(
&router,
"POST",
"/api/ai/triage",
&token,
Some(alert_body()),
)
.await;
assert_eq!(status, 200, "request {i} should succeed (within limit)");
}

// Third request — same user — must be 429 with a Retry-After header.
use http_body_util::BodyExt;
use tower::ServiceExt;
let req = axum::http::Request::builder()
.method("POST")
.uri("/api/ai/triage")
.header("authorization", format!("Bearer {token}"))
.header("content-type", "application/json")
.body(Body::from(serde_json::to_string(&alert_body()).unwrap()))
.unwrap();
let response = router.clone().oneshot(req).await.unwrap();
assert_eq!(response.status().as_u16(), 429);
let retry_after = response
.headers()
.get("Retry-After")
.and_then(|v| v.to_str().ok())
.expect("Retry-After header should be set on 429")
.to_string();
let secs: u64 = retry_after.parse().expect("Retry-After should parse as u64");
assert!(
(1..=60).contains(&secs),
"Retry-After should be within the 60s window, got {secs}"
);
// Drain the body so the response can be dropped cleanly.
let _ = response.into_body().collect().await;

common::cleanup(&pool).await;
}
11 changes: 11 additions & 0 deletions sigma-api/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ const ADMIN_EMAIL: &str = "admin@test.local";
const ADMIN_PASSWORD: &str = "testpass123";

pub async fn setup() -> (Router, PgPool) {
setup_with_llm_limit(100, 60).await
}

/// Same as `setup` but lets a test pin the per-user LLM rate-limit window.
/// Useful for verifying the 429 path without making 100+ requests.
pub async fn setup_with_llm_limit(
llm_requests: u32,
llm_window: u64,
) -> (Router, PgPool) {
let database_url =
std::env::var("DATABASE_URL").expect("DATABASE_URL must be set for integration tests");
let redis_url = std::env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".into());
Expand Down Expand Up @@ -65,6 +74,8 @@ pub async fn setup() -> (Router, PgPool) {
jwt_expiry_hours: 24,
llm_provider: routes::ai_triage::LlmProvider::default(),
llm_api_key: None,
llm_rate_limit_requests: llm_requests,
llm_rate_limit_window: llm_window,
};

// Build router matching main.rs structure
Expand Down
Loading