From 3a1600ec392c64483873c87ca27d50055be1686b Mon Sep 17 00:00:00 2001 From: lai3d Date: Wed, 20 May 2026 15:40:42 +0800 Subject: [PATCH] Audit-log /api/ai/triage requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every triage that passes RBAC + rate-limit now writes an audit_logs row (action=triage, resource=ai_triage, resource_id=vps_id when supplied). The details payload captures alert metadata, the resolved provider/model, and the LLM outcome — but NOT the raw description, recent_logs, or ebpf_metrics, which can carry sensitive content. Audit logs answer "who triaged what, when, with what outcome", not "what was in the logs". 403 (RBAC denied) and 429 (rate-limit denied) are intentionally NOT audit-logged — they're security events visible in the warn!/info! logs, not operator actions. Refactored the handler to build the response in a single binding so all three exit paths (no API key, LLM failed, LLM succeeded) flow through the same log_audit call. New integration test triages a fake VPS as an operator, then GETs /api/audit-logs as admin and verifies action/resource/resource_id/ user_email/details all match. Docs: docs/ai-triage.{en,zh}.md gain an "Audit logging" subsection documenting the schema and the deliberate exclusions. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/ai-triage.en.md | 18 +++++ docs/ai-triage.zh.md | 18 +++++ sigma-api/src/routes/ai_triage.rs | 116 +++++++++++++++++++----------- sigma-api/tests/ai_triage_test.rs | 51 +++++++++++++ 4 files changed, 162 insertions(+), 41 deletions(-) diff --git a/docs/ai-triage.en.md b/docs/ai-triage.en.md index 3e62db2..2d4047a 100644 --- a/docs/ai-triage.en.md +++ b/docs/ai-triage.en.md @@ -220,6 +220,24 @@ A second sliding window applies after the RBAC check, keyed on the authenticated When exhausted, the endpoint returns `429 Too Many Requests` with a `Retry-After` header (seconds). Fails open if Redis is unreachable — availability over perfect accounting when our own infra hiccups. +### Audit logging + +Every triage request that passes RBAC + rate-limit is recorded in `audit_logs` (admin-readable via `GET /api/audit-logs?resource=ai_triage`). RBAC denials (403) and rate-limit denials (429) are intentionally **not** audit-logged — they're security events visible in the structured `warn!` / `info!` logs. + +Logged shape: + +| Column | Value | +|--------|-------| +| `action` | `"triage"` | +| `resource` | `"ai_triage"` | +| `resource_id` | The targeted `vps_id` (UUID string) when supplied, else `null`. Lets operators run `?resource_id=` to see who has triaged a given host. | +| `details` | `{alert_name, alert_severity, vps_hostname, fired_at, provider, model, available, confidence}` | +| `user_id` / `user_email` | From `CurrentUser` | + +**What's deliberately not captured**: the raw `description`, `recent_logs`, `system_info`, `ebpf_metrics` payload, or the full LLM response. Audit logs answer "who triaged what, when, with what outcome" — they're not the place for sensitive log content or large response bodies. + +Failure to write the audit row never fails the triage request — `log_audit` is best-effort with a `warn!` on error, same as every other audited route. + ### OpenAPI The full schema is published at `/swagger-ui` under the **AI Triage** tag (`/api-docs/openapi.json` for machine consumption). diff --git a/docs/ai-triage.zh.md b/docs/ai-triage.zh.md index b9e6fb5..54b3bdc 100644 --- a/docs/ai-triage.zh.md +++ b/docs/ai-triage.zh.md @@ -220,6 +220,24 @@ RBAC 检查之后还有一道滑动窗口限制,key 用认证用户的 UUID(JWT 超过限额后,端点返回 `429 Too Many Requests`,并带上 `Retry-After` 响应头(秒)。Redis 不可达时**放行**(fail open) —— 自家基础设施抖动时,可用性优先于完美的计数。 +### 审计日志 + +每个通过 RBAC + 速率限制的诊断请求都会写入 `audit_logs`(管理员可通过 `GET /api/audit-logs?resource=ai_triage` 查询)。RBAC 拒绝(403)和速率限制拒绝(429)**不**写入审计日志 —— 它们属于安全事件,在结构化 `warn!` / `info!` 日志中可见。 + +记录的字段: + +| 列 | 取值 | +|----|------| +| `action` | `"triage"` | +| `resource` | `"ai_triage"` | +| `resource_id` | 提供了 `vps_id` 时记录目标 VPS 的 UUID 字符串,否则为 `null`。运维可以用 `?resource_id=` 查询某台机器被谁诊断过。 | +| `details` | `{alert_name, alert_severity, vps_hostname, fired_at, provider, model, available, confidence}` | +| `user_id` / `user_email` | 来自 `CurrentUser` | + +**有意不记录**的内容:原始 `description`、`recent_logs`、`system_info`、`ebpf_metrics` 内容,以及完整的 LLM 响应。审计日志回答的是"谁在何时诊断了什么,结果如何",不是用来存放敏感日志内容或大响应体的地方。 + +写入审计行失败不会让诊断请求失败 —— `log_audit` 是 best-effort 的,失败时只 `warn!`,与其他被审计的路由一致。 + ### OpenAPI 完整 schema 发布在 `/swagger-ui`,标签为 **AI Triage**(机器消费走 `/api-docs/openapi.json`)。 diff --git a/sigma-api/src/routes/ai_triage.rs b/sigma-api/src/routes/ai_triage.rs index a6d8d81..db52887 100644 --- a/sigma-api/src/routes/ai_triage.rs +++ b/sigma-api/src/routes/ai_triage.rs @@ -32,6 +32,7 @@ use uuid::Uuid; use crate::auth::{require_role, CurrentUser}; use crate::errors::AppError; +use crate::routes::audit_logs::log_audit; use crate::routes::AppState; // Provider endpoints. @@ -216,10 +217,12 @@ pub async fn triage( let provider = state.llm_provider; let provider_str = provider.as_str(); - // If no API key configured, degrade gracefully — return the alert - // back with `available: false` so the UI can still render something. - let Some(api_key) = state.llm_api_key.clone() else { - return Ok(Json(TriageResponse { + // Build the response in a single binding so every exit path (degraded + // or success) flows through the audit-log call below. + let response = match state.llm_api_key.clone() { + // No API key configured → degrade gracefully so the UI still + // renders something useful. + None => TriageResponse { available: false, diagnosis: Some(format!( "AI triage not configured (no LLM_API_KEY for provider={}). Alert: {}", @@ -240,47 +243,78 @@ pub async fn triage( "API key not configured for provider={}", provider_str )), - })); + }, + Some(api_key) => { + let model = req + .model + .clone() + .unwrap_or_else(|| provider.default_model().to_string()); + + let system_prompt = build_system_prompt(); + let user_prompt = build_user_prompt(&req); + + match call_llm( + &state.http_client, + provider, + &api_key, + &model, + &system_prompt, + &user_prompt, + DEFAULT_MAX_TOKENS, + ) + .await + { + Ok(text) => parse_llm_response(&text, &model, provider_str), + Err(e) => { + warn!( + provider = %provider_str, + error = %e, + "LLM call failed; degrading to alert-only response" + ); + TriageResponse { + available: false, + diagnosis: Some(format!("Raw alert: {}", req.alert.name)), + likely_causes: vec![], + remediation_steps: vec![ + "Inspect alert manually — AI triage was unavailable".into(), + ], + confidence: Some("low".into()), + model: Some(model), + provider: Some(provider_str.to_string()), + note: Some(format!("LLM unreachable: {:#}", e)), + } + } + } + } }; - let model = req - .model - .clone() - .unwrap_or_else(|| provider.default_model().to_string()); - - let system_prompt = build_system_prompt(); - let user_prompt = build_user_prompt(&req); - - let llm_text = match call_llm( - &state.http_client, - provider, - &api_key, - &model, - &system_prompt, - &user_prompt, - DEFAULT_MAX_TOKENS, + // Audit log every triage request that passed RBAC + rate-limit. We + // record alert metadata (NOT the raw description or logs, which can + // carry sensitive content) plus the resolved provider/model and the + // LLM outcome. resource_id is the targeted VPS UUID when supplied so + // operators can filter `/api/audit-logs?resource_id=` to see who + // has triaged a given host. + let vps_id_str = req.alert.vps_id.map(|id| id.to_string()); + log_audit( + &state.db, + &user, + "triage", + "ai_triage", + vps_id_str.as_deref(), + json!({ + "alert_name": req.alert.name, + "alert_severity": req.alert.severity, + "vps_hostname": req.alert.vps_hostname, + "fired_at": req.alert.fired_at, + "provider": response.provider, + "model": response.model, + "available": response.available, + "confidence": response.confidence, + }), ) - .await - { - Ok(text) => text, - Err(e) => { - warn!(provider = %provider_str, error = %e, "LLM call failed; degrading to alert-only response"); - return Ok(Json(TriageResponse { - available: false, - diagnosis: Some(format!("Raw alert: {}", req.alert.name)), - likely_causes: vec![], - remediation_steps: vec![ - "Inspect alert manually — AI triage was unavailable".into(), - ], - confidence: Some("low".into()), - model: Some(model), - provider: Some(provider_str.to_string()), - note: Some(format!("LLM unreachable: {:#}", e)), - })); - } - }; + .await; - Ok(Json(parse_llm_response(&llm_text, &model, provider_str))) + Ok(Json(response)) } // ---------- Per-user rate limit ---------- diff --git a/sigma-api/tests/ai_triage_test.rs b/sigma-api/tests/ai_triage_test.rs index 8a47cd8..c32d3ec 100644 --- a/sigma-api/tests/ai_triage_test.rs +++ b/sigma-api/tests/ai_triage_test.rs @@ -158,3 +158,54 @@ async fn test_per_user_rate_limit_triggers_429() { common::cleanup(&pool).await; } + +#[tokio::test] +async fn test_triage_writes_audit_log() { + let (router, pool) = common::setup().await; + let admin_token = common::login_admin(&router).await; + let token = + login_as(&router, &admin_token, "auditop@test.local", "operator").await; + + // Triage a synthetic alert pinned to a fake VPS so we can assert + // resource_id was captured. + let vps_id = uuid::Uuid::new_v4(); + let body = json!({ + "alert": { + "name": "audit-test alert", + "severity": "high", + "vps_hostname": "relay-99", + "vps_id": vps_id.to_string(), + } + }); + let (status, _) = + common::request_with_token(&router, "POST", "/api/ai/triage", &token, Some(body)) + .await; + assert_eq!(status, 200); + + // Admin (only role allowed to read audit logs) lists them filtered + // to the ai_triage resource and verifies the row landed. + let (status, logs) = common::request_with_token( + &router, + "GET", + "/api/audit-logs?resource=ai_triage", + &admin_token, + None, + ) + .await; + assert_eq!(status, 200); + assert_eq!(logs["total"], 1); + let entry = &logs["data"][0]; + assert_eq!(entry["action"], "triage"); + assert_eq!(entry["resource"], "ai_triage"); + assert_eq!(entry["resource_id"], vps_id.to_string()); + assert_eq!(entry["user_email"], "auditop@test.local"); + assert_eq!(entry["details"]["alert_name"], "audit-test alert"); + assert_eq!(entry["details"]["alert_severity"], "high"); + assert_eq!(entry["details"]["vps_hostname"], "relay-99"); + // No LLM_API_KEY in test env → available=false. We still log the + // attempt, with the resolved provider name. + assert_eq!(entry["details"]["available"], false); + assert_eq!(entry["details"]["provider"], "anthropic"); + + common::cleanup(&pool).await; +}