From 3a1600ec392c64483873c87ca27d50055be1686b Mon Sep 17 00:00:00 2001
From: lai3d <cgpanda.sg@gmail.com>
Date: Wed, 20 May 2026 15:40:42 +0800
Subject: [PATCH] Audit-log /api/ai/triage requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every triage that passes RBAC + rate-limit now writes an audit_logs row
(action=triage, resource=ai_triage, resource_id=vps_id when supplied).
The details payload captures alert metadata, the resolved provider/model,
and the LLM outcome — but NOT the raw description, recent_logs, or
ebpf_metrics, which can carry sensitive content. Audit logs answer
"who triaged what, when, with what outcome", not "what was in the logs".

403 (RBAC denied) and 429 (rate-limit denied) are intentionally NOT
audit-logged — they're security events visible in the warn!/info!
logs, not operator actions.

Refactored the handler to build the response in a single binding so
all three exit paths (no API key, LLM failed, LLM succeeded) flow
through the same log_audit call.

New integration test triages a fake VPS as an operator, then GETs
/api/audit-logs as admin and verifies action/resource/resource_id/
user_email/details all match.

Docs: docs/ai-triage.{en,zh}.md gain an "Audit logging" subsection
documenting the schema and the deliberate exclusions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/ai-triage.en.md              |  18 +++++
 docs/ai-triage.zh.md              |  18 +++++
 sigma-api/src/routes/ai_triage.rs | 116 +++++++++++++++++++-----------
 sigma-api/tests/ai_triage_test.rs |  51 +++++++++++++
 4 files changed, 162 insertions(+), 41 deletions(-)
diff --git a/docs/ai-triage.en.md b/docs/ai-triage.en.md
index 3e62db2..2d4047a 100644
--- a/docs/ai-triage.en.md
+++ b/docs/ai-triage.en.md
@@ -220,6 +220,24 @@ A second sliding window applies after the RBAC check, keyed on the authenticated
 
 When exhausted, the endpoint returns `429 Too Many Requests` with a `Retry-After` header (seconds). Fails open if Redis is unreachable — availability over perfect accounting when our own infra hiccups.
 
+### Audit logging
+
+Every triage request that passes RBAC + rate-limit is recorded in `audit_logs` (admin-readable via `GET /api/audit-logs?resource=ai_triage`). RBAC denials (403) and rate-limit denials (429) are intentionally **not** audit-logged — they're security events visible in the structured `warn!` / `info!` logs.
+
+Logged shape:
+
+| Column | Value |
+|--------|-------|
+| `action` | `"triage"` |
+| `resource` | `"ai_triage"` |
+| `resource_id` | The targeted `vps_id` (UUID string) when supplied, else `null`. Lets operators run `?resource_id=<vps>` to see who has triaged a given host. |
+| `details` | `{alert_name, alert_severity, vps_hostname, fired_at, provider, model, available, confidence}` |
+| `user_id` / `user_email` | From `CurrentUser` |
+
+**What's deliberately not captured**: the raw `description`, `recent_logs`, `system_info`, `ebpf_metrics` payload, or the full LLM response. Audit logs answer "who triaged what, when, with what outcome" — they're not the place for sensitive log content or large response bodies.
+
+Failure to write the audit row never fails the triage request — `log_audit` is best-effort with a `warn!` on error, same as every other audited route.
+
 ### OpenAPI
 
 The full schema is published at `/swagger-ui` under the **AI Triage** tag (`/api-docs/openapi.json` for machine consumption).
diff --git a/docs/ai-triage.zh.md b/docs/ai-triage.zh.md
index b9e6fb5..54b3bdc 100644
--- a/docs/ai-triage.zh.md
+++ b/docs/ai-triage.zh.md
@@ -220,6 +220,24 @@ RBAC 检查之后还有一道滑动窗口限制,key 用认证用户的 UUID(JWT
 
 超过限额后,端点返回 `429 Too Many Requests`,并带上 `Retry-After` 响应头(秒)。Redis 不可达时**放行**(fail open) —— 自家基础设施抖动时,可用性优先于完美的计数。
 
+### 审计日志
+
+每个通过 RBAC + 速率限制的诊断请求都会写入 `audit_logs`(管理员可通过 `GET /api/audit-logs?resource=ai_triage` 查询)。RBAC 拒绝(403)和速率限制拒绝(429)**不**写入审计日志 —— 它们属于安全事件,在结构化 `warn!` / `info!` 日志中可见。
+
+记录的字段:
+
+| 列 | 取值 |
+|----|------|
+| `action` | `"triage"` |
+| `resource` | `"ai_triage"` |
+| `resource_id` | 提供了 `vps_id` 时记录目标 VPS 的 UUID 字符串,否则为 `null`。运维可以用 `?resource_id=<vps>` 查询某台机器被谁诊断过。 |
+| `details` | `{alert_name, alert_severity, vps_hostname, fired_at, provider, model, available, confidence}` |
+| `user_id` / `user_email` | 来自 `CurrentUser` |
+
+**有意不记录**的内容:原始 `description`、`recent_logs`、`system_info`、`ebpf_metrics` 内容,以及完整的 LLM 响应。审计日志回答的是"谁在何时诊断了什么,结果如何",不是用来存放敏感日志内容或大响应体的地方。
+
+写入审计行失败不会让诊断请求失败 —— `log_audit` 是 best-effort 的,失败时只 `warn!`,与其他被审计的路由一致。
+
 ### OpenAPI
 
 完整 schema 发布在 `/swagger-ui`,标签为 **AI Triage**(机器消费走 `/api-docs/openapi.json`)。
diff --git a/sigma-api/src/routes/ai_triage.rs b/sigma-api/src/routes/ai_triage.rs
index a6d8d81..db52887 100644
--- a/sigma-api/src/routes/ai_triage.rs
+++ b/sigma-api/src/routes/ai_triage.rs
@@ -32,6 +32,7 @@ use uuid::Uuid;
 
 use crate::auth::{require_role, CurrentUser};
 use crate::errors::AppError;
+use crate::routes::audit_logs::log_audit;
 use crate::routes::AppState;
 
 // Provider endpoints.
@@ -216,10 +217,12 @@ pub async fn triage(
     let provider = state.llm_provider;
     let provider_str = provider.as_str();
 
-    // If no API key configured, degrade gracefully — return the alert
-    // back with `available: false` so the UI can still render something.
-    let Some(api_key) = state.llm_api_key.clone() else {
-        return Ok(Json(TriageResponse {
+    // Build the response in a single binding so every exit path (degraded
+    // or success) flows through the audit-log call below.
+    let response = match state.llm_api_key.clone() {
+        // No API key configured → degrade gracefully so the UI still
+        // renders something useful.
+        None => TriageResponse {
             available: false,
             diagnosis: Some(format!(
                 "AI triage not configured (no LLM_API_KEY for provider={}). Alert: {}",
@@ -240,47 +243,78 @@ pub async fn triage(
                 "API key not configured for provider={}",
                 provider_str
             )),
-        }));
+        },
+        Some(api_key) => {
+            let model = req
+                .model
+                .clone()
+                .unwrap_or_else(|| provider.default_model().to_string());
+
+            let system_prompt = build_system_prompt();
+            let user_prompt = build_user_prompt(&req);
+
+            match call_llm(
+                &state.http_client,
+                provider,
+                &api_key,
+                &model,
+                &system_prompt,
+                &user_prompt,
+                DEFAULT_MAX_TOKENS,
+            )
+            .await
+            {
+                Ok(text) => parse_llm_response(&text, &model, provider_str),
+                Err(e) => {
+                    warn!(
+                        provider = %provider_str,
+                        error = %e,
+                        "LLM call failed; degrading to alert-only response"
+                    );
+                    TriageResponse {
+                        available: false,
+                        diagnosis: Some(format!("Raw alert: {}", req.alert.name)),
+                        likely_causes: vec![],
+                        remediation_steps: vec![
+                            "Inspect alert manually — AI triage was unavailable".into(),
+                        ],
+                        confidence: Some("low".into()),
+                        model: Some(model),
+                        provider: Some(provider_str.to_string()),
+                        note: Some(format!("LLM unreachable: {:#}", e)),
+                    }
+                }
+            }
+        }
     };
 
-    let model = req
-        .model
-        .clone()
-        .unwrap_or_else(|| provider.default_model().to_string());
-
-    let system_prompt = build_system_prompt();
-    let user_prompt = build_user_prompt(&req);
-
-    let llm_text = match call_llm(
-        &state.http_client,
-        provider,
-        &api_key,
-        &model,
-        &system_prompt,
-        &user_prompt,
-        DEFAULT_MAX_TOKENS,
+    // Audit log every triage request that passed RBAC + rate-limit. We
+    // record alert metadata (NOT the raw description or logs, which can
+    // carry sensitive content) plus the resolved provider/model and the
+    // LLM outcome. resource_id is the targeted VPS UUID when supplied so
+    // operators can filter `/api/audit-logs?resource_id=<vps>` to see who
+    // has triaged a given host.
+    let vps_id_str = req.alert.vps_id.map(|id| id.to_string());
+    log_audit(
+        &state.db,
+        &user,
+        "triage",
+        "ai_triage",
+        vps_id_str.as_deref(),
+        json!({
+            "alert_name": req.alert.name,
+            "alert_severity": req.alert.severity,
+            "vps_hostname": req.alert.vps_hostname,
+            "fired_at": req.alert.fired_at,
+            "provider": response.provider,
+            "model": response.model,
+            "available": response.available,
+            "confidence": response.confidence,
+        }),
     )
-    .await
-    {
-        Ok(text) => text,
-        Err(e) => {
-            warn!(provider = %provider_str, error = %e, "LLM call failed; degrading to alert-only response");
-            return Ok(Json(TriageResponse {
-                available: false,
-                diagnosis: Some(format!("Raw alert: {}", req.alert.name)),
-                likely_causes: vec![],
-                remediation_steps: vec![
-                    "Inspect alert manually — AI triage was unavailable".into(),
-                ],
-                confidence: Some("low".into()),
-                model: Some(model),
-                provider: Some(provider_str.to_string()),
-                note: Some(format!("LLM unreachable: {:#}", e)),
-            }));
-        }
-    };
+    .await;
 
-    Ok(Json(parse_llm_response(&llm_text, &model, provider_str)))
+    Ok(Json(response))
 }
 
 // ---------- Per-user rate limit ----------
diff --git a/sigma-api/tests/ai_triage_test.rs b/sigma-api/tests/ai_triage_test.rs
index 8a47cd8..c32d3ec 100644
--- a/sigma-api/tests/ai_triage_test.rs
+++ b/sigma-api/tests/ai_triage_test.rs
@@ -158,3 +158,54 @@ async fn test_per_user_rate_limit_triggers_429() {
 
     common::cleanup(&pool).await;
 }
+
+#[tokio::test]
+async fn test_triage_writes_audit_log() {
+    let (router, pool) = common::setup().await;
+    let admin_token = common::login_admin(&router).await;
+    let token =
+        login_as(&router, &admin_token, "auditop@test.local", "operator").await;
+
+    // Triage a synthetic alert pinned to a fake VPS so we can assert
+    // resource_id was captured.
+    let vps_id = uuid::Uuid::new_v4();
+    let body = json!({
+        "alert": {
+            "name": "audit-test alert",
+            "severity": "high",
+            "vps_hostname": "relay-99",
+            "vps_id": vps_id.to_string(),
+        }
+    });
+    let (status, _) =
+        common::request_with_token(&router, "POST", "/api/ai/triage", &token, Some(body))
+            .await;
+    assert_eq!(status, 200);
+
+    // Admin (only role allowed to read audit logs) lists them filtered
+    // to the ai_triage resource and verifies the row landed.
+    let (status, logs) = common::request_with_token(
+        &router,
+        "GET",
+        "/api/audit-logs?resource=ai_triage",
+        &admin_token,
+        None,
+    )
+    .await;
+    assert_eq!(status, 200);
+    assert_eq!(logs["total"], 1);
+    let entry = &logs["data"][0];
+    assert_eq!(entry["action"], "triage");
+    assert_eq!(entry["resource"], "ai_triage");
+    assert_eq!(entry["resource_id"], vps_id.to_string());
+    assert_eq!(entry["user_email"], "auditop@test.local");
+    assert_eq!(entry["details"]["alert_name"], "audit-test alert");
+    assert_eq!(entry["details"]["alert_severity"], "high");
+    assert_eq!(entry["details"]["vps_hostname"], "relay-99");
+    // No LLM_API_KEY in test env → available=false. We still log the
+    // attempt, with the resolved provider name.
+    assert_eq!(entry["details"]["available"], false);
+    assert_eq!(entry["details"]["provider"], "anthropic");
+
+    common::cleanup(&pool).await;
+}