From c53dc8b1ce4b95d0dad3a5ac1eea3104009e5c8c Mon Sep 17 00:00:00 2001 From: OluRemiFour Date: Wed, 27 May 2026 11:57:28 +0100 Subject: [PATCH] Build Load Testing Framework --- backend/Cargo.toml | 141 ++---- backend/src/api/handlers/dashboard.rs | 223 --------- backend/src/api/handlers/profiling.rs | 197 ++++---- backend/src/config/mod.rs | 20 +- backend/src/config/reload.rs | 318 ++++-------- backend/src/error.rs | 112 +---- backend/src/jobs.rs | 15 +- backend/src/lib.rs | 7 +- backend/src/services/business_metrics.rs | 548 ++++----------------- backend/src/services/error_recovery.rs | 32 +- backend/src/services/feature_flags.rs | 257 +--------- backend/src/services/log_alerts.rs | 212 +------- backend/src/services/mod.rs | 3 +- backend/src/services/sys_metrics.rs | 389 +++++++-------- backend/src/services/tracing.rs | 406 +++------------- backend/tests/load/dashboard_load.rs | 453 ++++++++++++++++++ backend/tests/load/framework.rs | 585 +++++++++++++++++++++++ backend/tests/load/mod.rs | 33 +- backend/tests/load/profile_load.rs | 381 +++++++++++++-- backend/tests/load/status_load.rs | 317 +++++++++++- backend/tests/load/stellar_load.rs | 399 ++++++++++++++++ backend/tests/load_tests.rs | 20 +- 22 files changed, 2793 insertions(+), 2275 deletions(-) create mode 100644 backend/tests/load/dashboard_load.rs create mode 100644 backend/tests/load/framework.rs create mode 100644 backend/tests/load/stellar_load.rs diff --git a/backend/Cargo.toml b/backend/Cargo.toml index c661a3e..194f8e2 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -2,24 +2,6 @@ name = "backend" version = "0.1.0" edition = "2021" - -[dependencies] -axum = "0.7" -tokio = { version = "1", features = ["full"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "chrono", "uuid"] } -redis = { version = "0.25", features = ["tokio-comp"] } -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -thiserror = "1.0" -chrono = { version = "0.4", features = ["serde"] } -uuid = { version = "1", features = ["v4", "serde"] } -dotenvy = "0.15" -tower-http = { version = "0.5", features = ["trace"] } -name = "crucible-backend" -version = "0.1.0" -edition = "2021" description = "Backend API server for the Crucible smart contract testing platform" license = "MIT" authors = ["Crucible Contributors"] @@ -28,11 +10,19 @@ authors = ["Crucible Contributors"] name = "crucible-backend" path = "src/main.rs" +[[bin]] +name = "backup" +path = "src/bin/backup.rs" + +[features] +testutils = ["mockall"] + [dependencies] # Web framework axum = { version = "0.7", features = ["macros"] } tower = { version = "0.4", features = ["full"] } tower-http = { version = "0.5", features = ["cors", "trace", "compression-gzip", "request-id"] } +tower_governor = "0.4" # Async runtime tokio = { version = "1", features = ["full"] } @@ -53,112 +43,64 @@ redis = { version = "0.25", features = ["tokio-comp", "connection-manager"] } # Serialization serde = { version = "1", features = ["derive"] } serde_json = "1" +schemars = "0.8" # Observability tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +opentelemetry = { version = "0.24", features = ["trace", "metrics"] } +opentelemetry-otlp = { version = "0.17", features = ["trace", "grpc-tonic"] } +opentelemetry-semantic-conventions = "0.16" +opentelemetry_sdk = { version = "0.24", features = ["trace", "rt-tokio"] } +tracing-opentelemetry = "0.25" +tonic = "0.12" # Utilities uuid = { version = "1", features = ["v4", "serde"] } chrono = { version = "0.4", features = ["serde"] } dotenvy = "0.15" thiserror = "1" - -[dev-dependencies] -# Testing -reqwest = { version = "0.12", features = ["json"] } -tokio-test = "0.4" -testcontainers = "0.16" -wiremock = "0.6" - -[profile.release] -opt-level = 3 -lto = true -codegen-units = 1 -strip = true - -[dependencies] -axum = "0.7" -sqlx = { version = "0.7", features = ["postgres", "runtime-tokio", "macros"] } -redis = { version = "0.25", features = ["tokio-comp"] } -tokio = { version = "1.0", features = ["full"] } -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -schemars = "0.8" -tracing = "0.1" -tracing-subscriber = "0.3" - -[dev-dependencies] -tower = "0.4" -name = "backend" -version = "0.1.0" -edition = "2021" - -[[bin]] -name = "backup" -path = "src/bin/backup.rs" -[features] -testutils = ["mockall"] - -[dependencies] -axum = "0.7" -tokio = { version = "1", features = ["full"] } -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "macros", "chrono", "uuid"] } -redis = { version = "0.24", features = ["tokio-comp", "json"] } -sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "postgres", "chrono", "uuid", "json"] } -redis = { version = "0.27", features = ["tokio-comp", "json"] } -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } anyhow = "1.0" -thiserror = "1.0" -chrono = { version = "0.4", features = ["serde"] } -uuid = { version = "1.0", features = ["v4", "serde"] } -tower = { version = "0.5", features = ["util"] } -tower-http = { version = "0.5", features = ["trace"] } - -[dev-dependencies] -tower = { version = "0.5", features = ["util"] } -hyper = { version = "1.0", features = ["full"] } -mime = "0.3" -tokio = { version = "1", features = ["full", "test-util"] } arc-swap = "1.7" async-trait = "0.1" -dotenvy = "0.15" +futures-util = { version = "0.3", default-features = false, features = ["std"] } +base64 = "0.22" +validator = { version = "0.19", features = ["derive"] } +rust_decimal = { version = "1.35", features = ["serde"] } + +# Stellar +stellar-xdr = { version = "21.0", features = ["std"] } + +# API documentation utoipa = { version = "5.0", features = ["axum_extras", "chrono", "uuid"] } utoipa-swagger-ui = { version = "8.0", features = ["axum"] } + +# Background jobs apalis = { version = "0.6" } apalis-redis = "0.6" -rust_decimal = { version = "1.35", features = ["serde"] } -stellar-xdr = { version = "21.0", features = ["std"] } -base64 = "0.22" -validator = { version = "0.19", features = ["derive"] } -tower-http = { version = "0.5", features = ["cors", "trace"] } -tower_governor = "0.4" + +# Optional: mock support for tests mockall = { version = "0.13", optional = true } -opentelemetry = { version = "0.31", features = ["trace"] } -opentelemetry_sdk = { version = "0.31", features = ["trace", "rt-tokio"] } -opentelemetry-otlp = { version = "0.31", default-features = false, features = ["trace", "http-proto", "reqwest-client"] } -tracing-opentelemetry = { version = "0.32", default-features = false } -futures-util = { version = "0.3", default-features = false, features = ["std"] } -# OpenTelemetry and tracing instrumentation -opentelemetry = { version = "0.24", features = ["trace", "metrics"] } -opentelemetry-otlp = { version = "0.17", features = ["trace", "grpc-tonic"] } -opentelemetry-semantic-conventions = "0.16" -opentelemetry_sdk = { version = "0.24", features = ["trace", "rt-tokio"] } -tracing-opentelemetry = "0.25" -tonic = "0.12" [dev-dependencies] tower = { version = "0.4", features = ["util"] } tower-http = { version = "0.5", features = ["trace"] } -rust_decimal_macros = "1.35" -criterion = { version = "0.5", features = ["async_tokio"] } hyper = { version = "1.0", features = ["full"] } mime = "0.3" +tokio = { version = "1", features = ["full", "test-util"] } +reqwest = { version = "0.12", features = ["json"] } +tokio-test = "0.4" +testcontainers = "0.16" +wiremock = "0.6" mockall = "0.13" -mockall = "0.12" +rust_decimal_macros = "1.35" +criterion = { version = "0.5", features = ["async_tokio"] } + +[profile.release] +opt-level = 3 +lto = true +codegen-units = 1 +strip = true [[bench]] name = "performance" @@ -167,4 +109,3 @@ harness = false [[bench]] name = "dashboard_bench" harness = false - diff --git a/backend/src/api/handlers/dashboard.rs b/backend/src/api/handlers/dashboard.rs index 4f39154..1024d0e 100644 --- a/backend/src/api/handlers/dashboard.rs +++ b/backend/src/api/handlers/dashboard.rs @@ -1,226 +1,3 @@ -use axum::{Json, response::IntoResponse, extract::{State, Path}}; -use serde::{Serialize, Deserialize}; -use tracing::{info, instrument, error}; -use chrono::{DateTime, Utc}; -use crate::error::AppError; -use utoipa::ToSchema; -use std::sync::Arc; -use sqlx::PgPool; -use redis::AsyncCommands; - -/// Shared application state for dashboard handlers -pub struct DashboardState { - pub db: PgPool, - pub redis: redis::aio::ConnectionManager, -} - -#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] -pub struct DashboardMetrics { - /// Total number of active contracts - pub total_contracts: i64, - /// Total number of transactions processed - pub total_transactions: i64, - /// Average transaction processing time in milliseconds - pub avg_processing_time_ms: f64, - /// Number of failed transactions in the last 24 hours - pub failed_transactions_24h: i64, - /// Timestamp of the metrics snapshot - pub timestamp: DateTime, -} - -#[derive(Debug, Serialize, Deserialize, ToSchema)] -pub struct ContractStats { - /// Contract identifier - pub contract_id: String, - /// Number of invocations - pub invocation_count: i64, - /// Last invocation timestamp - pub last_invoked: Option>, - /// Average gas cost - pub avg_gas_cost: f64, -} - -/// Retrieves aggregated dashboard metrics with Redis caching -#[utoipa::path( - get, - path = "/api/v1/dashboard/metrics", - responses( - (status = 200, description = "Dashboard metrics retrieved successfully", body = DashboardMetrics), - (status = 500, description = "Internal server error") - ), - tag = "dashboard" -)] -#[instrument(skip(state))] -pub async fn get_dashboard_metrics( - State(state): State>, -) -> Result { - info!("Fetching dashboard metrics"); - - // Try cache first - let cache_key = "dashboard:metrics"; - let mut redis_conn = state.redis.clone(); - - if let Ok(cached) = redis_conn.get::<_, String>(cache_key).await { - if let Ok(metrics) = serde_json::from_str::(&cached) { - info!("Returning cached dashboard metrics"); - return Ok(Json(metrics)); - } - } - - // Fetch from database - let total_contracts = sqlx::query_scalar::<_, i64>( - "SELECT COUNT(*) FROM contracts" - ) - .fetch_optional(&state.db) - .await? - .unwrap_or(0); - - let total_transactions = sqlx::query_scalar::<_, i64>( - "SELECT COUNT(*) FROM transactions" - ) - .fetch_optional(&state.db) - .await? - .unwrap_or(0); - - let avg_processing_time = sqlx::query_scalar::<_, Option>( - "SELECT AVG(processing_time_ms) FROM transactions WHERE processing_time_ms IS NOT NULL" - ) - .fetch_one(&state.db) - .await? - .unwrap_or(0.0); - - let failed_24h = sqlx::query_scalar::<_, i64>( - "SELECT COUNT(*) FROM transactions - WHERE status = 'failed' AND created_at > NOW() - INTERVAL '24 hours'" - ) - .fetch_optional(&state.db) - .await? - .unwrap_or(0); - - let metrics = DashboardMetrics { - total_contracts, - total_transactions, - avg_processing_time_ms: avg_processing_time, - failed_transactions_24h: failed_24h, - timestamp: Utc::now(), - }; - - // Cache for 60 seconds - if let Ok(json) = serde_json::to_string(&metrics) { - let _: Result<(), _> = redis_conn.set_ex(cache_key, json, 60).await; - } - - info!( - contracts = metrics.total_contracts, - transactions = metrics.total_transactions, - "Dashboard metrics retrieved" - ); - - Ok(Json(metrics)) -} - -/// Retrieves statistics for a specific contract -#[utoipa::path( - get, - path = "/api/v1/dashboard/contracts/{contract_id}/stats", - params( - ("contract_id" = String, Path, description = "Contract identifier") - ), - responses( - (status = 200, description = "Contract statistics retrieved", body = ContractStats), - (status = 404, description = "Contract not found"), - (status = 500, description = "Internal server error") - ), - tag = "dashboard" -)] -#[instrument(skip(state))] -pub async fn get_contract_stats( - State(state): State>, - Path(contract_id): Path, -) -> Result { - info!(contract_id = %contract_id, "Fetching contract statistics"); - - let cache_key = format!("dashboard:contract:{}:stats", contract_id); - let mut redis_conn = state.redis.clone(); - - // Check cache - if let Ok(cached) = redis_conn.get::<_, String>(&cache_key).await { - if let Ok(stats) = serde_json::from_str::(&cached) { - return Ok(Json(stats)); - } - } - - // Query database - let result = sqlx::query!( - r#" - SELECT - COUNT(*) as "invocation_count!", - MAX(created_at) as last_invoked, - AVG(gas_cost) as avg_gas_cost - FROM transactions - WHERE contract_id = $1 - "#, - contract_id - ) - .fetch_optional(&state.db) - .await?; - - let stats = match result { - Some(row) if row.invocation_count > 0 => ContractStats { - contract_id: contract_id.clone(), - invocation_count: row.invocation_count, - last_invoked: row.last_invoked, - avg_gas_cost: row.avg_gas_cost.unwrap_or(0.0), - }, - _ => { - error!(contract_id = %contract_id, "Contract not found"); - return Err(AppError::NotFound(format!("Contract {} not found", contract_id))); - } - }; - - // Cache for 30 seconds - if let Ok(json) = serde_json::to_string(&stats) { - let _: Result<(), _> = redis_conn.set_ex(&cache_key, json, 30).await; - } - - Ok(Json(stats)) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_dashboard_metrics_serialization() { - let metrics = DashboardMetrics { - total_contracts: 100, - total_transactions: 5000, - avg_processing_time_ms: 125.5, - failed_transactions_24h: 3, - timestamp: Utc::now(), - }; - - let json = serde_json::to_string(&metrics).unwrap(); - let deserialized: DashboardMetrics = serde_json::from_str(&json).unwrap(); - - assert_eq!(deserialized.total_contracts, 100); - assert_eq!(deserialized.total_transactions, 5000); - } - - #[test] - fn test_contract_stats_serialization() { - let stats = ContractStats { - contract_id: "test_contract_123".to_string(), - invocation_count: 42, - last_invoked: Some(Utc::now()), - avg_gas_cost: 1500.75, - }; - - let json = serde_json::to_string(&stats).unwrap(); - let deserialized: ContractStats = serde_json::from_str(&json).unwrap(); - - assert_eq!(deserialized.contract_id, "test_contract_123"); - assert_eq!(deserialized.invocation_count, 42); //! Dashboard data API handler. //! //! Provides a single `GET /api/dashboard` endpoint that aggregates system diff --git a/backend/src/api/handlers/profiling.rs b/backend/src/api/handlers/profiling.rs index a518fba..6686f49 100644 --- a/backend/src/api/handlers/profiling.rs +++ b/backend/src/api/handlers/profiling.rs @@ -1,65 +1,88 @@ -use axum::extract::State; -use axum::{Json, response::IntoResponse, extract::State}; -use serde::{Serialize, Deserialize}; -use tracing::{info, instrument, info_span}; -use chrono::{DateTime, Utc}; -use crate::error::AppError; -use crate::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; +//! Performance profiling and system health API handlers. +//! +//! Provides endpoints for monitoring application health, collecting system +//! metrics, and triggering profiling runs. + use axum::{extract::State, response::IntoResponse, Json}; use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use std::sync::Arc; use tracing::{info, instrument}; use utoipa::ToSchema; + +use crate::api::contracts::{ + ApiResponse, ProfileTriggerRequest, ProfileTriggerResponse, SystemStatus, ValidatedJson, +}; +use crate::config::reload::ConfigManager; +use crate::error::AppError; use crate::services::{ - sys_metrics::MetricsExporter, error_recovery::ErrorManager, log_aggregator::LogAggregator, + sys_metrics::MetricsExporter, tracing::TracingService, }; -use crate::config::reload::ConfigManager; -use crate::api::contracts::{ApiResponse, SystemStatus, ProfileTriggerRequest, ProfileTriggerResponse, ValidatedJson}; -use sqlx::PgPool; use redis::Client as RedisClient; +// --------------------------------------------------------------------------- +// Shared application state +// --------------------------------------------------------------------------- + +/// Shared application state passed to profiling and status handlers. pub struct AppState { + /// Optional PostgreSQL connection pool (None in tests). pub db: Option, + /// System metrics exporter. pub metrics_exporter: Arc, + /// Error recovery manager. pub error_manager: Arc, + /// Hot-reloadable configuration manager. pub config_manager: Arc, + /// Async log aggregation pipeline. pub log_aggregator: Arc, + /// Redis client for caching. pub redis: RedisClient, } +// --------------------------------------------------------------------------- +// Response types +// --------------------------------------------------------------------------- + +/// Detailed performance metrics report. #[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] pub struct MetricsReport { - /// Total system uptime in seconds + /// Total system uptime in seconds. pub uptime_secs: u64, - /// Current resident set size (RSS) in bytes + /// Current resident set size (RSS) in bytes. pub memory_usage_bytes: u64, - /// Number of currently active HTTP requests + /// Number of currently active HTTP requests. pub active_requests: u32, - /// Percentage of failed requests in the last window + /// Percentage of failed requests in the last window. pub error_rate: f64, - /// Current latency for Stellar ledger ingestion in milliseconds + /// Current latency for Stellar ledger ingestion in milliseconds. pub ledger_ingestion_latency_ms: u32, } +/// System health check response. #[derive(Debug, Serialize, ToSchema)] pub struct HealthResponse { - /// Overall health status (e.g., 'healthy' or 'degraded') + /// Overall health status (e.g., `"healthy"` or `"degraded"`). pub status: String, - /// The current version of the backend service + /// The current version of the backend service. pub version: String, - /// RFC3339 timestamp of the health check + /// RFC3339 timestamp of the health check. pub timestamp: DateTime, - /// Connectivity status to the PostgreSQL database + /// Connectivity status to the PostgreSQL database. pub database_connected: bool, - /// Connectivity status to the Redis cache + /// Connectivity status to the Redis cache. pub redis_connected: bool, } -/// Handler for retrieving detailed performance metrics. +// --------------------------------------------------------------------------- +// Handlers +// --------------------------------------------------------------------------- + +/// `GET /api/v1/profiling/metrics` — retrieve detailed performance metrics. +/// /// Optimized for consumption by monitoring tools like Grafana. #[utoipa::path( get, @@ -74,21 +97,13 @@ pub struct HealthResponse { pub async fn get_metrics( State(state): State>, ) -> Result { - let span = info_span!("metrics.collection"); - let _enter = span.enter(); - info!("Collecting performance metrics"); - let sys_metrics = state.metrics_exporter.get_metrics().await; - - - // Instrument the metrics exporter call let metrics_span = TracingService::service_method_span("MetricsExporter", "get_metrics"); let _metrics_enter = metrics_span.enter(); - let sys_metrics = state.metrics_exporter.get_metrics().await; drop(_metrics_enter); - + let report = MetricsReport { uptime_secs: sys_metrics.uptime, memory_usage_bytes: sys_metrics.memory_usage, @@ -100,14 +115,14 @@ pub async fn get_metrics( info!( uptime = sys_metrics.uptime, memory = sys_metrics.memory_usage, - active_requests = 12, "Metrics collected successfully" ); Ok(Json(report)) } -/// Handler for system health checks. +/// `GET /api/v1/profiling/health` — system health check. +/// /// Performs actual pings to downstream services. #[utoipa::path( get, @@ -122,35 +137,29 @@ pub async fn get_metrics( pub async fn get_health( State(state): State>, ) -> Result { - let span = info_span!("health.check"); - let _enter = span.enter(); - info!("Performing system health check"); - - // Check database connectivity with tracing - let db_span = TracingService::db_query_span( - "SELECT 1", - "postgres", - "PING" - ); - let _db_enter = db_span.enter(); - - let db_healthy = sqlx::query("SELECT 1") - .fetch_optional(&state.db) - .await - .map(|result| result.is_some()) - .unwrap_or_else(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - false - }); - drop(_db_enter); - + let db_healthy = if let Some(ref pool) = state.db { + let db_span = TracingService::db_query_span("SELECT 1", "postgres", "PING"); + let _db_enter = db_span.enter(); + let result = sqlx::query("SELECT 1") + .fetch_optional(pool) + .await + .map(|r| r.is_some()) + .unwrap_or_else(|e| { + TracingService::record_error(&db_span, &e.to_string(), "database"); + false + }); + drop(_db_enter); + result + } else { + false + }; + let response = HealthResponse { status: if db_healthy { "healthy" } else { "degraded" }.to_string(), version: env!("CARGO_PKG_VERSION").to_string(), timestamp: Utc::now(), - database_connected: true, database_connected: db_healthy, redis_connected: true, }; @@ -164,47 +173,31 @@ pub async fn get_health( Ok(Json(response)) } -/// Handler for Prometheus-compatible metrics. +/// `GET /api/v1/profiling/prometheus` — Prometheus-compatible metrics. #[instrument(skip_all, fields(http.method = "GET", http.route = "/api/v1/profiling/prometheus"))] pub async fn get_prometheus_metrics() -> impl IntoResponse { - let span = info_span!("prometheus.metrics.export"); - let _enter = span.enter(); - info!("Exporting Prometheus-format metrics"); - "# HELP backend_requests_total Total number of requests\n\ - # TYPE backend_requests_total counter\n\ - backend_requests_total 1024\n\ - # HELP backend_ledger_latency_ms Current ledger ingestion latency\n\ - # TYPE backend_ledger_latency_ms gauge\n\ - backend_ledger_latency_ms 120\n" - .to_string() -} - -pub async fn get_system_status(State(state): State>) -> impl IntoResponse { # TYPE backend_requests_total counter\n\ backend_requests_total 1024\n\ # HELP backend_ledger_latency_ms Current ledger ingestion latency\n\ # TYPE backend_ledger_latency_ms gauge\n\ - backend_ledger_latency_ms 120\n".to_string() + backend_ledger_latency_ms 120\n" + .to_string() } -/// Handler for detailed system status +/// `GET /api/status` — detailed system status. #[instrument(skip_all, fields(http.method = "GET", http.route = "/api/status"))] pub async fn get_system_status( State(state): State>, ) -> ApiResponse { -) -> impl IntoResponse { - let span = info_span!("system.status"); - let _enter = span.enter(); - info!("Retrieving system status"); - + let metrics_span = TracingService::service_method_span("MetricsExporter", "get_metrics"); let _metrics_enter = metrics_span.enter(); let metrics = state.metrics_exporter.get_metrics().await; drop(_metrics_enter); - + let recovery_span = TracingService::service_method_span("ErrorManager", "get_active_tasks"); let _recovery_enter = recovery_span.enter(); let recovery_tasks = state.error_manager.get_active_tasks().await; @@ -216,42 +209,36 @@ pub async fn get_system_status( memory_used_bytes: metrics.memory_usage, active_recovery_tasks: recovery_tasks.len(), }) - Json(serde_json::json!({ - "status": "healthy", - "metrics": metrics, - "active_recovery_tasks": recovery_tasks, - })) } -pub async fn trigger_profile_collection(State(_state): State>) -> impl IntoResponse { -/// Handler to trigger profile collection (CPU, memory profiling) +/// `POST /api/profile` — trigger a profiling collection run. +#[utoipa::path( + post, + path = "/api/profile", + responses( + (status = 200, description = "Profiling collection triggered"), + (status = 400, description = "Invalid request parameters") + ), + tag = "profiling" +)] #[instrument(skip_all, fields(http.method = "POST", http.route = "/api/profile"))] pub async fn trigger_profile_collection( State(_state): State>, ValidatedJson(payload): ValidatedJson, ) -> ApiResponse { - // In a real implementation, this would trigger a CPU/Memory profile - // using the provided payload (duration, sample rate, etc.) - - ApiResponse::new(ProfileTriggerResponse { - profile_id: uuid::Uuid::new_v4(), - message: format!("Profiling collection triggered for label: {}", payload.label), - estimated_completion: chrono::Utc::now() + chrono::Duration::seconds(payload.duration_secs as i64), - }) -) -> impl IntoResponse { - let span = info_span!("profiling.collection"); - let _enter = span.enter(); - - let profile_id = uuid::Uuid::new_v4().to_string(); - + let profile_id = uuid::Uuid::new_v4(); + info!( profile_id = %profile_id, + label = %payload.label, + duration_secs = payload.duration_secs, "Profiling collection triggered" ); - - // In a real implementation, this would trigger a CPU/Memory profile - Json(serde_json::json!({ - "message": "Profiling collection triggered", - "profile_id": profile_id, - })) + + ApiResponse::new(ProfileTriggerResponse { + profile_id, + message: format!("Profiling collection triggered for label: {}", payload.label), + estimated_completion: chrono::Utc::now() + + chrono::Duration::seconds(payload.duration_secs as i64), + }) } diff --git a/backend/src/config/mod.rs b/backend/src/config/mod.rs index c9a0299..252a4cf 100644 --- a/backend/src/config/mod.rs +++ b/backend/src/config/mod.rs @@ -1,8 +1,12 @@ +//! Application configuration. + pub mod reload; use serde::{Deserialize, Serialize}; +use std::env; -#[derive(Debug, Clone, Serialize, Deserialize)] +/// Environment-based application configuration. +#[derive(Debug, Deserialize, Serialize, Clone)] pub struct AppConfig { pub server: ServerConfig, pub database: DatabaseConfig, @@ -43,14 +47,10 @@ impl Default for AppConfig { }, log_level: "info".to_string(), } -//! Application configuration. - -pub mod reload; - -use serde::Deserialize; -use std::env; + } +} -/// Environment-based application configuration. +/// Simple environment-based config loader (used by main.rs). #[derive(Debug, Deserialize, Clone)] pub struct Config { pub database_url: String, @@ -69,7 +69,9 @@ impl Config { database_url: env::var("DATABASE_URL") .unwrap_or_else(|_| "postgres://postgres:password@localhost:5432/backend".into()), redis_url: env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".into()), - server_port: env::var("PORT").unwrap_or_else(|_| "3000".into()).parse()?, + server_port: env::var("PORT") + .unwrap_or_else(|_| "3000".into()) + .parse()?, environment: env::var("APP_ENV").unwrap_or_else(|_| "development".into()), log_level: env::var("LOG_LEVEL").unwrap_or_else(|_| "info".into()), }) diff --git a/backend/src/config/reload.rs b/backend/src/config/reload.rs index b56caa6..c00a0fe 100644 --- a/backend/src/config/reload.rs +++ b/backend/src/config/reload.rs @@ -1,17 +1,39 @@ +//! Configuration hot-reload. +//! +//! This module provides two complementary configuration management types: +//! +//! - [`ConfigManager`] — a simple `ArcSwap`-backed manager used by the +//! profiling handlers. Supports file-based and patch-based reloads. +//! - [`ConfigWatcher`] — a richer watcher that subscribes to a Redis pub/sub +//! channel and atomically swaps the live config on every reload signal. +//! +//! # Redis protocol (ConfigWatcher) +//! +//! ```text +//! SET config:current '{"log_level":"info","max_connections":50,...}' +//! PUBLISH config:reload "reload" +//! ``` + +#![allow(dead_code)] + use std::sync::Arc; + use arc_swap::ArcSwap; -use axum::{ - extract::State, - http::StatusCode, - response::IntoResponse, - Json, -}; +use axum::{extract::State, http::StatusCode, response::IntoResponse, Json}; +use redis::{AsyncCommands, Client as RedisClient}; +use serde::{Deserialize, Serialize}; use serde_json::Value; use thiserror::Error; -use tracing::{info, warn, instrument}; +use tokio::sync::{watch, RwLock}; +use tracing::{error, info, instrument, warn}; + use crate::config::AppConfig; -/// Errors that can occur during configuration reload. +// --------------------------------------------------------------------------- +// ConfigReloadError +// --------------------------------------------------------------------------- + +/// Errors that can occur during configuration reload (ConfigManager). #[derive(Debug, Error)] pub enum ConfigReloadError { #[error("IO error: {0}")] @@ -45,34 +67,35 @@ impl IntoResponse for ConfigReloadError { } } -/// Manages hot-reloadable application configuration. +// --------------------------------------------------------------------------- +// ConfigManager (ArcSwap-based, used by profiling handlers) +// --------------------------------------------------------------------------- + +/// Manages hot-reloadable application configuration via `ArcSwap`. pub struct ConfigManager { current_config: ArcSwap, } impl ConfigManager { - /// Create a new ConfigManager with the default configuration. + /// Create a new `ConfigManager` with the given initial configuration. pub fn new(initial_config: AppConfig) -> Self { Self { current_config: ArcSwap::from(Arc::new(initial_config)), } } - /// Get a reference to the current configuration. + /// Return a snapshot of the current configuration. pub fn load(&self) -> Arc { self.current_config.load_full() } - /// Reload the configuration from a file or environment. - /// In this implementation, we simulate loading from a local `config.json` file. + /// Reload configuration from `config.json` in the current directory. #[instrument(skip(self))] pub async fn reload(&self) -> Result<(), ConfigReloadError> { info!("Starting configuration reload..."); - // In a real scenario, we would load from a file or external service. - // For this task, we'll look for `config.json` in the current directory. let config_path = "config.json"; - + if !std::path::Path::new(config_path).exists() { warn!("config.json not found, skipping reload"); return Err(ConfigReloadError::Io(std::io::Error::new( @@ -84,37 +107,37 @@ impl ConfigManager { let content = tokio::fs::read_to_string(config_path).await?; let new_config: AppConfig = serde_json::from_str(&content)?; - // Validate config (e.g., check database URL format) if new_config.database.url.is_empty() { - return Err(ConfigReloadError::Invalid("Database URL cannot be empty".to_string())); + return Err(ConfigReloadError::Invalid( + "Database URL cannot be empty".to_string(), + )); } - // Update the global configuration self.current_config.store(Arc::new(new_config)); - info!("Configuration successfully reloaded"); Ok(()) } - /// Update configuration from a JSON value (e.g., from an API request). + /// Apply a JSON patch to the current configuration. #[instrument(skip(self, patch))] pub fn update_from_patch(&self, patch: Value) -> Result<(), ConfigReloadError> { let current = self.load(); let mut current_json = serde_json::to_value(&*current)?; - - // Deep merge patch into current configuration + if let Some(patch_obj) = patch.as_object() { if let Some(current_obj) = current_json.as_object_mut() { for (k, v) in patch_obj { - if v.is_object() && current_obj.contains_key(k) && current_obj[k].is_object() { - // Merge nested objects + if v.is_object() + && current_obj.contains_key(k) + && current_obj[k].is_object() + { let sub_patch = v.as_object().unwrap(); - let sub_current = current_obj.get_mut(k).unwrap().as_object_mut().unwrap(); + let sub_current = + current_obj.get_mut(k).unwrap().as_object_mut().unwrap(); for (sk, sv) in sub_patch { sub_current.insert(sk.clone(), sv.clone()); } } else { - // Direct replacement for non-objects or new keys current_obj.insert(k.clone(), v.clone()); } } @@ -123,90 +146,42 @@ impl ConfigManager { let new_config: AppConfig = serde_json::from_value(current_json)?; self.current_config.store(Arc::new(new_config)); - info!("Configuration updated via patch"); Ok(()) } } -/// Axum handler to trigger a configuration reload. +// --------------------------------------------------------------------------- +// Axum handlers for ConfigManager +// --------------------------------------------------------------------------- + +/// `POST /api/config/reload` — trigger a configuration reload from disk. pub async fn handle_reload( State(state): State>, -) -> Result { - state.config_manager.reload().await?; - Ok((StatusCode::OK, Json(serde_json::json!({ "status": "reloaded" })))) +) -> impl IntoResponse { + match state.config_manager.reload().await { + Ok(()) => ( + StatusCode::OK, + Json(serde_json::json!({ "status": "reloaded" })), + ) + .into_response(), + Err(e) => e.into_response(), + } } -/// Axum handler to get the current configuration (sanitized). +/// `GET /api/config` — return the current configuration (sanitized). pub async fn handle_get_config( State(state): State>, ) -> impl IntoResponse { let config = state.config_manager.load(); - // In a real app, we would sanitize sensitive fields like DB passwords Json(config) -//! Configuration hot-reload. -//! -//! This module provides [`ConfigWatcher`], which holds the live [`AppConfig`] -//! behind an `Arc>` and can reload it at any time — either -//! programmatically via [`ConfigWatcher::reload`] or automatically by -//! subscribing to a Redis pub/sub channel with [`ConfigWatcher::watch`]. -//! -//! When a reload message arrives on the Redis channel the watcher fetches the -//! new configuration JSON from a Redis key, deserialises it, and atomically -//! swaps the in-memory value. All readers that hold a clone of the -//! [`ConfigHandle`] see the new values on their next read without any restart. -//! -//! # Example -//! -//! ```rust,no_run -//! use backend::config::reload::{AppConfig, ConfigWatcher}; -//! -//! # async fn example() -> anyhow::Result<()> { -//! let watcher = ConfigWatcher::new(AppConfig::default()); -//! let handle = watcher.handle(); -//! -//! // Read the current config -//! let cfg = handle.get().await; -//! println!("log level: {}", cfg.log_level); -//! -//! // Trigger a manual reload -//! watcher.reload(AppConfig { -//! log_level: "info".to_string(), -//! ..AppConfig::default() -//! }).await; -//! # Ok(()) -//! # } -//! ``` -//! -//! # Redis protocol -//! -//! Publish any non-empty string to `config:reload` to trigger a reload: -//! -//! ```text -//! PUBLISH config:reload "" -//! SET config:current '{"log_level":"info","max_connections":50,...}' -//! PUBLISH config:reload "reload" -//! ``` -//! -//! The watcher reads `config:current` from Redis after every message on -//! `config:reload`. If the key is absent or unparseable the existing config -//! is kept and an error is logged. - -#![allow(dead_code)] - -use std::sync::Arc; - -use redis::{AsyncCommands, Client as RedisClient}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use tokio::sync::{watch, RwLock}; -use tracing::{error, info, warn}; +} // --------------------------------------------------------------------------- -// Error type +// ReloadError (ConfigWatcher) // --------------------------------------------------------------------------- -/// Errors that can occur during configuration reload. +/// Errors that can occur during ConfigWatcher reload. #[derive(Debug, Error)] pub enum ReloadError { /// A Redis error occurred. @@ -223,15 +198,12 @@ pub enum ReloadError { } // --------------------------------------------------------------------------- -// AppConfig +// HotAppConfig (used by ConfigWatcher) // --------------------------------------------------------------------------- /// Live application configuration that can be hot-reloaded at runtime. -/// -/// All fields have sensible defaults so the application starts without any -/// external configuration source. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct AppConfig { +pub struct HotAppConfig { /// Tracing / log filter directive (e.g. `"backend=debug"`). pub log_level: String, /// Maximum number of database connections in the pool. @@ -240,11 +212,11 @@ pub struct AppConfig { pub request_timeout_secs: u64, /// Whether the maintenance mode banner is shown. pub maintenance_mode: bool, - /// Redis key that stores the serialised [`AppConfig`] JSON. + /// Redis key that stores the serialised [`HotAppConfig`] JSON. pub redis_config_key: String, } -impl Default for AppConfig { +impl Default for HotAppConfig { fn default() -> Self { Self { log_level: "backend=debug,tower_http=debug".to_string(), @@ -257,30 +229,24 @@ impl Default for AppConfig { } // --------------------------------------------------------------------------- -// ConfigHandle — cheap clone, shared reader +// ConfigHandle // --------------------------------------------------------------------------- /// A cheap-to-clone handle to the live configuration. -/// -/// Obtain one via [`ConfigWatcher::handle`] and share it across the -/// application. Reads never block writers for more than a single lock -/// acquisition. #[derive(Clone)] pub struct ConfigHandle { - inner: Arc>, - /// Notified whenever the config is reloaded. + inner: Arc>, changed: watch::Receiver<()>, } impl ConfigHandle { /// Return a snapshot of the current configuration. - pub async fn get(&self) -> AppConfig { + pub async fn get(&self) -> HotAppConfig { self.inner.read().await.clone() } /// Wait until the configuration changes, then return the new snapshot. - pub async fn wait_for_change(&mut self) -> AppConfig { - // `changed()` resolves immediately if there is an unseen change. + pub async fn wait_for_change(&mut self) -> HotAppConfig { let _ = self.changed.changed().await; self.get().await } @@ -290,16 +256,16 @@ impl ConfigHandle { // ConfigWatcher // --------------------------------------------------------------------------- -/// Owns the live [`AppConfig`] and drives hot-reload. +/// Owns the live [`HotAppConfig`] and drives hot-reload via Redis pub/sub. pub struct ConfigWatcher { - inner: Arc>, + inner: Arc>, notify_tx: watch::Sender<()>, notify_rx: watch::Receiver<()>, } impl ConfigWatcher { /// Create a new watcher with the given initial configuration. - pub fn new(initial: AppConfig) -> Self { + pub fn new(initial: HotAppConfig) -> Self { let (tx, rx) = watch::channel(()); Self { inner: Arc::new(RwLock::new(initial)), @@ -317,7 +283,7 @@ impl ConfigWatcher { } /// Atomically replace the current configuration and notify all handles. - pub async fn reload(&self, new_config: AppConfig) { + pub async fn reload(&self, new_config: HotAppConfig) { let old = { let mut guard = self.inner.write().await; let old = guard.clone(); @@ -331,7 +297,6 @@ impl ConfigWatcher { maintenance_mode = new_config.maintenance_mode, "Configuration reloaded" ); - // Ignore send error — it only fails when all receivers are dropped. let _ = self.notify_tx.send(()); } else { info!("Configuration reload requested but values unchanged"); @@ -339,34 +304,21 @@ impl ConfigWatcher { } /// Fetch the current configuration from Redis and apply it. - /// - /// Reads the JSON value stored at `AppConfig::redis_config_key` (default - /// `config:current`), deserialises it, and calls [`Self::reload`]. - /// - /// # Errors - /// Returns [`ReloadError`] if the Redis key is absent, the connection - /// fails, or the JSON cannot be deserialised. pub async fn reload_from_redis(&self, redis: &RedisClient) -> Result<(), ReloadError> { let key = self.inner.read().await.redis_config_key.clone(); let mut conn = redis.get_multiplexed_async_connection().await?; let raw: Option = conn.get(&key).await?; let json = raw.ok_or(ReloadError::NotFound)?; - let new_config: AppConfig = serde_json::from_str(&json)?; + let new_config: HotAppConfig = serde_json::from_str(&json)?; self.reload(new_config).await; Ok(()) } - /// Spawn a background task that subscribes to `config:reload` on Redis - /// and calls [`Self::reload_from_redis`] on every message. - /// - /// The task runs until the Redis connection is lost or the process exits. - /// Connection errors are logged and the task exits — callers may restart - /// it if desired. + /// Spawn a background task that subscribes to `config:reload` on Redis. pub fn watch(self: Arc, redis: RedisClient) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { const CHANNEL: &str = "config:reload"; - // get_async_connection is the only way to obtain a PubSub-capable connection. #[allow(deprecated)] let conn = match redis.get_async_connection().await { Ok(c) => c, @@ -382,10 +334,7 @@ impl ConfigWatcher { return; } - info!( - channel = CHANNEL, - "Config watcher: listening for reload signals" - ); + info!(channel = CHANNEL, "Config watcher: listening for reload signals"); let mut stream = pubsub.into_on_message(); use futures_util::StreamExt; @@ -396,7 +345,10 @@ impl ConfigWatcher { let payload: String = msg.get_payload().unwrap_or_default(); info!(payload = %payload, "Config reload signal received"); if let Err(e) = self.reload_from_redis(&redis).await { - warn!(error = %e, "Config reload from Redis failed; keeping current config"); + warn!( + error = %e, + "Config reload from Redis failed; keeping current config" + ); } } None => { @@ -418,14 +370,12 @@ mod tests { use super::*; fn default_watcher() -> ConfigWatcher { - ConfigWatcher::new(AppConfig::default()) + ConfigWatcher::new(HotAppConfig::default()) } - // --- AppConfig --- - #[test] fn test_default_config_values() { - let cfg = AppConfig::default(); + let cfg = HotAppConfig::default(); assert_eq!(cfg.max_connections, 10); assert_eq!(cfg.request_timeout_secs, 30); assert!(!cfg.maintenance_mode); @@ -435,36 +385,23 @@ mod tests { #[test] fn test_config_serialisation_roundtrip() { - let cfg = AppConfig::default(); + let cfg = HotAppConfig::default(); let json = serde_json::to_string(&cfg).unwrap(); - let back: AppConfig = serde_json::from_str(&json).unwrap(); + let back: HotAppConfig = serde_json::from_str(&json).unwrap(); assert_eq!(cfg, back); } - #[test] - fn test_config_partial_deserialisation() { - // Only some fields present — rest should use serde defaults. - let json = r#"{"log_level":"info","max_connections":25,"request_timeout_secs":60,"maintenance_mode":true,"redis_config_key":"config:current"}"#; - let cfg: AppConfig = serde_json::from_str(json).unwrap(); - assert_eq!(cfg.log_level, "info"); - assert_eq!(cfg.max_connections, 25); - assert!(cfg.maintenance_mode); - } - - // --- ConfigWatcher::reload --- - #[tokio::test] async fn test_reload_updates_config() { let watcher = default_watcher(); let handle = watcher.handle(); - let new_cfg = AppConfig { + let new_cfg = HotAppConfig { log_level: "info".to_string(), max_connections: 50, - ..AppConfig::default() + ..HotAppConfig::default() }; watcher.reload(new_cfg.clone()).await; - assert_eq!(handle.get().await, new_cfg); } @@ -472,14 +409,8 @@ mod tests { async fn test_reload_unchanged_does_not_notify() { let watcher = default_watcher(); let mut handle = watcher.handle(); - - // Mark the initial value as seen. handle.changed.borrow_and_update(); - - // Reload with identical config. - watcher.reload(AppConfig::default()).await; - - // `has_changed` should be false — no notification was sent. + watcher.reload(HotAppConfig::default()).await; assert!(!handle.changed.has_changed().unwrap()); } @@ -487,91 +418,42 @@ mod tests { async fn test_reload_changed_notifies_handle() { let watcher = default_watcher(); let mut handle = watcher.handle(); - handle.changed.borrow_and_update(); - watcher - .reload(AppConfig { + .reload(HotAppConfig { maintenance_mode: true, - ..AppConfig::default() + ..HotAppConfig::default() }) .await; - assert!(handle.changed.has_changed().unwrap()); } - // --- ConfigHandle --- - - #[tokio::test] - async fn test_handle_get_returns_current() { - let watcher = default_watcher(); - let handle = watcher.handle(); - assert_eq!(handle.get().await, AppConfig::default()); - } - #[tokio::test] async fn test_multiple_handles_see_same_update() { let watcher = default_watcher(); let h1 = watcher.handle(); let h2 = watcher.handle(); - - let new_cfg = AppConfig { + let new_cfg = HotAppConfig { max_connections: 99, - ..AppConfig::default() + ..HotAppConfig::default() }; - watcher.reload(new_cfg.clone()).await; - + watcher.reload(new_cfg).await; assert_eq!(h1.get().await.max_connections, 99); assert_eq!(h2.get().await.max_connections, 99); } - #[tokio::test] - async fn test_wait_for_change_resolves_after_reload() { - let watcher = Arc::new(default_watcher()); - let mut handle = watcher.handle(); - - // Mark current as seen so wait_for_change actually waits. - handle.changed.borrow_and_update(); - - let watcher2 = Arc::clone(&watcher); - tokio::spawn(async move { - tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; - watcher2 - .reload(AppConfig { - maintenance_mode: true, - ..AppConfig::default() - }) - .await; - }); - - let updated = handle.wait_for_change().await; - assert!(updated.maintenance_mode); - } - - // --- reload_from_redis (no live Redis — error path) --- - #[tokio::test] async fn test_reload_from_redis_connection_error() { let watcher = default_watcher(); - // Port 1 is never open — connection will fail immediately. let redis = RedisClient::open("redis://127.0.0.1:1/").unwrap(); let result = watcher.reload_from_redis(&redis).await; assert!(matches!(result, Err(ReloadError::Redis(_)))); - // Config must be unchanged. - assert_eq!(watcher.handle().get().await, AppConfig::default()); + assert_eq!(watcher.handle().get().await, HotAppConfig::default()); } - // --- ReloadError display --- - #[test] fn test_reload_error_not_found_display() { let e = ReloadError::NotFound; assert!(e.to_string().contains("not found")); } - - #[test] - fn test_reload_error_deserialise_display() { - let e = ReloadError::Deserialise(serde_json::from_str::("bad").unwrap_err()); - assert!(!e.to_string().is_empty()); - } } diff --git a/backend/src/error.rs b/backend/src/error.rs index 3781fa6..1c9b420 100644 --- a/backend/src/error.rs +++ b/backend/src/error.rs @@ -8,20 +8,9 @@ use axum::{ response::{IntoResponse, Response}, Json, }; -use serde_json::json; -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum AppError { - #[error("Database error: {0}")] - DatabaseError(#[from] sqlx::Error), - - #[error("Redis error: {0}")] - RedisError(#[from] redis::RedisError), - - #[error("Internal server error")] - InternalServerError, use serde::Serialize; +use thiserror::Error; +use tracing::error; /// Structured error response returned to API clients. #[derive(Debug, Serialize)] @@ -40,13 +29,13 @@ pub struct ErrorResponse { /// # Examples /// /// ```rust,no_run -/// use crucible_backend::error::AppError; +/// use backend::error::AppError; /// /// async fn handler() -> Result { /// Err(AppError::NotFound("Contract not found".into())) /// } /// ``` -#[derive(Debug, thiserror::Error)] +#[derive(Debug, Error)] pub enum AppError { /// 404 — The requested resource was not found. #[error("Not found: {0}")] @@ -83,51 +72,14 @@ pub enum AppError { /// 500 — A catch-all for unexpected internal errors. #[error("Internal error: {0}")] InternalError(String), -use serde_json::json; -use thiserror::Error; -use tracing::error; - -#[derive(Debug, Error)] -pub enum AppError { - #[error("Database error: {0}")] - Database(#[from] sqlx::Error), - - #[error("Redis error: {0}")] - Redis(#[from] redis::RedisError), - - #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), - - #[error("Internal server error")] - Internal, - - #[error("Not found: {0}")] - NotFound(String), - - #[error("Validation error: {0}")] - ValidationError(String), - #[error("Invalid request: {0}")] - BadRequest(String), - - #[error("Unauthorized")] - Unauthorized, + /// 502 — A Stellar network operation failed. #[error("Stellar operation failed: {0}")] StellarError(String), } impl IntoResponse for AppError { fn into_response(self) -> Response { - let (status, error_message) = match self { - AppError::DatabaseError(_) => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()), - AppError::RedisError(_) => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()), - AppError::InternalServerError => (StatusCode::INTERNAL_SERVER_ERROR, self.to_string()), - AppError::NotFound(msg) => (StatusCode::NOT_FOUND, msg), - AppError::ValidationError(msg) => (StatusCode::BAD_REQUEST, msg), - }; - - let body = Json(json!({ - "error": error_message, let (status, code, message) = match &self { AppError::NotFound(msg) => (StatusCode::NOT_FOUND, "not_found", msg.clone()), AppError::BadRequest(msg) => (StatusCode::BAD_REQUEST, "bad_request", msg.clone()), @@ -140,7 +92,7 @@ impl IntoResponse for AppError { (StatusCode::UNPROCESSABLE_ENTITY, "validation_error", msg.clone()) } AppError::DatabaseError(e) => { - tracing::error!("Database error: {e:?}"); + error!("Database error: {e:?}"); ( StatusCode::INTERNAL_SERVER_ERROR, "database_error", @@ -148,7 +100,7 @@ impl IntoResponse for AppError { ) } AppError::RedisError(e) => { - tracing::error!("Redis error: {e:?}"); + error!("Redis error: {e:?}"); ( StatusCode::INTERNAL_SERVER_ERROR, "redis_error", @@ -156,13 +108,21 @@ impl IntoResponse for AppError { ) } AppError::InternalError(msg) => { - tracing::error!("Internal error: {msg}"); + error!("Internal error: {msg}"); ( StatusCode::INTERNAL_SERVER_ERROR, "internal_error", "An internal error occurred".to_string(), ) } + AppError::StellarError(msg) => { + error!("Stellar error: {msg}"); + ( + StatusCode::BAD_GATEWAY, + "stellar_error", + "Failed to communicate with Stellar network".to_string(), + ) + } }; ( @@ -213,45 +173,5 @@ mod tests { let json = serde_json::to_string(&resp).unwrap(); assert!(json.contains("\"code\":\"not_found\"")); assert!(json.contains("\"message\":\"Resource not found\"")); - let (status, message) = match self { - AppError::Database(ref e) => { - error!("Database error occurred: {:?}", e); - ( - StatusCode::INTERNAL_SERVER_ERROR, - "A database error occurred".to_string(), - ) - } - AppError::Redis(ref e) => { - error!("Redis error occurred: {:?}", e); - ( - StatusCode::INTERNAL_SERVER_ERROR, - "A cache error occurred".to_string(), - ) - } - AppError::NotFound(msg) => (StatusCode::NOT_FOUND, msg), - AppError::BadRequest(msg) => (StatusCode::BAD_REQUEST, msg), - AppError::Unauthorized => (StatusCode::UNAUTHORIZED, "Unauthorized access".to_string()), - AppError::StellarError(msg) => { - error!("Stellar error: {}", msg); - ( - StatusCode::BAD_GATEWAY, - "Failed to communicate with Stellar network".to_string(), - ) - } - _ => { - error!("Internal error: {:?}", self); - ( - StatusCode::INTERNAL_SERVER_ERROR, - "An internal server error occurred".to_string(), - ) - } - }; - - let body = Json(json!({ - "error": message, - "code": status.as_u16(), - })); - - (status, body).into_response() } } diff --git a/backend/src/jobs.rs b/backend/src/jobs.rs index 2468029..b2c97a0 100644 --- a/backend/src/jobs.rs +++ b/backend/src/jobs.rs @@ -1,24 +1,23 @@ +//! Background job definitions for the Apalis job queue. + use serde::{Deserialize, Serialize}; use tracing::{info, instrument}; + use crate::services::tracing::TracingService; +/// Job payload for monitoring a Stellar transaction. #[derive(Debug, Serialize, Deserialize)] pub struct TransactionMonitorJob { pub tx_hash: String, } -/// Handler for monitoring Stellar transactions. -/// Returning () since Apalis 0.6 handlers can return (). -pub async fn monitor_transaction(job: TransactionMonitorJob) { +/// Handler for monitoring Stellar transactions via Apalis. #[instrument(skip_all, fields(job.name = "monitor_transaction", job.id = %job.tx_hash))] -pub async fn monitor_transaction( - job: TransactionMonitorJob, -) { +pub async fn monitor_transaction(job: TransactionMonitorJob) { let span = TracingService::job_span("monitor_transaction", &job.tx_hash); let _enter = span.enter(); - + info!("Monitoring Stellar transaction: {}", job.tx_hash); tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; - info!("Transaction monitoring completed: {}", job.tx_hash); } diff --git a/backend/src/lib.rs b/backend/src/lib.rs index bea007e..c66e111 100644 --- a/backend/src/lib.rs +++ b/backend/src/lib.rs @@ -1,14 +1,15 @@ -pub mod utils; +//! Crucible backend library crate. + pub mod api; pub mod config; pub mod db; pub mod error; pub mod jobs; pub mod services; -pub mod config; pub mod telemetry; +pub mod utils; + #[cfg(any(test, feature = "testutils"))] pub mod test_utils; -pub mod utils; pub use error::AppError; diff --git a/backend/src/services/business_metrics.rs b/backend/src/services/business_metrics.rs index 05dd5df..7ba48f3 100644 --- a/backend/src/services/business_metrics.rs +++ b/backend/src/services/business_metrics.rs @@ -1,28 +1,23 @@ +//! Business metrics service for tracking revenue, costs, and operational KPIs. + +#![allow(dead_code)] + use std::collections::HashMap; use std::sync::Arc; + use chrono::{DateTime, Duration, Utc}; use rust_decimal::Decimal; use serde::{Deserialize, Serialize}; use sqlx::PgPool; use tokio::sync::RwLock; -use tracing::{error, info, instrument, warn}; +use tracing::{error, info, instrument}; use uuid::Uuid; use crate::error::AppError; -// ─── Domain Types ──────────────────────────────────────────────────────────── - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BusinessMetric { - pub id: Uuid, - pub name: String, - pub value: Decimal, - pub unit: String, - pub category: MetricCategory, - pub tags: HashMap, - pub recorded_at: DateTime, - pub source: MetricSource, -} +// --------------------------------------------------------------------------- +// Domain types +// --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "snake_case")] @@ -35,20 +30,34 @@ pub enum MetricCategory { Custom(String), } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] #[serde(rename_all = "snake_case")] pub enum MetricSource { OnChain, OffChain, + #[default] Database, ExternalApi, Manual, } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricSnapshot { - pub timestamp: DateTime, - pub metrics: Vec, +pub struct BusinessMetric { + pub id: Uuid, + pub name: String, + pub value: Decimal, + pub unit: String, + pub category: MetricCategory, + pub tags: HashMap, + pub recorded_at: DateTime, + pub source: MetricSource, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsSummary { + pub total_metrics: i64, + pub categories: HashMap, + pub latest_timestamp: Option>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -61,14 +70,9 @@ pub struct MetricsQuery { pub offset: Option, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsSummary { - pub total_metrics: i64, - pub categories: HashMap, - pub latest_timestamp: Option>, -} - -// ─── Service ───────────────────────────────────────────────────────────────── +// --------------------------------------------------------------------------- +// Service +// --------------------------------------------------------------------------- pub struct BusinessMetricsService { db: PgPool, @@ -83,48 +87,52 @@ impl BusinessMetricsService { } } - /// Record a new business metric with the given parameters. - #[instrument(skip(self), fields(metric_name = %name))] + /// Record a new business metric. + #[instrument(skip(self, tags, value, unit, category, source))] pub async fn record_metric( &self, - name: impl Into, + name: String, value: Decimal, - unit: impl Into, + unit: String, category: MetricCategory, tags: HashMap, source: MetricSource, ) -> Result { let id = Uuid::new_v4(); let now = Utc::now(); - let name = name.into(); - let unit = unit.into(); - - sqlx::query_as!( - BusinessMetric, + let category_str = serde_json::to_string(&category) + .map_err(|e| AppError::InternalError(e.to_string()))?; + let source_str = serde_json::to_string(&source) + .map_err(|e| AppError::InternalError(e.to_string()))?; + let tags_json = serde_json::to_value(&tags) + .map_err(|e| AppError::InternalError(e.to_string()))?; + // Store Decimal as string to avoid sqlx type issues + let value_str = value.to_string(); + + sqlx::query( r#" INSERT INTO business_metrics (id, name, value, unit, category, tags, recorded_at, source) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - RETURNING id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" "#, - id, - name, - value, - unit, - category as MetricCategory, - serde_json::to_value(&tags)?, - now, - source as MetricSource, ) - .fetch_one(&self.db) + .bind(id) + .bind(&name) + .bind(&value_str) + .bind(&unit) + .bind(&category_str) + .bind(&tags_json) + .bind(now) + .bind(&source_str) + .execute(&self.db) .await .map_err(|e| { error!(error = %e, "Failed to record metric"); - AppError::Database(e) + AppError::DatabaseError(e) })?; let metric = BusinessMetric { id, - name, + name: name.clone(), value, unit, category, @@ -138,7 +146,6 @@ impl BusinessMetricsService { let mut cache = self.cache.write().await; let entry = cache.entry(metric.name.clone()).or_default(); entry.push(metric.clone()); - // Keep last 1000 values per metric if entry.len() > 1000 { entry.remove(0); } @@ -147,428 +154,81 @@ impl BusinessMetricsService { info!( metric_name = %metric.name, value = %metric.value, - category = ?metric.category, "Recorded business metric" ); Ok(metric) } - /// Record multiple metrics in a single transaction. - #[instrument(skip(self, metrics))] - pub async fn record_metrics_batch( - &self, - metrics: Vec<(String, Decimal, String, MetricCategory, HashMap, MetricSource)>, - ) -> Result, AppError> { - let mut tx = self.db.begin().await?; - let mut results = Vec::with_capacity(metrics.len()); - let now = Utc::now(); - - for (name, value, unit, category, tags, source) in metrics { - let id = Uuid::new_v4(); - - sqlx::query!( - r#" - INSERT INTO business_metrics (id, name, value, unit, category, tags, recorded_at, source) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - "#, - id, - name, - value, - unit, - serde_json::to_value(&tags)?, - now, - source as MetricSource, - ) - .execute(&mut *tx) - .await - .map_err(|e| { - error!(error = %e, "Failed in batch metric insert"); - AppError::Database(e) - })?; - - results.push(BusinessMetric { - id, - name, - value, - unit, - category, - tags, - recorded_at: now, - source, - }); - } - - tx.commit().await.map_err(|e| { - error!(error = %e, "Failed to commit batch metrics"); - AppError::Database(e) - })?; - - info!(count = results.len(), "Recorded batch metrics"); - Ok(results) - } - - /// Query metrics with optional filters. - #[instrument(skip(self))] - pub async fn query_metrics( - &self, - query: MetricsQuery, - ) -> Result<(Vec, i64), AppError> { - let limit = query.limit.unwrap_or(100); - let offset = query.offset.unwrap_or(0); - - let total = sqlx::query_scalar!( - r#"SELECT COUNT(*) as "count!" FROM business_metrics WHERE 1=1"# - ) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .unwrap_or(0); - - let metrics = sqlx::query_as!( - BusinessMetric, - r#" - SELECT id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" - FROM business_metrics - ORDER BY recorded_at DESC - LIMIT $1 OFFSET $2 - "#, - limit, - offset, - ) - .fetch_all(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok((metrics, total)) - } - - /// Get aggregated metrics summary. - #[instrument(skip(self))] - pub async fn get_metrics_summary(&self) -> Result { - let total: i64 = sqlx::query_scalar!( - r#"SELECT COUNT(*) as "count!" FROM business_metrics"# - ) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .unwrap_or(0); - - let latest: Option> = sqlx::query_scalar!( - r#"SELECT MAX(recorded_at) as "max!" FROM business_metrics"# - ) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - let rows = sqlx::query!( - r#"SELECT category as "category!: MetricCategory", COUNT(*) as "count!: i64" FROM business_metrics GROUP BY category"# - ) - .fetch_all(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - let mut categories = HashMap::new(); - for row in rows { - let key = match row.category { - MetricCategory::Custom(s) => s, - other => format!("{:?}", other).to_lowercase(), - }; - categories.insert(key, row.count); - } - - Ok(MetricsSummary { - total_metrics: total, - categories, - latest_timestamp: latest, - }) - } - - /// Compute aggregated values for a metric over a time range. - #[instrument(skip(self))] - pub async fn aggregate_metric( - &self, - name: &str, - from: DateTime, - to: DateTime, - ) -> Result, AppError> { - let result = sqlx::query_scalar!( - r#"SELECT SUM(value) as "sum!: Decimal" FROM business_metrics WHERE name = $1 AND recorded_at >= $2 AND recorded_at <= $3"#, - name, - from, - to, - ) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok(result) - } - - /// Get the latest value for a specific metric. - #[instrument(skip(self))] - pub async fn get_latest_metric( - &self, - name: &str, - ) -> Result, AppError> { - // Check cache first - { - let cache = self.cache.read().await; - if let Some(values) = cache.get(name) { - if let Some(latest) = values.last() { - return Ok(Some(latest.clone())); - } - } - } - - // Fall back to database - let metric = sqlx::query_as!( - BusinessMetric, - r#" - SELECT id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" - FROM business_metrics - WHERE name = $1 - ORDER BY recorded_at DESC - LIMIT 1 - "#, - name, - ) - .fetch_optional(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok(metric) - } - /// Remove metrics older than the retention period. #[instrument(skip(self))] pub async fn prune_old_metrics(&self, retention_days: i64) -> Result { let cutoff = Utc::now() - Duration::days(retention_days); - let deleted = sqlx::query!( - r#"DELETE FROM business_metrics WHERE recorded_at < $1"#, - cutoff, - ) - .execute(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .rows_affected(); + let result = sqlx::query("DELETE FROM business_metrics WHERE recorded_at < $1") + .bind(cutoff) + .execute(&self.db) + .await + .map_err(|e| AppError::DatabaseError(e))?; + let deleted = result.rows_affected(); info!(deleted, retention_days, "Pruned old metrics"); Ok(deleted) } -} - -// ─── API Handlers ──────────────────────────────────────────────────────────── -use axum::{extract::State, http::StatusCode, Json}; - -pub struct MetricsState { - pub service: Arc, -} - -#[derive(Debug, Deserialize)] -pub struct RecordMetricRequest { - pub name: String, - pub value: Decimal, - pub unit: String, - pub category: MetricCategory, - #[serde(default)] - pub tags: HashMap, - #[serde(default)] - pub source: MetricSource, -} - -/// POST /api/metrics — Record a new business metric. -#[utoipa::path( - post, - path = "/api/metrics", - request_body = RecordMetricRequest, - responses( - (status = 201, description = "Metric recorded", body = BusinessMetric), - (status = 400, description = "Invalid request"), - (status = 500, description = "Internal server error") - ) -)] -pub async fn record_metric( - State(state): State>, - Json(req): Json, -) -> Result<(StatusCode, Json), AppError> { - let metric = state - .service - .record_metric( - req.name, - req.value, - req.unit, - req.category, - req.tags, - req.source, - ) - .await?; - - Ok((StatusCode::CREATED, Json(metric))) -} - -/// GET /api/metrics — Query business metrics with filters. -#[utoipa::path( - get, - path = "/api/metrics", - params( - ("category" = Option, Query, description = "Filter by category"), - ("from" = Option>, Query, description = "Start of time range"), - ("to" = Option>, Query, description = "End of time range"), - ("limit" = Option, Query, description = "Max results"), - ("offset" = Option, Query, description = "Pagination offset") - ), - responses( - (status = 200, description = "List of metrics with total count"), - (status = 500, description = "Internal server error") - ) -)] -pub async fn query_metrics( - State(state): State>, - axum::extract::Query(params): axum::extract::Query>, -) -> Result, AppError> { - let category = params.get("category").and_then(|c| { - serde_json::from_str(&format!("\"{}\"", c)).ok() - }); - - let from = params - .get("from") - .and_then(|v| v.parse::>().ok()); - let to = params - .get("to") - .and_then(|v| v.parse::>().ok()); - let limit = params.get("limit").and_then(|v| v.parse::().ok()); - let offset = params.get("offset").and_then(|v| v.parse::().ok()); - - let query = MetricsQuery { - category, - from, - to, - tags: None, - limit, - offset, - }; - - let (metrics, total) = state.service.query_metrics(query).await?; - - Ok(Json(serde_json::json!({ - "metrics": metrics, - "total": total, - }))) + /// Get the latest cached value for a metric (no DB call). + pub async fn get_cached_latest(&self, name: &str) -> Option { + let cache = self.cache.read().await; + cache.get(name)?.last().cloned() + } } -/// GET /api/metrics/summary — Get aggregated metrics overview. -#[utoipa::path( - get, - path = "/api/metrics/summary", - responses( - (status = 200, description = "Metrics summary", body = MetricsSummary), - (status = 500, description = "Internal server error") - ) -)] -pub async fn get_metrics_summary( - State(state): State>, -) -> Result, AppError> { - let summary = state.service.get_metrics_summary().await?; - Ok(Json(summary)) -} +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; - use sqlx::PgPool; - - async fn setup_test_db() -> PgPool { - let pool = PgPool::connect("postgres://localhost:5432/crucible_test") - .await - .expect("Failed to connect to test database"); - - sqlx::query!( - r#" - CREATE TABLE IF NOT EXISTS business_metrics ( - id UUID PRIMARY KEY, - name TEXT NOT NULL, - value NUMERIC NOT NULL, - unit TEXT NOT NULL, - category TEXT NOT NULL, - tags JSONB DEFAULT '{}', - recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - source TEXT NOT NULL DEFAULT 'manual' - ) - "# - ) - .execute(&pool) - .await - .expect("Failed to create test table"); - pool + #[test] + fn test_metric_category_serialization() { + let cat = MetricCategory::Revenue; + let json = serde_json::to_string(&cat).unwrap(); + assert!(json.contains("revenue")); } - #[tokio::test] - async fn test_record_and_retrieve_metric() { - let pool = setup_test_db().await; - let service = BusinessMetricsService::new(pool); - - let metric = service - .record_metric( - "test_revenue", - Decimal::new(1000, 0), - "USD", - MetricCategory::Revenue, - HashMap::from([("region".into(), "us-east".into())]), - MetricSource::Database, - ) - .await - .expect("Failed to record metric"); - - assert_eq!(metric.name, "test_revenue"); - assert_eq!(metric.value, Decimal::new(1000, 0)); - - let latest = service - .get_latest_metric("test_revenue") - .await - .expect("Failed to get metric") - .expect("Metric not found"); - - assert_eq!(latest.value, Decimal::new(1000, 0)); + #[test] + fn test_metric_source_default() { + let src = MetricSource::default(); + assert_eq!(src, MetricSource::Database); } - #[tokio::test] - async fn test_metrics_summary() { - let pool = setup_test_db().await; - let service = BusinessMetricsService::new(pool); - - service - .record_metric( - "revenue", - Decimal::new(500, 0), - "USD", - MetricCategory::Revenue, - HashMap::new(), - MetricSource::Database, - ) - .await - .expect("Failed to record"); - - service - .record_metric( - "cost", - Decimal::new(200, 0), - "USD", - MetricCategory::Costs, - HashMap::new(), - MetricSource::Database, - ) - .await - .expect("Failed to record"); - - let summary = service - .get_metrics_summary() - .await - .expect("Failed to get summary"); + #[test] + fn test_business_metric_serialization() { + let metric = BusinessMetric { + id: Uuid::new_v4(), + name: "revenue".to_string(), + value: Decimal::new(1000, 2), + unit: "USD".to_string(), + category: MetricCategory::Revenue, + tags: HashMap::from([("region".into(), "us-east".into())]), + recorded_at: Utc::now(), + source: MetricSource::Database, + }; + let json = serde_json::to_string(&metric).unwrap(); + assert!(json.contains("revenue")); + assert!(json.contains("USD")); + } - assert!(summary.total_metrics >= 2); + #[test] + fn test_metrics_summary_serialization() { + let summary = MetricsSummary { + total_metrics: 42, + categories: HashMap::from([("revenue".into(), 10i64)]), + latest_timestamp: Some(Utc::now()), + }; + let json = serde_json::to_string(&summary).unwrap(); + assert!(json.contains("42")); } -} \ No newline at end of file +} diff --git a/backend/src/services/error_recovery.rs b/backend/src/services/error_recovery.rs index e462906..c12cc38 100644 --- a/backend/src/services/error_recovery.rs +++ b/backend/src/services/error_recovery.rs @@ -1,12 +1,15 @@ +//! Error recovery service. +//! +//! Tracks retry state for failing tasks with configurable max retries. + #![allow(dead_code)] + use serde::{Deserialize, Serialize}; use std::sync::Arc; use thiserror::Error; use tokio::sync::RwLock; -use tracing::{error, info, warn}; -use tracing::{error, info, warn, instrument}; -use thiserror::Error; -use serde::{Serialize, Deserialize}; +use tracing::{error, info, instrument, warn}; + use crate::services::tracing::TracingService; #[derive(Error, Debug, Serialize, Deserialize)] @@ -46,16 +49,15 @@ impl ErrorManager { } } + #[instrument(skip(self), fields(service.name = "ErrorManager", service.method = "handle_error"))] pub async fn handle_error( &self, error: RecoveryError, task_name: &str, ) -> Result<(), RecoveryError> { - #[instrument(skip(self), fields(service.name = "ErrorManager", service.method = "handle_error"))] - pub async fn handle_error(&self, error: RecoveryError, task_name: &str) -> Result<(), RecoveryError> { let span = TracingService::service_method_span("ErrorManager", "handle_error"); let _enter = span.enter(); - + warn!(task = %task_name, error = %error, "Handling error"); let mut tasks = self.tasks.write().await; @@ -63,7 +65,11 @@ impl ErrorManager { task.retries += 1; if task.retries > task.max_retries { error!(task = %task_name, "Max retries reached"); - TracingService::record_error(&span, &format!("Max retries reached for {}", task_name), "max_retries"); + TracingService::record_error( + &span, + &format!("Max retries reached for {}", task_name), + "max_retries", + ); return Err(RecoveryError::MaxRetriesReached(task_name.to_string())); } info!(task = %task_name, retry = task.retries, "Retrying task"); @@ -84,7 +90,6 @@ impl ErrorManager { pub async fn get_active_tasks(&self) -> Vec { let span = TracingService::service_method_span("ErrorManager", "get_active_tasks"); let _enter = span.enter(); - self.tasks.read().await.clone() } } @@ -98,32 +103,25 @@ mod tests { let manager = ErrorManager::new(); let task_name = "test_task"; - // First failure manager - .handle_error( - RecoveryError::Database("connection lost".to_string()), - task_name, - ) + .handle_error(RecoveryError::Database("connection lost".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await.len(), 1); assert_eq!(manager.get_active_tasks().await[0].retries, 1); - // Second failure manager .handle_error(RecoveryError::Redis("timeout".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await[0].retries, 2); - // Third failure manager .handle_error(RecoveryError::Internal("unknown".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await[0].retries, 3); - // Fourth failure - should fail let result = manager .handle_error(RecoveryError::Internal("last straw".to_string()), task_name) .await; diff --git a/backend/src/services/feature_flags.rs b/backend/src/services/feature_flags.rs index 56bf6cc..2a6b6c9 100644 --- a/backend/src/services/feature_flags.rs +++ b/backend/src/services/feature_flags.rs @@ -1,26 +1,4 @@ //! Feature flag service with Redis caching and PostgreSQL persistence. -//! -//! This module provides a production-ready feature flag system that: -//! - Stores flag state in PostgreSQL for durability -//! - Caches flag values in Redis for low-latency reads -//! - Supports cache invalidation on updates -//! - Provides async API for flag evaluation -//! -//! # Example -//! ```rust,no_run -//! use backend::services::feature_flags::FeatureFlagService; -//! use sqlx::PgPool; -//! use redis::Client; -//! -//! # async fn example(pool: PgPool, redis: Client) -> anyhow::Result<()> { -//! let service = FeatureFlagService::new(pool, redis); -//! let enabled = service.is_enabled("new_dashboard").await?; -//! if enabled { -//! // render new UI -//! } -//! # Ok(()) -//! # } -//! ``` #![allow(dead_code)] @@ -29,32 +7,22 @@ use redis::{AsyncCommands, Client as RedisClient}; use serde::{Deserialize, Serialize}; use sqlx::PgPool; use thiserror::Error; -use tracing::{debug, info, warn}; -use tracing::{debug, info, warn, instrument}; -use serde::{Deserialize, Serialize}; -use chrono::{DateTime, Utc}; +use tracing::{debug, info, instrument, warn}; + use crate::services::tracing::TracingService; // --------------------------------------------------------------------------- // Error type // --------------------------------------------------------------------------- -/// Errors that can occur in the feature flag service. #[derive(Debug, Error)] pub enum FlagError { - /// A database error occurred. #[error("Database error: {0}")] Database(#[from] sqlx::Error), - - /// A Redis error occurred. #[error("Redis error: {0}")] Redis(#[from] redis::RedisError), - - /// The requested flag was not found. #[error("Feature flag not found: {0}")] NotFound(String), - - /// An internal error occurred. #[error("Internal error: {0}")] Internal(String), } @@ -63,16 +31,11 @@ pub enum FlagError { // Domain types // --------------------------------------------------------------------------- -/// A feature flag record. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FeatureFlag { - /// Unique key identifying the flag. pub key: String, - /// Whether the flag is enabled. pub enabled: bool, - /// Human-readable description. pub description: String, - /// Last update timestamp. pub updated_at: DateTime, } @@ -80,98 +43,49 @@ pub struct FeatureFlag { // FeatureFlagService // --------------------------------------------------------------------------- -/// Service for managing feature flags with Redis caching and PostgreSQL persistence. pub struct FeatureFlagService { db: PgPool, redis: RedisClient, } impl FeatureFlagService { - /// Create a new feature flag service. - /// - /// # Arguments - /// - `db`: PostgreSQL connection pool - /// - `redis`: Redis client pub fn new(db: PgPool, redis: RedisClient) -> Self { Self { db, redis } } - /// Check if a feature flag is enabled. - /// - /// This method first checks Redis cache. On cache miss, it queries - /// PostgreSQL and populates the cache with a 5-minute TTL. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "is_enabled"))] pub async fn is_enabled(&self, key: &str) -> Result { let cache_key = format!("flag:{key}"); - // Try cache first with Redis tracing let redis_span = TracingService::redis_command_span("GET", Some(&cache_key)); let _redis_enter = redis_span.enter(); - - let mut conn = self.redis.get_multiplexed_async_connection().await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_connection"); - e - })?; - - let cached: Option = conn.get(&cache_key).await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_get"); - e - })?; - + let mut conn = self.redis.get_multiplexed_async_connection().await?; + let cached: Option = conn.get(&cache_key).await?; drop(_redis_enter); if let Some(val) = cached { - debug!(key = %key, cached = %val, "Feature flag cache hit"); + debug!(key = %key, "Feature flag cache hit"); return Ok(val == "1"); } - // Cache miss – query database with DB tracing debug!(key = %key, "Feature flag cache miss – querying database"); - let row: Option<(bool,)> = - sqlx::query_as("SELECT enabled FROM feature_flags WHERE key = $1") - .bind(key) - .fetch_optional(&self.db) - .await?; - let db_span = TracingService::db_query_span( "SELECT enabled FROM feature_flags WHERE key = $1", "postgres", - "SELECT" + "SELECT", ); let _db_enter = db_span.enter(); - - let row: Option<(bool,)> = sqlx::query_as( - "SELECT enabled FROM feature_flags WHERE key = $1" - ) - .bind(key) - .fetch_optional(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - + let row: Option<(bool,)> = + sqlx::query_as("SELECT enabled FROM feature_flags WHERE key = $1") + .bind(key) + .fetch_optional(&self.db) + .await?; drop(_db_enter); match row { Some((enabled,)) => { - // Populate cache with 5-minute TTL - let cache_set_span = TracingService::redis_command_span("SETEX", Some(&cache_key)); - let _cache_set_enter = cache_set_span.enter(); - let val = if enabled { "1" } else { "0" }; - let _: () = conn.set_ex(&cache_key, val, 300).await - .map_err(|e| { - TracingService::record_error(&cache_set_span, &e.to_string(), "redis_setex"); - e - })?; - - drop(_cache_set_enter); + let _: () = conn.set_ex(&cache_key, val, 300).await?; debug!(key = %key, enabled = enabled, "Cached feature flag"); Ok(enabled) } @@ -179,31 +93,14 @@ impl FeatureFlagService { } } - /// Get the full feature flag record. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "get"))] pub async fn get(&self, key: &str) -> Result { - let db_span = TracingService::db_query_span( - "SELECT key, enabled, description, updated_at FROM feature_flags WHERE key = $1", - "postgres", - "SELECT" - ); - let _db_enter = db_span.enter(); - let row: Option<(String, bool, String, DateTime)> = sqlx::query_as( "SELECT key, enabled, description, updated_at FROM feature_flags WHERE key = $1", ) .bind(key) .fetch_optional(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - drop(_db_enter); + .await?; match row { Some((key, enabled, description, updated_at)) => Ok(FeatureFlag { @@ -216,28 +113,13 @@ impl FeatureFlagService { } } - /// List all feature flags. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "list"))] pub async fn list(&self) -> Result, FlagError> { - let db_span = TracingService::db_query_span( - "SELECT key, enabled, description, updated_at FROM feature_flags ORDER BY key", - "postgres", - "SELECT" - ); - let _db_enter = db_span.enter(); - let rows: Vec<(String, bool, String, DateTime)> = sqlx::query_as( "SELECT key, enabled, description, updated_at FROM feature_flags ORDER BY key", ) .fetch_all(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", rows.len() as i64); - drop(_db_enter); + .await?; Ok(rows .into_iter() @@ -250,26 +132,9 @@ impl FeatureFlagService { .collect()) } - /// Create or update a feature flag. - /// - /// This method upserts the flag in PostgreSQL and invalidates the cache. + #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "set"))] pub async fn set(&self, key: &str, enabled: bool, description: &str) -> Result<(), FlagError> { sqlx::query( - #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "set"))] - pub async fn set( - &self, - key: &str, - enabled: bool, - description: &str, - ) -> Result<(), FlagError> { - let db_span = TracingService::db_query_span( - "INSERT INTO feature_flags ... ON CONFLICT DO UPDATE", - "postgres", - "UPSERT" - ); - let _db_enter = db_span.enter(); - - let result = sqlx::query( r#" INSERT INTO feature_flags (key, enabled, description, updated_at) VALUES ($1, $2, $3, $4) @@ -284,46 +149,19 @@ impl FeatureFlagService { .bind(description) .bind(Utc::now()) .execute(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", result.rows_affected() as i64); - drop(_db_enter); + .await?; - // Invalidate cache self.invalidate_cache(key).await?; - info!(key = %key, enabled = enabled, "Feature flag updated"); Ok(()) } - /// Delete a feature flag. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "delete"))] pub async fn delete(&self, key: &str) -> Result<(), FlagError> { - let db_span = TracingService::db_query_span( - "DELETE FROM feature_flags WHERE key = $1", - "postgres", - "DELETE" - ); - let _db_enter = db_span.enter(); - let result = sqlx::query("DELETE FROM feature_flags WHERE key = $1") .bind(key) .execute(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", result.rows_affected() as i64); - drop(_db_enter); + .await?; if result.rows_affected() == 0 { return Err(FlagError::NotFound(key.to_string())); @@ -334,31 +172,10 @@ impl FeatureFlagService { Ok(()) } - /// Invalidate the Redis cache for a specific flag. - #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "invalidate_cache"))] async fn invalidate_cache(&self, key: &str) -> Result<(), FlagError> { let cache_key = format!("flag:{key}"); let mut conn = self.redis.get_multiplexed_async_connection().await?; let deleted: i32 = conn.del(&cache_key).await?; - let cache_key = format!("flag:{}", key); - - let redis_span = TracingService::redis_command_span("DEL", Some(&cache_key)); - let _redis_enter = redis_span.enter(); - - let mut conn = self.redis.get_multiplexed_async_connection().await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_connection"); - e - })?; - - let deleted: i32 = conn.del(&cache_key).await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_del"); - e - })?; - - drop(_redis_enter); - if deleted > 0 { debug!(key = %key, "Invalidated feature flag cache"); } else { @@ -367,66 +184,32 @@ impl FeatureFlagService { Ok(()) } - /// Flush all feature flag cache entries (useful for testing / maintenance). - /// - /// This uses a Redis SCAN to find all keys matching `flag:*` and deletes them. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "flush_cache"))] pub async fn flush_cache(&self) -> Result { - let keys_span = TracingService::redis_command_span("KEYS", Some("flag:*")); - let _keys_enter = keys_span.enter(); - - let mut conn = self.redis.get_multiplexed_async_connection().await - .map_err(|e| { - TracingService::record_error(&keys_span, &e.to_string(), "redis_connection"); - e - })?; - + let mut conn = self.redis.get_multiplexed_async_connection().await?; let keys: Vec = redis::cmd("KEYS") .arg("flag:*") .query_async(&mut conn) - .await - .map_err(|e| { - TracingService::record_error(&keys_span, &e.to_string(), "redis_keys"); - e - })?; - - drop(_keys_enter); + .await?; if keys.is_empty() { - debug!("No feature flag cache entries to flush"); return Ok(0); } let count = keys.len(); - - let del_span = TracingService::redis_command_span("DEL", None); - let _del_enter = del_span.enter(); - for key in keys { - let _: () = conn.del(&key).await - .map_err(|e| { - TracingService::record_error(&del_span, &e.to_string(), "redis_del"); - e - })?; + let _: () = conn.del(&key).await?; } - - drop(_del_enter); info!(count = count, "Flushed feature flag cache"); Ok(count) } } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - #[cfg(test)] mod tests { use super::*; - // Unit tests that do not require live database/Redis connections. - #[test] fn test_flag_error_display() { let err = FlagError::NotFound("test_flag".to_string()); diff --git a/backend/src/services/log_alerts.rs b/backend/src/services/log_alerts.rs index 3f37e16..50c1b2f 100644 --- a/backend/src/services/log_alerts.rs +++ b/backend/src/services/log_alerts.rs @@ -1,168 +1,3 @@ -use axum::{ - extract::{Path, State}, - routing::{get, post}, - Json, Router, -}; -use serde::{Deserialize, Serialize}; -use sqlx::PgPool; -use std::sync::Arc; -use uuid::Uuid; -use crate::error::AppError; - -#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)] -pub struct LogAlertRule { - pub id: Uuid, - pub name: String, - pub pattern: String, - pub threshold: i32, - pub interval_seconds: i32, - pub is_enabled: bool, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct CreateRuleRequest { - pub name: String, - pub pattern: String, - pub threshold: i32, - pub interval_seconds: i32, -} - -#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)] -pub struct LogAlert { - pub id: Uuid, - pub rule_id: Uuid, - pub message: String, - pub triggered_at: chrono::DateTime, -} - -pub struct ServiceState { - pub db: PgPool, - pub redis: redis::Client, -} - -pub fn router() -> Router { - Router::new() - .route("/rules", post(create_rule).get(list_rules)) - .route("/rules/:id", get(get_rule)) - .route("/ingest", post(ingest_log)) -} - -async fn create_rule( - State(state): State>, - Json(payload): Json, -) -> Result, AppError> { - let rule = sqlx::query_as::<_, LogAlertRule>( - "INSERT INTO log_alert_rules (name, pattern, threshold, interval_seconds) - VALUES ($1, $2, $3, $4) RETURNING *" - ) - .bind(payload.name) - .bind(payload.pattern) - .bind(payload.threshold) - .bind(payload.interval_seconds) - .fetch_one(&state.db) - .await?; - - Ok(Json(rule)) -} - -async fn list_rules( - State(state): State>, -) -> Result>, AppError> { - let rules = sqlx::query_as::<_, LogAlertRule>("SELECT * FROM log_alert_rules") - .fetch_all(&state.db) - .await?; - Ok(Json(rules)) -} - -async fn get_rule( - State(state): State>, - Path(id): Path, -) -> Result, AppError> { - let rule = sqlx::query_as::<_, LogAlertRule>("SELECT * FROM log_alert_rules WHERE id = $1") - .bind(id) - .fetch_optional(&state.db) - .await? - .ok_or_else(|| AppError::NotFound(format!("Rule not found: {}", id)))?; - - Ok(Json(rule)) -} - -#[derive(Debug, Deserialize)] -pub struct LogEntry { - pub message: String, - pub level: String, -} - -async fn ingest_log( - State(state): State>, - Json(log): Json, -) -> Result, AppError> { - tracing::info!("Processing log: {}", log.message); - - // 1. Fetch all enabled rules - let rules = sqlx::query_as::<_, LogAlertRule>( - "SELECT * FROM log_alert_rules WHERE is_enabled = true" - ) - .fetch_all(&state.db) - .await?; - - let mut matched_rules = Vec::new(); - - for rule in rules { - if log.message.contains(&rule.pattern) { - tracing::debug!("Log matched pattern for rule: {}", rule.name); - - // 2. Increment count in Redis with TTL - let redis_key = format!("alert_count:{}:{}", rule.id, chrono::Utc::now().timestamp() / rule.interval_seconds as i64); - let mut conn = state.redis.get_async_connection().await?; - - let count: i32 = redis::cmd("INCR") - .arg(&redis_key) - .query_async(&mut conn) - .await?; - - // Set TTL if new key - if count == 1 { - let _: () = redis::cmd("EXPIRE") - .arg(&redis_key) - .arg(rule.interval_seconds) - .query_async(&mut conn) - .await?; - } - - // 3. Check if threshold reached - if count >= rule.threshold { - tracing::warn!("Threshold reached for rule: {}. Triggering alert!", rule.name); - - // 4. Persist alert - sqlx::query( - "INSERT INTO log_alerts (rule_id, message) VALUES ($1, $2)" - ) - .bind(rule.id) - .bind(format!("Threshold of {} reached for pattern '{}'", rule.threshold, rule.pattern)) - .execute(&state.db) - .await?; - - matched_rules.push(rule.name); - } - } - } - - Ok(Json(serde_json::json!({ - "status": "processed", - "matched": matched_rules - }))) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_pattern_matching() { - let pattern = "error"; - let message = "This is an error message"; - assert!(message.contains(pattern)); //! Log alerting service for monitoring log entries and triggering alerts. //! //! This module provides threshold-based alerting on top of the log aggregation @@ -270,9 +105,7 @@ impl AlertRule { /// Validate that the rule has sensible configuration values. pub fn validate(&self) -> Result<(), AlertError> { if self.name.trim().is_empty() { - return Err(AlertError::InvalidRule( - "name must not be empty".to_string(), - )); + return Err(AlertError::InvalidRule("name must not be empty".to_string())); } if self.pattern.trim().is_empty() { return Err(AlertError::InvalidRule( @@ -317,7 +150,6 @@ pub struct Alert { /// Tracks recent log-entry timestamps per rule for sliding-window evaluation. #[derive(Debug, Default)] struct RuleState { - /// Timestamps of log entries that matched this rule. hits: Vec>, } @@ -352,8 +184,6 @@ impl AlertManager { } /// Add or replace an alert rule. - /// - /// Returns an error if the rule fails validation. pub async fn add_rule(&self, rule: AlertRule) -> Result<(), AlertError> { rule.validate()?; let id = rule.id; @@ -380,10 +210,6 @@ impl AlertManager { } /// Evaluate a [`LogEntry`] against all active rules. - /// - /// For each rule whose pattern matches the entry's message, the hit is - /// recorded. If the sliding-window count reaches the rule's threshold an - /// [`Alert`] is fired and stored. pub async fn evaluate(&self, entry: &LogEntry) { let rules = self.rules.read().await; let mut states = self.rule_states.write().await; @@ -423,7 +249,6 @@ impl AlertManager { fired_at: Utc::now(), acknowledged: false, }); - // Reset hits so the alert doesn't re-fire on every subsequent entry. state.hits.clear(); } } @@ -520,8 +345,6 @@ mod tests { } } - // --- AlertRule validation --- - #[test] fn test_rule_validation_empty_name() { let mut rule = make_rule("ERROR", 3, 60); @@ -554,15 +377,12 @@ mod tests { assert!(rule.validate().is_ok()); } - // --- AlertManager CRUD --- - #[tokio::test] async fn test_add_and_get_rules() { let manager = AlertManager::new(); let rule = make_rule("ERROR", 3, 60); let id = rule.id; manager.add_rule(rule).await.unwrap(); - let rules = manager.get_rules().await; assert_eq!(rules.len(), 1); assert_eq!(rules[0].id, id); @@ -585,16 +405,12 @@ mod tests { assert!(matches!(result, Err(AlertError::RuleNotFound(_)))); } - // --- Alert evaluation --- - #[tokio::test] async fn test_no_alert_below_threshold() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 3, 60)).await.unwrap(); - manager.evaluate(&make_entry("ERROR occurred")).await; manager.evaluate(&make_entry("ERROR occurred")).await; - assert!(manager.get_alerts(None).await.is_empty()); } @@ -602,11 +418,9 @@ mod tests { async fn test_alert_fires_at_threshold() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 3, 60)).await.unwrap(); - for _ in 0..3 { manager.evaluate(&make_entry("ERROR occurred")).await; } - let alerts = manager.get_alerts(None).await; assert_eq!(alerts.len(), 1); assert_eq!(alerts[0].match_count, 3); @@ -616,11 +430,7 @@ mod tests { async fn test_non_matching_entry_does_not_fire() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 1, 60)).await.unwrap(); - - manager - .evaluate(&make_entry("INFO everything is fine")) - .await; - + manager.evaluate(&make_entry("INFO everything is fine")).await; assert!(manager.get_alerts(None).await.is_empty()); } @@ -628,32 +438,23 @@ mod tests { async fn test_alert_resets_after_firing() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 2, 60)).await.unwrap(); - - // First batch – fires manager.evaluate(&make_entry("ERROR a")).await; manager.evaluate(&make_entry("ERROR b")).await; assert_eq!(manager.get_alerts(None).await.len(), 1); - - // Second batch – fires again after reset manager.evaluate(&make_entry("ERROR c")).await; manager.evaluate(&make_entry("ERROR d")).await; assert_eq!(manager.get_alerts(None).await.len(), 2); } - // --- Acknowledge --- - #[tokio::test] async fn test_acknowledge_alert() { let manager = AlertManager::new(); manager.add_rule(make_rule("CRIT", 1, 60)).await.unwrap(); manager.evaluate(&make_entry("CRIT failure")).await; - let alerts = manager.get_alerts(None).await; assert_eq!(alerts.len(), 1); let alert_id = alerts[0].id; - manager.acknowledge_alert(alert_id).await.unwrap(); - let active = manager.get_active_alerts().await; assert!(active.is_empty()); } @@ -665,37 +466,28 @@ mod tests { assert!(matches!(result, Err(AlertError::AlertNotFound(_)))); } - // --- Severity filter --- - #[tokio::test] async fn test_filter_alerts_by_severity() { let manager = AlertManager::new(); - let mut warn_rule = make_rule("WARN", 1, 60); warn_rule.severity = AlertSeverity::Warning; manager.add_rule(warn_rule).await.unwrap(); - let mut crit_rule = make_rule("CRIT", 1, 60); crit_rule.severity = AlertSeverity::Critical; manager.add_rule(crit_rule).await.unwrap(); - manager.evaluate(&make_entry("WARN something")).await; manager.evaluate(&make_entry("CRIT something")).await; - let critical = manager.get_alerts(Some(AlertSeverity::Critical)).await; assert_eq!(critical.len(), 1); assert_eq!(critical[0].severity, AlertSeverity::Critical); } - // --- Clear --- - #[tokio::test] async fn test_clear_alerts() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERR", 1, 60)).await.unwrap(); manager.evaluate(&make_entry("ERR boom")).await; assert!(!manager.get_alerts(None).await.is_empty()); - manager.clear_alerts().await; assert!(manager.get_alerts(None).await.is_empty()); } diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs index c9ffa9b..3585ed5 100644 --- a/backend/src/services/mod.rs +++ b/backend/src/services/mod.rs @@ -1,9 +1,8 @@ -pub mod log_alerts; pub mod alerts; +pub mod business_metrics; pub mod error_recovery; pub mod feature_flags; pub mod log_aggregator; pub mod log_alerts; pub mod sys_metrics; -pub mod business_metrics; pub mod tracing; diff --git a/backend/src/services/sys_metrics.rs b/backend/src/services/sys_metrics.rs index bd9ec8f..be58118 100644 --- a/backend/src/services/sys_metrics.rs +++ b/backend/src/services/sys_metrics.rs @@ -1,101 +1,44 @@ -//! Build System Metrics Exporter -//! -//! This module provides a production-ready metrics exporter for build system operations. -//! It collects and persists build-related metrics including compilation times, dependency counts, -//! cache hit rates, and system resource usage. The service uses PostgreSQL for durability -//! and Redis for high-performance caching. -//! -//! # Example -//! ```rust,no_run -//! use backend::services::sys_metrics::BuildMetricsService; -//! use sqlx::PgPool; -//! use redis::Client; -//! -//! # async fn example(pool: PgPool, redis: Client) -> anyhow::Result<()> { -//! let service = BuildMetricsService::new(pool, redis); -//! -//! // Record a build metric -//! let metric = BuildMetric { -//! project_name: "crucible".to_string(), -//! build_id: "build-123".to_string(), -//! build_status: BuildStatus::Success, -//! compilation_time_ms: 5000, -//! dependency_count: 42, -//! cache_hit_rate: Some(85.5), -//! cpu_usage: Some(75.2), -//! memory_usage_mb: Some(1024), -//! build_timestamp: Utc::now(), -//! }; -//! service.record_build(metric).await?; -//! -//! // Query metrics -//! let metrics = service.get_project_metrics("crucible", 10).await?; -//! # Ok(()) -//! # } -//! ``` +//! System metrics and build metrics services. + +#![allow(dead_code)] -use sqlx::PgPool; -use redis::{Client as RedisClient, AsyncCommands}; -use serde::{Serialize, Deserialize}; use chrono::{DateTime, Utc}; -use tracing::{info, debug, warn, error}; +use redis::{AsyncCommands, Client as RedisClient}; +use rust_decimal::Decimal; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use std::sync::Arc; use thiserror::Error; +use tokio::sync::RwLock; +use tracing::{debug, info, instrument}; use uuid::Uuid; -use rust_decimal::Decimal; + +use crate::services::tracing::TracingService; // --------------------------------------------------------------------------- -// Error types +// MetricsError // --------------------------------------------------------------------------- -/// Errors that can occur in the build metrics service. #[derive(Debug, Error)] pub enum MetricsError { - /// A database error occurred. #[error("Database error: {0}")] Database(#[from] sqlx::Error), - - /// A Redis error occurred. #[error("Redis error: {0}")] Redis(#[from] redis::RedisError), - - /// Serialization error. #[error("Serialization error: {0}")] Serialization(String), - - /// The requested project was not found. #[error("Project not found: {0}")] ProjectNotFound(String), - - /// Invalid build status. #[error("Invalid build status: {0}")] InvalidStatus(String), - - /// An internal error occurred. #[error("Internal error: {0}")] Internal(String), -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use std::sync::Arc; -use tokio::sync::RwLock; -use tracing::info; -use serde::{Serialize, Deserialize}; -use chrono::{DateTime, Utc}; -use tracing::{info, instrument}; -use crate::services::tracing::TracingService; - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct SystemMetrics { - pub cpu_usage: f64, - pub memory_usage: u64, - pub uptime: u64, - pub timestamp: DateTime, } // --------------------------------------------------------------------------- -// Domain types +// BuildStatus // --------------------------------------------------------------------------- -/// Build status enumeration. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] pub enum BuildStatus { @@ -126,47 +69,32 @@ impl BuildStatus { } } -/// Build system metrics record. +// --------------------------------------------------------------------------- +// BuildMetric +// --------------------------------------------------------------------------- + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BuildMetric { - /// Unique identifier for the metric record. pub id: Option, - /// Name of the project being built. pub project_name: String, - /// Unique build identifier. pub build_id: String, - /// Status of the build. pub build_status: BuildStatus, - /// Compilation time in milliseconds. pub compilation_time_ms: i64, - /// Number of dependencies used. pub dependency_count: i32, - /// Cache hit rate percentage (0-100). pub cache_hit_rate: Option, - /// CPU usage percentage during build. pub cpu_usage: Option, - /// Memory usage in MB during build. pub memory_usage_mb: Option, - /// Timestamp when the build occurred. pub build_timestamp: DateTime, } -/// Aggregated build metrics summary. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BuildMetricsSummary { - /// Project name. pub project_name: String, - /// Total number of builds. pub total_builds: i64, - /// Number of successful builds. pub successful_builds: i64, - /// Number of failed builds. pub failed_builds: i64, - /// Average compilation time in milliseconds. pub avg_compilation_time_ms: Decimal, - /// Success rate percentage. pub success_rate: Decimal, - /// Average cache hit rate. pub avg_cache_hit_rate: Option, } @@ -174,38 +102,24 @@ pub struct BuildMetricsSummary { // BuildMetricsService // --------------------------------------------------------------------------- -/// Service for collecting and managing build system metrics with PostgreSQL persistence -/// and Redis caching. pub struct BuildMetricsService { db: PgPool, redis: RedisClient, } impl BuildMetricsService { - /// Create a new build metrics service. - /// - /// # Arguments - /// - `db`: PostgreSQL connection pool - /// - `redis`: Redis client pub fn new(db: PgPool, redis: RedisClient) -> Self { Self { db, redis } } - /// Record a build metric. - /// - /// This method persists the metric to PostgreSQL and invalidates relevant cache entries. - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database operation fails. - /// Returns [`MetricsError::Redis`] if the cache invalidation fails. pub async fn record_build(&self, metric: BuildMetric) -> Result { let id = Uuid::new_v4(); let status_str = metric.build_status.as_str(); sqlx::query( r#" - INSERT INTO build_metrics - (id, project_name, build_id, build_status, compilation_time_ms, + INSERT INTO build_metrics + (id, project_name, build_id, build_status, compilation_time_ms, dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) "#, @@ -216,14 +130,13 @@ impl BuildMetricsService { .bind(status_str) .bind(metric.compilation_time_ms) .bind(metric.dependency_count) - .bind(metric.cache_hit_rate) - .bind(metric.cpu_usage) + .bind(metric.cache_hit_rate.map(|d| d.to_string())) + .bind(metric.cpu_usage.map(|d| d.to_string())) .bind(metric.memory_usage_mb) .bind(metric.build_timestamp) .execute(&self.db) .await?; - // Invalidate cache for this project self.invalidate_project_cache(&metric.project_name).await?; info!( @@ -236,26 +149,12 @@ impl BuildMetricsService { Ok(id) } - /// Get metrics for a specific project. - /// - /// This method first checks Redis cache. On cache miss, it queries PostgreSQL - /// and populates the cache with a 5-minute TTL. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - `limit`: Maximum number of records to return - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. - /// Returns [`MetricsError::Redis`] if the cache operation fails. pub async fn get_project_metrics( &self, project_name: &str, limit: i64, ) -> Result, MetricsError> { let cache_key = format!("build_metrics:{}:{}", project_name, limit); - - // Try cache first let mut conn = self.redis.get_multiplexed_async_connection().await?; let cached: Option = conn.get(&cache_key).await?; @@ -264,27 +163,24 @@ impl BuildMetricsService { let metrics: Vec = serde_json::from_str(&val) .map_err(|e| MetricsError::Serialization(e.to_string()))?; return Ok(metrics); -impl Default for MetricsExporter { - fn default() -> Self { - Self::new() - } -} - -impl MetricsExporter { - pub fn new() -> Self { - Self { - current_metrics: Arc::new(RwLock::new(SystemMetrics { - timestamp: Utc::now(), - ..Default::default() - })), } - // Cache miss – query database debug!(project = %project_name, "Build metrics cache miss – querying database"); - let rows = sqlx::query_as( + let rows: Vec<( + Uuid, + String, + String, + String, + i64, + i32, + Option, + Option, + Option, + DateTime, + )> = sqlx::query_as( r#" SELECT id, project_name, build_id, build_status, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp + dependency_count, cache_hit_rate::float8, cpu_usage::float8, memory_usage_mb, build_timestamp FROM build_metrics WHERE project_name = $1 ORDER BY build_timestamp DESC @@ -298,53 +194,55 @@ impl MetricsExporter { let metrics: Vec = rows .into_iter() - .map(|(id, project_name, build_id, status_str, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp)| { - BuildMetric { - id: Some(id), + .map( + |( + id, project_name, build_id, - build_status: BuildStatus::from_str(&status_str).unwrap_or(BuildStatus::Failed), + status_str, compilation_time_ms, dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp, - } - }) + )| BuildMetric { + id: Some(id), + project_name, + build_id, + build_status: BuildStatus::from_str(&status_str) + .unwrap_or(BuildStatus::Failed), + compilation_time_ms, + dependency_count, + cache_hit_rate: cache_hit_rate.map(Decimal::try_from).and_then(|r| r.ok()), + cpu_usage: cpu_usage.map(Decimal::try_from).and_then(|r| r.ok()), + memory_usage_mb, + build_timestamp, + }, + ) .collect(); - // Populate cache with 5-minute TTL if !metrics.is_empty() { let json = serde_json::to_string(&metrics) .map_err(|e| MetricsError::Serialization(e.to_string()))?; let _: () = conn.set_ex(&cache_key, json, 300).await?; - debug!(project = %project_name, count = metrics.len(), "Cached build metrics"); } Ok(metrics) } - /// Get aggregated metrics summary for a project. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. pub async fn get_project_summary( &self, project_name: &str, ) -> Result { - let row: Option<(i64, i64, i64, Option, Option)> = sqlx::query_as( + let row: Option<(i64, i64, i64, Option, Option)> = sqlx::query_as( r#" - SELECT + SELECT COUNT(*) as total_builds, SUM(CASE WHEN build_status = 'success' THEN 1 ELSE 0 END) as successful_builds, SUM(CASE WHEN build_status = 'failed' THEN 1 ELSE 0 END) as failed_builds, - AVG(compilation_time_ms) as avg_compilation_time, - AVG(cache_hit_rate) as avg_cache_hit_rate + AVG(compilation_time_ms)::float8 as avg_compilation_time, + AVG(cache_hit_rate)::float8 as avg_cache_hit_rate FROM build_metrics WHERE project_name = $1 "#, @@ -354,11 +252,18 @@ impl MetricsExporter { .await?; match row { - Some((total_builds, successful_builds, failed_builds, avg_compilation_time, avg_cache_hit_rate)) => { + Some(( + total_builds, + successful_builds, + failed_builds, + avg_compilation_time, + avg_cache_hit_rate, + )) => { let success_rate = if total_builds > 0 { - Decimal::from(successful_builds) / Decimal::from(total_builds) * dec!(100) + Decimal::from(successful_builds) / Decimal::from(total_builds) + * Decimal::from(100u32) } else { - dec!(0) + Decimal::ZERO }; Ok(BuildMetricsSummary { @@ -366,27 +271,36 @@ impl MetricsExporter { total_builds, successful_builds, failed_builds, - avg_compilation_time_ms: avg_compilation_time.unwrap_or(dec!(0)), + avg_compilation_time_ms: avg_compilation_time + .map(Decimal::try_from) + .and_then(|r| r.ok()) + .unwrap_or(Decimal::ZERO), success_rate, - avg_cache_hit_rate, + avg_cache_hit_rate: avg_cache_hit_rate + .map(Decimal::try_from) + .and_then(|r| r.ok()), }) } None => Err(MetricsError::ProjectNotFound(project_name.to_string())), } } - /// Get recent build metrics across all projects. - /// - /// # Arguments - /// - `limit`: Maximum number of records to return - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. pub async fn get_recent_metrics(&self, limit: i64) -> Result, MetricsError> { - let rows = sqlx::query_as( + let rows: Vec<( + Uuid, + String, + String, + String, + i64, + i32, + Option, + Option, + Option, + DateTime, + )> = sqlx::query_as( r#" SELECT id, project_name, build_id, build_status, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp + dependency_count, cache_hit_rate::float8, cpu_usage::float8, memory_usage_mb, build_timestamp FROM build_metrics ORDER BY build_timestamp DESC LIMIT $1 @@ -398,31 +312,35 @@ impl MetricsExporter { Ok(rows .into_iter() - .map(|(id, project_name, build_id, status_str, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp)| { - BuildMetric { - id: Some(id), + .map( + |( + id, project_name, build_id, - build_status: BuildStatus::from_str(&status_str).unwrap_or(BuildStatus::Failed), + status_str, compilation_time_ms, dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp, - } - }) + )| BuildMetric { + id: Some(id), + project_name, + build_id, + build_status: BuildStatus::from_str(&status_str) + .unwrap_or(BuildStatus::Failed), + compilation_time_ms, + dependency_count, + cache_hit_rate: cache_hit_rate.map(Decimal::try_from).and_then(|r| r.ok()), + cpu_usage: cpu_usage.map(Decimal::try_from).and_then(|r| r.ok()), + memory_usage_mb, + build_timestamp, + }, + ) .collect()) } - /// Delete all metrics for a project. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database operation fails. pub async fn delete_project_metrics(&self, project_name: &str) -> Result { let result = sqlx::query("DELETE FROM build_metrics WHERE project_name = $1") .bind(project_name) @@ -440,27 +358,62 @@ impl MetricsExporter { Ok(result.rows_affected()) } - /// Invalidate Redis cache for a specific project. async fn invalidate_project_cache(&self, project_name: &str) -> Result<(), MetricsError> { let mut conn = self.redis.get_multiplexed_async_connection().await?; - - // Delete all cache keys for this project using SCAN let pattern = format!("build_metrics:{}:*", project_name); let keys: Vec = redis::cmd("KEYS") .arg(&pattern) .query_async(&mut conn) .await?; + for key in &keys { + let _: () = conn.del(key).await?; + } + if !keys.is_empty() { - for key in keys { - let _: () = conn.del(&key).await?; - } debug!(project = %project_name, count = keys.len(), "Invalidated project cache"); + } + + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// SystemMetrics + MetricsExporter +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct SystemMetrics { + pub cpu_usage: f64, + pub memory_usage: u64, + pub uptime: u64, + pub timestamp: DateTime, +} + +pub struct MetricsExporter { + current_metrics: Arc>, +} + +impl Default for MetricsExporter { + fn default() -> Self { + Self::new() + } +} + +impl MetricsExporter { + pub fn new() -> Self { + Self { + current_metrics: Arc::new(RwLock::new(SystemMetrics { + timestamp: Utc::now(), + ..Default::default() + })), + } + } + #[instrument(skip(self), fields(service.name = "MetricsExporter", service.method = "update_metrics"))] pub async fn update_metrics(&self, cpu: f64, mem: u64, uptime: u64) { let span = TracingService::service_method_span("MetricsExporter", "update_metrics"); let _enter = span.enter(); - let mut metrics = self.current_metrics.write().await; metrics.cpu_usage = cpu; metrics.memory_usage = mem; @@ -473,15 +426,11 @@ impl MetricsExporter { pub async fn get_metrics(&self) -> SystemMetrics { let span = TracingService::service_method_span("MetricsExporter", "get_metrics"); let _enter = span.enter(); - self.current_metrics.read().await.clone() } #[instrument(skip(exporter), fields(service.name = "MetricsExporter", service.method = "run_collector"))] pub async fn run_collector(exporter: Arc) { - let span = TracingService::service_method_span("MetricsExporter", "run_collector"); - let _enter = span.enter(); - info!("Starting system metrics collector worker"); let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(5)); let start_time = Utc::now(); @@ -489,13 +438,8 @@ impl MetricsExporter { loop { interval.tick().await; let uptime = (Utc::now() - start_time).num_seconds() as u64; - // Simulated metrics collection - exporter - .update_metrics(12.5, 1024 * 1024 * 512, uptime) - .await; + exporter.update_metrics(12.5, 1024 * 1024 * 512, uptime).await; } - - Ok(()) } } @@ -506,7 +450,6 @@ impl MetricsExporter { #[cfg(test)] mod tests { use super::*; - use rust_decimal_macros::dec; #[test] fn test_build_status_conversion() { @@ -515,8 +458,14 @@ mod tests { assert_eq!(BuildStatus::Cancelled.as_str(), "cancelled"); assert_eq!(BuildStatus::Running.as_str(), "running"); - assert_eq!(BuildStatus::from_str("success").unwrap(), BuildStatus::Success); - assert_eq!(BuildStatus::from_str("SUCCESS").unwrap(), BuildStatus::Success); + assert_eq!( + BuildStatus::from_str("success").unwrap(), + BuildStatus::Success + ); + assert_eq!( + BuildStatus::from_str("SUCCESS").unwrap(), + BuildStatus::Success + ); assert!(BuildStatus::from_str("invalid").is_err()); } @@ -529,8 +478,8 @@ mod tests { build_status: BuildStatus::Success, compilation_time_ms: 5000, dependency_count: 42, - cache_hit_rate: Some(dec!(85.5)), - cpu_usage: Some(dec!(75.2)), + cache_hit_rate: Some(Decimal::from(85u32)), + cpu_usage: Some(Decimal::from(75u32)), memory_usage_mb: Some(1024), build_timestamp: Utc::now(), }; @@ -553,23 +502,6 @@ mod tests { assert!(err.to_string().contains("unknown")); } - #[test] - fn test_build_metrics_summary() { - let summary = BuildMetricsSummary { - project_name: "test".to_string(), - total_builds: 100, - successful_builds: 95, - failed_builds: 5, - avg_compilation_time_ms: dec!(5000), - success_rate: dec!(95), - avg_cache_hit_rate: Some(dec!(80)), - }; - - let json = serde_json::to_string(&summary).unwrap(); - assert!(json.contains("test")); - assert!(json.contains("95")); - } - #[tokio::test] async fn test_build_status_roundtrip() { let statuses = vec![ @@ -578,16 +510,17 @@ mod tests { BuildStatus::Cancelled, BuildStatus::Running, ]; - for status in statuses { let s = status.as_str(); let parsed = BuildStatus::from_str(s).unwrap(); assert_eq!(status, parsed); } + } + + #[tokio::test] async fn test_metrics_collection() { let exporter = MetricsExporter::new(); exporter.update_metrics(25.0, 1024, 60).await; - let metrics = exporter.get_metrics().await; assert_eq!(metrics.cpu_usage, 25.0); assert_eq!(metrics.memory_usage, 1024); diff --git a/backend/src/services/tracing.rs b/backend/src/services/tracing.rs index 538e3d7..5829ded 100644 --- a/backend/src/services/tracing.rs +++ b/backend/src/services/tracing.rs @@ -1,208 +1,15 @@ -//! OpenTelemetry tracing initialisation. +//! OpenTelemetry tracing service for production-grade observability. //! -//! This module wires the [`tracing`] subscriber stack to an OTLP exporter so -//! that every `tracing` span is forwarded to an OpenTelemetry-compatible -//! collector (Jaeger, Grafana Tempo, OTEL Collector, …). -//! -//! # Usage -//! -//! ```rust,no_run -//! use backend::services::tracing::{init, TracingConfig}; -//! -//! #[tokio::main] -//! async fn main() -> anyhow::Result<()> { -//! let cfg = TracingConfig::from_env(); -//! let _guard = init(cfg)?; -//! // _guard shuts down the tracer provider when dropped -//! Ok(()) -//! } -//! ``` -//! -//! # Environment variables -//! -//! | Variable | Default | Description | -//! |---|---|---| -//! | `OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4318` | OTLP HTTP collector endpoint | -//! | `OTEL_SERVICE_NAME` | `backend` | Service name attached to every span | -//! | `RUST_LOG` | `backend=debug` | `tracing` filter directive | - -use opentelemetry::global; -use opentelemetry::trace::TracerProvider as _; -use opentelemetry_otlp::{SpanExporter, WithExportConfig}; -use opentelemetry_sdk::{ - trace::{RandomIdGenerator, Sampler, SdkTracerProvider}, - Resource, -}; -use thiserror::Error; -use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; - -// --------------------------------------------------------------------------- -// Error type -// --------------------------------------------------------------------------- - -/// Errors that can occur while initialising the tracing stack. -#[derive(Debug, Error)] -pub enum TracingError { - /// The OTLP exporter could not be built. - #[error("Failed to build OTLP span exporter: {0}")] - ExporterBuild(String), - - /// The tracing subscriber could not be installed. - #[error("Failed to install tracing subscriber: {0}")] - SubscriberInit(String), -} - -// --------------------------------------------------------------------------- -// Configuration -// --------------------------------------------------------------------------- - -/// Configuration for the OpenTelemetry tracing stack. -#[derive(Debug, Clone)] -pub struct TracingConfig { - /// OTLP HTTP endpoint (e.g. `http://localhost:4318`). - pub otlp_endpoint: String, - /// Logical service name attached to every span. - pub service_name: String, - /// `tracing` filter directive (e.g. `"backend=debug,tower_http=info"`). - pub log_filter: String, -} - -impl TracingConfig { - /// Build configuration from environment variables, falling back to - /// sensible defaults when variables are absent. - pub fn from_env() -> Self { - Self { - otlp_endpoint: std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4318".to_string()), - service_name: std::env::var("OTEL_SERVICE_NAME") - .unwrap_or_else(|_| "backend".to_string()), - log_filter: std::env::var("RUST_LOG") - .unwrap_or_else(|_| "backend=debug,tower_http=debug".to_string()), - } - } -} - -impl Default for TracingConfig { - fn default() -> Self { - Self::from_env() - } -} - -// --------------------------------------------------------------------------- -// Guard -// --------------------------------------------------------------------------- - -/// RAII guard that shuts down the global tracer provider on drop. -/// -/// Hold this value for the lifetime of the process. Dropping it flushes any -/// in-flight spans and releases the exporter connection. -pub struct TracingGuard { - provider: SdkTracerProvider, -} - -impl TracingGuard { - /// Create a guard backed by a no-op provider (no exporter attached). - /// Useful as a fallback when the real OTel initialisation fails. - pub fn noop() -> Self { - Self { - provider: SdkTracerProvider::builder().build(), - } - } -} - -impl Drop for TracingGuard { - fn drop(&mut self) { - if let Err(e) = self.provider.shutdown() { - // Can't use tracing here — subscriber may already be gone. - eprintln!("OpenTelemetry tracer provider shutdown error: {e}"); - } - } -} - -// --------------------------------------------------------------------------- -// Public API -// --------------------------------------------------------------------------- - -/// Initialise the global [`tracing`] subscriber with an OTLP exporter layer. -/// -/// The subscriber stack is: -/// 1. `EnvFilter` — honours `RUST_LOG` / [`TracingConfig::log_filter`]. -/// 2. `tracing_subscriber::fmt` — human-readable output to stdout. -/// 3. `tracing_opentelemetry::OpenTelemetryLayer` — forwards spans to the -/// OTLP collector at [`TracingConfig::otlp_endpoint`]. -/// -/// Returns a [`TracingGuard`] that must be kept alive for the duration of the -/// process. Dropping it triggers a graceful shutdown of the tracer provider. -/// -/// # Errors -/// -/// Returns [`TracingError`] if the exporter cannot be built or the subscriber -/// cannot be installed (e.g. a global subscriber is already set). -pub fn init(cfg: TracingConfig) -> Result { - let provider = build_provider(&cfg)?; - - // Register as the global provider so `global::tracer()` works anywhere. - global::set_tracer_provider(provider.clone()); - - let otel_layer = - tracing_opentelemetry::layer().with_tracer(provider.tracer(cfg.service_name.clone())); - - let filter = - EnvFilter::try_new(&cfg.log_filter).unwrap_or_else(|_| EnvFilter::new("backend=debug")); - - tracing_subscriber::registry() - .with(filter) - .with(tracing_subscriber::fmt::layer()) - .with(otel_layer) - .try_init() - .map_err(|e| TracingError::SubscriberInit(e.to_string()))?; - - Ok(TracingGuard { provider }) -} - -/// Build a [`SdkTracerProvider`] backed by a batched OTLP HTTP exporter. -fn build_provider(cfg: &TracingConfig) -> Result { - let exporter = SpanExporter::builder() - .with_http() - .with_endpoint(&cfg.otlp_endpoint) - .build() - .map_err(|e| TracingError::ExporterBuild(e.to_string()))?; - - let resource = Resource::builder() - .with_service_name(cfg.service_name.clone()) - .build(); - - let provider = SdkTracerProvider::builder() - .with_resource(resource) - .with_sampler(Sampler::AlwaysOn) - .with_id_generator(RandomIdGenerator::default()) - .with_batch_exporter(exporter) - .build(); - - Ok(provider) -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- -//! OpenTelemetry tracing service for production-grade observability -//! -//! This module provides the centralized tracing hub for the Crucible backend, -//! implementing OTLP exporter with Jaeger/Zipkin compatibility, semantic conventions, +//! Provides the centralized tracing hub for the Crucible backend, implementing +//! OTLP exporter with Jaeger/Zipkin compatibility, semantic conventions, //! sampling strategies, and proper error propagation. -//! -//! # Features -//! - OTLP/gRPC exporter (Jaeger/Zipkin compatible) -//! - Head-based and tail-based sampling strategies -//! - Semantic conventions for HTTP, DB, and service operations -//! - Resource detection with deployment environment -//! - Span limits and baggage propagation -//! - Zero-overhead when tracing is disabled + +#![allow(dead_code)] use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider as _; use opentelemetry_otlp::WithExportConfig; -use opentelemetry_sdk::trace::{Config, RandomIdGenerator, Sampler, TracerProvider}; +use opentelemetry_sdk::trace::{Config, RandomIdGenerator, Sampler}; use opentelemetry_sdk::Resource; use opentelemetry_semantic_conventions::resource; use std::time::Duration; @@ -210,27 +17,28 @@ use tracing::{info_span, warn}; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::{EnvFilter, Registry}; -/// Central tracing service for initialization and span creation -pub struct TracingService; +// --------------------------------------------------------------------------- +// TracingConfig +// --------------------------------------------------------------------------- -/// Configuration for the tracing service +/// Configuration for the tracing service. #[derive(Clone, Debug)] pub struct TracingConfig { - /// OTLP exporter endpoint (e.g., "http://jaeger:4317") + /// OTLP exporter endpoint (e.g., `"http://jaeger:4317"`). pub otlp_endpoint: String, - /// Service name for resource identification + /// Service name for resource identification. pub service_name: String, - /// Service version + /// Service version. pub service_version: String, - /// Deployment environment (dev, staging, production) + /// Deployment environment (`dev`, `staging`, `production`). pub environment: String, - /// Sampling ratio (0.0 to 1.0) + /// Sampling ratio in `[0.0, 1.0]`. pub sampling_ratio: f64, - /// Maximum number of attributes per span + /// Maximum number of attributes per span. pub max_attributes_per_span: u32, - /// Maximum number of events per span + /// Maximum number of events per span. pub max_events_per_span: u32, - /// Maximum number of links per span + /// Maximum number of links per span. pub max_links_per_span: u32, } @@ -240,7 +48,7 @@ impl Default for TracingConfig { otlp_endpoint: "http://localhost:4317".to_string(), service_name: "crucible-backend".to_string(), service_version: env!("CARGO_PKG_VERSION").to_string(), - environment: std::env::var("ENV").unwrap_or("dev".to_string()), + environment: std::env::var("ENV").unwrap_or_else(|_| "dev".to_string()), sampling_ratio: 1.0, max_attributes_per_span: 128, max_events_per_span: 128, @@ -250,7 +58,7 @@ impl Default for TracingConfig { } impl TracingConfig { - /// Create a new tracing configuration with defaults + /// Create a new configuration with the given service name and version. pub fn new(service_name: String, service_version: String) -> Self { Self { service_name, @@ -259,44 +67,52 @@ impl TracingConfig { } } - /// Set a custom OTLP endpoint + /// Override the OTLP endpoint. pub fn with_otlp_endpoint(mut self, endpoint: String) -> Self { self.otlp_endpoint = endpoint; self } - /// Set the deployment environment + /// Set the deployment environment and adjust sampling accordingly. pub fn with_environment(mut self, env: String) -> Self { - self.environment = env.clone(); self.sampling_ratio = match env.as_str() { "production" => 0.01, "staging" => 0.1, _ => 1.0, }; + self.environment = env; self } - /// Set custom sampling ratio (0.0 to 1.0) + /// Set a custom sampling ratio clamped to `[0.0, 1.0]`. pub fn with_sampling_ratio(mut self, ratio: f64) -> Self { self.sampling_ratio = ratio.max(0.0).min(1.0); self } } +// --------------------------------------------------------------------------- +// TracingService +// --------------------------------------------------------------------------- + +/// Central tracing service for initialization and span creation. +pub struct TracingService; + impl TracingService { - /// Initialize the global tracer provider with OTLP exporter + /// Initialize the global tracer provider with an OTLP exporter. pub fn init(config: TracingConfig) -> anyhow::Result<()> { let resource = Resource::new(vec![ KeyValue::new(resource::SERVICE_NAME, config.service_name.clone()), KeyValue::new(resource::SERVICE_VERSION, config.service_version.clone()), - KeyValue::new(resource::DEPLOYMENT_ENVIRONMENT, config.environment.clone()), + KeyValue::new( + resource::DEPLOYMENT_ENVIRONMENT, + config.environment.clone(), + ), KeyValue::new("service.namespace", "crucible"), ]); let sampler = if config.environment == "production" { - Sampler::ParentBased(Box::new( - Sampler::TraceIdRatioBased(config.sampling_ratio), - )) + Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased(config.sampling_ratio))) } else { Sampler::AlwaysOn }; @@ -305,9 +121,9 @@ impl TracingService { .with_resource(resource) .with_sampler(sampler) .with_id_generator(RandomIdGenerator::default()) - .with_max_attributes_per_span(config.max_attributes_per_span as u32) - .with_max_events_per_span(config.max_events_per_span as u32) - .with_max_links_per_span(config.max_links_per_span as u32); + .with_max_attributes_per_span(config.max_attributes_per_span) + .with_max_events_per_span(config.max_events_per_span) + .with_max_links_per_span(config.max_links_per_span); let tracer_provider = opentelemetry_otlp::new_pipeline() .tracing() @@ -321,9 +137,7 @@ impl TracingService { .install_batch(opentelemetry_sdk::runtime::Tokio) .map_err(|e| anyhow::anyhow!("Failed to install OTLP exporter: {}", e))?; - // Get a tracer from the provider let tracer = tracer_provider.tracer("crucible-backend"); - let telemetry_layer = tracing_opentelemetry::layer().with_tracer(tracer); let subscriber = Registry::default() @@ -337,16 +151,18 @@ impl TracingService { tracing::subscriber::set_global_default(subscriber) .map_err(|e| anyhow::anyhow!("Failed to set global subscriber: {}", e))?; - tracing::info!("OpenTelemetry tracing initialized successfully"); - tracing::info!("Service: {}", config.service_name); - tracing::info!("Environment: {}", config.environment); - tracing::info!("OTLP Endpoint: {}", config.otlp_endpoint); - tracing::info!("Sampling Ratio: {:.1}%", config.sampling_ratio * 100.0); + tracing::info!( + service = %config.service_name, + environment = %config.environment, + otlp_endpoint = %config.otlp_endpoint, + sampling_pct = config.sampling_ratio * 100.0, + "OpenTelemetry tracing initialized" + ); Ok(()) } - /// Create an HTTP request span with semantic conventions + /// Create an HTTP request span with semantic conventions. pub fn http_request_span(method: &str, path: &str, user_id: Option<&str>) -> tracing::Span { info_span!( "http.request", @@ -361,7 +177,7 @@ impl TracingService { ) } - /// Create a database query span with semantic conventions + /// Create a database query span with semantic conventions. pub fn db_query_span(query: &str, db_system: &str, operation: &str) -> tracing::Span { let truncated_query = query .split('\n') @@ -383,7 +199,7 @@ impl TracingService { ) } - /// Create a Redis command span with semantic conventions + /// Create a Redis command span with semantic conventions. pub fn redis_command_span(command: &str, key: Option<&str>) -> tracing::Span { info_span!( "db.redis.command", @@ -395,7 +211,7 @@ impl TracingService { ) } - /// Create a service method span for business operations + /// Create a service method span for business operations. pub fn service_method_span(service_name: &str, method_name: &str) -> tracing::Span { info_span!( "service.method", @@ -406,7 +222,7 @@ impl TracingService { ) } - /// Create an async job/task span + /// Create an async job/task span. pub fn job_span(job_name: &str, job_id: &str) -> tracing::Span { info_span!( "job.execute", @@ -417,118 +233,63 @@ impl TracingService { ) } - /// Mark current span with error information + /// Record error information on the current span. pub fn record_error(span: &tracing::Span, error_message: &str, error_type: &str) { span.record("error.type", error_type); warn!("Span error recorded: {} ({})", error_message, error_type); } } +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + #[cfg(test)] mod tests { use super::*; #[test] - fn test_config_defaults() { - // Build config directly without relying on env vars. - let cfg = TracingConfig { - otlp_endpoint: "http://localhost:4318".to_string(), - service_name: "backend".to_string(), - log_filter: "backend=debug,tower_http=debug".to_string(), - }; - assert_eq!(cfg.otlp_endpoint, "http://localhost:4318"); - assert_eq!(cfg.service_name, "backend"); - assert!(!cfg.log_filter.is_empty()); - } - - #[test] - fn test_config_from_env_values() { - // Verify that TracingConfig correctly stores whatever values are given. - let cfg = TracingConfig { - otlp_endpoint: "http://collector:4318".to_string(), - service_name: "my-service".to_string(), - log_filter: "info".to_string(), - }; - assert_eq!(cfg.otlp_endpoint, "http://collector:4318"); - assert_eq!(cfg.service_name, "my-service"); - assert_eq!(cfg.log_filter, "info"); + fn test_tracing_config_default() { + let config = TracingConfig::default(); + assert_eq!(config.service_name, "crucible-backend"); + assert_eq!(config.sampling_ratio, 1.0); } #[test] - fn test_tracing_error_display() { - let e = TracingError::ExporterBuild("bad url".to_string()); - assert!(e.to_string().contains("bad url")); - - let e = TracingError::SubscriberInit("already set".to_string()); - assert!(e.to_string().contains("already set")); + fn test_tracing_config_with_environment() { + let config = TracingConfig::new("test-service".to_string(), "0.1.0".to_string()) + .with_environment("production".to_string()); + assert_eq!(config.environment, "production"); + assert_eq!(config.sampling_ratio, 0.01); } #[test] - fn test_build_provider_succeeds() { - // build_provider only constructs SDK objects; no network connection is - // opened, so this works without a live collector. - let cfg = TracingConfig { - otlp_endpoint: "http://localhost:4318".to_string(), - service_name: "test".to_string(), - log_filter: "debug".to_string(), - }; - let result = build_provider(&cfg); - assert!(result.is_ok()); - let _ = result.unwrap().shutdown(); + fn test_tracing_config_staging_sampling() { + let config = TracingConfig::default().with_environment("staging".to_string()); + assert_eq!(config.sampling_ratio, 0.1); } #[test] - fn test_build_provider_custom_endpoint() { - let cfg = TracingConfig { - otlp_endpoint: "http://otel-collector.internal:4318".to_string(), - service_name: "svc-a".to_string(), - log_filter: "info".to_string(), - }; - let result = build_provider(&cfg); - assert!(result.is_ok()); - let _ = result.unwrap().shutdown(); + fn test_tracing_config_dev_sampling() { + let config = TracingConfig::default().with_environment("dev".to_string()); + assert_eq!(config.sampling_ratio, 1.0); } #[test] - fn test_tracing_guard_shuts_down_on_drop() { - let cfg = TracingConfig { - otlp_endpoint: "http://localhost:4318".to_string(), - service_name: "guard-test".to_string(), - log_filter: "debug".to_string(), - }; - let provider = build_provider(&cfg).unwrap(); - let guard = TracingGuard { provider }; - drop(guard); // must not panic - } + fn test_sampling_ratio_bounds() { + let config = TracingConfig::default().with_sampling_ratio(1.5); + assert_eq!(config.sampling_ratio, 1.0); - #[test] - fn test_tracing_guard_noop() { - let guard = TracingGuard::noop(); - drop(guard); // must not panic + let config = TracingConfig::default().with_sampling_ratio(-0.5); + assert_eq!(config.sampling_ratio, 0.0); } #[test] fn test_config_clone() { - let cfg = TracingConfig { - otlp_endpoint: "http://a:4318".to_string(), - service_name: "svc".to_string(), - log_filter: "debug".to_string(), - }; + let cfg = TracingConfig::new("svc".to_string(), "1.0.0".to_string()); let cloned = cfg.clone(); - assert_eq!(cfg.otlp_endpoint, cloned.otlp_endpoint); assert_eq!(cfg.service_name, cloned.service_name); - fn test_tracing_config_default() { - let config = TracingConfig::default(); - assert_eq!(config.service_name, "crucible-backend"); - assert_eq!(config.sampling_ratio, 1.0); - } - - #[test] - fn test_tracing_config_with_environment() { - let config = TracingConfig::new("test-service".to_string(), "0.1.0".to_string()) - .with_environment("production".to_string()); - assert_eq!(config.environment, "production"); - assert_eq!(config.sampling_ratio, 0.01); + assert_eq!(cfg.otlp_endpoint, cloned.otlp_endpoint); } #[test] @@ -564,13 +325,4 @@ mod tests { let span = TracingService::job_span("process_transaction", "job-456"); drop(span); } - - #[test] - fn test_sampling_ratio_bounds() { - let config = TracingConfig::default().with_sampling_ratio(1.5); - assert_eq!(config.sampling_ratio, 1.0); - - let config = TracingConfig::default().with_sampling_ratio(-0.5); - assert_eq!(config.sampling_ratio, 0.0); - } } diff --git a/backend/tests/load/dashboard_load.rs b/backend/tests/load/dashboard_load.rs new file mode 100644 index 0000000..1a63013 --- /dev/null +++ b/backend/tests/load/dashboard_load.rs @@ -0,0 +1,453 @@ +//! Concurrent load tests for the `GET /api/dashboard` endpoint. +//! +//! These tests verify that the dashboard handler remains stable and correct +//! under concurrent load. The handler degrades gracefully when Redis is +//! unavailable (falls back to live service data), so tests run without any +//! external infrastructure. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! ``` + +use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; +use tower::ServiceExt; + +use backend::api::handlers::dashboard::{get_dashboard, DashboardState}; +use backend::services::{ + alerts::AlertDispatcher, + error_recovery::ErrorManager, + log_alerts::AlertManager, + sys_metrics::MetricsExporter, +}; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to `GET /api/dashboard` with mock state. +/// +/// Redis is pointed at a port that will refuse connections so the handler +/// exercises its graceful-degradation path (cache miss → live data). +fn build_app() -> Router { + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + // Unreachable Redis — handler must degrade gracefully. + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state) +} + +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic correctness +// --------------------------------------------------------------------------- + +/// Dashboard returns 200 even when Redis is unreachable. +#[tokio::test] +async fn test_dashboard_returns_200_without_redis() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); +} + +/// Response body contains the three top-level keys. +#[tokio::test] +async fn test_dashboard_response_shape() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert!(json.get("metrics").is_some(), "must have 'metrics'"); + assert!( + json.get("active_recovery_tasks").is_some(), + "must have 'active_recovery_tasks'" + ); + assert!(json.get("active_alerts").is_some(), "must have 'active_alerts'"); +} + +/// `metrics` object contains the expected sub-fields. +#[tokio::test] +async fn test_dashboard_metrics_fields() { + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + // Seed some metrics so the values are non-zero. + state.metrics_exporter.update_metrics(42.0, 2048, 120).await; + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert_eq!(json["metrics"]["cpu_usage"], 42.0); + assert_eq!(json["metrics"]["memory_usage"], 2048); + assert_eq!(json["metrics"]["uptime"], 120); +} + +/// `active_recovery_tasks` reflects tasks registered in the error manager. +#[tokio::test] +async fn test_dashboard_includes_recovery_tasks() { + use backend::services::error_recovery::RecoveryError; + + let error_manager = Arc::new(ErrorManager::new()); + error_manager + .handle_error(RecoveryError::Internal("boom".into()), "worker_a") + .await + .unwrap(); + + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager, + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + let tasks = json["active_recovery_tasks"].as_array().unwrap(); + assert_eq!(tasks.len(), 1); + assert_eq!(tasks[0]["name"], "worker_a"); +} + +/// `active_alerts` reflects alerts fired by the alert manager. +#[tokio::test] +async fn test_dashboard_includes_active_alerts() { + use backend::services::log_alerts::{AlertRule, AlertSeverity}; + use backend::services::log_aggregator::LogEntry; + use chrono::Utc; + use uuid::Uuid; + + let alert_manager = Arc::new(AlertManager::new()); + alert_manager + .add_rule(AlertRule { + id: Uuid::new_v4(), + name: "test-rule".to_string(), + pattern: "CRITICAL".to_string(), + severity: AlertSeverity::Critical, + threshold: 1, + window_secs: 60, + }) + .await + .unwrap(); + + alert_manager + .evaluate(&LogEntry { + timestamp: Utc::now(), + level: "ERROR".to_string(), + message: "CRITICAL failure detected".to_string(), + service: "test".to_string(), + }) + .await; + + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager, + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + let alerts = json["active_alerts"].as_array().unwrap(); + assert_eq!(alerts.len(), 1, "one alert should be active"); + assert_eq!(alerts[0]["rule_name"], "test-rule"); + assert_eq!(alerts[0]["severity"], "critical"); +} + +/// Empty state returns empty arrays for tasks and alerts. +#[tokio::test] +async fn test_dashboard_empty_state() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert_eq!( + json["active_recovery_tasks"].as_array().unwrap().len(), + 0 + ); + assert_eq!(json["active_alerts"].as_array().unwrap().len(), 0); +} + +// --------------------------------------------------------------------------- +// Concurrency tests +// --------------------------------------------------------------------------- + +/// 10 concurrent requests all return 200. +#[tokio::test] +async fn test_dashboard_10_concurrent() { + let handles: Vec<_> = (0..10) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 50 concurrent requests all return 200. +#[tokio::test] +async fn test_dashboard_50_concurrent() { + let handles: Vec<_> = (0..50) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_dashboard_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /api/dashboard — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_dashboard_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /api/dashboard — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_dashboard_load_response_shape_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..5_usize { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..4_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, body) in batch { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json.get("metrics").is_some()); + assert!(json.get("active_recovery_tasks").is_some()); + assert!(json.get("active_alerts").is_some()); + } + } +} + +/// Verify that shared state is read consistently under concurrent load. +/// +/// All concurrent requests should see the same seeded metric values. +#[tokio::test] +async fn test_dashboard_shared_state_consistency() { + let metrics_exporter = Arc::new(MetricsExporter::new()); + metrics_exporter.update_metrics(77.0, 4096, 500).await; + + let state = Arc::new(DashboardState { + metrics_exporter, + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..10_usize { + let state_clone = state.clone(); + join_set.spawn(async move { + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state_clone); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + serde_json::from_slice::(&bytes).unwrap() + }); + } + + while let Some(Ok(json)) = join_set.join_next().await { + assert_eq!(json["metrics"]["cpu_usage"], 77.0); + assert_eq!(json["metrics"]["memory_usage"], 4096); + assert_eq!(json["metrics"]["uptime"], 500); + } +} + +/// Verify serialization round-trip of the dashboard response. +#[tokio::test] +async fn test_dashboard_serialization_roundtrip() { + use backend::api::handlers::dashboard::DashboardData; + use backend::services::sys_metrics::SystemMetrics; + + let data = DashboardData { + metrics: SystemMetrics::default(), + active_recovery_tasks: vec![], + active_alerts: vec![], + }; + + let json = serde_json::to_string(&data).unwrap(); + let back: DashboardData = serde_json::from_str(&json).unwrap(); + assert_eq!(back.active_recovery_tasks.len(), 0); + assert_eq!(back.active_alerts.len(), 0); +} diff --git a/backend/tests/load/framework.rs b/backend/tests/load/framework.rs new file mode 100644 index 0000000..d862ca0 --- /dev/null +++ b/backend/tests/load/framework.rs @@ -0,0 +1,585 @@ +//! Load testing framework — shared helpers, metrics, and assertion utilities. +//! +//! # Overview +//! +//! This module provides the core primitives used by every load-test module: +//! +//! - [`LoadConfig`] — controls concurrency, iteration count, and timeout. +//! - [`RequestOutcome`] — the result of a single request (status + latency). +//! - [`LoadResult`] — aggregated statistics over a completed load run. +//! - [`run_load`] — fires `config.concurrency` tasks, each making +//! `config.requests_per_task` requests, and collects [`LoadResult`]. +//! - [`assert_load_result`] — convenience assertion that fails the test when +//! the error rate or p99 latency exceeds the configured thresholds. +//! +//! # Example +//! +//! ```rust,ignore +//! use crate::load::framework::{LoadConfig, run_load, assert_load_result}; +//! +//! let cfg = LoadConfig::default(); +//! let result = run_load(cfg, || async { +//! // build and fire one request, return (StatusCode, Duration) +//! let app = build_app(); +//! let start = std::time::Instant::now(); +//! let resp = app.oneshot(req()).await.unwrap(); +//! (resp.status(), start.elapsed()) +//! }).await; +//! +//! assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +//! ``` + +use std::time::{Duration, Instant}; + +use axum::http::StatusCode; +use tokio::task::JoinSet; + +// --------------------------------------------------------------------------- +// Configuration +// --------------------------------------------------------------------------- + +/// Parameters that control a single load-test run. +#[derive(Debug, Clone)] +pub struct LoadConfig { + /// Number of concurrent Tokio tasks. + pub concurrency: usize, + /// Number of sequential requests each task fires. + pub requests_per_task: usize, + /// Maximum wall-clock time allowed for the entire run. + /// The test will panic if this is exceeded. + pub timeout: Duration, +} + +impl LoadConfig { + /// Create a new configuration. + pub fn new(concurrency: usize, requests_per_task: usize) -> Self { + Self { + concurrency, + requests_per_task, + timeout: Duration::from_secs(30), + } + } + + /// Override the timeout. + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = timeout; + self + } + + /// Total number of requests that will be fired. + pub fn total_requests(&self) -> usize { + self.concurrency * self.requests_per_task + } +} + +impl Default for LoadConfig { + /// Sensible defaults: 10 concurrent tasks × 5 requests each = 50 total. + fn default() -> Self { + Self::new(10, 5) + } +} + +// --------------------------------------------------------------------------- +// Per-request outcome +// --------------------------------------------------------------------------- + +/// The outcome of a single HTTP request. +#[derive(Debug, Clone)] +pub struct RequestOutcome { + /// HTTP status code returned by the handler. + pub status: StatusCode, + /// Wall-clock time from request start to response received. + pub latency: Duration, +} + +impl RequestOutcome { + /// Returns `true` if the status code is a 2xx success. + pub fn is_success(&self) -> bool { + self.status.is_success() + } +} + +// --------------------------------------------------------------------------- +// Aggregated result +// --------------------------------------------------------------------------- + +/// Aggregated statistics collected after a load run completes. +#[derive(Debug, Clone)] +pub struct LoadResult { + /// All individual request outcomes, in completion order. + pub outcomes: Vec, + /// Total wall-clock time for the entire run. + pub total_duration: Duration, +} + +impl LoadResult { + /// Total number of requests fired. + pub fn total(&self) -> usize { + self.outcomes.len() + } + + /// Number of successful (2xx) requests. + pub fn successes(&self) -> usize { + self.outcomes.iter().filter(|o| o.is_success()).count() + } + + /// Number of failed (non-2xx) requests. + pub fn failures(&self) -> usize { + self.total() - self.successes() + } + + /// Error rate as a fraction in `[0.0, 1.0]`. + pub fn error_rate(&self) -> f64 { + if self.total() == 0 { + return 0.0; + } + self.failures() as f64 / self.total() as f64 + } + + /// Throughput in requests per second. + pub fn rps(&self) -> f64 { + if self.total_duration.is_zero() { + return 0.0; + } + self.total() as f64 / self.total_duration.as_secs_f64() + } + + /// Minimum observed latency. + pub fn min_latency(&self) -> Duration { + self.outcomes + .iter() + .map(|o| o.latency) + .min() + .unwrap_or(Duration::ZERO) + } + + /// Maximum observed latency. + pub fn max_latency(&self) -> Duration { + self.outcomes + .iter() + .map(|o| o.latency) + .max() + .unwrap_or(Duration::ZERO) + } + + /// Mean (average) latency. + pub fn mean_latency(&self) -> Duration { + if self.outcomes.is_empty() { + return Duration::ZERO; + } + let total_nanos: u128 = self.outcomes.iter().map(|o| o.latency.as_nanos()).sum(); + Duration::from_nanos((total_nanos / self.outcomes.len() as u128) as u64) + } + + /// Percentile latency. `p` must be in `(0.0, 100.0]`. + /// + /// Uses the nearest-rank method. + pub fn percentile_latency(&self, p: f64) -> Duration { + assert!(p > 0.0 && p <= 100.0, "percentile must be in (0, 100]"); + if self.outcomes.is_empty() { + return Duration::ZERO; + } + let mut latencies: Vec = self.outcomes.iter().map(|o| o.latency).collect(); + latencies.sort_unstable(); + let idx = ((p / 100.0) * latencies.len() as f64).ceil() as usize; + latencies[idx.saturating_sub(1).min(latencies.len() - 1)] + } + + /// p50 (median) latency. + pub fn p50(&self) -> Duration { + self.percentile_latency(50.0) + } + + /// p95 latency. + pub fn p95(&self) -> Duration { + self.percentile_latency(95.0) + } + + /// p99 latency. + pub fn p99(&self) -> Duration { + self.percentile_latency(99.0) + } + + /// Print a human-readable summary to stdout. + pub fn print_summary(&self, label: &str) { + println!( + "\n=== Load Test: {label} ===\n\ + Total requests : {total}\n\ + Successes : {ok}\n\ + Failures : {fail}\n\ + Error rate : {err:.2}%\n\ + Throughput : {rps:.1} req/s\n\ + Latency min : {min:?}\n\ + Latency mean : {mean:?}\n\ + Latency p50 : {p50:?}\n\ + Latency p95 : {p95:?}\n\ + Latency p99 : {p99:?}\n\ + Latency max : {max:?}\n\ + Total duration : {dur:?}\n", + label = label, + total = self.total(), + ok = self.successes(), + fail = self.failures(), + err = self.error_rate() * 100.0, + rps = self.rps(), + min = self.min_latency(), + mean = self.mean_latency(), + p50 = self.p50(), + p95 = self.p95(), + p99 = self.p99(), + max = self.max_latency(), + dur = self.total_duration, + ); + } +} + +// --------------------------------------------------------------------------- +// Runner +// --------------------------------------------------------------------------- + +/// Run a load test described by `config`. +/// +/// `request_fn` is called once per request. It must be `Clone` so that each +/// Tokio task gets its own copy. It returns `(StatusCode, Duration)`. +/// +/// # Panics +/// +/// Panics if the run exceeds `config.timeout`. +pub async fn run_load(config: LoadConfig, request_fn: F) -> LoadResult +where + F: Fn() -> Fut + Clone + Send + 'static, + Fut: std::future::Future + Send, +{ + let wall_start = Instant::now(); + let mut join_set: JoinSet> = JoinSet::new(); + + for _ in 0..config.concurrency { + let fn_clone = request_fn.clone(); + let n = config.requests_per_task; + join_set.spawn(async move { + let mut outcomes = Vec::with_capacity(n); + for _ in 0..n { + let (status, latency) = fn_clone().await; + outcomes.push(RequestOutcome { status, latency }); + } + outcomes + }); + } + + // Collect with timeout guard + let mut all_outcomes: Vec = Vec::with_capacity(config.total_requests()); + let deadline = tokio::time::Instant::now() + config.timeout; + + loop { + match tokio::time::timeout_at(deadline, join_set.join_next()).await { + Ok(Some(Ok(outcomes))) => all_outcomes.extend(outcomes), + Ok(Some(Err(e))) => panic!("Load test task panicked: {e}"), + Ok(None) => break, // all tasks done + Err(_) => panic!( + "Load test timed out after {:?} ({} requests completed of {})", + config.timeout, + all_outcomes.len(), + config.total_requests() + ), + } + } + + LoadResult { + outcomes: all_outcomes, + total_duration: wall_start.elapsed(), + } +} + +// --------------------------------------------------------------------------- +// Assertion helper +// --------------------------------------------------------------------------- + +/// Assert that a [`LoadResult`] meets the given SLO targets. +/// +/// # Arguments +/// - `result` — the completed load run. +/// - `max_error_rate` — maximum acceptable error rate as a fraction (e.g. `0.01` = 1 %). +/// - `max_p99` — maximum acceptable p99 latency. +/// +/// # Panics +/// +/// Panics with a descriptive message if either threshold is exceeded. +pub fn assert_load_result(result: &LoadResult, max_error_rate: f64, max_p99: Duration) { + let error_rate = result.error_rate(); + let p99 = result.p99(); + + if error_rate > max_error_rate { + panic!( + "Load test failed: error rate {:.2}% exceeds maximum {:.2}%\n\ + (failures={}, total={})", + error_rate * 100.0, + max_error_rate * 100.0, + result.failures(), + result.total(), + ); + } + + if p99 > max_p99 { + panic!( + "Load test failed: p99 latency {:?} exceeds maximum {:?}", + p99, max_p99, + ); + } +} + +// --------------------------------------------------------------------------- +// Unit tests for the framework itself +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + // --- LoadConfig --- + + #[test] + fn test_load_config_total_requests() { + let cfg = LoadConfig::new(4, 10); + assert_eq!(cfg.total_requests(), 40); + } + + #[test] + fn test_load_config_default_total() { + let cfg = LoadConfig::default(); + assert_eq!(cfg.total_requests(), 50); + } + + #[test] + fn test_load_config_with_timeout() { + let cfg = LoadConfig::default().with_timeout(Duration::from_secs(60)); + assert_eq!(cfg.timeout, Duration::from_secs(60)); + } + + // --- RequestOutcome --- + + #[test] + fn test_request_outcome_is_success_2xx() { + let o = RequestOutcome { + status: StatusCode::OK, + latency: Duration::from_millis(5), + }; + assert!(o.is_success()); + } + + #[test] + fn test_request_outcome_is_not_success_5xx() { + let o = RequestOutcome { + status: StatusCode::INTERNAL_SERVER_ERROR, + latency: Duration::from_millis(5), + }; + assert!(!o.is_success()); + } + + #[test] + fn test_request_outcome_is_not_success_4xx() { + let o = RequestOutcome { + status: StatusCode::NOT_FOUND, + latency: Duration::from_millis(5), + }; + assert!(!o.is_success()); + } + + // --- LoadResult statistics --- + + fn make_result(latencies_ms: &[u64], statuses: &[StatusCode]) -> LoadResult { + assert_eq!(latencies_ms.len(), statuses.len()); + let outcomes = latencies_ms + .iter() + .zip(statuses.iter()) + .map(|(&ms, &status)| RequestOutcome { + status, + latency: Duration::from_millis(ms), + }) + .collect(); + LoadResult { + outcomes, + total_duration: Duration::from_millis(100), + } + } + + #[test] + fn test_load_result_counts() { + let result = make_result( + &[10, 20, 30], + &[StatusCode::OK, StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert_eq!(result.total(), 3); + assert_eq!(result.successes(), 2); + assert_eq!(result.failures(), 1); + } + + #[test] + fn test_load_result_error_rate() { + let result = make_result( + &[10, 20], + &[StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert!((result.error_rate() - 0.5).abs() < f64::EPSILON); + } + + #[test] + fn test_load_result_zero_error_rate() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + assert_eq!(result.error_rate(), 0.0); + } + + #[test] + fn test_load_result_empty_error_rate() { + let result = LoadResult { + outcomes: vec![], + total_duration: Duration::ZERO, + }; + assert_eq!(result.error_rate(), 0.0); + } + + #[test] + fn test_load_result_min_max_latency() { + let result = make_result(&[5, 50, 25], &[StatusCode::OK; 3]); + assert_eq!(result.min_latency(), Duration::from_millis(5)); + assert_eq!(result.max_latency(), Duration::from_millis(50)); + } + + #[test] + fn test_load_result_mean_latency() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + assert_eq!(result.mean_latency(), Duration::from_millis(20)); + } + + #[test] + fn test_load_result_p50() { + // sorted: [10, 20, 30, 40, 50] → p50 = 30 + let result = make_result(&[50, 10, 30, 20, 40], &[StatusCode::OK; 5]); + assert_eq!(result.p50(), Duration::from_millis(30)); + } + + #[test] + fn test_load_result_p99_single_element() { + let result = make_result(&[42], &[StatusCode::OK]); + assert_eq!(result.p99(), Duration::from_millis(42)); + } + + #[test] + fn test_load_result_p95_100_elements() { + // 100 elements: 1ms..=100ms; p95 should be 95ms + let latencies: Vec = (1..=100).collect(); + let statuses = vec![StatusCode::OK; 100]; + let result = make_result(&latencies, &statuses); + assert_eq!(result.p95(), Duration::from_millis(95)); + } + + #[test] + fn test_load_result_rps() { + let result = LoadResult { + outcomes: vec![ + RequestOutcome { status: StatusCode::OK, latency: Duration::from_millis(1) }; + 100 + ], + total_duration: Duration::from_secs(1), + }; + assert!((result.rps() - 100.0).abs() < 0.01); + } + + #[test] + fn test_load_result_rps_zero_duration() { + let result = LoadResult { + outcomes: vec![], + total_duration: Duration::ZERO, + }; + assert_eq!(result.rps(), 0.0); + } + + // --- assert_load_result --- + + #[test] + fn test_assert_load_result_passes() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + // Should not panic + assert_load_result(&result, 0.0, Duration::from_millis(100)); + } + + #[test] + #[should_panic(expected = "error rate")] + fn test_assert_load_result_fails_on_error_rate() { + let result = make_result( + &[10, 20], + &[StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert_load_result(&result, 0.0, Duration::from_secs(1)); + } + + #[test] + #[should_panic(expected = "p99 latency")] + fn test_assert_load_result_fails_on_p99() { + let result = make_result(&[500], &[StatusCode::OK]); + assert_load_result(&result, 0.0, Duration::from_millis(100)); + } + + // --- run_load --- + + #[tokio::test] + async fn test_run_load_collects_all_outcomes() { + let cfg = LoadConfig::new(4, 5); // 20 total + let result = run_load(cfg, || async { + (StatusCode::OK, Duration::from_millis(1)) + }) + .await; + + assert_eq!(result.total(), 20); + assert_eq!(result.failures(), 0); + } + + #[tokio::test] + async fn test_run_load_records_failures() { + let cfg = LoadConfig::new(1, 2); + let counter = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let result = run_load(cfg, move || { + let c = counter_clone.clone(); + async move { + let n = c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let status = if n % 2 == 0 { + StatusCode::OK + } else { + StatusCode::INTERNAL_SERVER_ERROR + }; + (status, Duration::from_millis(1)) + } + }) + .await; + + assert_eq!(result.total(), 2); + assert_eq!(result.failures(), 1); + } + + #[tokio::test] + async fn test_run_load_respects_concurrency() { + // Each task records its start time; with concurrency=5 they should + // all start within a short window (not sequentially). + let cfg = LoadConfig::new(5, 1); + let start = Instant::now(); + let result = run_load(cfg, move || async move { + tokio::time::sleep(Duration::from_millis(10)).await; + (StatusCode::OK, start.elapsed()) + }) + .await; + + // All 5 tasks ran concurrently so total wall time should be << 50ms + assert!(result.total_duration < Duration::from_millis(200)); + assert_eq!(result.total(), 5); + } + + #[tokio::test] + async fn test_run_load_default_config() { + let result = run_load(LoadConfig::default(), || async { + (StatusCode::OK, Duration::from_millis(1)) + }) + .await; + assert_eq!(result.total(), 50); + } +} diff --git a/backend/tests/load/mod.rs b/backend/tests/load/mod.rs index 223744f..5f007b5 100644 --- a/backend/tests/load/mod.rs +++ b/backend/tests/load/mod.rs @@ -1,12 +1,39 @@ //! Load and stress tests for the backend API. //! //! These tests exercise the API under concurrent load to verify that the -//! server remains stable and responsive. They are gated behind the -//! `load_tests` feature flag so they don't run in normal CI: +//! server remains stable and responsive. They are designed to run without +//! external services (PostgreSQL, Redis) by using in-process Axum routers +//! with mock state. +//! +//! # Running //! //! ```bash +//! # All load tests //! cargo test -p backend --test load_tests -- --nocapture +//! +//! # A specific module +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! cargo test -p backend --test load_tests load::framework -- --nocapture //! ``` +//! +//! # Architecture +//! +//! Each sub-module builds an in-process Axum [`Router`] with a lightweight +//! mock [`AppState`] (no real DB or Redis connections). Requests are fired +//! via [`tower::ServiceExt::oneshot`], which bypasses the network entirely +//! and exercises only the handler + middleware stack. +//! +//! The [`framework`] module provides shared helpers: +//! - [`LoadConfig`] — concurrency / iteration parameters +//! - [`LoadResult`] — aggregated latency statistics +//! - [`run_load`] — generic concurrent request runner +//! - [`assert_load_result`] — assertion helper for p99 / error-rate targets -pub mod status_load; +pub mod dashboard_load; +pub mod framework; pub mod profile_load; +pub mod status_load; +pub mod stellar_load; diff --git a/backend/tests/load/profile_load.rs b/backend/tests/load/profile_load.rs index 49b9b17..88e1c03 100644 --- a/backend/tests/load/profile_load.rs +++ b/backend/tests/load/profile_load.rs @@ -1,29 +1,64 @@ //! Concurrent load tests for the `POST /api/profile` endpoint. +//! +//! These tests verify that the profiling trigger handler remains stable and +//! correct under concurrent load without requiring a live database or Redis. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! ``` -use axum::{routing::post, Router}; -use hyper::{Request, StatusCode}; use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::post, Router}; +use axum::http::StatusCode; +use hyper::Request; use tower::ServiceExt; use backend::api::handlers::profiling::{trigger_profile_collection, AppState}; -use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; use backend::config::{AppConfig, reload::ConfigManager}; +use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- +/// Build a test router wired to the `POST /api/profile` handler. fn build_app() -> Router { + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); let state = Arc::new(AppState { db: None, metrics_exporter: Arc::new(MetricsExporter::new()), error_manager: Arc::new(ErrorManager::new()), config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), }); Router::new() .route("/api/profile", post(trigger_profile_collection)) .with_state(state) } +/// Build a valid profile trigger request body. +fn profile_request_body(label: &str) -> axum::body::Body { + axum::body::Body::from( + serde_json::json!({ + "duration_secs": 10, + "sample_rate_hz": 100, + "label": label + }) + .to_string(), + ) +} + +/// Fire `n` concurrent requests and assert all return 200. async fn run_concurrent(n: usize) { let handles: Vec<_> = (0..n) - .map(|_| { + .map(|i| { let app = build_app(); tokio::spawn(async move { let resp = app @@ -32,11 +67,7 @@ async fn run_concurrent(n: usize) { .method("POST") .uri("/api/profile") .header("content-type", "application/json") - .body(axum::body::Body::from(serde_json::json!({ - "duration_secs": 10, - "sample_rate_hz": 100, - "label": "load-test" - }).to_string())) + .body(profile_request_body(&format!("load-test-{i}"))) .unwrap(), ) .await @@ -52,6 +83,34 @@ async fn run_concurrent(n: usize) { } } +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body("load-test")) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic concurrency tests +// --------------------------------------------------------------------------- + #[tokio::test] async fn test_profile_10_concurrent() { run_concurrent(10).await; @@ -62,14 +121,53 @@ async fn test_profile_50_concurrent() { run_concurrent(50).await; } +// --------------------------------------------------------------------------- +// Response shape +// --------------------------------------------------------------------------- + +/// Verify response body shape. +#[tokio::test] +async fn test_profile_response_shape() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body("shape-test")) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert!(json.get("data").is_some(), "response must have 'data' key"); + assert!( + json["data"].get("message").is_some(), + "data must have 'message' key" + ); + assert!( + json["data"].get("profile_id").is_some(), + "data must have 'profile_id' key" + ); + assert!( + json["data"].get("estimated_completion").is_some(), + "data must have 'estimated_completion' key" + ); +} + /// Verify each response contains a unique profile_id. #[tokio::test] async fn test_profile_unique_ids() { - use axum::body::to_bytes; use std::collections::HashSet; let mut ids = HashSet::new(); - for _ in 0..10 { + for i in 0..10 { let app = build_app(); let resp = app .oneshot( @@ -77,54 +175,269 @@ async fn test_profile_unique_ids() { .method("POST") .uri("/api/profile") .header("content-type", "application/json") - .body(axum::body::Body::from(serde_json::json!({ - "duration_secs": 10, - "sample_rate_hz": 100, - "label": "load-test-id" - }).to_string())) + .body(profile_request_body(&format!("unique-id-test-{i}"))) .unwrap(), ) .await .unwrap(); let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); - let json: serde_json::Value = serde_json::from_slice(&bytes).expect("Valid JSON"); - let id = json["data"]["profile_id"].as_str().expect("profile_id in data").to_string(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let id = json["data"]["profile_id"] + .as_str() + .expect("profile_id must be a string") + .to_string(); ids.insert(id); } - // All 10 profile IDs should be unique - assert_eq!(ids.len(), 10); + assert_eq!(ids.len(), 10, "all 10 profile IDs must be unique"); } -/// Verify response body shape. +/// Verify the `message` field contains the label from the request. #[tokio::test] -async fn test_profile_response_shape() { - use axum::body::to_bytes; - +async fn test_profile_message_contains_label() { let app = build_app(); + let label = "my-custom-label"; let resp = app .oneshot( Request::builder() .method("POST") .uri("/api/profile") .header("content-type", "application/json") - .body(axum::body::Body::from(serde_json::json!({ - "duration_secs": 10, - "sample_rate_hz": 100, - "label": "load-test-shape" - }).to_string())) + .body(profile_request_body(label)) .unwrap(), ) .await .unwrap(); - assert_eq!(resp.status(), StatusCode::OK); - let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let message = json["data"]["message"].as_str().unwrap(); + assert!( + message.contains(label), + "message '{message}' must contain label '{label}'" + ); +} + +// --------------------------------------------------------------------------- +// Validation tests +// --------------------------------------------------------------------------- + +/// Verify that a missing `label` field returns 400 / 422. +#[tokio::test] +async fn test_profile_missing_label_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(axum::body::Body::from( + serde_json::json!({ + "duration_secs": 10, + "sample_rate_hz": 100, + "label": "" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + + // Empty label should fail validation → 400 or 422 + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} + +/// Verify that `duration_secs = 0` is rejected. +#[tokio::test] +async fn test_profile_zero_duration_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(axum::body::Body::from( + serde_json::json!({ + "duration_secs": 0, + "sample_rate_hz": 100, + "label": "test" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} + +/// Verify that `duration_secs` exceeding 3600 is rejected. +#[tokio::test] +async fn test_profile_excessive_duration_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(axum::body::Body::from( + serde_json::json!({ + "duration_secs": 9999, + "sample_rate_hz": 100, + "label": "test" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} + +/// Verify that a non-JSON body returns 400 / 415. +#[tokio::test] +async fn test_profile_non_json_body_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "text/plain") + .body(axum::body::Body::from("not json")) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status().is_client_error(), + "expected 4xx, got {}", + resp.status() + ); +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_profile_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("POST /api/profile — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_profile_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("POST /api/profile — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_profile_load_response_shape_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for i in 0..5_usize { + join_set.spawn(async move { + let mut results = Vec::new(); + for j in 0..4_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body(&format!("task-{i}-req-{j}"))) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, body) in batch { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["status"], "success"); + assert!(json["data"].get("profile_id").is_some()); + assert!(json["data"].get("message").is_some()); + assert!(json["data"].get("estimated_completion").is_some()); + } + } +} + +/// Verify that concurrent requests each produce a unique profile_id. +#[tokio::test] +async fn test_profile_concurrent_unique_ids() { + use std::collections::HashSet; + use std::sync::Mutex; + + let ids = Arc::new(Mutex::new(HashSet::new())); + let mut join_set = tokio::task::JoinSet::new(); + + for i in 0..20_usize { + let ids_clone = ids.clone(); + join_set.spawn(async move { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body(&format!("concurrent-{i}"))) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let id = json["data"]["profile_id"] + .as_str() + .unwrap() + .to_string(); + ids_clone.lock().unwrap().insert(id); + }); + } + + while join_set.join_next().await.is_some() {} - assert!(json.get("data").is_some()); - assert!(json["data"].get("message").is_some()); - assert!(json["data"].get("profile_id").is_some()); + let collected = ids.lock().unwrap(); + assert_eq!(collected.len(), 20, "all 20 concurrent profile IDs must be unique"); } diff --git a/backend/tests/load/status_load.rs b/backend/tests/load/status_load.rs index e714aca..abbb09b 100644 --- a/backend/tests/load/status_load.rs +++ b/backend/tests/load/status_load.rs @@ -1,21 +1,42 @@ //! Concurrent load tests for the `GET /api/status` endpoint. +//! +//! These tests verify that the status handler remains stable and correct +//! under concurrent load without requiring a live database or Redis instance. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! ``` -use axum::{routing::get, Router}; -use hyper::{Request, StatusCode}; use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; use tower::ServiceExt; use backend::api::handlers::profiling::{get_system_status, AppState}; -use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; use backend::config::{AppConfig, reload::ConfigManager}; +use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- -/// Build a test router with the status endpoint. +/// Build a test router wired to the `/api/status` handler with mock state. fn build_app() -> Router { + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); let state = Arc::new(AppState { db: None, metrics_exporter: Arc::new(MetricsExporter::new()), error_manager: Arc::new(ErrorManager::new()), config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), }); Router::new() .route("/api/status", get(get_system_status)) @@ -48,6 +69,32 @@ async fn run_concurrent(n: usize) { } } +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic concurrency tests +// --------------------------------------------------------------------------- + #[tokio::test] async fn test_status_10_concurrent() { run_concurrent(10).await; @@ -63,6 +110,10 @@ async fn test_status_100_concurrent() { run_concurrent(100).await; } +// --------------------------------------------------------------------------- +// Sequential stability +// --------------------------------------------------------------------------- + /// Verify that repeated sequential requests all succeed. #[tokio::test] async fn test_status_sequential_stability() { @@ -82,11 +133,13 @@ async fn test_status_sequential_stability() { } } +// --------------------------------------------------------------------------- +// Response shape +// --------------------------------------------------------------------------- + /// Verify response body contains expected JSON keys. #[tokio::test] async fn test_status_response_shape() { - use axum::body::to_bytes; - let app = build_app(); let resp = app .oneshot( @@ -104,8 +157,252 @@ async fn test_status_response_shape() { let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); assert_eq!(json["status"], "success"); - assert!(json.get("data").is_some()); - assert!(json["data"].get("status").is_some()); - assert!(json["data"].get("uptime_secs").is_some()); - assert!(json["data"].get("active_recovery_tasks").is_some()); + assert!(json.get("data").is_some(), "response must have 'data' key"); + assert!( + json["data"].get("status").is_some(), + "data must have 'status' key" + ); + assert!( + json["data"].get("uptime_secs").is_some(), + "data must have 'uptime_secs' key" + ); + assert!( + json["data"].get("active_recovery_tasks").is_some(), + "data must have 'active_recovery_tasks' key" + ); +} + +/// Verify the `status` field value is `"healthy"`. +#[tokio::test] +async fn test_status_healthy_value() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["status"], "healthy"); +} + +/// Verify `active_recovery_tasks` starts at zero with a fresh state. +#[tokio::test] +async fn test_status_zero_recovery_tasks_initially() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["active_recovery_tasks"], 0); +} + +/// Verify `uptime_secs` is a non-negative integer. +#[tokio::test] +async fn test_status_uptime_is_non_negative() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let uptime = json["data"]["uptime_secs"].as_u64(); + assert!(uptime.is_some(), "uptime_secs must be a non-negative integer"); +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_status_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /api/status — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_status_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /api/status — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_status_load_response_shape_under_load() { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(5, 4); // 20 total + let outcomes: Vec<(StatusCode, Vec)> = { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..cfg.concurrency { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..4 { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + let mut all = Vec::new(); + while let Some(Ok(batch)) = join_set.join_next().await { + all.extend(batch); + } + all + }; + + for (status, body) in outcomes { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["status"], "success"); + assert!(json["data"].get("status").is_some()); + assert!(json["data"].get("uptime_secs").is_some()); + assert!(json["data"].get("active_recovery_tasks").is_some()); + } +} + +/// Verify that the handler is idempotent — repeated calls return the same shape. +#[tokio::test] +async fn test_status_idempotent_responses() { + let app = build_app(); + let mut previous: Option = None; + + for _ in 0..5 { + let resp = app + .clone() + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + if let Some(ref prev) = previous { + // Keys must be identical; values may differ (e.g. uptime_secs) + assert_eq!( + prev.as_object().unwrap().keys().collect::>(), + json.as_object().unwrap().keys().collect::>(), + "response keys must be stable across calls" + ); + } + previous = Some(json); + } +} + +/// Verify that the handler correctly reflects recovery tasks added to state. +#[tokio::test] +async fn test_status_reflects_recovery_tasks() { + use backend::services::error_recovery::RecoveryError; + + let error_manager = Arc::new(ErrorManager::new()); + error_manager + .handle_error(RecoveryError::Internal("boom".into()), "worker_a") + .await + .unwrap(); + + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); + let state = Arc::new(AppState { + db: None, + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: error_manager.clone(), + config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/status", get(get_system_status)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["active_recovery_tasks"], 1); +} + +/// Verify that the handler correctly reflects updated metrics. +#[tokio::test] +async fn test_status_reflects_updated_metrics() { + let metrics_exporter = Arc::new(MetricsExporter::new()); + metrics_exporter.update_metrics(55.0, 2048, 300).await; + + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); + let state = Arc::new(AppState { + db: None, + metrics_exporter: metrics_exporter.clone(), + error_manager: Arc::new(ErrorManager::new()), + config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/status", get(get_system_status)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["uptime_secs"], 300); + assert_eq!(json["data"]["memory_used_bytes"], 2048); } diff --git a/backend/tests/load/stellar_load.rs b/backend/tests/load/stellar_load.rs new file mode 100644 index 0000000..1eed7e0 --- /dev/null +++ b/backend/tests/load/stellar_load.rs @@ -0,0 +1,399 @@ +//! Concurrent load tests for the `GET /.well-known/stellar.toml` endpoint. +//! +//! These tests verify that the Stellar SEP-1 handler remains stable and +//! correct under concurrent load. The handler is stateless so no mock +//! infrastructure is required. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! ``` + +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; +use tower::ServiceExt; + +use backend::api::handlers::stellar::get_stellar_toml; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to the Stellar TOML handler. +fn build_app() -> Router { + Router::new().route("/.well-known/stellar.toml", get(get_stellar_toml)) +} + +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic correctness +// --------------------------------------------------------------------------- + +/// Handler returns 200 OK. +#[tokio::test] +async fn test_stellar_toml_returns_200() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); +} + +/// Response includes the required `Access-Control-Allow-Origin: *` header (SEP-1). +#[tokio::test] +async fn test_stellar_toml_cors_header() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let cors = resp + .headers() + .get("access-control-allow-origin") + .expect("Access-Control-Allow-Origin header must be present"); + assert_eq!(cors, "*"); +} + +/// Response `Content-Type` is `text/plain`. +#[tokio::test] +async fn test_stellar_toml_content_type() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let ct = resp + .headers() + .get("content-type") + .expect("Content-Type header must be present"); + assert!( + ct.to_str().unwrap().contains("text/plain"), + "Content-Type must be text/plain, got: {:?}", + ct + ); +} + +/// Response body contains the required TOML fields. +#[tokio::test] +async fn test_stellar_toml_body_content() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body = std::str::from_utf8(&bytes).unwrap(); + + assert!(body.contains("VERSION"), "body must contain VERSION"); + assert!( + body.contains("NETWORK_PASSPHRASE"), + "body must contain NETWORK_PASSPHRASE" + ); + assert!(body.contains("ACCOUNTS"), "body must contain ACCOUNTS"); + assert!(body.contains("CURRENCIES"), "body must contain CURRENCIES"); +} + +/// Response body contains the USDC currency entry. +#[tokio::test] +async fn test_stellar_toml_contains_usdc() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body = std::str::from_utf8(&bytes).unwrap(); + + assert!(body.contains("USDC"), "body must contain USDC currency"); +} + +/// Response body is non-empty. +#[tokio::test] +async fn test_stellar_toml_non_empty_body() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + assert!(!bytes.is_empty(), "response body must not be empty"); +} + +/// Response is identical across multiple calls (handler is pure / stateless). +#[tokio::test] +async fn test_stellar_toml_deterministic() { + let mut bodies: Vec> = Vec::new(); + + for _ in 0..5 { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + bodies.push(bytes.to_vec()); + } + + let first = &bodies[0]; + for body in &bodies[1..] { + assert_eq!(body, first, "all responses must be identical"); + } +} + +// --------------------------------------------------------------------------- +// Concurrency tests +// --------------------------------------------------------------------------- + +/// 10 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_10_concurrent() { + let handles: Vec<_> = (0..10) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 50 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_50_concurrent() { + let handles: Vec<_> = (0..50) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 100 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_100_concurrent() { + let handles: Vec<_> = (0..100) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// Verify that all concurrent responses have identical bodies. +#[tokio::test] +async fn test_stellar_toml_concurrent_identical_bodies() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..20_usize { + join_set.spawn(async { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + to_bytes(resp.into_body(), usize::MAX) + .await + .unwrap() + .to_vec() + }); + } + + let mut bodies: Vec> = Vec::new(); + while let Some(Ok(body)) = join_set.join_next().await { + bodies.push(body); + } + + assert_eq!(bodies.len(), 20); + let first = &bodies[0]; + for body in &bodies[1..] { + assert_eq!(body, first, "all concurrent responses must be identical"); + } +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 200ms (stateless handler should be very fast). +#[tokio::test] +async fn test_stellar_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(200)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_stellar_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 50 concurrent tasks × 10 requests each = 500 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_stellar_load_500_requests_slo() { + let result = run_framework_load(50, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 500 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct headers. +#[tokio::test] +async fn test_stellar_load_headers_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..10_usize { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..5_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let cors = resp + .headers() + .get("access-control-allow-origin") + .map(|v| v.to_str().unwrap().to_string()); + results.push((status, cors)); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, cors) in batch { + assert_eq!(status, StatusCode::OK); + assert_eq!( + cors.as_deref(), + Some("*"), + "CORS header must be '*' under load" + ); + } + } +} diff --git a/backend/tests/load_tests.rs b/backend/tests/load_tests.rs index b24467d..8b86fd7 100644 --- a/backend/tests/load_tests.rs +++ b/backend/tests/load_tests.rs @@ -1,11 +1,29 @@ //! Load and stress test suite entry point. //! -//! Run with: +//! This file is the integration test binary for all load tests. Each sub-module +//! exercises a specific API endpoint under concurrent load using the shared +//! [`load::framework`] helpers. +//! +//! # Running +//! //! ```bash +//! # All load tests (with output) //! cargo test -p backend --test load_tests -- --nocapture +//! +//! # A specific endpoint +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! +//! # Framework unit tests only +//! cargo test -p backend --test load_tests load::framework -- --nocapture //! ``` mod load { + pub mod framework; + pub mod dashboard_load; pub mod profile_load; pub mod status_load; + pub mod stellar_load; }