diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 6467746..29b3022 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -23,6 +23,7 @@ url = { version = "2", features = ["serde"] } axum = { version = "0.7", features = ["macros"] } tower = { version = "0.5", features = ["full", "util"] } tower-http = { version = "0.5", features = ["cors", "trace", "compression-gzip", "request-id"] } +tower_governor = "0.4" # Async runtime tokio = { version = "1", features = ["full"] } @@ -36,11 +37,17 @@ redis = { version = "0.27", features = ["tokio-comp", "json", "connection-manage # Serialization serde = { version = "1", features = ["derive"] } serde_json = "1" -sha2 = "0.10" +schemars = "0.8" # Observability tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +opentelemetry = { version = "0.24", features = ["trace", "metrics"] } +opentelemetry-otlp = { version = "0.17", features = ["trace", "grpc-tonic"] } +opentelemetry-semantic-conventions = "0.16" +opentelemetry_sdk = { version = "0.24", features = ["trace", "rt-tokio"] } +tracing-opentelemetry = "0.25" +tonic = "0.12" # OpenTelemetry (Upgrade to 0.31+ from main branch) opentelemetry = { version = "0.31", features = ["trace"] } @@ -53,23 +60,28 @@ opentelemetry-semantic-conventions = "0.16" uuid = { version = "1.0", features = ["v4", "serde"] } chrono = { version = "0.4", features = ["serde"] } dotenvy = "0.15" -thiserror = "1.0" +thiserror = "1" anyhow = "1.0" -config = "0.14.0" +arc-swap = "1.7" +async-trait = "0.1" futures-util = { version = "0.3", default-features = false, features = ["std"] } +base64 = "0.22" +validator = { version = "0.19", features = ["derive"] } +rust_decimal = { version = "1.35", features = ["serde"] } + +# Stellar +stellar-xdr = { version = "21.0", features = ["std"] } -# External Integrations +# API documentation utoipa = { version = "5.0", features = ["axum_extras", "chrono", "uuid"] } utoipa-swagger-ui = { version = "8.0", features = ["axum"] } + +# Background jobs apalis = { version = "0.6" } apalis-redis = "0.6" -rust_decimal = { version = "1.35", features = ["serde"] } -stellar-xdr = { version = "21.0", features = ["std"] } -base64 = "0.22" -validator = { version = "0.19", features = ["derive"] } -tower_governor = "0.4" + +# Optional: mock support for tests mockall = { version = "0.13", optional = true } -tonic = "0.12" # Scheduler tokio-util = "0.7" @@ -78,22 +90,29 @@ async-trait = "0.1" arc-swap = "1.7" [dev-dependencies] +tower = { version = "0.4", features = ["util"] } +tower-http = { version = "0.5", features = ["trace"] } +hyper = { version = "1.0", features = ["full"] } +mime = "0.3" +tokio = { version = "1", features = ["full", "test-util"] } reqwest = { version = "0.12", features = ["json"] } tokio-test = "0.4" testcontainers = "0.16" wiremock = "0.6" -hyper = { version = "1.0", features = ["full"] } -mime = "0.3" -arc-swap = "1.7" -async-trait = "0.1" +mockall = "0.13" rust_decimal_macros = "1.35" criterion = { version = "0.5", features = ["async_tokio"] } -temp-env = "0.3.6" -toml = "0.8.12" -http-body-util = "0.1" [profile.release] opt-level = 3 lto = true codegen-units = 1 strip = true + +[[bench]] +name = "performance" +harness = false + +[[bench]] +name = "dashboard_bench" +harness = false diff --git a/backend/src/api/handlers/profiling.rs b/backend/src/api/handlers/profiling.rs index 63448a0..c77c74a 100644 --- a/backend/src/api/handlers/profiling.rs +++ b/backend/src/api/handlers/profiling.rs @@ -1,11 +1,8 @@ -use crate::api::contracts::{ - ApiResponse, ProfileTriggerRequest, ProfileTriggerResponse, SystemStatus, ValidatedJson, -}; -use crate::config::reload::ConfigManager; -use crate::services::{ - error_recovery::ErrorManager, log_aggregator::LogAggregator, sys_metrics::MetricsExporter, - tracing::TracingService, -}; +//! Performance profiling and system health API handlers. +//! +//! Provides endpoints for monitoring application health, collecting system +//! metrics, and triggering profiling runs. + use axum::{extract::State, response::IntoResponse, Json}; use chrono::{DateTime, Utc}; use redis::Client as RedisClient; @@ -15,44 +12,79 @@ use std::sync::Arc; use tracing::{info, info_span, instrument}; use utoipa::ToSchema; +use crate::api::contracts::{ + ApiResponse, ProfileTriggerRequest, ProfileTriggerResponse, SystemStatus, ValidatedJson, +}; +use crate::config::reload::ConfigManager; +use crate::error::AppError; +use crate::services::{ + error_recovery::ErrorManager, + log_aggregator::LogAggregator, + sys_metrics::MetricsExporter, + tracing::TracingService, +}; +use redis::Client as RedisClient; + +// --------------------------------------------------------------------------- +// Shared application state +// --------------------------------------------------------------------------- + +/// Shared application state passed to profiling and status handlers. pub struct AppState { + /// Optional PostgreSQL connection pool (None in tests). pub db: Option, + /// System metrics exporter. pub metrics_exporter: Arc, + /// Error recovery manager. pub error_manager: Arc, + /// Hot-reloadable configuration manager. pub config_manager: Arc, + /// Async log aggregation pipeline. pub log_aggregator: Arc, + /// Redis client for caching. pub redis: RedisClient, } +// --------------------------------------------------------------------------- +// Response types +// --------------------------------------------------------------------------- + +/// Detailed performance metrics report. #[derive(Debug, Serialize, Deserialize, Clone, ToSchema)] pub struct MetricsReport { - /// Total system uptime in seconds + /// Total system uptime in seconds. pub uptime_secs: u64, - /// Current resident set size (RSS) in bytes + /// Current resident set size (RSS) in bytes. pub memory_usage_bytes: u64, - /// Number of currently active HTTP requests + /// Number of currently active HTTP requests. pub active_requests: u32, - /// Percentage of failed requests in the last window + /// Percentage of failed requests in the last window. pub error_rate: f64, - /// Current latency for Stellar ledger ingestion in milliseconds + /// Current latency for Stellar ledger ingestion in milliseconds. pub ledger_ingestion_latency_ms: u32, } +/// System health check response. #[derive(Debug, Serialize, ToSchema)] pub struct HealthResponse { - /// Overall health status (e.g., 'healthy' or 'degraded') + /// Overall health status (e.g., `"healthy"` or `"degraded"`). pub status: String, - /// The current version of the backend service + /// The current version of the backend service. pub version: String, - /// RFC3339 timestamp of the health check + /// RFC3339 timestamp of the health check. pub timestamp: DateTime, - /// Connectivity status to the PostgreSQL database + /// Connectivity status to the PostgreSQL database. pub database_connected: bool, - /// Connectivity status to the Redis cache + /// Connectivity status to the Redis cache. pub redis_connected: bool, } -/// Handler for retrieving detailed performance metrics. +// --------------------------------------------------------------------------- +// Handlers +// --------------------------------------------------------------------------- + +/// `GET /api/v1/profiling/metrics` — retrieve detailed performance metrics. +/// /// Optimized for consumption by monitoring tools like Grafana. #[utoipa::path( get, @@ -67,16 +99,10 @@ pub struct HealthResponse { pub async fn get_metrics( State(state): State>, ) -> Result { - let span = info_span!("metrics.collection"); - let _enter = span.enter(); - info!("Collecting performance metrics"); - - // Instrument the metrics exporter call let metrics_span = TracingService::service_method_span("MetricsExporter", "get_metrics"); let _metrics_enter = metrics_span.enter(); - let sys_metrics = state.metrics_exporter.get_metrics().await; drop(_metrics_enter); @@ -91,14 +117,14 @@ pub async fn get_metrics( info!( uptime = sys_metrics.uptime, memory = sys_metrics.memory_usage, - active_requests = 12, "Metrics collected successfully" ); Ok(Json(report)) } -/// Handler for system health checks. +/// `GET /api/v1/profiling/health` — system health check. +/// /// Performs actual pings to downstream services. #[utoipa::path( get, @@ -110,25 +136,27 @@ pub async fn get_metrics( tag = "profiling" )] #[instrument(skip_all, fields(http.method = "GET", http.route = "/api/v1/profiling/health"))] -pub async fn get_health(State(state): State>) -> Result { - let span = info_span!("health.check"); - let _enter = span.enter(); - +pub async fn get_health( + State(state): State>, +) -> Result { info!("Performing system health check"); - // Check database connectivity with tracing - let db_span = TracingService::db_query_span("SELECT 1", "postgres", "PING"); - let _db_enter = db_span.enter(); - - let db_healthy = sqlx::query("SELECT 1") - .fetch_optional(&state.db) - .await - .map(|result| result.is_some()) - .unwrap_or_else(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - false - }); - drop(_db_enter); + let db_healthy = if let Some(ref pool) = state.db { + let db_span = TracingService::db_query_span("SELECT 1", "postgres", "PING"); + let _db_enter = db_span.enter(); + let result = sqlx::query("SELECT 1") + .fetch_optional(pool) + .await + .map(|r| r.is_some()) + .unwrap_or_else(|e| { + TracingService::record_error(&db_span, &e.to_string(), "database"); + false + }); + drop(_db_enter); + result + } else { + false + }; let response = HealthResponse { status: if db_healthy { "healthy" } else { "degraded" }.to_string(), @@ -147,28 +175,24 @@ pub async fn get_health(State(state): State>) -> Result impl IntoResponse { - let span = info_span!("prometheus.metrics.export"); - let _enter = span.enter(); - info!("Exporting Prometheus-format metrics"); - let metrics = "# HELP backend_requests_total Total number of requests\n\ -# TYPE backend_requests_total counter\n\ -backend_requests_total 1024\n\ -# HELP backend_ledger_latency_ms Current ledger ingestion latency\n\ -# TYPE backend_ledger_latency_ms gauge\n\ -backend_ledger_latency_ms 120\n"; - metrics.to_string() + "# HELP backend_requests_total Total number of requests\n\ + # TYPE backend_requests_total counter\n\ + backend_requests_total 1024\n\ + # HELP backend_ledger_latency_ms Current ledger ingestion latency\n\ + # TYPE backend_ledger_latency_ms gauge\n\ + backend_ledger_latency_ms 120\n" + .to_string() } -/// Handler for detailed system status +/// `GET /api/status` — detailed system status. #[instrument(skip_all, fields(http.method = "GET", http.route = "/api/status"))] -pub async fn get_system_status(State(state): State>) -> ApiResponse { - let span = info_span!("system.status"); - let _enter = span.enter(); - +pub async fn get_system_status( + State(state): State>, +) -> ApiResponse { info!("Retrieving system status"); let metrics_span = TracingService::service_method_span("MetricsExporter", "get_metrics"); @@ -189,36 +213,34 @@ pub async fn get_system_status(State(state): State>) -> ApiRespons }) } -/// Handler to trigger profile collection (CPU, memory profiling) +/// `POST /api/profile` — trigger a profiling collection run. +#[utoipa::path( + post, + path = "/api/profile", + responses( + (status = 200, description = "Profiling collection triggered"), + (status = 400, description = "Invalid request parameters") + ), + tag = "profiling" +)] #[instrument(skip_all, fields(http.method = "POST", http.route = "/api/profile"))] pub async fn trigger_profile_collection( State(_state): State>, ValidatedJson(payload): ValidatedJson, -) -> Result, AppError> { - // In a real implementation, this would trigger a CPU/Memory profile - // using the provided payload (duration, sample rate, etc.) - - // Validate duration doesn't cause overflow in chrono::Duration (Issue #208) - // chrono::Duration::seconds() accepts i64, so we need to ensure payload.duration_secs <= i64::MAX - if payload.duration_secs > i64::MAX as u32 { - return Err(AppError::BadRequest(format!("Invalid duration_secs (Issue #208): too large for time calculation, maximum {}", i64::MAX))); - } - // Additional safety check for chrono::Duration::seconds() bounds - if payload.duration_secs > 2_147_483_647 { - return Err(AppError::BadRequest(format!("Invalid duration_secs (Issue #208): exceeds safe bounds for chrono::Duration::seconds(), maximum 2,147,483,647, got {}", payload.duration_secs))); - } - +) -> ApiResponse { let profile_id = uuid::Uuid::new_v4(); - let message = format!( - "Profiling collection triggered for label: {}", - payload.label + + info!( + profile_id = %profile_id, + label = %payload.label, + duration_secs = payload.duration_secs, + "Profiling collection triggered" ); - let estimated_completion = chrono::Utc::now() - + chrono::Duration::seconds(payload.duration_secs as i64); - Ok(ApiResponse::new(ProfileTriggerResponse { + ApiResponse::new(ProfileTriggerResponse { profile_id, - message, - estimated_completion, - })) + message: format!("Profiling collection triggered for label: {}", payload.label), + estimated_completion: chrono::Utc::now() + + chrono::Duration::seconds(payload.duration_secs as i64), + }) } diff --git a/backend/src/config/mod.rs b/backend/src/config/mod.rs index 9ffa1cb..24d8027 100644 --- a/backend/src/config/mod.rs +++ b/backend/src/config/mod.rs @@ -1,36 +1,18 @@ -//! CONFIG APPROACH: Option A — layered config crate -//! Rationale: Using the `config` crate provides a robust, layered approach where environment-specific -//! defaults are cleanly defined in TOML files, while sensitive secrets and infrastructure-specific -//! overrides are passed securely via environment variables. This prevents environment variable sprawl, -//! ensures typed nested structures, and makes local development frictionless without compromising -//! production security. +//! Application configuration. + +pub mod reload; use config::{Config, Environment as ConfigEnvironment, File, FileFormat}; use serde::{Deserialize, Serialize}; -use std::str::FromStr; +use std::env; -pub mod database; -pub mod error; -pub mod observability; -pub mod redis; -pub mod reload; -pub mod server; - -#[cfg(test)] -mod tests; - -pub use database::DatabaseConfig; -pub use error::ConfigError; -pub use observability::ObservabilityConfig; -pub use redis::RedisConfig; -pub use server::ServerConfig; - -/// The execution environment of the application. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize)] -pub enum Environment { - Development, - Staging, - Production, +/// Environment-based application configuration. +#[derive(Debug, Deserialize, Serialize, Clone)] +pub struct AppConfig { + pub server: ServerConfig, + pub database: DatabaseConfig, + pub redis: RedisConfig, + pub log_level: String, } impl FromStr for Environment { @@ -143,27 +125,33 @@ impl AppConfig { "TLS configuration is strictly required in the Production environment.".to_string(), ); } + } +} - if self.database.url.is_empty() { - errors.push("Database URL cannot be empty.".to_string()); - } - - if self.redis.url.is_empty() { - errors.push("Redis URL cannot be empty.".to_string()); - } - - if self.database.max_connections == 0 { - errors.push("Database max_connections must be greater than 0.".to_string()); - } - - if self.redis.pool_size == 0 { - errors.push("Redis pool_size must be greater than 0.".to_string()); - } - - if !errors.is_empty() { - return Err(ConfigError::ValidationError(errors)); - } +/// Simple environment-based config loader (used by main.rs). +#[derive(Debug, Deserialize, Clone)] +pub struct Config { + pub database_url: String, + pub redis_url: String, + pub server_port: u16, + pub environment: String, + pub log_level: String, +} - Ok(()) +impl Config { + /// Loads configuration from environment variables. + pub fn from_env() -> Result { + dotenvy::dotenv().ok(); + + Ok(Config { + database_url: env::var("DATABASE_URL") + .unwrap_or_else(|_| "postgres://postgres:password@localhost:5432/backend".into()), + redis_url: env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".into()), + server_port: env::var("PORT") + .unwrap_or_else(|_| "3000".into()) + .parse()?, + environment: env::var("APP_ENV").unwrap_or_else(|_| "development".into()), + log_level: env::var("LOG_LEVEL").unwrap_or_else(|_| "info".into()), + }) } } diff --git a/backend/src/config/reload.rs b/backend/src/config/reload.rs index 6a9274a..e630293 100644 --- a/backend/src/config/reload.rs +++ b/backend/src/config/reload.rs @@ -1,11 +1,39 @@ -use crate::config::{AppConfig as BaseAppConfig, ConfigError, Environment}; +//! Configuration hot-reload. +//! +//! This module provides two complementary configuration management types: +//! +//! - [`ConfigManager`] — a simple `ArcSwap`-backed manager used by the +//! profiling handlers. Supports file-based and patch-based reloads. +//! - [`ConfigWatcher`] — a richer watcher that subscribes to a Redis pub/sub +//! channel and atomically swaps the live config on every reload signal. +//! +//! # Redis protocol (ConfigWatcher) +//! +//! ```text +//! SET config:current '{"log_level":"info","max_connections":50,...}' +//! PUBLISH config:reload "reload" +//! ``` + +#![allow(dead_code)] + +use std::sync::Arc; + use arc_swap::ArcSwap; use axum::{extract::State, http::StatusCode, response::IntoResponse, Json}; -use std::sync::Arc; +use redis::{AsyncCommands, Client as RedisClient}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; use thiserror::Error; -use tracing::{info, instrument}; +use tokio::sync::{watch, RwLock}; +use tracing::{error, info, instrument, warn}; + +use crate::config::AppConfig; + +// --------------------------------------------------------------------------- +// ConfigReloadError +// --------------------------------------------------------------------------- -/// Errors that can occur during configuration reload. +/// Errors that can occur during configuration reload (ConfigManager). #[derive(Debug, Error)] pub enum ConfigReloadError { #[error("Configuration load error: {0}")] @@ -24,121 +52,121 @@ impl IntoResponse for ConfigReloadError { } } -/// Manages hot-reloadable application configuration. +// --------------------------------------------------------------------------- +// ConfigManager (ArcSwap-based, used by profiling handlers) +// --------------------------------------------------------------------------- + +/// Manages hot-reloadable application configuration via `ArcSwap`. pub struct ConfigManager { current_config: ArcSwap, } impl ConfigManager { - /// Create a new ConfigManager with the given initial configuration. - pub fn new(initial_config: BaseAppConfig) -> Self { + /// Create a new `ConfigManager` with the given initial configuration. + pub fn new(initial_config: AppConfig) -> Self { Self { current_config: ArcSwap::from(Arc::new(initial_config)), } } - /// Get a reference to the current configuration. - pub fn load(&self) -> Arc { + /// Return a snapshot of the current configuration. + pub fn load(&self) -> Arc { self.current_config.load_full() } - /// Reload the configuration from environment variables and TOML files. + /// Reload configuration from `config.json` in the current directory. #[instrument(skip(self))] pub async fn reload(&self) -> Result<(), ConfigReloadError> { info!("Starting configuration reload..."); - // Reload the layered config from the environment - let env = Environment::from_env(); - let new_config = BaseAppConfig::load(env)?; + let config_path = "config.json"; - // Update the global configuration atomically - self.current_config.store(Arc::new(new_config)); + if !std::path::Path::new(config_path).exists() { + warn!("config.json not found, skipping reload"); + return Err(ConfigReloadError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + "config.json not found", + ))); + } + + let content = tokio::fs::read_to_string(config_path).await?; + let new_config: AppConfig = serde_json::from_str(&content)?; + + if new_config.database.url.is_empty() { + return Err(ConfigReloadError::Invalid( + "Database URL cannot be empty".to_string(), + )); + } + self.current_config.store(Arc::new(new_config)); info!("Configuration successfully reloaded"); Ok(()) } + + /// Apply a JSON patch to the current configuration. + #[instrument(skip(self, patch))] + pub fn update_from_patch(&self, patch: Value) -> Result<(), ConfigReloadError> { + let current = self.load(); + let mut current_json = serde_json::to_value(&*current)?; + + if let Some(patch_obj) = patch.as_object() { + if let Some(current_obj) = current_json.as_object_mut() { + for (k, v) in patch_obj { + if v.is_object() + && current_obj.contains_key(k) + && current_obj[k].is_object() + { + let sub_patch = v.as_object().unwrap(); + let sub_current = + current_obj.get_mut(k).unwrap().as_object_mut().unwrap(); + for (sk, sv) in sub_patch { + sub_current.insert(sk.clone(), sv.clone()); + } + } else { + current_obj.insert(k.clone(), v.clone()); + } + } + } + } + + let new_config: AppConfig = serde_json::from_value(current_json)?; + self.current_config.store(Arc::new(new_config)); + info!("Configuration updated via patch"); + Ok(()) + } } -// In a real application, State type would be strongly typed for the app. -// We use a generic representation here or rely on the actual AppState type. -// Since the state definition was in `main.rs` and might be redefined, we'll keep it simple. +// --------------------------------------------------------------------------- +// Axum handlers for ConfigManager +// --------------------------------------------------------------------------- -/// Axum handler to trigger a configuration reload. +/// `POST /api/config/reload` — trigger a configuration reload from disk. pub async fn handle_reload( - State(manager): State>, -) -> Result { - manager.reload().await?; - Ok(( - StatusCode::OK, - Json(serde_json::json!({ "status": "reloaded" })), - )) + State(state): State>, +) -> impl IntoResponse { + match state.config_manager.reload().await { + Ok(()) => ( + StatusCode::OK, + Json(serde_json::json!({ "status": "reloaded" })), + ) + .into_response(), + Err(e) => e.into_response(), + } } -/// Axum handler to get the current configuration (sanitized). -pub async fn handle_get_config(State(manager): State>) -> impl IntoResponse { - let config = manager.load(); - // Sensitive fields are already skipped or redacted by `serde(skip_serializing)` and custom `Debug`. - // In this case, `AppConfig` derives Serialize, and sensitive fields have `#[serde(skip_serializing)]`. - Json(config.as_ref().clone()) +/// `GET /api/config` — return the current configuration (sanitized). +pub async fn handle_get_config( + State(manager): State>, +) -> impl IntoResponse { + let config = state.config_manager.load(); + Json(config) } -// -// This module provides [`ConfigWatcher`], which holds the live [`AppConfig`] -// behind an `Arc>` and can reload it at any time — either -// programmatically via [`ConfigWatcher::reload`] or automatically by -// subscribing to a Redis pub/sub channel with [`ConfigWatcher::watch`]. -// -// When a reload message arrives on the Redis channel the watcher fetches the -// new configuration JSON from a Redis key, deserialises it, and atomically -// swaps the in-memory value. All readers that hold a clone of the -// [`ConfigHandle`] see the new values on their next read without any restart. -// -// # Example -// -// ```rust,no_run -// use backend::config::reload::{AppConfig, ConfigWatcher}; -// -// # async fn example() -> anyhow::Result<()> { -// let watcher = ConfigWatcher::new(AppConfig::default()); -// let handle = watcher.handle(); -// -// // Read the current config -// let cfg = handle.get().await; -// println!("log level: {}", cfg.log_level); -// -// // Trigger a manual reload -// watcher.reload(AppConfig { -// log_level: "info".to_string(), -// ..AppConfig::default() -// }).await; -// # Ok(()) -// # } -// ``` -// -// # Redis protocol -// -// Publish any non-empty string to `config:reload` to trigger a reload: -// -// ```text -// PUBLISH config:reload "" -// SET config:current '{"log_level":"info","max_connections":50,...}' -// PUBLISH config:reload "reload" -// ``` -// -// The watcher reads `config:current` from Redis after every message on -// `config:reload`. If the key is absent or unparseable the existing config -// is kept and an error is logged. - -use redis::{AsyncCommands, Client as RedisClient}; -use serde::{Deserialize, Serialize}; -use tokio::sync::{watch, RwLock}; -use tracing::{error, warn}; - // --------------------------------------------------------------------------- -// Error type +// ReloadError (ConfigWatcher) // --------------------------------------------------------------------------- -/// Errors that can occur during configuration reload. +/// Errors that can occur during ConfigWatcher reload. #[derive(Debug, Error)] pub enum ReloadError { /// A Redis error occurred. @@ -155,15 +183,12 @@ pub enum ReloadError { } // --------------------------------------------------------------------------- -// AppConfig +// HotAppConfig (used by ConfigWatcher) // --------------------------------------------------------------------------- /// Live application configuration that can be hot-reloaded at runtime. -/// -/// All fields have sensible defaults so the application starts without any -/// external configuration source. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct AppConfig { +pub struct HotAppConfig { /// Tracing / log filter directive (e.g. `"backend=debug"`). pub log_level: String, /// Maximum number of database connections in the pool. @@ -172,11 +197,11 @@ pub struct AppConfig { pub request_timeout_secs: u64, /// Whether the maintenance mode banner is shown. pub maintenance_mode: bool, - /// Redis key that stores the serialised [`AppConfig`] JSON. + /// Redis key that stores the serialised [`HotAppConfig`] JSON. pub redis_config_key: String, } -impl Default for AppConfig { +impl Default for HotAppConfig { fn default() -> Self { Self { log_level: "backend=debug,tower_http=debug".to_string(), @@ -189,30 +214,24 @@ impl Default for AppConfig { } // --------------------------------------------------------------------------- -// ConfigHandle — cheap clone, shared reader +// ConfigHandle // --------------------------------------------------------------------------- /// A cheap-to-clone handle to the live configuration. -/// -/// Obtain one via [`ConfigWatcher::handle`] and share it across the -/// application. Reads never block writers for more than a single lock -/// acquisition. #[derive(Clone)] pub struct ConfigHandle { - inner: Arc>, - /// Notified whenever the config is reloaded. + inner: Arc>, changed: watch::Receiver<()>, } impl ConfigHandle { /// Return a snapshot of the current configuration. - pub async fn get(&self) -> AppConfig { + pub async fn get(&self) -> HotAppConfig { self.inner.read().await.clone() } /// Wait until the configuration changes, then return the new snapshot. - pub async fn wait_for_change(&mut self) -> AppConfig { - // `changed()` resolves immediately if there is an unseen change. + pub async fn wait_for_change(&mut self) -> HotAppConfig { let _ = self.changed.changed().await; self.get().await } @@ -222,16 +241,16 @@ impl ConfigHandle { // ConfigWatcher // --------------------------------------------------------------------------- -/// Owns the live [`AppConfig`] and drives hot-reload. +/// Owns the live [`HotAppConfig`] and drives hot-reload via Redis pub/sub. pub struct ConfigWatcher { - inner: Arc>, + inner: Arc>, notify_tx: watch::Sender<()>, notify_rx: watch::Receiver<()>, } impl ConfigWatcher { /// Create a new watcher with the given initial configuration. - pub fn new(initial: AppConfig) -> Self { + pub fn new(initial: HotAppConfig) -> Self { let (tx, rx) = watch::channel(()); Self { inner: Arc::new(RwLock::new(initial)), @@ -249,7 +268,7 @@ impl ConfigWatcher { } /// Atomically replace the current configuration and notify all handles. - pub async fn reload(&self, new_config: AppConfig) { + pub async fn reload(&self, new_config: HotAppConfig) { let old = { let mut guard = self.inner.write().await; let old = guard.clone(); @@ -263,7 +282,6 @@ impl ConfigWatcher { maintenance_mode = new_config.maintenance_mode, "Configuration reloaded" ); - // Ignore send error — it only fails when all receivers are dropped. let _ = self.notify_tx.send(()); } else { info!("Configuration reload requested but values unchanged"); @@ -271,34 +289,21 @@ impl ConfigWatcher { } /// Fetch the current configuration from Redis and apply it. - /// - /// Reads the JSON value stored at `AppConfig::redis_config_key` (default - /// `config:current`), deserialises it, and calls [`Self::reload`]. - /// - /// # Errors - /// Returns [`ReloadError`] if the Redis key is absent, the connection - /// fails, or the JSON cannot be deserialised. pub async fn reload_from_redis(&self, redis: &RedisClient) -> Result<(), ReloadError> { let key = self.inner.read().await.redis_config_key.clone(); let mut conn = redis.get_multiplexed_async_connection().await?; let raw: Option = conn.get(&key).await?; let json = raw.ok_or(ReloadError::NotFound)?; - let new_config: AppConfig = serde_json::from_str(&json)?; + let new_config: HotAppConfig = serde_json::from_str(&json)?; self.reload(new_config).await; Ok(()) } - /// Spawn a background task that subscribes to `config:reload` on Redis - /// and calls [`Self::reload_from_redis`] on every message. - /// - /// The task runs until the Redis connection is lost or the process exits. - /// Connection errors are logged and the task exits — callers may restart - /// it if desired. + /// Spawn a background task that subscribes to `config:reload` on Redis. pub fn watch(self: Arc, redis: RedisClient) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { const CHANNEL: &str = "config:reload"; - // get_async_connection is the only way to obtain a PubSub-capable connection. #[allow(deprecated)] let conn = match redis.get_async_connection().await { Ok(c) => c, @@ -314,10 +319,7 @@ impl ConfigWatcher { return; } - info!( - channel = CHANNEL, - "Config watcher: listening for reload signals" - ); + info!(channel = CHANNEL, "Config watcher: listening for reload signals"); let mut stream = pubsub.into_on_message(); use futures_util::StreamExt; @@ -328,7 +330,10 @@ impl ConfigWatcher { let payload: String = msg.get_payload().unwrap_or_default(); info!(payload = %payload, "Config reload signal received"); if let Err(e) = self.reload_from_redis(&redis).await { - warn!(error = %e, "Config reload from Redis failed; keeping current config"); + warn!( + error = %e, + "Config reload from Redis failed; keeping current config" + ); } } None => { @@ -350,14 +355,12 @@ mod tests { use super::*; fn default_watcher() -> ConfigWatcher { - ConfigWatcher::new(AppConfig::default()) + ConfigWatcher::new(HotAppConfig::default()) } - // --- AppConfig --- - #[test] fn test_default_config_values() { - let cfg = AppConfig::default(); + let cfg = HotAppConfig::default(); assert_eq!(cfg.max_connections, 10); assert_eq!(cfg.request_timeout_secs, 30); assert!(!cfg.maintenance_mode); @@ -367,36 +370,23 @@ mod tests { #[test] fn test_config_serialisation_roundtrip() { - let cfg = AppConfig::default(); + let cfg = HotAppConfig::default(); let json = serde_json::to_string(&cfg).unwrap(); - let back: AppConfig = serde_json::from_str(&json).unwrap(); + let back: HotAppConfig = serde_json::from_str(&json).unwrap(); assert_eq!(cfg, back); } - #[test] - fn test_config_partial_deserialisation() { - // Only some fields present — rest should use serde defaults. - let json = r#"{"log_level":"info","max_connections":25,"request_timeout_secs":60,"maintenance_mode":true,"redis_config_key":"config:current"}"#; - let cfg: AppConfig = serde_json::from_str(json).unwrap(); - assert_eq!(cfg.log_level, "info"); - assert_eq!(cfg.max_connections, 25); - assert!(cfg.maintenance_mode); - } - - // --- ConfigWatcher::reload --- - #[tokio::test] async fn test_reload_updates_config() { let watcher = default_watcher(); let handle = watcher.handle(); - let new_cfg = AppConfig { + let new_cfg = HotAppConfig { log_level: "info".to_string(), max_connections: 50, - ..AppConfig::default() + ..HotAppConfig::default() }; watcher.reload(new_cfg.clone()).await; - assert_eq!(handle.get().await, new_cfg); } @@ -404,14 +394,8 @@ mod tests { async fn test_reload_unchanged_does_not_notify() { let watcher = default_watcher(); let mut handle = watcher.handle(); - - // Mark the initial value as seen. handle.changed.borrow_and_update(); - - // Reload with identical config. - watcher.reload(AppConfig::default()).await; - - // `has_changed` should be false — no notification was sent. + watcher.reload(HotAppConfig::default()).await; assert!(!handle.changed.has_changed().unwrap()); } @@ -419,91 +403,42 @@ mod tests { async fn test_reload_changed_notifies_handle() { let watcher = default_watcher(); let mut handle = watcher.handle(); - handle.changed.borrow_and_update(); - watcher - .reload(AppConfig { + .reload(HotAppConfig { maintenance_mode: true, - ..AppConfig::default() + ..HotAppConfig::default() }) .await; - assert!(handle.changed.has_changed().unwrap()); } - // --- ConfigHandle --- - - #[tokio::test] - async fn test_handle_get_returns_current() { - let watcher = default_watcher(); - let handle = watcher.handle(); - assert_eq!(handle.get().await, AppConfig::default()); - } - #[tokio::test] async fn test_multiple_handles_see_same_update() { let watcher = default_watcher(); let h1 = watcher.handle(); let h2 = watcher.handle(); - - let new_cfg = AppConfig { + let new_cfg = HotAppConfig { max_connections: 99, - ..AppConfig::default() + ..HotAppConfig::default() }; - watcher.reload(new_cfg.clone()).await; - + watcher.reload(new_cfg).await; assert_eq!(h1.get().await.max_connections, 99); assert_eq!(h2.get().await.max_connections, 99); } - #[tokio::test] - async fn test_wait_for_change_resolves_after_reload() { - let watcher = Arc::new(default_watcher()); - let mut handle = watcher.handle(); - - // Mark current as seen so wait_for_change actually waits. - handle.changed.borrow_and_update(); - - let watcher2 = Arc::clone(&watcher); - tokio::spawn(async move { - tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; - watcher2 - .reload(AppConfig { - maintenance_mode: true, - ..AppConfig::default() - }) - .await; - }); - - let updated = handle.wait_for_change().await; - assert!(updated.maintenance_mode); - } - - // --- reload_from_redis (no live Redis — error path) --- - #[tokio::test] async fn test_reload_from_redis_connection_error() { let watcher = default_watcher(); - // Port 1 is never open — connection will fail immediately. let redis = RedisClient::open("redis://127.0.0.1:1/").unwrap(); let result = watcher.reload_from_redis(&redis).await; assert!(matches!(result, Err(ReloadError::Redis(_)))); - // Config must be unchanged. - assert_eq!(watcher.handle().get().await, AppConfig::default()); + assert_eq!(watcher.handle().get().await, HotAppConfig::default()); } - // --- ReloadError display --- - #[test] fn test_reload_error_not_found_display() { let e = ReloadError::NotFound; assert!(e.to_string().contains("not found")); } - - #[test] - fn test_reload_error_deserialise_display() { - let e = ReloadError::Deserialise(serde_json::from_str::("bad").unwrap_err()); - assert!(!e.to_string().is_empty()); - } } diff --git a/backend/src/error.rs b/backend/src/error.rs index 72bdbfe..b9ce3e7 100644 --- a/backend/src/error.rs +++ b/backend/src/error.rs @@ -9,7 +9,6 @@ use axum::{ Json, }; use serde::Serialize; -use serde_json::json; use thiserror::Error; use tracing::error; @@ -30,7 +29,7 @@ pub struct ErrorResponse { /// # Examples /// /// ```rust,no_run -/// use crucible_backend::error::AppError; +/// use backend::error::AppError; /// /// async fn handler() -> Result { /// Err(AppError::NotFound("Contract not found".into())) @@ -68,47 +67,21 @@ pub enum AppError { /// 500 — An internal database error occurred. #[error("Database error: {0}")] - Database(#[from] sqlx::Error), + DatabaseError(#[from] sqlx::Error), /// 500 — An internal Redis error occurred. #[error("Redis error: {0}")] - Redis(#[from] redis::RedisError), - - /// 500 — A serialization error occurred. - #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), + RedisError(#[from] redis::RedisError), /// 500 — A catch-all for unexpected internal errors. #[error("Internal error: {0}")] InternalError(String), - /// 500 — Internal server error (no message). - #[error("Internal server error")] - Internal, - - /// 502 — Stellar network communication failure. + /// 502 — A Stellar network operation failed. #[error("Stellar operation failed: {0}")] StellarError(String), } -// Convenience constructors used by services. -impl AppError { - /// Wrap a database error. - pub fn db(e: sqlx::Error) -> Self { - AppError::Database(e) - } - - /// Wrap a Redis error. - pub fn redis(e: redis::RedisError) -> Self { - AppError::Redis(e) - } - - /// Wrap a serialization error. - pub fn serialization(e: serde_json::Error) -> Self { - AppError::Serialization(e) - } -} - impl IntoResponse for AppError { fn into_response(self) -> Response { let (status, code, message) = match &self { @@ -117,12 +90,10 @@ impl IntoResponse for AppError { AppError::Unauthorized(msg) => (StatusCode::UNAUTHORIZED, "unauthorized", msg.clone()), AppError::Forbidden(msg) => (StatusCode::FORBIDDEN, "forbidden", msg.clone()), AppError::Conflict(msg) => (StatusCode::CONFLICT, "conflict", msg.clone()), - AppError::ValidationError(msg) => ( - StatusCode::UNPROCESSABLE_ENTITY, - "validation_error", - msg.clone(), - ), - AppError::Database(e) => { + AppError::ValidationError(msg) => { + (StatusCode::UNPROCESSABLE_ENTITY, "validation_error", msg.clone()) + } + AppError::DatabaseError(e) => { error!("Database error: {e:?}"); ( StatusCode::INTERNAL_SERVER_ERROR, @@ -130,7 +101,7 @@ impl IntoResponse for AppError { "An internal database error occurred".to_string(), ) } - AppError::Redis(e) => { + AppError::RedisError(e) => { error!("Redis error: {e:?}"); ( StatusCode::INTERNAL_SERVER_ERROR, @@ -154,14 +125,6 @@ impl IntoResponse for AppError { "An internal error occurred".to_string(), ) } - AppError::Internal => { - error!("Internal server error"); - ( - StatusCode::INTERNAL_SERVER_ERROR, - "internal_error", - "An internal server error occurred".to_string(), - ) - } AppError::StellarError(msg) => { error!("Stellar error: {msg}"); ( @@ -170,7 +133,6 @@ impl IntoResponse for AppError { "Failed to communicate with Stellar network".to_string(), ) } - AppError::LengthRequired(msg) => (StatusCode::LENGTH_REQUIRED, "length_required", msg.clone()), }; ( diff --git a/backend/src/jobs.rs b/backend/src/jobs.rs index bfd1380..b2c97a0 100644 --- a/backend/src/jobs.rs +++ b/backend/src/jobs.rs @@ -1,14 +1,17 @@ -use crate::services::tracing::TracingService; +//! Background job definitions for the Apalis job queue. + use serde::{Deserialize, Serialize}; use tracing::{info, instrument}; +use crate::services::tracing::TracingService; + +/// Job payload for monitoring a Stellar transaction. #[derive(Debug, Serialize, Deserialize)] pub struct TransactionMonitorJob { pub tx_hash: String, } -/// Handler for monitoring Stellar transactions. -/// Returning () since Apalis 0.6 handlers can return (). +/// Handler for monitoring Stellar transactions via Apalis. #[instrument(skip_all, fields(job.name = "monitor_transaction", job.id = %job.tx_hash))] pub async fn monitor_transaction(job: TransactionMonitorJob) { let span = TracingService::job_span("monitor_transaction", &job.tx_hash); @@ -16,6 +19,5 @@ pub async fn monitor_transaction(job: TransactionMonitorJob) { info!("Monitoring Stellar transaction: {}", job.tx_hash); tokio::time::sleep(tokio::time::Duration::from_secs(2)).await; - info!("Transaction monitoring completed: {}", job.tx_hash); } diff --git a/backend/src/lib.rs b/backend/src/lib.rs index 1c91475..2d41232 100644 --- a/backend/src/lib.rs +++ b/backend/src/lib.rs @@ -1,3 +1,5 @@ +//! Crucible backend library crate. + pub mod api; pub mod config; pub mod db; diff --git a/backend/src/services/business_metrics.rs b/backend/src/services/business_metrics.rs index 6b1184f..43da365 100644 --- a/backend/src/services/business_metrics.rs +++ b/backend/src/services/business_metrics.rs @@ -1,3 +1,10 @@ +//! Business metrics service for tracking revenue, costs, and operational KPIs. + +#![allow(dead_code)] + +use std::collections::HashMap; +use std::sync::Arc; + use chrono::{DateTime, Duration, Utc}; use rust_decimal::Decimal; use serde::{Deserialize, Serialize}; @@ -5,24 +12,14 @@ use sqlx::PgPool; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; -use tracing::{error, info, instrument, warn}; +use tracing::{error, info, instrument}; use uuid::Uuid; use crate::error::AppError; -// ─── Domain Types ──────────────────────────────────────────────────────────── - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct BusinessMetric { - pub id: Uuid, - pub name: String, - pub value: Decimal, - pub unit: String, - pub category: MetricCategory, - pub tags: HashMap, - pub recorded_at: DateTime, - pub source: MetricSource, -} +// --------------------------------------------------------------------------- +// Domain types +// --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "snake_case")] @@ -35,20 +32,34 @@ pub enum MetricCategory { Custom(String), } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] #[serde(rename_all = "snake_case")] pub enum MetricSource { OnChain, OffChain, + #[default] Database, ExternalApi, Manual, } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricSnapshot { - pub timestamp: DateTime, - pub metrics: Vec, +pub struct BusinessMetric { + pub id: Uuid, + pub name: String, + pub value: Decimal, + pub unit: String, + pub category: MetricCategory, + pub tags: HashMap, + pub recorded_at: DateTime, + pub source: MetricSource, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsSummary { + pub total_metrics: i64, + pub categories: HashMap, + pub latest_timestamp: Option>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -61,14 +72,9 @@ pub struct MetricsQuery { pub offset: Option, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsSummary { - pub total_metrics: i64, - pub categories: HashMap, - pub latest_timestamp: Option>, -} - -// ─── Service ───────────────────────────────────────────────────────────────── +// --------------------------------------------------------------------------- +// Service +// --------------------------------------------------------------------------- pub struct BusinessMetricsService { db: PgPool, @@ -83,48 +89,52 @@ impl BusinessMetricsService { } } - /// Record a new business metric with the given parameters. - #[instrument(skip(self), fields(metric_name = %name))] + /// Record a new business metric. + #[instrument(skip(self, tags, value, unit, category, source))] pub async fn record_metric( &self, - name: impl Into, + name: String, value: Decimal, - unit: impl Into, + unit: String, category: MetricCategory, tags: HashMap, source: MetricSource, ) -> Result { let id = Uuid::new_v4(); let now = Utc::now(); - let name = name.into(); - let unit = unit.into(); - - sqlx::query_as!( - BusinessMetric, + let category_str = serde_json::to_string(&category) + .map_err(|e| AppError::InternalError(e.to_string()))?; + let source_str = serde_json::to_string(&source) + .map_err(|e| AppError::InternalError(e.to_string()))?; + let tags_json = serde_json::to_value(&tags) + .map_err(|e| AppError::InternalError(e.to_string()))?; + // Store Decimal as string to avoid sqlx type issues + let value_str = value.to_string(); + + sqlx::query( r#" INSERT INTO business_metrics (id, name, value, unit, category, tags, recorded_at, source) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - RETURNING id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" "#, - id, - name, - value, - unit, - category as MetricCategory, - serde_json::to_value(&tags)?, - now, - source as MetricSource, ) - .fetch_one(&self.db) + .bind(id) + .bind(&name) + .bind(&value_str) + .bind(&unit) + .bind(&category_str) + .bind(&tags_json) + .bind(now) + .bind(&source_str) + .execute(&self.db) .await .map_err(|e| { error!(error = %e, "Failed to record metric"); - AppError::Database(e) + AppError::DatabaseError(e) })?; let metric = BusinessMetric { id, - name, + name: name.clone(), value, unit, category, @@ -138,7 +148,6 @@ impl BusinessMetricsService { let mut cache = self.cache.write().await; let entry = cache.entry(metric.name.clone()).or_default(); entry.push(metric.clone()); - // Keep last 1000 values per metric if entry.len() > 1000 { entry.remove(0); } @@ -147,429 +156,81 @@ impl BusinessMetricsService { info!( metric_name = %metric.name, value = %metric.value, - category = ?metric.category, "Recorded business metric" ); Ok(metric) } - /// Record multiple metrics in a single transaction. - #[instrument(skip(self, metrics))] - pub async fn record_metrics_batch( - &self, - metrics: Vec<( - String, - Decimal, - String, - MetricCategory, - HashMap, - MetricSource, - )>, - ) -> Result, AppError> { - let mut tx = self.db.begin().await?; - let mut results = Vec::with_capacity(metrics.len()); - let now = Utc::now(); - - for (name, value, unit, category, tags, source) in metrics { - let id = Uuid::new_v4(); - - sqlx::query!( - r#" - INSERT INTO business_metrics (id, name, value, unit, category, tags, recorded_at, source) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - "#, - id, - name, - value, - unit, - serde_json::to_value(&tags)?, - now, - source as MetricSource, - ) - .execute(&mut *tx) - .await - .map_err(|e| { - error!(error = %e, "Failed in batch metric insert"); - AppError::Database(e) - })?; - - results.push(BusinessMetric { - id, - name, - value, - unit, - category, - tags, - recorded_at: now, - source, - }); - } - - tx.commit().await.map_err(|e| { - error!(error = %e, "Failed to commit batch metrics"); - AppError::Database(e) - })?; - - info!(count = results.len(), "Recorded batch metrics"); - Ok(results) - } - - /// Query metrics with optional filters. - #[instrument(skip(self))] - pub async fn query_metrics( - &self, - query: MetricsQuery, - ) -> Result<(Vec, i64), AppError> { - let limit = query.limit.unwrap_or(100); - let offset = query.offset.unwrap_or(0); - - let total = - sqlx::query_scalar!(r#"SELECT COUNT(*) as "count!" FROM business_metrics WHERE 1=1"#) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .unwrap_or(0); - - let metrics = sqlx::query_as!( - BusinessMetric, - r#" - SELECT id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" - FROM business_metrics - ORDER BY recorded_at DESC - LIMIT $1 OFFSET $2 - "#, - limit, - offset, - ) - .fetch_all(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok((metrics, total)) - } - - /// Get aggregated metrics summary. - #[instrument(skip(self))] - pub async fn get_metrics_summary(&self) -> Result { - let total: i64 = - sqlx::query_scalar!(r#"SELECT COUNT(*) as "count!" FROM business_metrics"#) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .unwrap_or(0); - - let latest: Option> = - sqlx::query_scalar!(r#"SELECT MAX(recorded_at) as "max!" FROM business_metrics"#) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - let rows = sqlx::query!( - r#"SELECT category as "category!: MetricCategory", COUNT(*) as "count!: i64" FROM business_metrics GROUP BY category"# - ) - .fetch_all(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - let mut categories = HashMap::new(); - for row in rows { - let key = match row.category { - MetricCategory::Custom(s) => s, - other => format!("{:?}", other).to_lowercase(), - }; - categories.insert(key, row.count); - } - - Ok(MetricsSummary { - total_metrics: total, - categories, - latest_timestamp: latest, - }) - } - - /// Compute aggregated values for a metric over a time range. - #[instrument(skip(self))] - pub async fn aggregate_metric( - &self, - name: &str, - from: DateTime, - to: DateTime, - ) -> Result, AppError> { - let result = sqlx::query_scalar!( - r#"SELECT SUM(value) as "sum!: Decimal" FROM business_metrics WHERE name = $1 AND recorded_at >= $2 AND recorded_at <= $3"#, - name, - from, - to, - ) - .fetch_one(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok(result) - } - - /// Get the latest value for a specific metric. - #[instrument(skip(self))] - pub async fn get_latest_metric(&self, name: &str) -> Result, AppError> { - // Check cache first - { - let cache = self.cache.read().await; - if let Some(values) = cache.get(name) { - if let Some(latest) = values.last() { - return Ok(Some(latest.clone())); - } - } - } - - // Fall back to database - let metric = sqlx::query_as!( - BusinessMetric, - r#" - SELECT id, name, value, unit, category as "category: _", tags as "tags: _", recorded_at, source as "source: _" - FROM business_metrics - WHERE name = $1 - ORDER BY recorded_at DESC - LIMIT 1 - "#, - name, - ) - .fetch_optional(&self.db) - .await - .map_err(|e| AppError::Database(e))?; - - Ok(metric) - } - /// Remove metrics older than the retention period. #[instrument(skip(self))] pub async fn prune_old_metrics(&self, retention_days: i64) -> Result { let cutoff = Utc::now() - Duration::days(retention_days); - let deleted = sqlx::query!( - r#"DELETE FROM business_metrics WHERE recorded_at < $1"#, - cutoff, - ) - .execute(&self.db) - .await - .map_err(|e| AppError::Database(e))? - .rows_affected(); + let result = sqlx::query("DELETE FROM business_metrics WHERE recorded_at < $1") + .bind(cutoff) + .execute(&self.db) + .await + .map_err(|e| AppError::DatabaseError(e))?; + let deleted = result.rows_affected(); info!(deleted, retention_days, "Pruned old metrics"); Ok(deleted) } -} - -// ─── API Handlers ──────────────────────────────────────────────────────────── - -use axum::{extract::State, http::StatusCode, Json}; - -pub struct MetricsState { - pub service: Arc, -} - -#[derive(Debug, Deserialize)] -pub struct RecordMetricRequest { - pub name: String, - pub value: Decimal, - pub unit: String, - pub category: MetricCategory, - #[serde(default)] - pub tags: HashMap, - #[serde(default)] - pub source: MetricSource, -} - -/// POST /api/metrics — Record a new business metric. -#[utoipa::path( - post, - path = "/api/metrics", - request_body = RecordMetricRequest, - responses( - (status = 201, description = "Metric recorded", body = BusinessMetric), - (status = 400, description = "Invalid request"), - (status = 500, description = "Internal server error") - ) -)] -pub async fn record_metric( - State(state): State>, - Json(req): Json, -) -> Result<(StatusCode, Json), AppError> { - let metric = state - .service - .record_metric( - req.name, - req.value, - req.unit, - req.category, - req.tags, - req.source, - ) - .await?; - - Ok((StatusCode::CREATED, Json(metric))) -} - -/// GET /api/metrics — Query business metrics with filters. -#[utoipa::path( - get, - path = "/api/metrics", - params( - ("category" = Option, Query, description = "Filter by category"), - ("from" = Option>, Query, description = "Start of time range"), - ("to" = Option>, Query, description = "End of time range"), - ("limit" = Option, Query, description = "Max results"), - ("offset" = Option, Query, description = "Pagination offset") - ), - responses( - (status = 200, description = "List of metrics with total count"), - (status = 500, description = "Internal server error") - ) -)] -pub async fn query_metrics( - State(state): State>, - axum::extract::Query(params): axum::extract::Query>, -) -> Result, AppError> { - let category = params - .get("category") - .and_then(|c| serde_json::from_str(&format!("\"{}\"", c)).ok()); - let from = params - .get("from") - .and_then(|v| v.parse::>().ok()); - let to = params - .get("to") - .and_then(|v| v.parse::>().ok()); - let limit = params.get("limit").and_then(|v| v.parse::().ok()); - let offset = params.get("offset").and_then(|v| v.parse::().ok()); - - let query = MetricsQuery { - category, - from, - to, - tags: None, - limit, - offset, - }; - - let (metrics, total) = state.service.query_metrics(query).await?; - - Ok(Json(serde_json::json!({ - "metrics": metrics, - "total": total, - }))) + /// Get the latest cached value for a metric (no DB call). + pub async fn get_cached_latest(&self, name: &str) -> Option { + let cache = self.cache.read().await; + cache.get(name)?.last().cloned() + } } -/// GET /api/metrics/summary — Get aggregated metrics overview. -#[utoipa::path( - get, - path = "/api/metrics/summary", - responses( - (status = 200, description = "Metrics summary", body = MetricsSummary), - (status = 500, description = "Internal server error") - ) -)] -pub async fn get_metrics_summary( - State(state): State>, -) -> Result, AppError> { - let summary = state.service.get_metrics_summary().await?; - Ok(Json(summary)) -} +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; - use sqlx::PgPool; - async fn setup_test_db() -> PgPool { - let pool = PgPool::connect("postgres://localhost:5432/crucible_test") - .await - .expect("Failed to connect to test database"); - - sqlx::query!( - r#" - CREATE TABLE IF NOT EXISTS business_metrics ( - id UUID PRIMARY KEY, - name TEXT NOT NULL, - value NUMERIC NOT NULL, - unit TEXT NOT NULL, - category TEXT NOT NULL, - tags JSONB DEFAULT '{}', - recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - source TEXT NOT NULL DEFAULT 'manual' - ) - "# - ) - .execute(&pool) - .await - .expect("Failed to create test table"); - - pool + #[test] + fn test_metric_category_serialization() { + let cat = MetricCategory::Revenue; + let json = serde_json::to_string(&cat).unwrap(); + assert!(json.contains("revenue")); } - #[tokio::test] - async fn test_record_and_retrieve_metric() { - let pool = setup_test_db().await; - let service = BusinessMetricsService::new(pool); - - let metric = service - .record_metric( - "test_revenue", - Decimal::new(1000, 0), - "USD", - MetricCategory::Revenue, - HashMap::from([("region".into(), "us-east".into())]), - MetricSource::Database, - ) - .await - .expect("Failed to record metric"); - - assert_eq!(metric.name, "test_revenue"); - assert_eq!(metric.value, Decimal::new(1000, 0)); - - let latest = service - .get_latest_metric("test_revenue") - .await - .expect("Failed to get metric") - .expect("Metric not found"); - - assert_eq!(latest.value, Decimal::new(1000, 0)); + #[test] + fn test_metric_source_default() { + let src = MetricSource::default(); + assert_eq!(src, MetricSource::Database); } - #[tokio::test] - async fn test_metrics_summary() { - let pool = setup_test_db().await; - let service = BusinessMetricsService::new(pool); - - service - .record_metric( - "revenue", - Decimal::new(500, 0), - "USD", - MetricCategory::Revenue, - HashMap::new(), - MetricSource::Database, - ) - .await - .expect("Failed to record"); - - service - .record_metric( - "cost", - Decimal::new(200, 0), - "USD", - MetricCategory::Costs, - HashMap::new(), - MetricSource::Database, - ) - .await - .expect("Failed to record"); - - let summary = service - .get_metrics_summary() - .await - .expect("Failed to get summary"); + #[test] + fn test_business_metric_serialization() { + let metric = BusinessMetric { + id: Uuid::new_v4(), + name: "revenue".to_string(), + value: Decimal::new(1000, 2), + unit: "USD".to_string(), + category: MetricCategory::Revenue, + tags: HashMap::from([("region".into(), "us-east".into())]), + recorded_at: Utc::now(), + source: MetricSource::Database, + }; + let json = serde_json::to_string(&metric).unwrap(); + assert!(json.contains("revenue")); + assert!(json.contains("USD")); + } - assert!(summary.total_metrics >= 2); + #[test] + fn test_metrics_summary_serialization() { + let summary = MetricsSummary { + total_metrics: 42, + categories: HashMap::from([("revenue".into(), 10i64)]), + latest_timestamp: Some(Utc::now()), + }; + let json = serde_json::to_string(&summary).unwrap(); + assert!(json.contains("42")); } } diff --git a/backend/src/services/error_recovery.rs b/backend/src/services/error_recovery.rs index 2e6ee25..c12cc38 100644 --- a/backend/src/services/error_recovery.rs +++ b/backend/src/services/error_recovery.rs @@ -1,11 +1,17 @@ +//! Error recovery service. +//! +//! Tracks retry state for failing tasks with configurable max retries. + #![allow(dead_code)] -use crate::services::tracing::TracingService; + use serde::{Deserialize, Serialize}; use std::sync::Arc; use thiserror::Error; use tokio::sync::RwLock; use tracing::{error, info, instrument, warn}; +use crate::services::tracing::TracingService; + #[derive(Error, Debug, Serialize, Deserialize)] pub enum RecoveryError { #[error("Database error: {0}")] @@ -84,7 +90,6 @@ impl ErrorManager { pub async fn get_active_tasks(&self) -> Vec { let span = TracingService::service_method_span("ErrorManager", "get_active_tasks"); let _enter = span.enter(); - self.tasks.read().await.clone() } } @@ -98,32 +103,25 @@ mod tests { let manager = ErrorManager::new(); let task_name = "test_task"; - // First failure manager - .handle_error( - RecoveryError::Database("connection lost".to_string()), - task_name, - ) + .handle_error(RecoveryError::Database("connection lost".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await.len(), 1); assert_eq!(manager.get_active_tasks().await[0].retries, 1); - // Second failure manager .handle_error(RecoveryError::Redis("timeout".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await[0].retries, 2); - // Third failure manager .handle_error(RecoveryError::Internal("unknown".to_string()), task_name) .await .unwrap(); assert_eq!(manager.get_active_tasks().await[0].retries, 3); - // Fourth failure - should fail let result = manager .handle_error(RecoveryError::Internal("last straw".to_string()), task_name) .await; diff --git a/backend/src/services/feature_flags.rs b/backend/src/services/feature_flags.rs index a9346b3..805184c 100644 --- a/backend/src/services/feature_flags.rs +++ b/backend/src/services/feature_flags.rs @@ -1,26 +1,4 @@ //! Feature flag service with Redis caching and PostgreSQL persistence. -//! -//! This module provides a production-ready feature flag system that: -//! - Stores flag state in PostgreSQL for durability -//! - Caches flag values in Redis for low-latency reads -//! - Supports cache invalidation on updates -//! - Provides async API for flag evaluation -//! -//! # Example -//! ```rust,no_run -//! use backend::services::feature_flags::FeatureFlagService; -//! use sqlx::PgPool; -//! use redis::Client; -//! -//! # async fn example(pool: PgPool, redis: Client) -> anyhow::Result<()> { -//! let service = FeatureFlagService::new(pool, redis); -//! let enabled = service.is_enabled("new_dashboard").await?; -//! if enabled { -//! // render new UI -//! } -//! # Ok(()) -//! # } -//! ``` #![allow(dead_code)] @@ -32,26 +10,20 @@ use sqlx::PgPool; use thiserror::Error; use tracing::{debug, info, instrument, warn}; +use crate::services::tracing::TracingService; + // --------------------------------------------------------------------------- // Error type // --------------------------------------------------------------------------- -/// Errors that can occur in the feature flag service. #[derive(Debug, Error)] pub enum FlagError { - /// A database error occurred. #[error("Database error: {0}")] Database(#[from] sqlx::Error), - - /// A Redis error occurred. #[error("Redis error: {0}")] Redis(#[from] redis::RedisError), - - /// The requested flag was not found. #[error("Feature flag not found: {0}")] NotFound(String), - - /// An internal error occurred. #[error("Internal error: {0}")] Internal(String), } @@ -60,16 +32,11 @@ pub enum FlagError { // Domain types // --------------------------------------------------------------------------- -/// A feature flag record. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FeatureFlag { - /// Unique key identifying the flag. pub key: String, - /// Whether the flag is enabled. pub enabled: bool, - /// Human-readable description. pub description: String, - /// Last update timestamp. pub updated_at: DateTime, } @@ -77,98 +44,49 @@ pub struct FeatureFlag { // FeatureFlagService // --------------------------------------------------------------------------- -/// Service for managing feature flags with Redis caching and PostgreSQL persistence. pub struct FeatureFlagService { db: PgPool, redis: RedisClient, } impl FeatureFlagService { - /// Create a new feature flag service. - /// - /// # Arguments - /// - `db`: PostgreSQL connection pool - /// - `redis`: Redis client pub fn new(db: PgPool, redis: RedisClient) -> Self { Self { db, redis } } - /// Check if a feature flag is enabled. - /// - /// This method first checks Redis cache. On cache miss, it queries - /// PostgreSQL and populates the cache with a 5-minute TTL. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "is_enabled"))] pub async fn is_enabled(&self, key: &str) -> Result { let cache_key = format!("flag:{key}"); - // Try cache first with Redis tracing let redis_span = TracingService::redis_command_span("GET", Some(&cache_key)); let _redis_enter = redis_span.enter(); - - let mut conn = self - .redis - .get_multiplexed_async_connection() - .await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_connection"); - e - })?; - - let cached: Option = conn.get(&cache_key).await.map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_get"); - e - })?; - + let mut conn = self.redis.get_multiplexed_async_connection().await?; + let cached: Option = conn.get(&cache_key).await?; drop(_redis_enter); if let Some(val) = cached { - debug!(key = %key, cached = %val, "Feature flag cache hit"); + debug!(key = %key, "Feature flag cache hit"); return Ok(val == "1"); } - // Cache miss – query database with DB tracing debug!(key = %key, "Feature flag cache miss – querying database"); - let row: Option<(bool,)> = - sqlx::query_as("SELECT enabled FROM feature_flags WHERE key = $1") - .bind(key) - .fetch_optional(&self.db) - .await?; - let db_span = TracingService::db_query_span( "SELECT enabled FROM feature_flags WHERE key = $1", "postgres", "SELECT", ); let _db_enter = db_span.enter(); - let row: Option<(bool,)> = sqlx::query_as("SELECT enabled FROM feature_flags WHERE key = $1") .bind(key) .fetch_optional(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - + .await?; drop(_db_enter); match row { Some((enabled,)) => { - // Populate cache with 5-minute TTL - let cache_set_span = TracingService::redis_command_span("SETEX", Some(&cache_key)); - let _cache_set_enter = cache_set_span.enter(); - let val = if enabled { "1" } else { "0" }; - let _: () = conn.set_ex(&cache_key, val, 300).await.map_err(|e| { - TracingService::record_error(&cache_set_span, &e.to_string(), "redis_setex"); - e - })?; - - drop(_cache_set_enter); + let _: () = conn.set_ex(&cache_key, val, 300).await?; debug!(key = %key, enabled = enabled, "Cached feature flag"); Ok(enabled) } @@ -176,31 +94,14 @@ impl FeatureFlagService { } } - /// Get the full feature flag record. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "get"))] pub async fn get(&self, key: &str) -> Result { - let db_span = TracingService::db_query_span( - "SELECT key, enabled, description, updated_at FROM feature_flags WHERE key = $1", - "postgres", - "SELECT", - ); - let _db_enter = db_span.enter(); - let row: Option<(String, bool, String, DateTime)> = sqlx::query_as( "SELECT key, enabled, description, updated_at FROM feature_flags WHERE key = $1", ) .bind(key) .fetch_optional(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - drop(_db_enter); + .await?; match row { Some((key, enabled, description, updated_at)) => Ok(FeatureFlag { @@ -213,28 +114,13 @@ impl FeatureFlagService { } } - /// List all feature flags. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "list"))] pub async fn list(&self) -> Result, FlagError> { - let db_span = TracingService::db_query_span( - "SELECT key, enabled, description, updated_at FROM feature_flags ORDER BY key", - "postgres", - "SELECT", - ); - let _db_enter = db_span.enter(); - let rows: Vec<(String, bool, String, DateTime)> = sqlx::query_as( "SELECT key, enabled, description, updated_at FROM feature_flags ORDER BY key", ) .fetch_all(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", rows.len() as i64); - drop(_db_enter); + .await?; Ok(rows .into_iter() @@ -247,19 +133,9 @@ impl FeatureFlagService { .collect()) } - /// Create or update a feature flag. - /// - /// This method upserts the flag in PostgreSQL and invalidates the cache. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "set"))] pub async fn set(&self, key: &str, enabled: bool, description: &str) -> Result<(), FlagError> { - let db_span = TracingService::db_query_span( - "INSERT INTO feature_flags ... ON CONFLICT DO UPDATE", - "postgres", - "UPSERT", - ); - let _db_enter = db_span.enter(); - - let result = sqlx::query( + sqlx::query( r#" INSERT INTO feature_flags (key, enabled, description, updated_at) VALUES ($1, $2, $3, $4) @@ -274,46 +150,19 @@ impl FeatureFlagService { .bind(description) .bind(Utc::now()) .execute(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", result.rows_affected() as i64); - drop(_db_enter); + .await?; - // Invalidate cache self.invalidate_cache(key).await?; - info!(key = %key, enabled = enabled, "Feature flag updated"); Ok(()) } - /// Delete a feature flag. - /// - /// # Errors - /// Returns [`FlagError::NotFound`] if the flag doesn't exist. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "delete"))] pub async fn delete(&self, key: &str) -> Result<(), FlagError> { - let db_span = TracingService::db_query_span( - "DELETE FROM feature_flags WHERE key = $1", - "postgres", - "DELETE", - ); - let _db_enter = db_span.enter(); - let result = sqlx::query("DELETE FROM feature_flags WHERE key = $1") .bind(key) .execute(&self.db) - .await - .map_err(|e| { - TracingService::record_error(&db_span, &e.to_string(), "database"); - e - })?; - - db_span.record("db.rows_affected", result.rows_affected() as i64); - drop(_db_enter); + .await?; if result.rows_affected() == 0 { return Err(FlagError::NotFound(key.to_string())); @@ -324,30 +173,10 @@ impl FeatureFlagService { Ok(()) } - /// Invalidate the Redis cache for a specific flag. - #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "invalidate_cache"))] async fn invalidate_cache(&self, key: &str) -> Result<(), FlagError> { - let cache_key = format!("flag:{}", key); - - let redis_span = TracingService::redis_command_span("DEL", Some(&cache_key)); - let _redis_enter = redis_span.enter(); - - let mut conn = self - .redis - .get_multiplexed_async_connection() - .await - .map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_connection"); - e - })?; - - let deleted: i32 = conn.del(&cache_key).await.map_err(|e| { - TracingService::record_error(&redis_span, &e.to_string(), "redis_del"); - e - })?; - - drop(_redis_enter); - + let cache_key = format!("flag:{key}"); + let mut conn = self.redis.get_multiplexed_async_connection().await?; + let deleted: i32 = conn.del(&cache_key).await?; if deleted > 0 { debug!(key = %key, "Invalidated feature flag cache"); } else { @@ -356,68 +185,32 @@ impl FeatureFlagService { Ok(()) } - /// Flush all feature flag cache entries (useful for testing / maintenance). - /// - /// This uses a Redis SCAN to find all keys matching `flag:*` and deletes them. #[instrument(skip(self), fields(service.name = "FeatureFlagService", service.method = "flush_cache"))] pub async fn flush_cache(&self) -> Result { - let keys_span = TracingService::redis_command_span("KEYS", Some("flag:*")); - let _keys_enter = keys_span.enter(); - - let mut conn = self - .redis - .get_multiplexed_async_connection() - .await - .map_err(|e| { - TracingService::record_error(&keys_span, &e.to_string(), "redis_connection"); - e - })?; - + let mut conn = self.redis.get_multiplexed_async_connection().await?; let keys: Vec = redis::cmd("KEYS") .arg("flag:*") .query_async(&mut conn) - .await - .map_err(|e| { - TracingService::record_error(&keys_span, &e.to_string(), "redis_keys"); - e - })?; - - drop(_keys_enter); + .await?; if keys.is_empty() { - debug!("No feature flag cache entries to flush"); return Ok(0); } let count = keys.len(); - - let del_span = TracingService::redis_command_span("DEL", None); - let _del_enter = del_span.enter(); - for key in keys { - let _: () = conn.del(&key).await.map_err(|e| { - TracingService::record_error(&del_span, &e.to_string(), "redis_del"); - e - })?; + let _: () = conn.del(&key).await?; } - drop(_del_enter); - info!(count = count, "Flushed feature flag cache"); Ok(count) } } -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - #[cfg(test)] mod tests { use super::*; - // Unit tests that do not require live database/Redis connections. - #[test] fn test_flag_error_display() { let err = FlagError::NotFound("test_flag".to_string()); diff --git a/backend/src/services/log_alerts.rs b/backend/src/services/log_alerts.rs index c08e169..7fbbe6c 100644 --- a/backend/src/services/log_alerts.rs +++ b/backend/src/services/log_alerts.rs @@ -1,200 +1,27 @@ -use crate::error::AppError; -use axum::{ - extract::{Path, State}, - routing::{get, post}, - Json, Router, -}; -use serde::{Deserialize, Serialize}; -use sqlx::PgPool; -use std::sync::Arc; -use uuid::Uuid; - -#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)] -pub struct LogAlertRule { - pub id: Uuid, - pub name: String, - pub pattern: String, - pub threshold: i32, - pub interval_seconds: i32, - pub is_enabled: bool, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct CreateRuleRequest { - pub name: String, - pub pattern: String, - pub threshold: i32, - pub interval_seconds: i32, -} - -#[derive(Debug, Serialize, Deserialize, sqlx::FromRow)] -pub struct LogAlert { - pub id: Uuid, - pub rule_id: Uuid, - pub message: String, - pub triggered_at: chrono::DateTime, -} - -pub struct ServiceState { - pub db: PgPool, - pub redis: redis::Client, -} - -pub fn router() -> Router { - Router::new() - .route("/rules", post(create_rule).get(list_rules)) - .route("/rules/:id", get(get_rule)) - .route("/ingest", post(ingest_log)) -} - -async fn create_rule( - State(state): State>, - Json(payload): Json, -) -> Result, AppError> { - let rule = sqlx::query_as::<_, LogAlertRule>( - "INSERT INTO log_alert_rules (name, pattern, threshold, interval_seconds) - VALUES ($1, $2, $3, $4) RETURNING *", - ) - .bind(payload.name) - .bind(payload.pattern) - .bind(payload.threshold) - .bind(payload.interval_seconds) - .fetch_one(&state.db) - .await?; - - Ok(Json(rule)) -} - -async fn list_rules( - State(state): State>, -) -> Result>, AppError> { - let rules = sqlx::query_as::<_, LogAlertRule>("SELECT * FROM log_alert_rules") - .fetch_all(&state.db) - .await?; - Ok(Json(rules)) -} - -async fn get_rule( - State(state): State>, - Path(id): Path, -) -> Result, AppError> { - let rule = sqlx::query_as::<_, LogAlertRule>("SELECT * FROM log_alert_rules WHERE id = $1") - .bind(id) - .fetch_optional(&state.db) - .await? - .ok_or_else(|| AppError::NotFound(format!("Rule not found: {}", id)))?; - - Ok(Json(rule)) -} - -#[derive(Debug, Deserialize)] -pub struct LogEntry { - pub message: String, - pub level: String, -} - -async fn ingest_log( - State(state): State>, - Json(log): Json, -) -> Result, AppError> { - tracing::info!("Processing log: {}", log.message); - - // 1. Fetch all enabled rules - let rules = - sqlx::query_as::<_, LogAlertRule>("SELECT * FROM log_alert_rules WHERE is_enabled = true") - .fetch_all(&state.db) - .await?; - - let mut matched_rules = Vec::new(); - - for rule in rules { - if log.message.contains(&rule.pattern) { - tracing::debug!("Log matched pattern for rule: {}", rule.name); - - // 2. Increment count in Redis with TTL - let redis_key = format!( - "alert_count:{}:{}", - rule.id, - chrono::Utc::now().timestamp() / rule.interval_seconds as i64 - ); - let mut conn = state.redis.get_async_connection().await?; - - let count: i32 = redis::cmd("INCR") - .arg(&redis_key) - .query_async(&mut conn) - .await?; - - // Set TTL if new key - if count == 1 { - let _: () = redis::cmd("EXPIRE") - .arg(&redis_key) - .arg(rule.interval_seconds) - .query_async(&mut conn) - .await?; - } - - // 3. Check if threshold reached - if count >= rule.threshold { - tracing::warn!( - "Threshold reached for rule: {}. Triggering alert!", - rule.name - ); - - // 4. Persist alert - sqlx::query("INSERT INTO log_alerts (rule_id, message) VALUES ($1, $2)") - .bind(rule.id) - .bind(format!( - "Threshold of {} reached for pattern '{}'", - rule.threshold, rule.pattern - )) - .execute(&state.db) - .await?; - - matched_rules.push(rule.name); - } - } - } - - Ok(Json(serde_json::json!({ - "status": "processed", - "matched": matched_rules - }))) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_pattern_matching() { - let pattern = "error"; - let message = "This is an error message"; - assert!(message.contains(pattern)); - } -} - -// Log alerting service for monitoring log entries and triggering alerts. -// -// This module provides threshold-based alerting on top of the log aggregation -// pipeline. Alerts are evaluated against configurable rules and can be -// dispatched to multiple channels (in-memory queue, Redis pub/sub). -// -// # Example -// ```rust,no_run -// use backend::services::log_alerts::{AlertManager, AlertRule, AlertSeverity}; -// -// # async fn example() { -// let manager = AlertManager::new(); -// manager.add_rule(AlertRule { -// id: uuid::Uuid::new_v4(), -// name: "High error rate".to_string(), -// pattern: "ERROR".to_string(), -// severity: AlertSeverity::Critical, -// threshold: 5, -// window_secs: 60, -// }).await; -// # } -// ``` +//! Log alerting service for monitoring log entries and triggering alerts. +//! +//! This module provides threshold-based alerting on top of the log aggregation +//! pipeline. Alerts are evaluated against configurable rules and can be +//! dispatched to multiple channels (in-memory queue, Redis pub/sub). +//! +//! # Example +//! ```rust,no_run +//! use backend::services::log_alerts::{AlertManager, AlertRule, AlertSeverity}; +//! +//! # async fn example() { +//! let manager = AlertManager::new(); +//! manager.add_rule(AlertRule { +//! id: uuid::Uuid::new_v4(), +//! name: "High error rate".to_string(), +//! pattern: "ERROR".to_string(), +//! severity: AlertSeverity::Critical, +//! threshold: 5, +//! window_secs: 60, +//! }).await; +//! # } +//! ``` + +#![allow(dead_code)] use chrono::{DateTime, Utc}; use std::collections::HashMap; @@ -275,9 +102,7 @@ impl AlertRule { /// Validate that the rule has sensible configuration values. pub fn validate(&self) -> Result<(), AlertError> { if self.name.trim().is_empty() { - return Err(AlertError::InvalidRule( - "name must not be empty".to_string(), - )); + return Err(AlertError::InvalidRule("name must not be empty".to_string())); } if self.pattern.trim().is_empty() { return Err(AlertError::InvalidRule( @@ -322,7 +147,6 @@ pub struct Alert { /// Tracks recent log-entry timestamps per rule for sliding-window evaluation. #[derive(Debug, Default)] struct RuleState { - /// Timestamps of log entries that matched this rule. hits: Vec>, } @@ -357,8 +181,6 @@ impl AlertManager { } /// Add or replace an alert rule. - /// - /// Returns an error if the rule fails validation. pub async fn add_rule(&self, rule: AlertRule) -> Result<(), AlertError> { rule.validate()?; let id = rule.id; @@ -385,10 +207,6 @@ impl AlertManager { } /// Evaluate a [`LogEntry`] against all active rules. - /// - /// For each rule whose pattern matches the entry's message, the hit is - /// recorded. If the sliding-window count reaches the rule's threshold an - /// [`Alert`] is fired and stored. pub async fn evaluate(&self, entry: &LogEntry) { let rules = self.rules.read().await; let mut states = self.rule_states.write().await; @@ -428,7 +246,6 @@ impl AlertManager { fired_at: Utc::now(), acknowledged: false, }); - // Reset hits so the alert doesn't re-fire on every subsequent entry. state.hits.clear(); } } @@ -525,8 +342,6 @@ mod tests { } } - // --- AlertRule validation --- - #[test] fn test_rule_validation_empty_name() { let mut rule = make_rule("ERROR", 3, 60); @@ -559,15 +374,12 @@ mod tests { assert!(rule.validate().is_ok()); } - // --- AlertManager CRUD --- - #[tokio::test] async fn test_add_and_get_rules() { let manager = AlertManager::new(); let rule = make_rule("ERROR", 3, 60); let id = rule.id; manager.add_rule(rule).await.unwrap(); - let rules = manager.get_rules().await; assert_eq!(rules.len(), 1); assert_eq!(rules[0].id, id); @@ -590,16 +402,12 @@ mod tests { assert!(matches!(result, Err(AlertError::RuleNotFound(_)))); } - // --- Alert evaluation --- - #[tokio::test] async fn test_no_alert_below_threshold() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 3, 60)).await.unwrap(); - manager.evaluate(&make_entry("ERROR occurred")).await; manager.evaluate(&make_entry("ERROR occurred")).await; - assert!(manager.get_alerts(None).await.is_empty()); } @@ -607,11 +415,9 @@ mod tests { async fn test_alert_fires_at_threshold() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 3, 60)).await.unwrap(); - for _ in 0..3 { manager.evaluate(&make_entry("ERROR occurred")).await; } - let alerts = manager.get_alerts(None).await; assert_eq!(alerts.len(), 1); assert_eq!(alerts[0].match_count, 3); @@ -621,11 +427,7 @@ mod tests { async fn test_non_matching_entry_does_not_fire() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 1, 60)).await.unwrap(); - - manager - .evaluate(&make_entry("INFO everything is fine")) - .await; - + manager.evaluate(&make_entry("INFO everything is fine")).await; assert!(manager.get_alerts(None).await.is_empty()); } @@ -633,32 +435,23 @@ mod tests { async fn test_alert_resets_after_firing() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERROR", 2, 60)).await.unwrap(); - - // First batch – fires manager.evaluate(&make_entry("ERROR a")).await; manager.evaluate(&make_entry("ERROR b")).await; assert_eq!(manager.get_alerts(None).await.len(), 1); - - // Second batch – fires again after reset manager.evaluate(&make_entry("ERROR c")).await; manager.evaluate(&make_entry("ERROR d")).await; assert_eq!(manager.get_alerts(None).await.len(), 2); } - // --- Acknowledge --- - #[tokio::test] async fn test_acknowledge_alert() { let manager = AlertManager::new(); manager.add_rule(make_rule("CRIT", 1, 60)).await.unwrap(); manager.evaluate(&make_entry("CRIT failure")).await; - let alerts = manager.get_alerts(None).await; assert_eq!(alerts.len(), 1); let alert_id = alerts[0].id; - manager.acknowledge_alert(alert_id).await.unwrap(); - let active = manager.get_active_alerts().await; assert!(active.is_empty()); } @@ -670,37 +463,28 @@ mod tests { assert!(matches!(result, Err(AlertError::AlertNotFound(_)))); } - // --- Severity filter --- - #[tokio::test] async fn test_filter_alerts_by_severity() { let manager = AlertManager::new(); - let mut warn_rule = make_rule("WARN", 1, 60); warn_rule.severity = AlertSeverity::Warning; manager.add_rule(warn_rule).await.unwrap(); - let mut crit_rule = make_rule("CRIT", 1, 60); crit_rule.severity = AlertSeverity::Critical; manager.add_rule(crit_rule).await.unwrap(); - manager.evaluate(&make_entry("WARN something")).await; manager.evaluate(&make_entry("CRIT something")).await; - let critical = manager.get_alerts(Some(AlertSeverity::Critical)).await; assert_eq!(critical.len(), 1); assert_eq!(critical[0].severity, AlertSeverity::Critical); } - // --- Clear --- - #[tokio::test] async fn test_clear_alerts() { let manager = AlertManager::new(); manager.add_rule(make_rule("ERR", 1, 60)).await.unwrap(); manager.evaluate(&make_entry("ERR boom")).await; assert!(!manager.get_alerts(None).await.is_empty()); - manager.clear_alerts().await; assert!(manager.get_alerts(None).await.is_empty()); } diff --git a/backend/src/services/mod.rs b/backend/src/services/mod.rs index d90e2b2..5db99b5 100644 --- a/backend/src/services/mod.rs +++ b/backend/src/services/mod.rs @@ -1,8 +1,5 @@ pub mod alerts; pub mod business_metrics; -pub mod log_alerts; -pub mod dedup; -pub mod cache_metrics; pub mod error_recovery; pub mod feature_flags; pub mod log_aggregator; diff --git a/backend/src/services/sys_metrics.rs b/backend/src/services/sys_metrics.rs index 6b533d0..8865d18 100644 --- a/backend/src/services/sys_metrics.rs +++ b/backend/src/services/sys_metrics.rs @@ -1,101 +1,44 @@ -//! Build System Metrics Exporter -//! -//! This module provides a production-ready metrics exporter for build system operations. -//! It collects and persists build-related metrics including compilation times, dependency counts, -//! cache hit rates, and system resource usage. The service uses PostgreSQL for durability -//! and Redis for high-performance caching. -//! -//! # Example -//! ```rust,no_run -//! use backend::services::sys_metrics::BuildMetricsService; -//! use sqlx::PgPool; -//! use redis::Client; -//! -//! # async fn example(pool: PgPool, redis: Client) -> anyhow::Result<()> { -//! let service = BuildMetricsService::new(pool, redis); -//! -//! // Record a build metric -//! let metric = BuildMetric { -//! project_name: "crucible".to_string(), -//! build_id: "build-123".to_string(), -//! build_status: BuildStatus::Success, -//! compilation_time_ms: 5000, -//! dependency_count: 42, -//! cache_hit_rate: Some(85.5), -//! cpu_usage: Some(75.2), -//! memory_usage_mb: Some(1024), -//! build_timestamp: Utc::now(), -//! }; -//! service.record_build(metric).await?; -//! -//! // Query metrics -//! let metrics = service.get_project_metrics("crucible", 10).await?; -//! # Ok(()) -//! # } -//! ``` +//! System metrics and build metrics services. + +#![allow(dead_code)] use chrono::{DateTime, Utc}; use redis::{AsyncCommands, Client as RedisClient}; use rust_decimal::Decimal; use serde::{Deserialize, Serialize}; use sqlx::PgPool; +use std::sync::Arc; use thiserror::Error; -use tracing::{debug, error, info, warn}; +use tokio::sync::RwLock; +use tracing::{debug, info, instrument}; use uuid::Uuid; +use crate::services::tracing::TracingService; + // --------------------------------------------------------------------------- -// Error types +// MetricsError // --------------------------------------------------------------------------- -/// Errors that can occur in the build metrics service. #[derive(Debug, Error)] pub enum MetricsError { - /// A database error occurred. #[error("Database error: {0}")] Database(#[from] sqlx::Error), - - /// A Redis error occurred. #[error("Redis error: {0}")] Redis(#[from] redis::RedisError), - - /// Serialization error. #[error("Serialization error: {0}")] Serialization(String), - - /// The requested project was not found. #[error("Project not found: {0}")] ProjectNotFound(String), - - /// Invalid build status. #[error("Invalid build status: {0}")] InvalidStatus(String), - - /// An internal error occurred. #[error("Internal error: {0}")] Internal(String), } -use crate::services::tracing::TracingService; -use std::sync::Arc; -use tokio::sync::RwLock; - -pub struct MetricsExporter { - current_metrics: Arc>, -} - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct SystemMetrics { - pub cpu_usage: f64, - pub memory_usage: u64, - pub uptime: u64, - pub timestamp: DateTime, -} - // --------------------------------------------------------------------------- -// Domain types +// BuildStatus // --------------------------------------------------------------------------- -/// Build status enumeration. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "lowercase")] pub enum BuildStatus { @@ -126,47 +69,32 @@ impl BuildStatus { } } -/// Build system metrics record. +// --------------------------------------------------------------------------- +// BuildMetric +// --------------------------------------------------------------------------- + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BuildMetric { - /// Unique identifier for the metric record. pub id: Option, - /// Name of the project being built. pub project_name: String, - /// Unique build identifier. pub build_id: String, - /// Status of the build. pub build_status: BuildStatus, - /// Compilation time in milliseconds. pub compilation_time_ms: i64, - /// Number of dependencies used. pub dependency_count: i32, - /// Cache hit rate percentage (0-100). pub cache_hit_rate: Option, - /// CPU usage percentage during build. pub cpu_usage: Option, - /// Memory usage in MB during build. pub memory_usage_mb: Option, - /// Timestamp when the build occurred. pub build_timestamp: DateTime, } -/// Aggregated build metrics summary. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BuildMetricsSummary { - /// Project name. pub project_name: String, - /// Total number of builds. pub total_builds: i64, - /// Number of successful builds. pub successful_builds: i64, - /// Number of failed builds. pub failed_builds: i64, - /// Average compilation time in milliseconds. pub avg_compilation_time_ms: Decimal, - /// Success rate percentage. pub success_rate: Decimal, - /// Average cache hit rate. pub avg_cache_hit_rate: Option, } @@ -174,38 +102,24 @@ pub struct BuildMetricsSummary { // BuildMetricsService // --------------------------------------------------------------------------- -/// Service for collecting and managing build system metrics with PostgreSQL persistence -/// and Redis caching. pub struct BuildMetricsService { db: PgPool, redis: RedisClient, } impl BuildMetricsService { - /// Create a new build metrics service. - /// - /// # Arguments - /// - `db`: PostgreSQL connection pool - /// - `redis`: Redis client pub fn new(db: PgPool, redis: RedisClient) -> Self { Self { db, redis } } - /// Record a build metric. - /// - /// This method persists the metric to PostgreSQL and invalidates relevant cache entries. - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database operation fails. - /// Returns [`MetricsError::Redis`] if the cache invalidation fails. pub async fn record_build(&self, metric: BuildMetric) -> Result { let id = Uuid::new_v4(); let status_str = metric.build_status.as_str(); sqlx::query( r#" - INSERT INTO build_metrics - (id, project_name, build_id, build_status, compilation_time_ms, + INSERT INTO build_metrics + (id, project_name, build_id, build_status, compilation_time_ms, dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) "#, @@ -216,14 +130,13 @@ impl BuildMetricsService { .bind(status_str) .bind(metric.compilation_time_ms) .bind(metric.dependency_count) - .bind(metric.cache_hit_rate) - .bind(metric.cpu_usage) + .bind(metric.cache_hit_rate.map(|d| d.to_string())) + .bind(metric.cpu_usage.map(|d| d.to_string())) .bind(metric.memory_usage_mb) .bind(metric.build_timestamp) .execute(&self.db) .await?; - // Invalidate cache for this project self.invalidate_project_cache(&metric.project_name).await?; info!( @@ -236,26 +149,12 @@ impl BuildMetricsService { Ok(id) } - /// Get metrics for a specific project. - /// - /// This method first checks Redis cache. On cache miss, it queries PostgreSQL - /// and populates the cache with a 5-minute TTL. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - `limit`: Maximum number of records to return - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. - /// Returns [`MetricsError::Redis`] if the cache operation fails. pub async fn get_project_metrics( &self, project_name: &str, limit: i64, ) -> Result, MetricsError> { let cache_key = format!("build_metrics:{}:{}", project_name, limit); - - // Try cache first let mut conn = self.redis.get_multiplexed_async_connection().await?; let cached: Option = conn.get(&cache_key).await?; @@ -266,12 +165,22 @@ impl BuildMetricsService { return Ok(metrics); } - // Cache miss – query database debug!(project = %project_name, "Build metrics cache miss – querying database"); - let rows = sqlx::query_as( + let rows: Vec<( + Uuid, + String, + String, + String, + i64, + i32, + Option, + Option, + Option, + DateTime, + )> = sqlx::query_as( r#" SELECT id, project_name, build_id, build_status, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp + dependency_count, cache_hit_rate::float8, cpu_usage::float8, memory_usage_mb, build_timestamp FROM build_metrics WHERE project_name = $1 ORDER BY build_timestamp DESC @@ -297,54 +206,43 @@ impl BuildMetricsService { cpu_usage, memory_usage_mb, build_timestamp, - )| { - BuildMetric { - id: Some(id), - project_name, - build_id, - build_status: BuildStatus::from_str(&status_str) - .unwrap_or(BuildStatus::Failed), - compilation_time_ms, - dependency_count, - cache_hit_rate, - cpu_usage, - memory_usage_mb, - build_timestamp, - } + )| BuildMetric { + id: Some(id), + project_name, + build_id, + build_status: BuildStatus::from_str(&status_str) + .unwrap_or(BuildStatus::Failed), + compilation_time_ms, + dependency_count, + cache_hit_rate: cache_hit_rate.map(Decimal::try_from).and_then(|r| r.ok()), + cpu_usage: cpu_usage.map(Decimal::try_from).and_then(|r| r.ok()), + memory_usage_mb, + build_timestamp, }, ) .collect(); - // Populate cache with 5-minute TTL if !metrics.is_empty() { let json = serde_json::to_string(&metrics) .map_err(|e| MetricsError::Serialization(e.to_string()))?; let _: () = conn.set_ex(&cache_key, json, 300).await?; - debug!(project = %project_name, count = metrics.len(), "Cached build metrics"); } Ok(metrics) } - /// Get aggregated metrics summary for a project. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. pub async fn get_project_summary( &self, project_name: &str, ) -> Result { - let row: Option<(i64, i64, i64, Option, Option)> = sqlx::query_as( + let row: Option<(i64, i64, i64, Option, Option)> = sqlx::query_as( r#" - SELECT + SELECT COUNT(*) as total_builds, SUM(CASE WHEN build_status = 'success' THEN 1 ELSE 0 END) as successful_builds, SUM(CASE WHEN build_status = 'failed' THEN 1 ELSE 0 END) as failed_builds, - AVG(compilation_time_ms) as avg_compilation_time, - AVG(cache_hit_rate) as avg_cache_hit_rate + AVG(compilation_time_ms)::float8 as avg_compilation_time, + AVG(cache_hit_rate)::float8 as avg_cache_hit_rate FROM build_metrics WHERE project_name = $1 "#, @@ -363,7 +261,7 @@ impl BuildMetricsService { )) => { let success_rate = if total_builds > 0 { Decimal::from(successful_builds) / Decimal::from(total_builds) - * Decimal::from(100) + * Decimal::from(100u32) } else { Decimal::ZERO }; @@ -373,27 +271,36 @@ impl BuildMetricsService { total_builds, successful_builds, failed_builds, - avg_compilation_time_ms: avg_compilation_time.unwrap_or(Decimal::ZERO), + avg_compilation_time_ms: avg_compilation_time + .map(Decimal::try_from) + .and_then(|r| r.ok()) + .unwrap_or(Decimal::ZERO), success_rate, - avg_cache_hit_rate, + avg_cache_hit_rate: avg_cache_hit_rate + .map(Decimal::try_from) + .and_then(|r| r.ok()), }) } None => Err(MetricsError::ProjectNotFound(project_name.to_string())), } } - /// Get recent build metrics across all projects. - /// - /// # Arguments - /// - `limit`: Maximum number of records to return - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database query fails. pub async fn get_recent_metrics(&self, limit: i64) -> Result, MetricsError> { - let rows = sqlx::query_as( + let rows: Vec<( + Uuid, + String, + String, + String, + i64, + i32, + Option, + Option, + Option, + DateTime, + )> = sqlx::query_as( r#" SELECT id, project_name, build_id, build_status, compilation_time_ms, - dependency_count, cache_hit_rate, cpu_usage, memory_usage_mb, build_timestamp + dependency_count, cache_hit_rate::float8, cpu_usage::float8, memory_usage_mb, build_timestamp FROM build_metrics ORDER BY build_timestamp DESC LIMIT $1 @@ -417,32 +324,23 @@ impl BuildMetricsService { cpu_usage, memory_usage_mb, build_timestamp, - )| { - BuildMetric { - id: Some(id), - project_name, - build_id, - build_status: BuildStatus::from_str(&status_str) - .unwrap_or(BuildStatus::Failed), - compilation_time_ms, - dependency_count, - cache_hit_rate, - cpu_usage, - memory_usage_mb, - build_timestamp, - } + )| BuildMetric { + id: Some(id), + project_name, + build_id, + build_status: BuildStatus::from_str(&status_str) + .unwrap_or(BuildStatus::Failed), + compilation_time_ms, + dependency_count, + cache_hit_rate: cache_hit_rate.map(Decimal::try_from).and_then(|r| r.ok()), + cpu_usage: cpu_usage.map(Decimal::try_from).and_then(|r| r.ok()), + memory_usage_mb, + build_timestamp, }, ) .collect()) } - /// Delete all metrics for a project. - /// - /// # Arguments - /// - `project_name`: Name of the project - /// - /// # Errors - /// Returns [`MetricsError::Database`] if the database operation fails. pub async fn delete_project_metrics(&self, project_name: &str) -> Result { let result = sqlx::query("DELETE FROM build_metrics WHERE project_name = $1") .bind(project_name) @@ -460,27 +358,42 @@ impl BuildMetricsService { Ok(result.rows_affected()) } - /// Invalidate Redis cache for a specific project. async fn invalidate_project_cache(&self, project_name: &str) -> Result<(), MetricsError> { let mut conn = self.redis.get_multiplexed_async_connection().await?; - - // Delete all cache keys for this project using SCAN let pattern = format!("build_metrics:{}:*", project_name); let keys: Vec = redis::cmd("KEYS") .arg(&pattern) .query_async(&mut conn) .await?; + for key in &keys { + let _: () = conn.del(key).await?; + } + if !keys.is_empty() { - for key in keys { - let _: () = conn.del(&key).await?; - } debug!(project = %project_name, count = keys.len(), "Invalidated project cache"); } + Ok(()) } } +// --------------------------------------------------------------------------- +// SystemMetrics + MetricsExporter +// --------------------------------------------------------------------------- + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct SystemMetrics { + pub cpu_usage: f64, + pub memory_usage: u64, + pub uptime: u64, + pub timestamp: DateTime, +} + +pub struct MetricsExporter { + current_metrics: Arc>, +} + impl Default for MetricsExporter { fn default() -> Self { Self::new() @@ -497,10 +410,10 @@ impl MetricsExporter { } } + #[instrument(skip(self), fields(service.name = "MetricsExporter", service.method = "update_metrics"))] pub async fn update_metrics(&self, cpu: f64, mem: u64, uptime: u64) { let span = TracingService::service_method_span("MetricsExporter", "update_metrics"); let _enter = span.enter(); - let mut metrics = self.current_metrics.write().await; metrics.cpu_usage = cpu; metrics.memory_usage = mem; @@ -512,14 +425,10 @@ impl MetricsExporter { pub async fn get_metrics(&self) -> SystemMetrics { let span = TracingService::service_method_span("MetricsExporter", "get_metrics"); let _enter = span.enter(); - self.current_metrics.read().await.clone() } pub async fn run_collector(exporter: Arc) { - let span = TracingService::service_method_span("MetricsExporter", "run_collector"); - let _enter = span.enter(); - info!("Starting system metrics collector worker"); let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(5)); let start_time = Utc::now(); @@ -527,10 +436,7 @@ impl MetricsExporter { loop { interval.tick().await; let uptime = (Utc::now() - start_time).num_seconds() as u64; - // Simulated metrics collection - exporter - .update_metrics(12.5, 1024 * 1024 * 512, uptime) - .await; + exporter.update_metrics(12.5, 1024 * 1024 * 512, uptime).await; } } } @@ -542,7 +448,6 @@ impl MetricsExporter { #[cfg(test)] mod tests { use super::*; - use rust_decimal_macros::dec; #[test] fn test_build_status_conversion() { @@ -571,8 +476,8 @@ mod tests { build_status: BuildStatus::Success, compilation_time_ms: 5000, dependency_count: 42, - cache_hit_rate: Some(dec!(85.5)), - cpu_usage: Some(dec!(75.2)), + cache_hit_rate: Some(Decimal::from(85u32)), + cpu_usage: Some(Decimal::from(75u32)), memory_usage_mb: Some(1024), build_timestamp: Utc::now(), }; @@ -595,23 +500,6 @@ mod tests { assert!(err.to_string().contains("unknown")); } - #[test] - fn test_build_metrics_summary() { - let summary = BuildMetricsSummary { - project_name: "test".to_string(), - total_builds: 100, - successful_builds: 95, - failed_builds: 5, - avg_compilation_time_ms: dec!(5000), - success_rate: dec!(95), - avg_cache_hit_rate: Some(dec!(80)), - }; - - let json = serde_json::to_string(&summary).unwrap(); - assert!(json.contains("test")); - assert!(json.contains("95")); - } - #[tokio::test] async fn test_build_status_roundtrip() { let statuses = vec![ @@ -620,7 +508,6 @@ mod tests { BuildStatus::Cancelled, BuildStatus::Running, ]; - for status in statuses { let s = status.as_str(); let parsed = BuildStatus::from_str(s).unwrap(); @@ -632,7 +519,6 @@ mod tests { async fn test_metrics_collection() { let exporter = MetricsExporter::new(); exporter.update_metrics(25.0, 1024, 60).await; - let metrics = exporter.get_metrics().await; assert_eq!(metrics.cpu_usage, 25.0); assert_eq!(metrics.memory_usage, 1024); diff --git a/backend/src/services/tracing.rs b/backend/src/services/tracing.rs index 0bcdd83..a23f818 100644 --- a/backend/src/services/tracing.rs +++ b/backend/src/services/tracing.rs @@ -1,21 +1,15 @@ -//! OpenTelemetry tracing service for production-grade observability +//! OpenTelemetry tracing service for production-grade observability. //! -//! This module provides the centralized tracing hub for the Crucible backend, -//! implementing OTLP exporter with Jaeger/Zipkin compatibility, semantic conventions, +//! Provides the centralized tracing hub for the Crucible backend, implementing +//! OTLP exporter with Jaeger/Zipkin compatibility, semantic conventions, //! sampling strategies, and proper error propagation. -//! -//! # Features -//! - OTLP/gRPC exporter (Jaeger/Zipkin compatible) -//! - Head-based and tail-based sampling strategies -//! - Semantic conventions for HTTP, DB, and service operations -//! - Resource detection with deployment environment -//! - Span limits and baggage propagation -//! - Zero-overhead when tracing is disabled + +#![allow(dead_code)] use opentelemetry::trace::TracerProvider as _; use opentelemetry::KeyValue; use opentelemetry_otlp::WithExportConfig; -use opentelemetry_sdk::trace::{Config, RandomIdGenerator, Sampler, TracerProvider}; +use opentelemetry_sdk::trace::{Config, RandomIdGenerator, Sampler}; use opentelemetry_sdk::Resource; use opentelemetry_semantic_conventions::resource; use std::time::Duration; @@ -23,27 +17,28 @@ use tracing::{info_span, warn}; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::{EnvFilter, Registry}; -/// Central tracing service for initialization and span creation -pub struct TracingService; +// --------------------------------------------------------------------------- +// TracingConfig +// --------------------------------------------------------------------------- -/// Configuration for the tracing service +/// Configuration for the tracing service. #[derive(Clone, Debug)] pub struct TracingConfig { - /// OTLP exporter endpoint (e.g., "http://jaeger:4317") + /// OTLP exporter endpoint (e.g., `"http://jaeger:4317"`). pub otlp_endpoint: String, - /// Service name for resource identification + /// Service name for resource identification. pub service_name: String, - /// Service version + /// Service version. pub service_version: String, - /// Deployment environment (dev, staging, production) + /// Deployment environment (`dev`, `staging`, `production`). pub environment: String, - /// Sampling ratio (0.0 to 1.0) + /// Sampling ratio in `[0.0, 1.0]`. pub sampling_ratio: f64, - /// Maximum number of attributes per span + /// Maximum number of attributes per span. pub max_attributes_per_span: u32, - /// Maximum number of events per span + /// Maximum number of events per span. pub max_events_per_span: u32, - /// Maximum number of links per span + /// Maximum number of links per span. pub max_links_per_span: u32, } @@ -53,7 +48,7 @@ impl Default for TracingConfig { otlp_endpoint: "http://localhost:4317".to_string(), service_name: "crucible-backend".to_string(), service_version: env!("CARGO_PKG_VERSION").to_string(), - environment: std::env::var("ENV").unwrap_or("dev".to_string()), + environment: std::env::var("ENV").unwrap_or_else(|_| "dev".to_string()), sampling_ratio: 1.0, max_attributes_per_span: 128, max_events_per_span: 128, @@ -63,7 +58,7 @@ impl Default for TracingConfig { } impl TracingConfig { - /// Create a new tracing configuration with defaults + /// Create a new configuration with the given service name and version. pub fn new(service_name: String, service_version: String) -> Self { Self { service_name, @@ -72,41 +67,49 @@ impl TracingConfig { } } - /// Set a custom OTLP endpoint + /// Override the OTLP endpoint. pub fn with_otlp_endpoint(mut self, endpoint: String) -> Self { self.otlp_endpoint = endpoint; self } - /// Set the deployment environment + /// Set the deployment environment and adjust sampling accordingly. pub fn with_environment(mut self, env: String) -> Self { - self.environment = env.clone(); self.sampling_ratio = match env.as_str() { "production" => 0.01, "staging" => 0.1, _ => 1.0, }; + self.environment = env; self } - /// Set custom sampling ratio (0.0 to 1.0) + /// Set a custom sampling ratio clamped to `[0.0, 1.0]`. pub fn with_sampling_ratio(mut self, ratio: f64) -> Self { self.sampling_ratio = ratio.max(0.0).min(1.0); self } } +// --------------------------------------------------------------------------- +// TracingService +// --------------------------------------------------------------------------- + +/// Central tracing service for initialization and span creation. +pub struct TracingService; + impl TracingService { - /// Initialize the global tracer provider with OTLP exporter + /// Initialize the global tracer provider with an OTLP exporter. pub fn init(config: TracingConfig) -> anyhow::Result<()> { - let resource = Resource::builder() - .with_attributes(vec![ - KeyValue::new(resource::SERVICE_NAME, config.service_name.clone()), - KeyValue::new(resource::SERVICE_VERSION, config.service_version.clone()), - KeyValue::new(resource::DEPLOYMENT_ENVIRONMENT, config.environment.clone()), - KeyValue::new("service.namespace", "crucible"), - ]) - .build(); + let resource = Resource::new(vec![ + KeyValue::new(resource::SERVICE_NAME, config.service_name.clone()), + KeyValue::new(resource::SERVICE_VERSION, config.service_version.clone()), + KeyValue::new( + resource::DEPLOYMENT_ENVIRONMENT, + config.environment.clone(), + ), + KeyValue::new("service.namespace", "crucible"), + ]); let sampler = if config.environment == "production" { Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased(config.sampling_ratio))) @@ -118,9 +121,9 @@ impl TracingService { .with_resource(resource) .with_sampler(sampler) .with_id_generator(RandomIdGenerator::default()) - .with_max_attributes_per_span(config.max_attributes_per_span as u32) - .with_max_events_per_span(config.max_events_per_span as u32) - .with_max_links_per_span(config.max_links_per_span as u32); + .with_max_attributes_per_span(config.max_attributes_per_span) + .with_max_events_per_span(config.max_events_per_span) + .with_max_links_per_span(config.max_links_per_span); let tracer_provider = opentelemetry_otlp::new_pipeline() .tracing() @@ -134,9 +137,7 @@ impl TracingService { .install_batch(opentelemetry_sdk::runtime::Tokio) .map_err(|e| anyhow::anyhow!("Failed to install OTLP exporter: {}", e))?; - // Get a tracer from the provider let tracer = tracer_provider.tracer("crucible-backend"); - let telemetry_layer = tracing_opentelemetry::layer().with_tracer(tracer); let subscriber = Registry::default() @@ -150,16 +151,18 @@ impl TracingService { tracing::subscriber::set_global_default(subscriber) .map_err(|e| anyhow::anyhow!("Failed to set global subscriber: {}", e))?; - tracing::info!("OpenTelemetry tracing initialized successfully"); - tracing::info!("Service: {}", config.service_name); - tracing::info!("Environment: {}", config.environment); - tracing::info!("OTLP Endpoint: {}", config.otlp_endpoint); - tracing::info!("Sampling Ratio: {:.1}%", config.sampling_ratio * 100.0); + tracing::info!( + service = %config.service_name, + environment = %config.environment, + otlp_endpoint = %config.otlp_endpoint, + sampling_pct = config.sampling_ratio * 100.0, + "OpenTelemetry tracing initialized" + ); Ok(()) } - /// Create an HTTP request span with semantic conventions + /// Create an HTTP request span with semantic conventions. pub fn http_request_span(method: &str, path: &str, user_id: Option<&str>) -> tracing::Span { info_span!( "http.request", @@ -174,7 +177,7 @@ impl TracingService { ) } - /// Create a database query span with semantic conventions + /// Create a database query span with semantic conventions. pub fn db_query_span(query: &str, db_system: &str, operation: &str) -> tracing::Span { let truncated_query = query .split('\n') @@ -196,7 +199,7 @@ impl TracingService { ) } - /// Create a Redis command span with semantic conventions + /// Create a Redis command span with semantic conventions. pub fn redis_command_span(command: &str, key: Option<&str>) -> tracing::Span { info_span!( "db.redis.command", @@ -208,7 +211,7 @@ impl TracingService { ) } - /// Create a service method span for business operations + /// Create a service method span for business operations. pub fn service_method_span(service_name: &str, method_name: &str) -> tracing::Span { info_span!( "service.method", @@ -219,7 +222,7 @@ impl TracingService { ) } - /// Create an async job/task span + /// Create an async job/task span. pub fn job_span(job_name: &str, job_id: &str) -> tracing::Span { info_span!( "job.execute", @@ -230,13 +233,17 @@ impl TracingService { ) } - /// Mark current span with error information + /// Record error information on the current span. pub fn record_error(span: &tracing::Span, error_message: &str, error_type: &str) { span.record("error.type", error_type); warn!("Span error recorded: {} ({})", error_message, error_type); } } +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + #[cfg(test)] mod tests { use super::*; @@ -256,6 +263,35 @@ mod tests { assert_eq!(config.sampling_ratio, 0.01); } + #[test] + fn test_tracing_config_staging_sampling() { + let config = TracingConfig::default().with_environment("staging".to_string()); + assert_eq!(config.sampling_ratio, 0.1); + } + + #[test] + fn test_tracing_config_dev_sampling() { + let config = TracingConfig::default().with_environment("dev".to_string()); + assert_eq!(config.sampling_ratio, 1.0); + } + + #[test] + fn test_sampling_ratio_bounds() { + let config = TracingConfig::default().with_sampling_ratio(1.5); + assert_eq!(config.sampling_ratio, 1.0); + + let config = TracingConfig::default().with_sampling_ratio(-0.5); + assert_eq!(config.sampling_ratio, 0.0); + } + + #[test] + fn test_config_clone() { + let cfg = TracingConfig::new("svc".to_string(), "1.0.0".to_string()); + let cloned = cfg.clone(); + assert_eq!(cfg.service_name, cloned.service_name); + assert_eq!(cfg.otlp_endpoint, cloned.otlp_endpoint); + } + #[test] fn test_http_span_creation() { let span = TracingService::http_request_span("GET", "/api/users", Some("user123")); @@ -289,13 +325,4 @@ mod tests { let span = TracingService::job_span("process_transaction", "job-456"); drop(span); } - - #[test] - fn test_sampling_ratio_bounds() { - let config = TracingConfig::default().with_sampling_ratio(1.5); - assert_eq!(config.sampling_ratio, 1.0); - - let config = TracingConfig::default().with_sampling_ratio(-0.5); - assert_eq!(config.sampling_ratio, 0.0); - } } diff --git a/backend/tests/load/dashboard_load.rs b/backend/tests/load/dashboard_load.rs new file mode 100644 index 0000000..1a63013 --- /dev/null +++ b/backend/tests/load/dashboard_load.rs @@ -0,0 +1,453 @@ +//! Concurrent load tests for the `GET /api/dashboard` endpoint. +//! +//! These tests verify that the dashboard handler remains stable and correct +//! under concurrent load. The handler degrades gracefully when Redis is +//! unavailable (falls back to live service data), so tests run without any +//! external infrastructure. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! ``` + +use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; +use tower::ServiceExt; + +use backend::api::handlers::dashboard::{get_dashboard, DashboardState}; +use backend::services::{ + alerts::AlertDispatcher, + error_recovery::ErrorManager, + log_alerts::AlertManager, + sys_metrics::MetricsExporter, +}; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to `GET /api/dashboard` with mock state. +/// +/// Redis is pointed at a port that will refuse connections so the handler +/// exercises its graceful-degradation path (cache miss → live data). +fn build_app() -> Router { + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + // Unreachable Redis — handler must degrade gracefully. + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state) +} + +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic correctness +// --------------------------------------------------------------------------- + +/// Dashboard returns 200 even when Redis is unreachable. +#[tokio::test] +async fn test_dashboard_returns_200_without_redis() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); +} + +/// Response body contains the three top-level keys. +#[tokio::test] +async fn test_dashboard_response_shape() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert!(json.get("metrics").is_some(), "must have 'metrics'"); + assert!( + json.get("active_recovery_tasks").is_some(), + "must have 'active_recovery_tasks'" + ); + assert!(json.get("active_alerts").is_some(), "must have 'active_alerts'"); +} + +/// `metrics` object contains the expected sub-fields. +#[tokio::test] +async fn test_dashboard_metrics_fields() { + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + // Seed some metrics so the values are non-zero. + state.metrics_exporter.update_metrics(42.0, 2048, 120).await; + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert_eq!(json["metrics"]["cpu_usage"], 42.0); + assert_eq!(json["metrics"]["memory_usage"], 2048); + assert_eq!(json["metrics"]["uptime"], 120); +} + +/// `active_recovery_tasks` reflects tasks registered in the error manager. +#[tokio::test] +async fn test_dashboard_includes_recovery_tasks() { + use backend::services::error_recovery::RecoveryError; + + let error_manager = Arc::new(ErrorManager::new()); + error_manager + .handle_error(RecoveryError::Internal("boom".into()), "worker_a") + .await + .unwrap(); + + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager, + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + let tasks = json["active_recovery_tasks"].as_array().unwrap(); + assert_eq!(tasks.len(), 1); + assert_eq!(tasks[0]["name"], "worker_a"); +} + +/// `active_alerts` reflects alerts fired by the alert manager. +#[tokio::test] +async fn test_dashboard_includes_active_alerts() { + use backend::services::log_alerts::{AlertRule, AlertSeverity}; + use backend::services::log_aggregator::LogEntry; + use chrono::Utc; + use uuid::Uuid; + + let alert_manager = Arc::new(AlertManager::new()); + alert_manager + .add_rule(AlertRule { + id: Uuid::new_v4(), + name: "test-rule".to_string(), + pattern: "CRITICAL".to_string(), + severity: AlertSeverity::Critical, + threshold: 1, + window_secs: 60, + }) + .await + .unwrap(); + + alert_manager + .evaluate(&LogEntry { + timestamp: Utc::now(), + level: "ERROR".to_string(), + message: "CRITICAL failure detected".to_string(), + service: "test".to_string(), + }) + .await; + + let state = Arc::new(DashboardState { + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: Arc::new(ErrorManager::new()), + alert_manager, + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + let alerts = json["active_alerts"].as_array().unwrap(); + assert_eq!(alerts.len(), 1, "one alert should be active"); + assert_eq!(alerts[0]["rule_name"], "test-rule"); + assert_eq!(alerts[0]["severity"], "critical"); +} + +/// Empty state returns empty arrays for tasks and alerts. +#[tokio::test] +async fn test_dashboard_empty_state() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert_eq!( + json["active_recovery_tasks"].as_array().unwrap().len(), + 0 + ); + assert_eq!(json["active_alerts"].as_array().unwrap().len(), 0); +} + +// --------------------------------------------------------------------------- +// Concurrency tests +// --------------------------------------------------------------------------- + +/// 10 concurrent requests all return 200. +#[tokio::test] +async fn test_dashboard_10_concurrent() { + let handles: Vec<_> = (0..10) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 50 concurrent requests all return 200. +#[tokio::test] +async fn test_dashboard_50_concurrent() { + let handles: Vec<_> = (0..50) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_dashboard_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /api/dashboard — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_dashboard_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /api/dashboard — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_dashboard_load_response_shape_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..5_usize { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..4_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, body) in batch { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json.get("metrics").is_some()); + assert!(json.get("active_recovery_tasks").is_some()); + assert!(json.get("active_alerts").is_some()); + } + } +} + +/// Verify that shared state is read consistently under concurrent load. +/// +/// All concurrent requests should see the same seeded metric values. +#[tokio::test] +async fn test_dashboard_shared_state_consistency() { + let metrics_exporter = Arc::new(MetricsExporter::new()); + metrics_exporter.update_metrics(77.0, 4096, 500).await; + + let state = Arc::new(DashboardState { + metrics_exporter, + error_manager: Arc::new(ErrorManager::new()), + alert_manager: Arc::new(AlertManager::new()), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..10_usize { + let state_clone = state.clone(); + join_set.spawn(async move { + let app = Router::new() + .route("/api/dashboard", get(get_dashboard)) + .with_state(state_clone); + let resp = app + .oneshot( + Request::builder() + .uri("/api/dashboard") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + serde_json::from_slice::(&bytes).unwrap() + }); + } + + while let Some(Ok(json)) = join_set.join_next().await { + assert_eq!(json["metrics"]["cpu_usage"], 77.0); + assert_eq!(json["metrics"]["memory_usage"], 4096); + assert_eq!(json["metrics"]["uptime"], 500); + } +} + +/// Verify serialization round-trip of the dashboard response. +#[tokio::test] +async fn test_dashboard_serialization_roundtrip() { + use backend::api::handlers::dashboard::DashboardData; + use backend::services::sys_metrics::SystemMetrics; + + let data = DashboardData { + metrics: SystemMetrics::default(), + active_recovery_tasks: vec![], + active_alerts: vec![], + }; + + let json = serde_json::to_string(&data).unwrap(); + let back: DashboardData = serde_json::from_str(&json).unwrap(); + assert_eq!(back.active_recovery_tasks.len(), 0); + assert_eq!(back.active_alerts.len(), 0); +} diff --git a/backend/tests/load/framework.rs b/backend/tests/load/framework.rs new file mode 100644 index 0000000..d862ca0 --- /dev/null +++ b/backend/tests/load/framework.rs @@ -0,0 +1,585 @@ +//! Load testing framework — shared helpers, metrics, and assertion utilities. +//! +//! # Overview +//! +//! This module provides the core primitives used by every load-test module: +//! +//! - [`LoadConfig`] — controls concurrency, iteration count, and timeout. +//! - [`RequestOutcome`] — the result of a single request (status + latency). +//! - [`LoadResult`] — aggregated statistics over a completed load run. +//! - [`run_load`] — fires `config.concurrency` tasks, each making +//! `config.requests_per_task` requests, and collects [`LoadResult`]. +//! - [`assert_load_result`] — convenience assertion that fails the test when +//! the error rate or p99 latency exceeds the configured thresholds. +//! +//! # Example +//! +//! ```rust,ignore +//! use crate::load::framework::{LoadConfig, run_load, assert_load_result}; +//! +//! let cfg = LoadConfig::default(); +//! let result = run_load(cfg, || async { +//! // build and fire one request, return (StatusCode, Duration) +//! let app = build_app(); +//! let start = std::time::Instant::now(); +//! let resp = app.oneshot(req()).await.unwrap(); +//! (resp.status(), start.elapsed()) +//! }).await; +//! +//! assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +//! ``` + +use std::time::{Duration, Instant}; + +use axum::http::StatusCode; +use tokio::task::JoinSet; + +// --------------------------------------------------------------------------- +// Configuration +// --------------------------------------------------------------------------- + +/// Parameters that control a single load-test run. +#[derive(Debug, Clone)] +pub struct LoadConfig { + /// Number of concurrent Tokio tasks. + pub concurrency: usize, + /// Number of sequential requests each task fires. + pub requests_per_task: usize, + /// Maximum wall-clock time allowed for the entire run. + /// The test will panic if this is exceeded. + pub timeout: Duration, +} + +impl LoadConfig { + /// Create a new configuration. + pub fn new(concurrency: usize, requests_per_task: usize) -> Self { + Self { + concurrency, + requests_per_task, + timeout: Duration::from_secs(30), + } + } + + /// Override the timeout. + pub fn with_timeout(mut self, timeout: Duration) -> Self { + self.timeout = timeout; + self + } + + /// Total number of requests that will be fired. + pub fn total_requests(&self) -> usize { + self.concurrency * self.requests_per_task + } +} + +impl Default for LoadConfig { + /// Sensible defaults: 10 concurrent tasks × 5 requests each = 50 total. + fn default() -> Self { + Self::new(10, 5) + } +} + +// --------------------------------------------------------------------------- +// Per-request outcome +// --------------------------------------------------------------------------- + +/// The outcome of a single HTTP request. +#[derive(Debug, Clone)] +pub struct RequestOutcome { + /// HTTP status code returned by the handler. + pub status: StatusCode, + /// Wall-clock time from request start to response received. + pub latency: Duration, +} + +impl RequestOutcome { + /// Returns `true` if the status code is a 2xx success. + pub fn is_success(&self) -> bool { + self.status.is_success() + } +} + +// --------------------------------------------------------------------------- +// Aggregated result +// --------------------------------------------------------------------------- + +/// Aggregated statistics collected after a load run completes. +#[derive(Debug, Clone)] +pub struct LoadResult { + /// All individual request outcomes, in completion order. + pub outcomes: Vec, + /// Total wall-clock time for the entire run. + pub total_duration: Duration, +} + +impl LoadResult { + /// Total number of requests fired. + pub fn total(&self) -> usize { + self.outcomes.len() + } + + /// Number of successful (2xx) requests. + pub fn successes(&self) -> usize { + self.outcomes.iter().filter(|o| o.is_success()).count() + } + + /// Number of failed (non-2xx) requests. + pub fn failures(&self) -> usize { + self.total() - self.successes() + } + + /// Error rate as a fraction in `[0.0, 1.0]`. + pub fn error_rate(&self) -> f64 { + if self.total() == 0 { + return 0.0; + } + self.failures() as f64 / self.total() as f64 + } + + /// Throughput in requests per second. + pub fn rps(&self) -> f64 { + if self.total_duration.is_zero() { + return 0.0; + } + self.total() as f64 / self.total_duration.as_secs_f64() + } + + /// Minimum observed latency. + pub fn min_latency(&self) -> Duration { + self.outcomes + .iter() + .map(|o| o.latency) + .min() + .unwrap_or(Duration::ZERO) + } + + /// Maximum observed latency. + pub fn max_latency(&self) -> Duration { + self.outcomes + .iter() + .map(|o| o.latency) + .max() + .unwrap_or(Duration::ZERO) + } + + /// Mean (average) latency. + pub fn mean_latency(&self) -> Duration { + if self.outcomes.is_empty() { + return Duration::ZERO; + } + let total_nanos: u128 = self.outcomes.iter().map(|o| o.latency.as_nanos()).sum(); + Duration::from_nanos((total_nanos / self.outcomes.len() as u128) as u64) + } + + /// Percentile latency. `p` must be in `(0.0, 100.0]`. + /// + /// Uses the nearest-rank method. + pub fn percentile_latency(&self, p: f64) -> Duration { + assert!(p > 0.0 && p <= 100.0, "percentile must be in (0, 100]"); + if self.outcomes.is_empty() { + return Duration::ZERO; + } + let mut latencies: Vec = self.outcomes.iter().map(|o| o.latency).collect(); + latencies.sort_unstable(); + let idx = ((p / 100.0) * latencies.len() as f64).ceil() as usize; + latencies[idx.saturating_sub(1).min(latencies.len() - 1)] + } + + /// p50 (median) latency. + pub fn p50(&self) -> Duration { + self.percentile_latency(50.0) + } + + /// p95 latency. + pub fn p95(&self) -> Duration { + self.percentile_latency(95.0) + } + + /// p99 latency. + pub fn p99(&self) -> Duration { + self.percentile_latency(99.0) + } + + /// Print a human-readable summary to stdout. + pub fn print_summary(&self, label: &str) { + println!( + "\n=== Load Test: {label} ===\n\ + Total requests : {total}\n\ + Successes : {ok}\n\ + Failures : {fail}\n\ + Error rate : {err:.2}%\n\ + Throughput : {rps:.1} req/s\n\ + Latency min : {min:?}\n\ + Latency mean : {mean:?}\n\ + Latency p50 : {p50:?}\n\ + Latency p95 : {p95:?}\n\ + Latency p99 : {p99:?}\n\ + Latency max : {max:?}\n\ + Total duration : {dur:?}\n", + label = label, + total = self.total(), + ok = self.successes(), + fail = self.failures(), + err = self.error_rate() * 100.0, + rps = self.rps(), + min = self.min_latency(), + mean = self.mean_latency(), + p50 = self.p50(), + p95 = self.p95(), + p99 = self.p99(), + max = self.max_latency(), + dur = self.total_duration, + ); + } +} + +// --------------------------------------------------------------------------- +// Runner +// --------------------------------------------------------------------------- + +/// Run a load test described by `config`. +/// +/// `request_fn` is called once per request. It must be `Clone` so that each +/// Tokio task gets its own copy. It returns `(StatusCode, Duration)`. +/// +/// # Panics +/// +/// Panics if the run exceeds `config.timeout`. +pub async fn run_load(config: LoadConfig, request_fn: F) -> LoadResult +where + F: Fn() -> Fut + Clone + Send + 'static, + Fut: std::future::Future + Send, +{ + let wall_start = Instant::now(); + let mut join_set: JoinSet> = JoinSet::new(); + + for _ in 0..config.concurrency { + let fn_clone = request_fn.clone(); + let n = config.requests_per_task; + join_set.spawn(async move { + let mut outcomes = Vec::with_capacity(n); + for _ in 0..n { + let (status, latency) = fn_clone().await; + outcomes.push(RequestOutcome { status, latency }); + } + outcomes + }); + } + + // Collect with timeout guard + let mut all_outcomes: Vec = Vec::with_capacity(config.total_requests()); + let deadline = tokio::time::Instant::now() + config.timeout; + + loop { + match tokio::time::timeout_at(deadline, join_set.join_next()).await { + Ok(Some(Ok(outcomes))) => all_outcomes.extend(outcomes), + Ok(Some(Err(e))) => panic!("Load test task panicked: {e}"), + Ok(None) => break, // all tasks done + Err(_) => panic!( + "Load test timed out after {:?} ({} requests completed of {})", + config.timeout, + all_outcomes.len(), + config.total_requests() + ), + } + } + + LoadResult { + outcomes: all_outcomes, + total_duration: wall_start.elapsed(), + } +} + +// --------------------------------------------------------------------------- +// Assertion helper +// --------------------------------------------------------------------------- + +/// Assert that a [`LoadResult`] meets the given SLO targets. +/// +/// # Arguments +/// - `result` — the completed load run. +/// - `max_error_rate` — maximum acceptable error rate as a fraction (e.g. `0.01` = 1 %). +/// - `max_p99` — maximum acceptable p99 latency. +/// +/// # Panics +/// +/// Panics with a descriptive message if either threshold is exceeded. +pub fn assert_load_result(result: &LoadResult, max_error_rate: f64, max_p99: Duration) { + let error_rate = result.error_rate(); + let p99 = result.p99(); + + if error_rate > max_error_rate { + panic!( + "Load test failed: error rate {:.2}% exceeds maximum {:.2}%\n\ + (failures={}, total={})", + error_rate * 100.0, + max_error_rate * 100.0, + result.failures(), + result.total(), + ); + } + + if p99 > max_p99 { + panic!( + "Load test failed: p99 latency {:?} exceeds maximum {:?}", + p99, max_p99, + ); + } +} + +// --------------------------------------------------------------------------- +// Unit tests for the framework itself +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + // --- LoadConfig --- + + #[test] + fn test_load_config_total_requests() { + let cfg = LoadConfig::new(4, 10); + assert_eq!(cfg.total_requests(), 40); + } + + #[test] + fn test_load_config_default_total() { + let cfg = LoadConfig::default(); + assert_eq!(cfg.total_requests(), 50); + } + + #[test] + fn test_load_config_with_timeout() { + let cfg = LoadConfig::default().with_timeout(Duration::from_secs(60)); + assert_eq!(cfg.timeout, Duration::from_secs(60)); + } + + // --- RequestOutcome --- + + #[test] + fn test_request_outcome_is_success_2xx() { + let o = RequestOutcome { + status: StatusCode::OK, + latency: Duration::from_millis(5), + }; + assert!(o.is_success()); + } + + #[test] + fn test_request_outcome_is_not_success_5xx() { + let o = RequestOutcome { + status: StatusCode::INTERNAL_SERVER_ERROR, + latency: Duration::from_millis(5), + }; + assert!(!o.is_success()); + } + + #[test] + fn test_request_outcome_is_not_success_4xx() { + let o = RequestOutcome { + status: StatusCode::NOT_FOUND, + latency: Duration::from_millis(5), + }; + assert!(!o.is_success()); + } + + // --- LoadResult statistics --- + + fn make_result(latencies_ms: &[u64], statuses: &[StatusCode]) -> LoadResult { + assert_eq!(latencies_ms.len(), statuses.len()); + let outcomes = latencies_ms + .iter() + .zip(statuses.iter()) + .map(|(&ms, &status)| RequestOutcome { + status, + latency: Duration::from_millis(ms), + }) + .collect(); + LoadResult { + outcomes, + total_duration: Duration::from_millis(100), + } + } + + #[test] + fn test_load_result_counts() { + let result = make_result( + &[10, 20, 30], + &[StatusCode::OK, StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert_eq!(result.total(), 3); + assert_eq!(result.successes(), 2); + assert_eq!(result.failures(), 1); + } + + #[test] + fn test_load_result_error_rate() { + let result = make_result( + &[10, 20], + &[StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert!((result.error_rate() - 0.5).abs() < f64::EPSILON); + } + + #[test] + fn test_load_result_zero_error_rate() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + assert_eq!(result.error_rate(), 0.0); + } + + #[test] + fn test_load_result_empty_error_rate() { + let result = LoadResult { + outcomes: vec![], + total_duration: Duration::ZERO, + }; + assert_eq!(result.error_rate(), 0.0); + } + + #[test] + fn test_load_result_min_max_latency() { + let result = make_result(&[5, 50, 25], &[StatusCode::OK; 3]); + assert_eq!(result.min_latency(), Duration::from_millis(5)); + assert_eq!(result.max_latency(), Duration::from_millis(50)); + } + + #[test] + fn test_load_result_mean_latency() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + assert_eq!(result.mean_latency(), Duration::from_millis(20)); + } + + #[test] + fn test_load_result_p50() { + // sorted: [10, 20, 30, 40, 50] → p50 = 30 + let result = make_result(&[50, 10, 30, 20, 40], &[StatusCode::OK; 5]); + assert_eq!(result.p50(), Duration::from_millis(30)); + } + + #[test] + fn test_load_result_p99_single_element() { + let result = make_result(&[42], &[StatusCode::OK]); + assert_eq!(result.p99(), Duration::from_millis(42)); + } + + #[test] + fn test_load_result_p95_100_elements() { + // 100 elements: 1ms..=100ms; p95 should be 95ms + let latencies: Vec = (1..=100).collect(); + let statuses = vec![StatusCode::OK; 100]; + let result = make_result(&latencies, &statuses); + assert_eq!(result.p95(), Duration::from_millis(95)); + } + + #[test] + fn test_load_result_rps() { + let result = LoadResult { + outcomes: vec![ + RequestOutcome { status: StatusCode::OK, latency: Duration::from_millis(1) }; + 100 + ], + total_duration: Duration::from_secs(1), + }; + assert!((result.rps() - 100.0).abs() < 0.01); + } + + #[test] + fn test_load_result_rps_zero_duration() { + let result = LoadResult { + outcomes: vec![], + total_duration: Duration::ZERO, + }; + assert_eq!(result.rps(), 0.0); + } + + // --- assert_load_result --- + + #[test] + fn test_assert_load_result_passes() { + let result = make_result(&[10, 20, 30], &[StatusCode::OK; 3]); + // Should not panic + assert_load_result(&result, 0.0, Duration::from_millis(100)); + } + + #[test] + #[should_panic(expected = "error rate")] + fn test_assert_load_result_fails_on_error_rate() { + let result = make_result( + &[10, 20], + &[StatusCode::OK, StatusCode::INTERNAL_SERVER_ERROR], + ); + assert_load_result(&result, 0.0, Duration::from_secs(1)); + } + + #[test] + #[should_panic(expected = "p99 latency")] + fn test_assert_load_result_fails_on_p99() { + let result = make_result(&[500], &[StatusCode::OK]); + assert_load_result(&result, 0.0, Duration::from_millis(100)); + } + + // --- run_load --- + + #[tokio::test] + async fn test_run_load_collects_all_outcomes() { + let cfg = LoadConfig::new(4, 5); // 20 total + let result = run_load(cfg, || async { + (StatusCode::OK, Duration::from_millis(1)) + }) + .await; + + assert_eq!(result.total(), 20); + assert_eq!(result.failures(), 0); + } + + #[tokio::test] + async fn test_run_load_records_failures() { + let cfg = LoadConfig::new(1, 2); + let counter = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let result = run_load(cfg, move || { + let c = counter_clone.clone(); + async move { + let n = c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let status = if n % 2 == 0 { + StatusCode::OK + } else { + StatusCode::INTERNAL_SERVER_ERROR + }; + (status, Duration::from_millis(1)) + } + }) + .await; + + assert_eq!(result.total(), 2); + assert_eq!(result.failures(), 1); + } + + #[tokio::test] + async fn test_run_load_respects_concurrency() { + // Each task records its start time; with concurrency=5 they should + // all start within a short window (not sequentially). + let cfg = LoadConfig::new(5, 1); + let start = Instant::now(); + let result = run_load(cfg, move || async move { + tokio::time::sleep(Duration::from_millis(10)).await; + (StatusCode::OK, start.elapsed()) + }) + .await; + + // All 5 tasks ran concurrently so total wall time should be << 50ms + assert!(result.total_duration < Duration::from_millis(200)); + assert_eq!(result.total(), 5); + } + + #[tokio::test] + async fn test_run_load_default_config() { + let result = run_load(LoadConfig::default(), || async { + (StatusCode::OK, Duration::from_millis(1)) + }) + .await; + assert_eq!(result.total(), 50); + } +} diff --git a/backend/tests/load/mod.rs b/backend/tests/load/mod.rs index 223744f..5f007b5 100644 --- a/backend/tests/load/mod.rs +++ b/backend/tests/load/mod.rs @@ -1,12 +1,39 @@ //! Load and stress tests for the backend API. //! //! These tests exercise the API under concurrent load to verify that the -//! server remains stable and responsive. They are gated behind the -//! `load_tests` feature flag so they don't run in normal CI: +//! server remains stable and responsive. They are designed to run without +//! external services (PostgreSQL, Redis) by using in-process Axum routers +//! with mock state. +//! +//! # Running //! //! ```bash +//! # All load tests //! cargo test -p backend --test load_tests -- --nocapture +//! +//! # A specific module +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! cargo test -p backend --test load_tests load::framework -- --nocapture //! ``` +//! +//! # Architecture +//! +//! Each sub-module builds an in-process Axum [`Router`] with a lightweight +//! mock [`AppState`] (no real DB or Redis connections). Requests are fired +//! via [`tower::ServiceExt::oneshot`], which bypasses the network entirely +//! and exercises only the handler + middleware stack. +//! +//! The [`framework`] module provides shared helpers: +//! - [`LoadConfig`] — concurrency / iteration parameters +//! - [`LoadResult`] — aggregated latency statistics +//! - [`run_load`] — generic concurrent request runner +//! - [`assert_load_result`] — assertion helper for p99 / error-rate targets -pub mod status_load; +pub mod dashboard_load; +pub mod framework; pub mod profile_load; +pub mod status_load; +pub mod stellar_load; diff --git a/backend/tests/load/profile_load.rs b/backend/tests/load/profile_load.rs index ebcb132..88e1c03 100644 --- a/backend/tests/load/profile_load.rs +++ b/backend/tests/load/profile_load.rs @@ -1,29 +1,64 @@ //! Concurrent load tests for the `POST /api/profile` endpoint. +//! +//! These tests verify that the profiling trigger handler remains stable and +//! correct under concurrent load without requiring a live database or Redis. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! ``` -use axum::{routing::post, Router}; -use hyper::{Request, StatusCode}; use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::post, Router}; +use axum::http::StatusCode; +use hyper::Request; use tower::ServiceExt; use backend::api::handlers::profiling::{trigger_profile_collection, AppState}; -use backend::config::{reload::ConfigManager, AppConfig}; +use backend::config::{AppConfig, reload::ConfigManager}; use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to the `POST /api/profile` handler. fn build_app() -> Router { + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); let state = Arc::new(AppState { db: None, metrics_exporter: Arc::new(MetricsExporter::new()), error_manager: Arc::new(ErrorManager::new()), config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), }); Router::new() .route("/api/profile", post(trigger_profile_collection)) .with_state(state) } +/// Build a valid profile trigger request body. +fn profile_request_body(label: &str) -> axum::body::Body { + axum::body::Body::from( + serde_json::json!({ + "duration_secs": 10, + "sample_rate_hz": 100, + "label": label + }) + .to_string(), + ) +} + +/// Fire `n` concurrent requests and assert all return 200. async fn run_concurrent(n: usize) { let handles: Vec<_> = (0..n) - .map(|_| { + .map(|i| { let app = build_app(); tokio::spawn(async move { let resp = app @@ -32,14 +67,7 @@ async fn run_concurrent(n: usize) { .method("POST") .uri("/api/profile") .header("content-type", "application/json") - .body(axum::body::Body::from( - serde_json::json!({ - "duration_secs": 10, - "sample_rate_hz": 100, - "label": "load-test" - }) - .to_string(), - )) + .body(profile_request_body(&format!("load-test-{i}"))) .unwrap(), ) .await @@ -55,6 +83,34 @@ async fn run_concurrent(n: usize) { } } +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body("load-test")) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic concurrency tests +// --------------------------------------------------------------------------- + #[tokio::test] async fn test_profile_10_concurrent() { run_concurrent(10).await; @@ -65,14 +121,53 @@ async fn test_profile_50_concurrent() { run_concurrent(50).await; } +// --------------------------------------------------------------------------- +// Response shape +// --------------------------------------------------------------------------- + +/// Verify response body shape. +#[tokio::test] +async fn test_profile_response_shape() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body("shape-test")) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + assert!(json.get("data").is_some(), "response must have 'data' key"); + assert!( + json["data"].get("message").is_some(), + "data must have 'message' key" + ); + assert!( + json["data"].get("profile_id").is_some(), + "data must have 'profile_id' key" + ); + assert!( + json["data"].get("estimated_completion").is_some(), + "data must have 'estimated_completion' key" + ); +} + /// Verify each response contains a unique profile_id. #[tokio::test] async fn test_profile_unique_ids() { - use axum::body::to_bytes; use std::collections::HashSet; let mut ids = HashSet::new(); - for _ in 0..10 { + for i in 0..10 { let app = build_app(); let resp = app .oneshot( @@ -80,37 +175,57 @@ async fn test_profile_unique_ids() { .method("POST") .uri("/api/profile") .header("content-type", "application/json") - .body(axum::body::Body::from( - serde_json::json!({ - "duration_secs": 10, - "sample_rate_hz": 100, - "label": "load-test-id" - }) - .to_string(), - )) + .body(profile_request_body(&format!("unique-id-test-{i}"))) .unwrap(), ) .await .unwrap(); let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); - let json: serde_json::Value = serde_json::from_slice(&bytes).expect("Valid JSON"); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); let id = json["data"]["profile_id"] .as_str() - .expect("profile_id in data") + .expect("profile_id must be a string") .to_string(); ids.insert(id); } - // All 10 profile IDs should be unique - assert_eq!(ids.len(), 10); + assert_eq!(ids.len(), 10, "all 10 profile IDs must be unique"); } -/// Verify response body shape. +/// Verify the `message` field contains the label from the request. #[tokio::test] -async fn test_profile_response_shape() { - use axum::body::to_bytes; +async fn test_profile_message_contains_label() { + let app = build_app(); + let label = "my-custom-label"; + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body(label)) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let message = json["data"]["message"].as_str().unwrap(); + assert!( + message.contains(label), + "message '{message}' must contain label '{label}'" + ); +} +// --------------------------------------------------------------------------- +// Validation tests +// --------------------------------------------------------------------------- + +/// Verify that a missing `label` field returns 400 / 422. +#[tokio::test] +async fn test_profile_missing_label_rejected() { let app = build_app(); let resp = app .oneshot( @@ -122,7 +237,7 @@ async fn test_profile_response_shape() { serde_json::json!({ "duration_secs": 10, "sample_rate_hz": 100, - "label": "load-test-shape" + "label": "" }) .to_string(), )) @@ -131,12 +246,198 @@ async fn test_profile_response_shape() { .await .unwrap(); - assert_eq!(resp.status(), StatusCode::OK); + // Empty label should fail validation → 400 or 422 + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} - let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); - let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); +/// Verify that `duration_secs = 0` is rejected. +#[tokio::test] +async fn test_profile_zero_duration_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(axum::body::Body::from( + serde_json::json!({ + "duration_secs": 0, + "sample_rate_hz": 100, + "label": "test" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} + +/// Verify that `duration_secs` exceeding 3600 is rejected. +#[tokio::test] +async fn test_profile_excessive_duration_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(axum::body::Body::from( + serde_json::json!({ + "duration_secs": 9999, + "sample_rate_hz": 100, + "label": "test" + }) + .to_string(), + )) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status() == StatusCode::BAD_REQUEST + || resp.status() == StatusCode::UNPROCESSABLE_ENTITY, + "expected 400 or 422, got {}", + resp.status() + ); +} + +/// Verify that a non-JSON body returns 400 / 415. +#[tokio::test] +async fn test_profile_non_json_body_rejected() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "text/plain") + .body(axum::body::Body::from("not json")) + .unwrap(), + ) + .await + .unwrap(); + + assert!( + resp.status().is_client_error(), + "expected 4xx, got {}", + resp.status() + ); +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_profile_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("POST /api/profile — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_profile_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("POST /api/profile — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_profile_load_response_shape_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for i in 0..5_usize { + join_set.spawn(async move { + let mut results = Vec::new(); + for j in 0..4_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body(&format!("task-{i}-req-{j}"))) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, body) in batch { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["status"], "success"); + assert!(json["data"].get("profile_id").is_some()); + assert!(json["data"].get("message").is_some()); + assert!(json["data"].get("estimated_completion").is_some()); + } + } +} + +/// Verify that concurrent requests each produce a unique profile_id. +#[tokio::test] +async fn test_profile_concurrent_unique_ids() { + use std::collections::HashSet; + use std::sync::Mutex; + + let ids = Arc::new(Mutex::new(HashSet::new())); + let mut join_set = tokio::task::JoinSet::new(); + + for i in 0..20_usize { + let ids_clone = ids.clone(); + join_set.spawn(async move { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/profile") + .header("content-type", "application/json") + .body(profile_request_body(&format!("concurrent-{i}"))) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let id = json["data"]["profile_id"] + .as_str() + .unwrap() + .to_string(); + ids_clone.lock().unwrap().insert(id); + }); + } + + while join_set.join_next().await.is_some() {} - assert!(json.get("data").is_some()); - assert!(json["data"].get("message").is_some()); - assert!(json["data"].get("profile_id").is_some()); + let collected = ids.lock().unwrap(); + assert_eq!(collected.len(), 20, "all 20 concurrent profile IDs must be unique"); } diff --git a/backend/tests/load/status_load.rs b/backend/tests/load/status_load.rs index 7508b01..abbb09b 100644 --- a/backend/tests/load/status_load.rs +++ b/backend/tests/load/status_load.rs @@ -1,21 +1,42 @@ //! Concurrent load tests for the `GET /api/status` endpoint. +//! +//! These tests verify that the status handler remains stable and correct +//! under concurrent load without requiring a live database or Redis instance. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! ``` -use axum::{routing::get, Router}; -use hyper::{Request, StatusCode}; use std::sync::Arc; +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; use tower::ServiceExt; use backend::api::handlers::profiling::{get_system_status, AppState}; -use backend::config::{reload::ConfigManager, AppConfig}; +use backend::config::{AppConfig, reload::ConfigManager}; use backend::services::{error_recovery::ErrorManager, sys_metrics::MetricsExporter}; -/// Build a test router with the status endpoint. +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to the `/api/status` handler with mock state. fn build_app() -> Router { + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); let state = Arc::new(AppState { db: None, metrics_exporter: Arc::new(MetricsExporter::new()), error_manager: Arc::new(ErrorManager::new()), config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), }); Router::new() .route("/api/status", get(get_system_status)) @@ -48,6 +69,32 @@ async fn run_concurrent(n: usize) { } } +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic concurrency tests +// --------------------------------------------------------------------------- + #[tokio::test] async fn test_status_10_concurrent() { run_concurrent(10).await; @@ -63,6 +110,10 @@ async fn test_status_100_concurrent() { run_concurrent(100).await; } +// --------------------------------------------------------------------------- +// Sequential stability +// --------------------------------------------------------------------------- + /// Verify that repeated sequential requests all succeed. #[tokio::test] async fn test_status_sequential_stability() { @@ -82,11 +133,13 @@ async fn test_status_sequential_stability() { } } +// --------------------------------------------------------------------------- +// Response shape +// --------------------------------------------------------------------------- + /// Verify response body contains expected JSON keys. #[tokio::test] async fn test_status_response_shape() { - use axum::body::to_bytes; - let app = build_app(); let resp = app .oneshot( @@ -104,8 +157,252 @@ async fn test_status_response_shape() { let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); assert_eq!(json["status"], "success"); - assert!(json.get("data").is_some()); - assert!(json["data"].get("status").is_some()); - assert!(json["data"].get("uptime_secs").is_some()); - assert!(json["data"].get("active_recovery_tasks").is_some()); + assert!(json.get("data").is_some(), "response must have 'data' key"); + assert!( + json["data"].get("status").is_some(), + "data must have 'status' key" + ); + assert!( + json["data"].get("uptime_secs").is_some(), + "data must have 'uptime_secs' key" + ); + assert!( + json["data"].get("active_recovery_tasks").is_some(), + "data must have 'active_recovery_tasks' key" + ); +} + +/// Verify the `status` field value is `"healthy"`. +#[tokio::test] +async fn test_status_healthy_value() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["status"], "healthy"); +} + +/// Verify `active_recovery_tasks` starts at zero with a fresh state. +#[tokio::test] +async fn test_status_zero_recovery_tasks_initially() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["active_recovery_tasks"], 0); +} + +/// Verify `uptime_secs` is a non-negative integer. +#[tokio::test] +async fn test_status_uptime_is_non_negative() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + let uptime = json["data"]["uptime_secs"].as_u64(); + assert!(uptime.is_some(), "uptime_secs must be a non-negative integer"); +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_status_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /api/status — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_status_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /api/status — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct JSON shape. +#[tokio::test] +async fn test_status_load_response_shape_under_load() { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(5, 4); // 20 total + let outcomes: Vec<(StatusCode, Vec)> = { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..cfg.concurrency { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..4 { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + results.push((status, bytes.to_vec())); + } + results + }); + } + let mut all = Vec::new(); + while let Some(Ok(batch)) = join_set.join_next().await { + all.extend(batch); + } + all + }; + + for (status, body) in outcomes { + assert_eq!(status, StatusCode::OK); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["status"], "success"); + assert!(json["data"].get("status").is_some()); + assert!(json["data"].get("uptime_secs").is_some()); + assert!(json["data"].get("active_recovery_tasks").is_some()); + } +} + +/// Verify that the handler is idempotent — repeated calls return the same shape. +#[tokio::test] +async fn test_status_idempotent_responses() { + let app = build_app(); + let mut previous: Option = None; + + for _ in 0..5 { + let resp = app + .clone() + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + + if let Some(ref prev) = previous { + // Keys must be identical; values may differ (e.g. uptime_secs) + assert_eq!( + prev.as_object().unwrap().keys().collect::>(), + json.as_object().unwrap().keys().collect::>(), + "response keys must be stable across calls" + ); + } + previous = Some(json); + } +} + +/// Verify that the handler correctly reflects recovery tasks added to state. +#[tokio::test] +async fn test_status_reflects_recovery_tasks() { + use backend::services::error_recovery::RecoveryError; + + let error_manager = Arc::new(ErrorManager::new()); + error_manager + .handle_error(RecoveryError::Internal("boom".into()), "worker_a") + .await + .unwrap(); + + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); + let state = Arc::new(AppState { + db: None, + metrics_exporter: Arc::new(MetricsExporter::new()), + error_manager: error_manager.clone(), + config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/status", get(get_system_status)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["active_recovery_tasks"], 1); +} + +/// Verify that the handler correctly reflects updated metrics. +#[tokio::test] +async fn test_status_reflects_updated_metrics() { + let metrics_exporter = Arc::new(MetricsExporter::new()); + metrics_exporter.update_metrics(55.0, 2048, 300).await; + + let (log_aggregator, _rx) = backend::services::log_aggregator::LogAggregator::new(); + let state = Arc::new(AppState { + db: None, + metrics_exporter: metrics_exporter.clone(), + error_manager: Arc::new(ErrorManager::new()), + config_manager: Arc::new(ConfigManager::new(AppConfig::default())), + log_aggregator: Arc::new(log_aggregator), + redis: redis::Client::open("redis://127.0.0.1:1/").unwrap(), + }); + + let app = Router::new() + .route("/api/status", get(get_system_status)) + .with_state(state); + + let resp = app + .oneshot( + Request::builder() + .uri("/api/status") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let json: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(json["data"]["uptime_secs"], 300); + assert_eq!(json["data"]["memory_used_bytes"], 2048); } diff --git a/backend/tests/load/stellar_load.rs b/backend/tests/load/stellar_load.rs new file mode 100644 index 0000000..1eed7e0 --- /dev/null +++ b/backend/tests/load/stellar_load.rs @@ -0,0 +1,399 @@ +//! Concurrent load tests for the `GET /.well-known/stellar.toml` endpoint. +//! +//! These tests verify that the Stellar SEP-1 handler remains stable and +//! correct under concurrent load. The handler is stateless so no mock +//! infrastructure is required. +//! +//! # Running +//! +//! ```bash +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! ``` + +use std::time::Instant; + +use axum::{body::to_bytes, routing::get, Router}; +use axum::http::StatusCode; +use hyper::Request; +use tower::ServiceExt; + +use backend::api::handlers::stellar::get_stellar_toml; + +use crate::load::framework::{assert_load_result, LoadConfig, LoadResult}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a test router wired to the Stellar TOML handler. +fn build_app() -> Router { + Router::new().route("/.well-known/stellar.toml", get(get_stellar_toml)) +} + +/// Run a full load test using the framework and return the [`LoadResult`]. +async fn run_framework_load(concurrency: usize, requests_per_task: usize) -> LoadResult { + use crate::load::framework::run_load; + + let cfg = LoadConfig::new(concurrency, requests_per_task); + run_load(cfg, || async { + let app = build_app(); + let start = Instant::now(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + (resp.status(), start.elapsed()) + }) + .await +} + +// --------------------------------------------------------------------------- +// Basic correctness +// --------------------------------------------------------------------------- + +/// Handler returns 200 OK. +#[tokio::test] +async fn test_stellar_toml_returns_200() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::OK); +} + +/// Response includes the required `Access-Control-Allow-Origin: *` header (SEP-1). +#[tokio::test] +async fn test_stellar_toml_cors_header() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let cors = resp + .headers() + .get("access-control-allow-origin") + .expect("Access-Control-Allow-Origin header must be present"); + assert_eq!(cors, "*"); +} + +/// Response `Content-Type` is `text/plain`. +#[tokio::test] +async fn test_stellar_toml_content_type() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let ct = resp + .headers() + .get("content-type") + .expect("Content-Type header must be present"); + assert!( + ct.to_str().unwrap().contains("text/plain"), + "Content-Type must be text/plain, got: {:?}", + ct + ); +} + +/// Response body contains the required TOML fields. +#[tokio::test] +async fn test_stellar_toml_body_content() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body = std::str::from_utf8(&bytes).unwrap(); + + assert!(body.contains("VERSION"), "body must contain VERSION"); + assert!( + body.contains("NETWORK_PASSPHRASE"), + "body must contain NETWORK_PASSPHRASE" + ); + assert!(body.contains("ACCOUNTS"), "body must contain ACCOUNTS"); + assert!(body.contains("CURRENCIES"), "body must contain CURRENCIES"); +} + +/// Response body contains the USDC currency entry. +#[tokio::test] +async fn test_stellar_toml_contains_usdc() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body = std::str::from_utf8(&bytes).unwrap(); + + assert!(body.contains("USDC"), "body must contain USDC currency"); +} + +/// Response body is non-empty. +#[tokio::test] +async fn test_stellar_toml_non_empty_body() { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + assert!(!bytes.is_empty(), "response body must not be empty"); +} + +/// Response is identical across multiple calls (handler is pure / stateless). +#[tokio::test] +async fn test_stellar_toml_deterministic() { + let mut bodies: Vec> = Vec::new(); + + for _ in 0..5 { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + bodies.push(bytes.to_vec()); + } + + let first = &bodies[0]; + for body in &bodies[1..] { + assert_eq!(body, first, "all responses must be identical"); + } +} + +// --------------------------------------------------------------------------- +// Concurrency tests +// --------------------------------------------------------------------------- + +/// 10 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_10_concurrent() { + let handles: Vec<_> = (0..10) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 50 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_50_concurrent() { + let handles: Vec<_> = (0..50) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// 100 concurrent requests all return 200. +#[tokio::test] +async fn test_stellar_toml_100_concurrent() { + let handles: Vec<_> = (0..100) + .map(|_| { + let app = build_app(); + tokio::spawn(async move { + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + resp.status() + }) + }) + .collect(); + + for handle in handles { + assert_eq!(handle.await.unwrap(), StatusCode::OK); + } +} + +/// Verify that all concurrent responses have identical bodies. +#[tokio::test] +async fn test_stellar_toml_concurrent_identical_bodies() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..20_usize { + join_set.spawn(async { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + to_bytes(resp.into_body(), usize::MAX) + .await + .unwrap() + .to_vec() + }); + } + + let mut bodies: Vec> = Vec::new(); + while let Some(Ok(body)) = join_set.join_next().await { + bodies.push(body); + } + + assert_eq!(bodies.len(), 20); + let first = &bodies[0]; + for body in &bodies[1..] { + assert_eq!(body, first, "all concurrent responses must be identical"); + } +} + +// --------------------------------------------------------------------------- +// Framework-based load tests with SLO assertions +// --------------------------------------------------------------------------- + +/// 10 concurrent tasks × 10 requests each = 100 total. +/// SLO: 0% errors, p99 < 200ms (stateless handler should be very fast). +#[tokio::test] +async fn test_stellar_load_100_requests_slo() { + let result = run_framework_load(10, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 100 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(200)); +} + +/// 20 concurrent tasks × 10 requests each = 200 total. +/// SLO: 0% errors, p99 < 500ms. +#[tokio::test] +async fn test_stellar_load_200_requests_slo() { + let result = run_framework_load(20, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 200 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_millis(500)); +} + +/// 50 concurrent tasks × 10 requests each = 500 total. +/// SLO: 0% errors, p99 < 1s. +#[tokio::test] +async fn test_stellar_load_500_requests_slo() { + let result = run_framework_load(50, 10).await; + result.print_summary("GET /.well-known/stellar.toml — 500 requests"); + assert_load_result(&result, 0.0, std::time::Duration::from_secs(1)); +} + +/// Verify that all responses under load have the correct headers. +#[tokio::test] +async fn test_stellar_load_headers_under_load() { + let mut join_set = tokio::task::JoinSet::new(); + for _ in 0..10_usize { + join_set.spawn(async { + let mut results = Vec::new(); + for _ in 0..5_usize { + let app = build_app(); + let resp = app + .oneshot( + Request::builder() + .uri("/.well-known/stellar.toml") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let cors = resp + .headers() + .get("access-control-allow-origin") + .map(|v| v.to_str().unwrap().to_string()); + results.push((status, cors)); + } + results + }); + } + + while let Some(Ok(batch)) = join_set.join_next().await { + for (status, cors) in batch { + assert_eq!(status, StatusCode::OK); + assert_eq!( + cors.as_deref(), + Some("*"), + "CORS header must be '*' under load" + ); + } + } +} diff --git a/backend/tests/load_tests.rs b/backend/tests/load_tests.rs index b24467d..8b86fd7 100644 --- a/backend/tests/load_tests.rs +++ b/backend/tests/load_tests.rs @@ -1,11 +1,29 @@ //! Load and stress test suite entry point. //! -//! Run with: +//! This file is the integration test binary for all load tests. Each sub-module +//! exercises a specific API endpoint under concurrent load using the shared +//! [`load::framework`] helpers. +//! +//! # Running +//! //! ```bash +//! # All load tests (with output) //! cargo test -p backend --test load_tests -- --nocapture +//! +//! # A specific endpoint +//! cargo test -p backend --test load_tests load::status_load -- --nocapture +//! cargo test -p backend --test load_tests load::profile_load -- --nocapture +//! cargo test -p backend --test load_tests load::dashboard_load -- --nocapture +//! cargo test -p backend --test load_tests load::stellar_load -- --nocapture +//! +//! # Framework unit tests only +//! cargo test -p backend --test load_tests load::framework -- --nocapture //! ``` mod load { + pub mod framework; + pub mod dashboard_load; pub mod profile_load; pub mod status_load; + pub mod stellar_load; }