From 9d784f2f4884176130f5bf2594a7d21ee218a760 Mon Sep 17 00:00:00 2001 From: codex-kramer Date: Wed, 17 Jun 2026 19:24:16 -0700 Subject: [PATCH] Treat deploy cancellations and 4xx health noise correctly --- crates/temps-core/src/workflow_executor.rs | 42 ++++++++++++++++--- .../services/workflow_execution_service.rs | 5 ++- .../src/service/proxy_log_service.rs | 4 +- crates/temps-proxy/src/storage/clickhouse.rs | 2 +- 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/crates/temps-core/src/workflow_executor.rs b/crates/temps-core/src/workflow_executor.rs index e7a21d145..18ffbbb8e 100644 --- a/crates/temps-core/src/workflow_executor.rs +++ b/crates/temps-core/src/workflow_executor.rs @@ -81,6 +81,13 @@ impl WorkflowExecutor { Self { job_tracker } } + fn is_cancellation_error(error: &WorkflowError) -> bool { + matches!( + error, + WorkflowError::WorkflowCancelled | WorkflowError::BuildCancelled + ) + } + /// Execute a workflow with proper dependency resolution and parallel execution pub async fn execute_workflow( &self, @@ -554,8 +561,17 @@ impl WorkflowExecutor { (job_id_clone, job_result) } Err(e) => { - let error_msg = format!("โŒ Job failed: {}", e); - error!("โŒ Job '{}' failed: {}", job_id_clone, e); + let is_cancellation = Self::is_cancellation_error(&e); + let error_msg = if is_cancellation { + format!("Job cancelled: {}", e) + } else { + format!("Job failed: {}", e) + }; + if is_cancellation { + warn!("Job '{}' cancelled: {}", job_id_clone, e); + } else { + error!("Job '{}' failed: {}", job_id_clone, e); + } // Log the error to the job's context so it appears in the job logs if let Err(log_err) = error_context.log(&error_msg).await { @@ -565,15 +581,19 @@ impl WorkflowExecutor { ); } - // Call cleanup on job failure - warn!("๐Ÿงน Calling cleanup for failed job '{}'", job_id_clone); + // Call cleanup on job failure or cancellation + warn!("Calling cleanup for stopped job '{}'", job_id_clone); if let Err(cleanup_err) = job.cleanup(&error_context).await { error!("Failed to cleanup job '{}': {}", job_id_clone, cleanup_err); } ( job_id_clone, - JobResult::failure(error_context, e.to_string()), + if is_cancellation { + JobResult::cancelled(error_context) + } else { + JobResult::failure(error_context, e.to_string()) + }, ) } } @@ -679,6 +699,18 @@ mod tests { cancelled: bool, } + #[test] + fn test_cancellation_errors_are_not_failures() { + assert!(WorkflowExecutor::is_cancellation_error( + &WorkflowError::WorkflowCancelled + )); + assert!(WorkflowExecutor::is_cancellation_error( + &WorkflowError::BuildCancelled + )); + assert!(!WorkflowExecutor::is_cancellation_error( + &WorkflowError::JobExecutionFailed("Build failed".to_string()) + )); + } #[async_trait::async_trait] impl WorkflowCancellationProvider for TestCancellationProvider { async fn is_cancelled(&self, _workflow_run_id: &str) -> Result { diff --git a/crates/temps-deployments/src/services/workflow_execution_service.rs b/crates/temps-deployments/src/services/workflow_execution_service.rs index 89a892615..85915ee0d 100644 --- a/crates/temps-deployments/src/services/workflow_execution_service.rs +++ b/crates/temps-deployments/src/services/workflow_execution_service.rs @@ -330,8 +330,9 @@ impl WorkflowExecutionService { Err(e) => { // Check if this is a cancellation error let error_message = format!("{}", e); - let is_cancellation = - error_message.contains("cancelled") || error_message.contains("Cancelled"); + let lower_error_message = error_message.to_lowercase(); + let is_cancellation = lower_error_message.contains("cancelled") + || lower_error_message.contains("canceled"); if is_cancellation { info!( diff --git a/crates/temps-proxy/src/service/proxy_log_service.rs b/crates/temps-proxy/src/service/proxy_log_service.rs index 616e46cf4..a03a0d455 100644 --- a/crates/temps-proxy/src/service/proxy_log_service.rs +++ b/crates/temps-proxy/src/service/proxy_log_service.rs @@ -814,7 +814,7 @@ impl ProxyLogService { SELECT project_id, COALESCE(COUNT(*), 0) as total_requests, - COALESCE(SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END), 0) as total_errors, + COALESCE(SUM(CASE WHEN status_code >= 500 THEN 1 ELSE 0 END), 0) as total_errors, COALESCE(AVG(response_time_ms)::float8, 0) as avg_response_time_ms FROM proxy_logs WHERE timestamp >= $1 @@ -1859,7 +1859,7 @@ pub struct ProjectHealthSummary { pub project_id: i32, /// Total requests in the period pub total_requests: i64, - /// Total errors (status >= 400) in the period + /// Total server errors (status >= 500) in the period pub total_errors: i64, /// Average response time in ms pub avg_response_time_ms: f64, diff --git a/crates/temps-proxy/src/storage/clickhouse.rs b/crates/temps-proxy/src/storage/clickhouse.rs index 749ada4d6..e990cfb60 100644 --- a/crates/temps-proxy/src/storage/clickhouse.rs +++ b/crates/temps-proxy/src/storage/clickhouse.rs @@ -1194,7 +1194,7 @@ impl ProxyLogStorage for ClickHouseProxyLogStore { "SELECT \ assumeNotNull(project_id) AS project_id, \ count() AS total_requests, \ - countIf(status_code >= 400) AS total_errors, \ + countIf(status_code >= 500) AS total_errors, \ ifNull(avg(response_time_ms), 0) AS avg_response_time_ms \ FROM proxy_logs FINAL \ WHERE {} \