diff --git a/rock/admin/metrics/constants.py b/rock/admin/metrics/constants.py index 48c69b8c04..66c2d8c580 100644 --- a/rock/admin/metrics/constants.py +++ b/rock/admin/metrics/constants.py @@ -4,6 +4,7 @@ class MetricsConstants: SANDBOX_REQUEST_TOTAL = "request.total" SANDBOX_REQUEST_SUCCESS = "request.success" SANDBOX_REQUEST_FAILURE = "request.failure" + SANDBOX_REQUEST_CLIENT_ERROR = "request.client_error" SANDBOX_REQUEST_RT = "request.rt" diff --git a/rock/admin/metrics/decorator.py b/rock/admin/metrics/decorator.py index 39bdceb932..c647dc9f88 100644 --- a/rock/admin/metrics/decorator.py +++ b/rock/admin/metrics/decorator.py @@ -9,6 +9,7 @@ from rock.admin.metrics.constants import MetricsConstants from rock.admin.metrics.monitor import MetricsMonitor +from rock.sdk.common.exceptions import BadRequestRockError if TYPE_CHECKING: from rock.sandbox.sandbox_meta_store import SandboxMetaStore @@ -130,7 +131,10 @@ def _record_metrics(metrics_monitor: MetricsMonitor, result, attributes: dict, s is_exception = isinstance(result, Exception) if is_exception: metric_attrs = {**attributes, "error_type": type(result).__name__} - metrics_monitor.record_counter_by_name(f"{metric_prefix}.failure", 1, metric_attrs) + if isinstance(result, BadRequestRockError): + metrics_monitor.record_counter_by_name(f"{metric_prefix}.client_error", 1, metric_attrs) + else: + metrics_monitor.record_counter_by_name(f"{metric_prefix}.failure", 1, metric_attrs) else: metric_attrs = attributes metrics_monitor.record_counter_by_name(f"{metric_prefix}.success", 1, metric_attrs) diff --git a/rock/admin/metrics/monitor.py b/rock/admin/metrics/monitor.py index 44c8d077bb..86a9ed7755 100644 --- a/rock/admin/metrics/monitor.py +++ b/rock/admin/metrics/monitor.py @@ -72,6 +72,7 @@ def _register_metrics(self): # Request count metrics self._register_counter(MetricsConstants.SANDBOX_REQUEST_SUCCESS, "Number of successful requests") self._register_counter(MetricsConstants.SANDBOX_REQUEST_FAILURE, "Number of failed requests") + self._register_counter(MetricsConstants.SANDBOX_REQUEST_CLIENT_ERROR, "Number of client error requests") self._register_counter(MetricsConstants.SANDBOX_REQUEST_TOTAL, "Number of total requests") # Sandbox metrics self._register_gauge(MetricsConstants.SANDBOX_TOTAL_COUNT, "Number of sandbox") diff --git a/tests/unit/admin/metrics/test_decorator.py b/tests/unit/admin/metrics/test_decorator.py index 08ff686353..dd078bb2bb 100644 --- a/tests/unit/admin/metrics/test_decorator.py +++ b/tests/unit/admin/metrics/test_decorator.py @@ -189,9 +189,9 @@ async def do_something(self, sandbox_id): assert attrs["namespace"] == "n1" -def test_record_metrics_bad_request_goes_to_failure(): - """BadRequestRockError lands in the generic `.failure` bucket along with - every other exception type — there is no separate client-fault bucket.""" +def test_record_metrics_bad_request_goes_to_client_error(): + """BadRequestRockError lands in the `.client_error` bucket, not `.failure`, + so client faults don't pollute server failure metrics.""" mock_metrics_monitor = Mock(spec=MetricsMonitor) attributes = {"operation": "test_op"} start_time = 0 @@ -202,6 +202,32 @@ def test_record_metrics_bad_request_goes_to_failure(): _record_metrics(mock_metrics_monitor, exception, attributes, start_time, "test") error_attrs = {**attributes, "error_type": "BadRequestRockError"} - mock_metrics_monitor.record_counter_by_name.assert_any_call("test.failure", 1, error_attrs) + mock_metrics_monitor.record_counter_by_name.assert_any_call("test.client_error", 1, error_attrs) mock_metrics_monitor.record_gauge_by_name.assert_called_once_with("test.rt", 1000.0, error_attrs) mock_metrics_monitor.record_counter_by_name.assert_any_call("test.total", 1, error_attrs) + # Verify it does NOT go to .failure + failure_calls = [c for c in mock_metrics_monitor.record_counter_by_name.call_args_list if c[0][0] == "test.failure"] + assert len(failure_calls) == 0 + + +def test_record_metrics_server_error_goes_to_failure(): + """Non-client exceptions (e.g. InternalServerRockError, generic Exception) + still land in the `.failure` bucket.""" + from rock.sdk.common.exceptions import InternalServerRockError + + mock_metrics_monitor = Mock(spec=MetricsMonitor) + attributes = {"operation": "test_op"} + start_time = 0 + exception = InternalServerRockError("internal error") + + with patch("rock.admin.metrics.decorator.time.perf_counter", return_value=1.0): + with pytest.raises(InternalServerRockError): + _record_metrics(mock_metrics_monitor, exception, attributes, start_time, "test") + + error_attrs = {**attributes, "error_type": "InternalServerRockError"} + mock_metrics_monitor.record_counter_by_name.assert_any_call("test.failure", 1, error_attrs) + # Verify it does NOT go to .client_error + client_error_calls = [ + c for c in mock_metrics_monitor.record_counter_by_name.call_args_list if c[0][0] == "test.client_error" + ] + assert len(client_error_calls) == 0