diff --git a/src/constants.h b/src/constants.h index ed7cf980e..87d660e31 100644 --- a/src/constants.h +++ b/src/constants.h @@ -88,6 +88,14 @@ constexpr char kInitialStateFolder[] = "initial_state"; constexpr char kPendingRequestMetric[] = "inf_pending_request_count"; constexpr char kModelLoadTimeMetric[] = "model_load_time"; constexpr char kFirstResponseHistogram[] = "first_response_histogram"; +constexpr char kRequestDurationHistogram[] = "request_duration_histogram"; +constexpr char kQueueDurationHistogram[] = "queue_duration_histogram"; +constexpr char kComputeInputDurationHistogram[] = + "compute_input_duration_histogram"; +constexpr char kComputeInferDurationHistogram[] = + "compute_infer_duration_histogram"; +constexpr char kComputeOutputDurationHistogram[] = + "compute_output_duration_histogram"; constexpr uint64_t NANOS_PER_SECOND = 1000000000; constexpr uint64_t NANOS_PER_MILLIS = 1000000; diff --git a/src/infer_stats.cc b/src/infer_stats.cc index 47ab309cb..938098363 100644 --- a/src/infer_stats.cc +++ b/src/infer_stats.cc @@ -144,6 +144,20 @@ InferenceStatsAggregator::UpdateSuccessWithDuration( "compute_infer_duration", compute_infer_duration_ns / 1000); metric_reporter->ObserveSummary( "compute_output_duration", compute_output_duration_ns / 1000); + // Histogram Latencies + // FIXME [DLIS-4762]: request histogram is disabled when cache is enabled. + if (!reporter_config.cache_enabled_) { + metric_reporter->ObserveHistogram( + kRequestDurationHistogram, request_duration_ns / 1000); + } + metric_reporter->ObserveHistogram( + kQueueDurationHistogram, queue_duration_ns / 1000); + metric_reporter->ObserveHistogram( + kComputeInputDurationHistogram, compute_input_duration_ns / 1000); + metric_reporter->ObserveHistogram( + kComputeInferDurationHistogram, compute_infer_duration_ns / 1000); + metric_reporter->ObserveHistogram( + kComputeOutputDurationHistogram, compute_output_duration_ns / 1000); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index f13dc3056..496758f79 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -307,6 +307,16 @@ MetricModelReporter::InitializeHistograms( if (config_.latency_histograms_enabled_) { histogram_families_[kFirstResponseHistogram] = &Metrics::FamilyFirstResponseDuration(); + histogram_families_[kRequestDurationHistogram] = + &Metrics::FamilyRequestDurationHistogram(); + histogram_families_[kQueueDurationHistogram] = + &Metrics::FamilyQueueDurationHistogram(); + histogram_families_[kComputeInputDurationHistogram] = + &Metrics::FamilyComputeInputDurationHistogram(); + histogram_families_[kComputeInferDurationHistogram] = + &Metrics::FamilyComputeInferDurationHistogram(); + histogram_families_[kComputeOutputDurationHistogram] = + &Metrics::FamilyComputeOutputDurationHistogram(); } for (auto& iter : histogram_families_) { diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 1b3f48da3..438cbafa3 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -60,10 +60,17 @@ struct MetricReporterConfig { // Create and use Summaries for per-model latency related metrics bool latency_summaries_enabled_ = false; // Default bucket boundaries used for each histogram metric. Each value - // represents a boundary. For example, {100, 500, 2000, 5000} are latencies. - // in milliseconds in first_response_histogram. + // represents a boundary. For example, {100, 500, 2000, 5000} are latencies + // in milliseconds for first_response_histogram. Other duration histograms + // use microseconds as their unit. std::unordered_map - histogram_options_ = {{kFirstResponseHistogram, {100, 500, 2000, 5000}}}; + histogram_options_ = { + {kFirstResponseHistogram, {100, 500, 2000, 5000}}, + {kRequestDurationHistogram, {1000, 5000, 25000, 50000, 100000}}, + {kQueueDurationHistogram, {100, 1000, 5000, 10000, 50000}}, + {kComputeInputDurationHistogram, {100, 500, 1000, 5000, 10000}}, + {kComputeInferDurationHistogram, {1000, 5000, 25000, 50000, 100000}}, + {kComputeOutputDurationHistogram, {100, 500, 1000, 5000, 10000}}}; // Quantiles used for any summary metrics. Each pair of values represents // { quantile, error }. For example, {0.90, 0.01} means to compute the @@ -82,7 +89,15 @@ struct MetricReporterConfig { // "ModelMetrics" with the full name displayed from metrics reporting while a // different name is used internally. All new histograms must update the map. const std::unordered_map metric_map_ = { - {"nv_inference_first_response_histogram_ms", kFirstResponseHistogram}}; + {"nv_inference_first_response_histogram_ms", kFirstResponseHistogram}, + {"nv_inference_request_duration_histogram_us", kRequestDurationHistogram}, + {"nv_inference_queue_duration_histogram_us", kQueueDurationHistogram}, + {"nv_inference_compute_input_duration_histogram_us", + kComputeInputDurationHistogram}, + {"nv_inference_compute_infer_duration_histogram_us", + kComputeInferDurationHistogram}, + {"nv_inference_compute_output_duration_histogram_us", + kComputeOutputDurationHistogram}}; #endif // TRITON_ENABLE_METRICS }; diff --git a/src/metrics.cc b/src/metrics.cc index 21894bb6e..b2b8b09ce 100644 --- a/src/metrics.cc +++ b/src/metrics.cc @@ -155,6 +155,32 @@ Metrics::Metrics() .Name("nv_inference_first_response_histogram_ms") .Help("Duration from request to first response in milliseconds") .Register(*registry_)), + inf_request_duration_histogram_us_family_( + prometheus::BuildHistogram() + .Name("nv_inference_request_duration_histogram_us") + .Help("Histogram of end-to-end inference request duration, " + "in microseconds") + .Register(*registry_)), + inf_queue_duration_histogram_us_family_( + prometheus::BuildHistogram() + .Name("nv_inference_queue_duration_histogram_us") + .Help("Histogram of inference queuing duration, in microseconds") + .Register(*registry_)), + inf_compute_input_duration_histogram_us_family_( + prometheus::BuildHistogram() + .Name("nv_inference_compute_input_duration_histogram_us") + .Help("Histogram of compute input duration, in microseconds") + .Register(*registry_)), + inf_compute_infer_duration_histogram_us_family_( + prometheus::BuildHistogram() + .Name("nv_inference_compute_infer_duration_histogram_us") + .Help("Histogram of compute inference duration, in microseconds") + .Register(*registry_)), + inf_compute_output_duration_histogram_us_family_( + prometheus::BuildHistogram() + .Name("nv_inference_compute_output_duration_histogram_us") + .Help("Histogram of compute output duration, in microseconds") + .Register(*registry_)), // Summaries inf_request_summary_us_family_( diff --git a/src/metrics.h b/src/metrics.h index ac04ebebc..3b9d0de81 100644 --- a/src/metrics.h +++ b/src/metrics.h @@ -221,6 +221,32 @@ class Metrics { return GetSingleton()->inf_first_response_histogram_ms_family_; } + static prometheus::Family& + FamilyRequestDurationHistogram() + { + return GetSingleton()->inf_request_duration_histogram_us_family_; + } + static prometheus::Family& + FamilyQueueDurationHistogram() + { + return GetSingleton()->inf_queue_duration_histogram_us_family_; + } + static prometheus::Family& + FamilyComputeInputDurationHistogram() + { + return GetSingleton()->inf_compute_input_duration_histogram_us_family_; + } + static prometheus::Family& + FamilyComputeInferDurationHistogram() + { + return GetSingleton()->inf_compute_infer_duration_histogram_us_family_; + } + static prometheus::Family& + FamilyComputeOutputDurationHistogram() + { + return GetSingleton()->inf_compute_output_duration_histogram_us_family_; + } + // Metric family of load time per model static prometheus::Family& FamilyModelLoadTime() { @@ -331,6 +357,16 @@ class Metrics { // Histograms prometheus::Family& inf_first_response_histogram_ms_family_; + prometheus::Family& + inf_request_duration_histogram_us_family_; + prometheus::Family& + inf_queue_duration_histogram_us_family_; + prometheus::Family& + inf_compute_input_duration_histogram_us_family_; + prometheus::Family& + inf_compute_infer_duration_histogram_us_family_; + prometheus::Family& + inf_compute_output_duration_histogram_us_family_; // Summaries prometheus::Family& inf_request_summary_us_family_;