Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 4 additions & 88 deletions src/metric_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ namespace triton { namespace core {
//
MetricFamily::MetricFamily(
TRITONSERVER_MetricKind kind, const char* name, const char* description)
: kind_(kind), storage_(Metrics::GetMetricsStorage())
{
auto registry = Metrics::GetRegistry();

Expand All @@ -65,62 +66,18 @@ MetricFamily::MetricFamily(
throw std::invalid_argument(
"Unsupported kind passed to MetricFamily constructor.");
}

kind_ = kind;
}

void*
MetricFamily::Add(
std::map<std::string, std::string> label_map, Metric* metric,
const TritonServerMetricArgs* args)
{
void* prom_metric = nullptr;
switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in counter Metric constructor.");
}
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
auto counter_ptr = &counter_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in gauge Metric constructor.");
}
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
auto gauge_ptr = &gauge_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
if (args == nullptr) {
throw std::invalid_argument(
"Bucket boundaries not found in Metric args.");
}
if (args->kind() != TRITONSERVER_METRIC_KIND_HISTOGRAM) {
throw std::invalid_argument("Metric args not set to histogram kind.");
}
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family_);
auto histogram_ptr =
&histogram_family_ptr->Add(label_map, args->buckets());
prom_metric = reinterpret_cast<void*>(histogram_ptr);
break;
}
default:
throw std::invalid_argument(
"Unsupported family kind passed to Metric constructor.");
}
void* prom_metric = storage_->Add(kind_, family_, std::move(label_map), args);

std::lock_guard<std::mutex> lk(metric_mtx_);
++prom_metric_ref_cnt_[prom_metric];
child_metrics_.insert(metric);

return prom_metric;
}

Expand All @@ -137,48 +94,7 @@ MetricFamily::Remove(void* prom_metric, Metric* metric)
return;
}

{
std::lock_guard<std::mutex> lk(metric_mtx_);
const auto it = prom_metric_ref_cnt_.find(prom_metric);
if (it != prom_metric_ref_cnt_.end()) {
--it->second;
if (it->second == 0) {
prom_metric_ref_cnt_.erase(it);
} else {
// Done as it is not the last reference
return;
}
}
}

switch (kind_) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
auto counter_ptr = reinterpret_cast<prometheus::Counter*>(prom_metric);
counter_family_ptr->Remove(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(prom_metric);
gauge_family_ptr->Remove(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family_);
auto histogram_ptr =
reinterpret_cast<prometheus::Histogram*>(prom_metric);
histogram_family_ptr->Remove(histogram_ptr);
break;
}
default:
// Invalid kind should be caught in constructor
LOG_ERROR << "Unsupported kind in Metric destructor.";
break;
}
storage_->Remove(kind_, family_, prom_metric);
}

void
Expand Down
31 changes: 2 additions & 29 deletions src/metric_family.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,12 @@
#include <unordered_map>

#include "infer_parameter.h"
#include "metrics.h"
#include "prometheus/registry.h"
#include "tritonserver_apis.h"

namespace triton { namespace core {

//
// TritonServerMetricArgs
//
// Implementation for TRITONSERVER_MetricArgs.
//
class TritonServerMetricArgs {
public:
TritonServerMetricArgs() = default;

void* SetHistogramArgs(const double* buckets, uint64_t bucket_count)
{
kind_ = TRITONSERVER_METRIC_KIND_HISTOGRAM;
buckets_ = std::vector<double>(buckets, buckets + bucket_count);
return nullptr;
}
TRITONSERVER_MetricKind kind() const { return kind_; }
const std::vector<double>& buckets() const { return buckets_; }

private:
TRITONSERVER_MetricKind kind_;
std::vector<double> buckets_;
};

//
// Implementation for TRITONSERVER_MetricFamily.
//
Expand Down Expand Up @@ -93,14 +71,9 @@ class MetricFamily {

void* family_;
TRITONSERVER_MetricKind kind_;
std::shared_ptr<MetricsStorage> storage_;
// Synchronize access of related metric objects
std::mutex metric_mtx_;
// Prometheus returns the existing metric pointer if the metric with the same
// set of labels are requested, as a result, different Metric objects may
// refer to the same prometheus metric. So we must track the reference count
// of the metric and request prometheus to remove it only when all references
// are released.
std::unordered_map<void*, size_t> prom_metric_ref_cnt_;
// Maintain references to metrics created from this metric family to
// invalidate their references if a family is deleted before its metric
std::set<Metric*> child_metrics_;
Expand Down
112 changes: 112 additions & 0 deletions src/metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,114 @@

namespace triton { namespace core {

void*
MetricsStorage::Add(
TRITONSERVER_MetricKind kind, void* family,
std::map<std::string, std::string> label_map,
const TritonServerMetricArgs* args)
{
void* prom_metric = nullptr;
switch (kind) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in counter Metric constructor.");
}
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family);
auto counter_ptr = &counter_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
if (args != nullptr) {
throw std::invalid_argument(
"Unexpected args found in gauge Metric constructor.");
}
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family);
auto gauge_ptr = &gauge_family_ptr->Add(label_map);
prom_metric = reinterpret_cast<void*>(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
if (args == nullptr) {
throw std::invalid_argument(
"Bucket boundaries not found in Metric args.");
}
if (args->kind() != TRITONSERVER_METRIC_KIND_HISTOGRAM) {
throw std::invalid_argument("Metric args not set to histogram kind.");
}
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family);
auto histogram_ptr =
&histogram_family_ptr->Add(label_map, args->buckets());
prom_metric = reinterpret_cast<void*>(histogram_ptr);
break;
}
default:
throw std::invalid_argument(
"Unsupported family kind passed to Metric constructor.");
}

std::lock_guard<std::mutex> lk(metric_mtx_);
++prom_metric_ref_cnt_[prom_metric];

return prom_metric;
}

void
MetricsStorage::Remove(
TRITONSERVER_MetricKind kind, void* family, void* prom_metric)
{
{
std::lock_guard<std::mutex> lk(metric_mtx_);
const auto it = prom_metric_ref_cnt_.find(prom_metric);
if (it != prom_metric_ref_cnt_.end()) {
--it->second;
if (it->second == 0) {
prom_metric_ref_cnt_.erase(it);
} else {
// Done as it is not the last reference
return;
}
}
}

switch (kind) {
case TRITONSERVER_METRIC_KIND_COUNTER: {
auto counter_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family);
auto counter_ptr = reinterpret_cast<prometheus::Counter*>(prom_metric);
counter_family_ptr->Remove(counter_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_GAUGE: {
auto gauge_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family);
auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(prom_metric);
gauge_family_ptr->Remove(gauge_ptr);
break;
}
case TRITONSERVER_METRIC_KIND_HISTOGRAM: {
auto histogram_family_ptr =
reinterpret_cast<prometheus::Family<prometheus::Histogram>*>(family);
auto histogram_ptr =
reinterpret_cast<prometheus::Histogram*>(prom_metric);
histogram_family_ptr->Remove(histogram_ptr);
break;
}
default:
// Invalid kind should be caught in constructor
LOG_ERROR << "Unsupported kind in Metric destructor.";
break;
}
}

Metrics::Metrics()
: registry_(std::make_shared<prometheus::Registry>()),
serializer_(new prometheus::TextSerializer()),
metrics_storage_(std::make_shared<MetricsStorage>()),
inf_success_family_(
prometheus::BuildCounter()
.Name("nv_inference_request_success")
Expand Down Expand Up @@ -1040,6 +1145,13 @@ Metrics::GetRegistry()
return singleton->registry_;
}

std::shared_ptr<MetricsStorage>
Metrics::GetMetricsStorage()
{
auto singleton = Metrics::GetSingleton();
return singleton->metrics_storage_;
}

const std::string
Metrics::SerializedMetrics()
{
Expand Down
47 changes: 47 additions & 0 deletions src/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,49 @@ struct DcgmMetadata {
};
#endif // TRITON_ENABLE_METRICS_GPU

//
// TritonServerMetricArgs
//
// Implementation for TRITONSERVER_MetricArgs.
//
class TritonServerMetricArgs {
public:
TritonServerMetricArgs() = default;

void* SetHistogramArgs(const double* buckets, uint64_t bucket_count)
{
kind_ = TRITONSERVER_METRIC_KIND_HISTOGRAM;
buckets_ = std::vector<double>(buckets, buckets + bucket_count);
return nullptr;
}
TRITONSERVER_MetricKind kind() const { return kind_; }
const std::vector<double>& buckets() const { return buckets_; }

private:
TRITONSERVER_MetricKind kind_;
std::vector<double> buckets_;
};

class MetricsStorage {
public:
void* Add(
TRITONSERVER_MetricKind kind, void* family,
std::map<std::string, std::string> label_map,
const TritonServerMetricArgs* args);

void Remove(TRITONSERVER_MetricKind kind, void* family, void* prom_metric);

private:
// Synchronize access of related metric objects
std::mutex metric_mtx_;
// Prometheus returns the existing metric pointer if the metric with the same
// set of labels are requested, as a result, different Metric objects may
// refer to the same prometheus metric. So we must track the reference count
// of the metric and request prometheus to remove it only when all references
// are released.
std::unordered_map<void*, size_t> prom_metric_ref_cnt_;
};

class Metrics {
public:
// Return the hash value of the labels
Expand Down Expand Up @@ -142,6 +185,9 @@ class Metrics {
// Get the prometheus registry
static std::shared_ptr<prometheus::Registry> GetRegistry();

// Get the storage that holds prometheus metrics with reference count
static std::shared_ptr<MetricsStorage> GetMetricsStorage();

// Get serialized metrics
static const std::string SerializedMetrics();

Expand Down Expand Up @@ -297,6 +343,7 @@ class Metrics {

std::shared_ptr<prometheus::Registry> registry_;
std::unique_ptr<prometheus::Serializer> serializer_;
std::shared_ptr<MetricsStorage> metrics_storage_;

// DLIS-4761: Refactor into groups of families
prometheus::Family<prometheus::Counter>& inf_success_family_;
Expand Down
Loading