Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions workloads/kube-burner-ocp-wrapper/metrics-endpoint.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
- endpoint: {{.MC_OBO}}
{{if ne .HC_PLATFORM "aws"}}token: {{.MC_PROMETHEUS_TOKEN}}{{end}}
metrics:
- metrics-profiles/{{.HC_PRODUCT}}/hosted-cp-metrics.yml
alerts:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,36 @@

# OVN service sync latency

- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}", kind="service"}[2m])) by (le))
- query: histogram_quantile(0.99, sum(rate(ovnkube_master_network_programming_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}", kind="service"}[2m])) by (le))
metricName: serviceSyncLatency

# Etcd metrics

- query: sum(rate(etcd_server_leader_changes_seen_total{namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
- query: sum(rate(etcd_server_leader_changes_seen_total{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
metricName: etcdLeaderChangesRate

- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
metricName: 99thEtcdDiskBackendCommitDurationSeconds

- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[2m]))
metricName: 99thEtcdDiskWalFsyncDurationSeconds

- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{namespace=~".+{{.HCP_NAMESPACE}}"}[5m]))
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{cluster="{{.MC_NAME}}",namespace=~".+{{.HCP_NAMESPACE}}"}[5m]))
metricName: 99thEtcdRoundTripTimeSeconds

- query: sum by (cluster_version)(etcd_cluster_version)
- query: sum by (cluster_version)(etcd_cluster_version{cluster="{{.MC_NAME}}"})
metricName: etcdVersion
instant: true

# Cluster version
- query: sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{cluster="{{.MC_NAME}}",namespace=~".*{{.HCP_NAMESPACE}}"}) by (pod,container,namespace)
metricName: podCPUReq
instant: true

- query: sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{cluster="{{.MC_NAME}}",namespace=~".*{{.HCP_NAMESPACE}}"}) by (pod,container,namespace)
metricName: podMemoryReq
instant: true

- query: cluster_version{type="completed", namespace=~".+{{.HCP_NAMESPACE}}"}
- query: cluster_version{cluster="{{.MC_NAME}}",type="completed", namespace=~".+{{.HCP_NAMESPACE}}"}
metricName: clusterVersion
instant: true
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Management Node metrics: CPU & Memory

- query: kube_node_role{}
- query: kube_node_info{cluster="{{.MC_NAME}}"}
metricName: mgmtNodeRoles

- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) unless on (instance) label_replace(kube_node_role{cluster="{{.MC_NAME}}",role="infra"}, "instance", "$1", "node", "(.+)")) > 0
Expand Down Expand Up @@ -89,14 +89,6 @@
- query: sum(container_memory_cache{cluster="{{.MC_NAME}}",name!="",container!="POD",namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod, container, namespace, node)
metricName: podMemoryCache-Controlplane

- query: sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod,container,namespace)
metricName: podCPUReq
instant: true

- query: sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{namespace=~".+{{.HCP_NAMESPACE}}"}) by (pod,container,namespace)
metricName: podMemoryReq
instant: true

- query: kubernetes_build_info{cluster="{{.MC_NAME}}"}
metricName: mgmtClusterVersion
instant: true
Expand Down
13 changes: 8 additions & 5 deletions workloads/kube-burner-ocp-wrapper/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ hypershift(){
else
echo "Detected ${HC_PLATFORM} environment..."

MC_NAME=$(kubectl config view -o jsonpath='{.clusters[].name}' --kubeconfig=${MC_KUBECONFIG})
HC_NAME=$(oc get infrastructure cluster -o go-template --template='{{.status.etcdDiscoveryDomain}}' | awk -F. '{print$1}')
if [ -z "${MC_NAME}" ]; then
MC_NAME=$(kubectl config view -o jsonpath='{.clusters[].name}' --kubeconfig="${MC_KUBECONFIG}")
fi
HC_NAME=$(oc get infrastructure cluster -o go-template --template='{{.status.etcdDiscoveryDomain}}' | awk -F. '{print$2}')
HCP_NAMESPACE=${HC_NAME}
QUERY="sum(kube_node_role{cluster=\"$MC_NAME\",role=\"worker\"})by(node)"
QUERY="sum(node_memory_MemTotal_bytes{cluster=\"$MC_NAME\",instance=~\".*user.*\"})by(instance)"

if [[ -z ${AKS_PROM} ]] || [[ -z ${AZURE_PROM} ]] ; then
echo "Azure/AKS prometheus inputs are missing, exiting.."
Expand Down Expand Up @@ -94,7 +96,7 @@ EOF
echo "Get all management worker nodes, excludes infra, obo, workload"
Q_NODES=""
Q_STDOUT=$(curl -H "Authorization: Bearer ${MC_PROMETHEUS_TOKEN}" -k --silent --globoff ${MC_PROMETHEUS}/api/v1/query?query=${QUERY}&time='$(date +"%s")')
for n in $(echo $Q_STDOUT | jq -r '.data.result[].metric.node'); do
for n in $(echo "$Q_STDOUT" | jq -r ".data.result[].metric.$([ \"$HC_PLATFORM\" = \"aws\" ] && echo node || echo instance)"); do
if [[ ${Q_NODES} == "" ]]; then
Q_NODES=${n}
else
Expand All @@ -114,13 +116,14 @@ HOSTED_PROMETHEUS_TOKEN: <truncated>
HCP_NAMESPACE: ${HCP_NAMESPACE}
MGMT_WORKER_NODES: ${MGMT_WORKER_NODES}
HC_PRODUCT: ${HC_PRODUCT}
HC_PLATFORM: ${HC_PLATFORM}
EOF

if [[ ${WORKLOAD} =~ "index" ]]; then
export elapsed=${ELAPSED:-20m}
fi

export MC_OBO MC_PROMETHEUS MC_PROMETHEUS_TOKEN HOSTED_PROMETHEUS HOSTED_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES HC_PRODUCT MC_NAME
export MC_OBO MC_PROMETHEUS MC_PROMETHEUS_TOKEN HOSTED_PROMETHEUS HOSTED_PROMETHEUS_TOKEN HCP_NAMESPACE MGMT_WORKER_NODES HC_PRODUCT MC_NAME HC_PLATFORM

}

Expand Down