diff --git a/charts/otel-collector/Chart.lock b/charts/otel-collector/Chart.lock index 7621da74a..5c3f6b9de 100644 --- a/charts/otel-collector/Chart.lock +++ b/charts/otel-collector/Chart.lock @@ -2,5 +2,8 @@ dependencies: - name: opentelemetry-collector repository: https://open-telemetry.github.io/opentelemetry-helm-charts version: 0.114.0 -digest: sha256:f16aaab229e47fe11246ae1df02285c7ab672952af5b2ebab675c492aa65c63d -generated: "2025-08-04T15:28:37.439059528+01:00" +- name: opentelemetry-collector + repository: https://open-telemetry.github.io/opentelemetry-helm-charts + version: 0.114.0 +digest: sha256:ea718a2fe7e745fe1bc5523687d0b78f54e13d6036411cb8828916f690a10485 +generated: "2026-02-26T16:04:05.616035792Z" diff --git a/charts/otel-collector/Chart.yaml b/charts/otel-collector/Chart.yaml index f6f863c5e..c13c3da34 100644 --- a/charts/otel-collector/Chart.yaml +++ b/charts/otel-collector/Chart.yaml @@ -2,8 +2,12 @@ apiVersion: v2 name: otel-collector description: Workflows otel-collector type: application -version: 0.1.2 +version: 0.1.3 dependencies: - name: opentelemetry-collector repository: https://open-telemetry.github.io/opentelemetry-helm-charts version: 0.114.0 + - name: opentelemetry-collector + alias: opentelemetry-collector-agent + repository: https://open-telemetry.github.io/opentelemetry-helm-charts + version: 0.114.0 diff --git a/charts/otel-collector/templates/role.yaml b/charts/otel-collector/templates/role.yaml index a63489c4a..d8fc06649 100644 --- a/charts/otel-collector/templates/role.yaml +++ b/charts/otel-collector/templates/role.yaml @@ -7,3 +7,15 @@ rules: - apiGroups: [""] resources: ["pods", "services", "endpoints"] verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["nodes", "namespaces", "replicationcontrollers", "resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["nodes/stats", "nodes/proxy", "nodes/metrics"] + verbs: ["get"] diff --git a/charts/otel-collector/templates/rolebinding.yaml b/charts/otel-collector/templates/rolebinding.yaml index 6d7fc65e3..442492325 100644 --- a/charts/otel-collector/templates/rolebinding.yaml +++ b/charts/otel-collector/templates/rolebinding.yaml @@ -7,6 +7,9 @@ subjects: - kind: ServiceAccount name: "{{ .Release.Name }}-opentelemetry-collector" namespace: {{ .Release.Namespace }} + - kind: ServiceAccount + name: "{{ .Release.Name }}-opentelemetry-collector-agent" + namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole name: opentelemetry-collector diff --git a/charts/otel-collector/values.yaml b/charts/otel-collector/values.yaml index 51bc953d8..09f559c6f 100644 --- a/charts/otel-collector/values.yaml +++ b/charts/otel-collector/values.yaml @@ -6,6 +6,8 @@ opentelemetry-collector: presets: kubernetesAttributes: enabled: true + clusterMetrics: + enabled: true ports: prometheus: enabled: true @@ -70,9 +72,21 @@ opentelemetry-collector: - sources: - from: connection receivers: + k8s_cluster: + node_conditions_to_report: [Ready, MemoryPressure, DiskPressure] + allocatable_types_to_report: [cpu, memory, ephemeral-storage, pods] + collection_interval: 30s + metrics: + k8s.node.condition: + enabled: true + k8s.pod.status_reason: + enabled: true prometheus: config: scrape_configs: + # - job_name: 'kube-state-metrics' + # static_configs: + # - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] - job_name: 'kubernetes-pods' # scheme: https # tls_config: @@ -145,6 +159,7 @@ opentelemetry-collector: pipelines: metrics: receivers: + - k8s_cluster - prometheus - otlp processors: @@ -162,3 +177,86 @@ opentelemetry-collector: - k8sattributes exporters: - otlphttp + +opentelemetry-collector-agent: + enabled: true + image: + repository: otel/opentelemetry-collector-contrib + mode: daemonset + presets: + kubeletMetrics: + enabled: true + kubernetesAttributes: + enabled: true + resources: + requests: + cpu: '200m' + memory: 256Mi + limits: + cpu: '500m' + memory: 512Mi + ports: + otlp: + enabled: false + otlp-http: + enabled: false + prometheus: + enabled: true + containerPort: 9090 + servicePort: 9090 + protocol: TCP + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + config: + processors: + batch: + send_batch_size: 256 + k8sattributes: + auth_type: 'serviceAccount' + extract: + metadata: + - k8s.namespace.name + - k8s.pod.name + - k8s.pod.uid + - k8s.node.name + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + receivers: + kubeletstats: + collection_interval: 20s + auth_type: serviceAccount + endpoint: "https://${env:K8S_NODE_IP}:10250" + insecure_skip_verify: true + metric_groups: + - node + - pod + - container + exporters: + prometheus: + endpoint: 0.0.0.0:9090 + resource_to_telemetry_conversion: + enabled: true + service: + pipelines: + metrics: + receivers: + - kubeletstats + processors: + - k8sattributes + - memory_limiter + - batch + exporters: + - prometheus diff --git a/charts/workflows-cluster/staging-values.yaml b/charts/workflows-cluster/staging-values.yaml index c48fa0463..81ffe84a3 100644 --- a/charts/workflows-cluster/staging-values.yaml +++ b/charts/workflows-cluster/staging-values.yaml @@ -58,6 +58,8 @@ vcluster: selfHeal: true sync: fromHost: + nodes: + enabled: true secrets: mappings: byName: @@ -73,6 +75,12 @@ vcluster: "/postgres-application-passwords": "workflows/postgres-application-passwords" "/postgres-initdb-script": "workflows/postgres-initdb-script" + integrations: + metricsServer: + enabled: true + nodes: true + pods: true + ingress: secretName: letsencrypt-kubernetes-staging-workflows-diamond-ac-uk host: kubernetes.staging.workflows.diamond.ac.uk diff --git a/charts/workflows-cluster/values.yaml b/charts/workflows-cluster/values.yaml index 67f43c791..8baa1e56a 100644 --- a/charts/workflows-cluster/values.yaml +++ b/charts/workflows-cluster/values.yaml @@ -17,6 +17,8 @@ vcluster: deploy: enabled: true statefulSet: + image: + tag: 3.6.4-0 extraArgs: - --quota-backend-bytes=8589934592 # 8Gi resources: