diff --git a/azure/specs/service-spec.json.tpl b/azure/specs/service-spec.json.tpl index 4458a423..830a0bf3 100644 --- a/azure/specs/service-spec.json.tpl +++ b/azure/specs/service-spec.json.tpl @@ -100,6 +100,23 @@ { "type":"Control", "scope":"#/properties/autoscaling/properties/target_cpu_utilization" + }, + { + "type": "Control", + "scope": "#/properties/autoscaling/properties/target_memory_enabled" + }, + { + "rule": { + "effect": "SHOW", + "condition": { + "scope": "#/properties/autoscaling/properties/target_memory_enabled", + "schema": { + "const": true + } + } + }, + "type": "Control", + "scope": "#/properties/autoscaling/properties/target_memory_utilization" } ] } @@ -363,6 +380,19 @@ "maximum":90, "minimum":50, "description":"CPU utilization threshold that triggers scaling" + }, + "target_memory_enabled": { + "type": "boolean", + "title": "Scale by memory", + "default": false + }, + "target_memory_utilization": { + "type": "integer", + "title": "Target memory utilization (%)", + "default": 70, + "maximum": 90, + "minimum": 30, + "description": "Memory utilization threshold that triggers scaling" } } }, diff --git a/datadog/metric/build_context b/datadog/metric/build_context new file mode 100755 index 00000000..33d7be39 --- /dev/null +++ b/datadog/metric/build_context @@ -0,0 +1,47 @@ +#!/bin/bash + +SCOPE_ID=$(echo "$CONTEXT" | jq -r '.arguments.scope_id // empty') + +if [[ -z "$DATADOG_API_KEY" ]] || [[ -z "$DATADOG_APP_KEY" ]]; then + NRN=$(np scope read --id "$SCOPE_ID" --format json | jq -r .nrn) + + PROVIDERS=$(np provider list --categories metrics --nrn "$NRN" --format json) + + + DATADOG_API_KEY=${DATADOG_API_KEY:-$(echo "$PROVIDERS" | jq -r '.results[0].attributes.credentials.api_key // empty')} + DATADOG_APP_KEY=${DATADOG_APP_KEY:-$(echo "$PROVIDERS" | jq -r '.results[0].attributes.credentials.app_key // empty')} + DATADOG_SITE=${DATADOG_SITE:-$(echo "$PROVIDERS" | jq -r '.results[0].attributes.settings.site // "datadoghq.com"')} + +else + DATADOG_SITE=${DATADOG_SITE:-"datadoghq.com"} +fi + +if [[ -z "$DATADOG_API_KEY" ]]; then + echo "There is no Datadog provider configured. Please configure Datadog in Platform settings and try again." >&2 + exit 1 +fi + +if [[ -z "$DATADOG_APP_KEY" ]]; then + echo "Datadog App Key is required. Please configure App Key in Platform settings and try again." >&2 + exit 1 +fi + +while read -r line; do + eval "$line" +done < <(echo "$CONTEXT" | jq -r '.arguments | to_entries[] | + if (.value | type) == "array" then + "export \(.key | ascii_upcase)=\(.value | join(","))" + else + "export \(.key | ascii_upcase)=\(.value)" + end') + +if [[ -n "$METRIC" ]]; then + export METRIC_NAME="$METRIC" +fi + +export DATADOG_API_KEY +export DATADOG_APP_KEY +export DATADOG_SITE +export SCOPE_ID +export SERVICE_NAME +export ENVIRONMENT \ No newline at end of file diff --git a/datadog/metric/list b/datadog/metric/list new file mode 100755 index 00000000..5a3b1a21 --- /dev/null +++ b/datadog/metric/list @@ -0,0 +1,55 @@ +#!/bin/bash + +echo '{ + "results": [ + { + "name": "http.rpm", + "title": "Throughput", + "unit": "rpm", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + }, + { + "name": "http.response_time", + "title": "Response time", + "unit": "ms", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + }, + { + "name": "http.error_rate", + "title": "Error rate", + "unit": "%", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + }, + { + "name": "system.cpu_usage_percentage", + "title": "Cpu usage", + "unit": "%", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + }, + { + "name": "system.memory_usage_percentage", + "title": "Memory usage", + "unit": "%", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["scope_id", "instance_id"] + }, + { + "name": "http.healthcheck_count", + "title": "Healthcheck", + "unit": "check", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + }, + { + "name": "http.healthcheck_fail", + "title": "Healthcheck failures", + "unit": "count", + "available_filters": ["scope_id", "instance_id"], + "available_group_by": ["instance_id"] + } + ] +}' \ No newline at end of file diff --git a/datadog/metric/metric b/datadog/metric/metric new file mode 100755 index 00000000..946f3adf --- /dev/null +++ b/datadog/metric/metric @@ -0,0 +1,318 @@ +#!/bin/bash + +GROUP_BY=${GROUP_BY:-""} + + +# Validate required parameters +if [[ -z "$METRIC_NAME" ]]; then + echo '{"metric":"","type":"","period_in_seconds":0,"unit":"","results":[]}' + exit 1 +fi + +if [[ -z "$APPLICATION_ID" ]]; then + echo '{"metric":"","type":"","period_in_seconds":0,"unit":"","results":[]}' + exit 1 +fi + +if [[ -z "$DATADOG_API_KEY" ]]; then + echo '{"error":"DATADOG_API_KEY is required. Please specify with DATADOG_API_KEY environment variable"}' + exit 1 +fi + +if [[ -z "$DATADOG_APP_KEY" ]]; then + echo '{"error":"DATADOG_APP_KEY is required. Please specify with DATADOG_APP_KEY environment variable"}' + exit 1 +fi + +get_metric_config() { + case "$METRIC_NAME" in + "http.error_rate") + echo "gauge percent" + ;; + "http.response_time") + echo "gauge seconds" + ;; + "http.rpm") + echo "gauge count_per_minute" + ;; + "http.healthcheck_count") + echo "gauge count" + ;; + "http.healthcheck_fail") + echo "gauge count" + ;; + "trace.http.request.p99") + echo "gauge milliseconds" + ;; + "system.cpu_usage_percentage") + echo "gauge percent" + ;; + "system.memory_usage_percentage") + echo "gauge percent" + ;; + "system.used_memory_kb") + echo "gauge kilobytes" + ;; + *) + echo "gauge unknown" + ;; + esac +} + +build_filters() { + local filters="" + + if [[ -n "$APPLICATION_ID" ]]; then + filters="application_id:$APPLICATION_ID" + fi + + if [[ -n "$SCOPE_ID" ]]; then + if [[ -n "$filters" ]]; then + filters="$filters,scope_id:$SCOPE_ID" + else + filters="scope_id:$SCOPE_ID" + fi + fi + + if [[ -n "$DEPLOYMENT_ID" && "$DEPLOYMENT_ID" != "null" ]]; then + if [[ -n "$filters" ]]; then + filters="$filters,deployment_id:$DEPLOYMENT_ID" + else + filters="deployment_id:$DEPLOYMENT_ID" + fi + fi + + echo "$filters" +} + +build_datadog_query() { + local metric="$1" + local filters="$2" + local rollup_step="$3" + local start_time="$4" + local end_time="$5" + local groupBy="$GROUP_BY" + + if [[ "$groupBy" == "[]" || "$groupBy" == "" ]]; then + groupBy="" + fi + + local group_clause="" + if [[ -n "$groupBy" ]]; then + group_clause=" by {$(echo "$groupBy" | tr ',' ' ' | xargs | tr ' ' ',')}" + fi + + case "$metric" in + # Custom nullplatform metrics with specific logic + "http.healthcheck_count") + echo "sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes} by {instance_id}.rollup(sum, $rollup_step)" + ;; + "http.healthcheck_fail") + echo "sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes} by {instance_id}.rollup(sum, $rollup_step) - sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes,quality:ok_2xx_3xx} by {instance_id}.rollup(sum, $rollup_step)" + ;; + "system.memory_usage_percentage") + echo "avg:nullplatform.scope.memory_usage_percentage{$filters}$group_clause" + ;; + "system.cpu_usage_percentage") + echo "avg:nullplatform.scope.cpu_usage_percentage{$filters}$group_clause" + ;; + "system.used_memory_kb") + echo "avg:nullplatform.scope.memory_usage_kb{$filters}$group_clause" + ;; + "http.response_time") + echo "sum:nullplatform.scope.response_time{$filters}$group_clause.rollup(sum, 60) / sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)" + ;; + "http.rpm") + echo "sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)" + ;; + "http.error_rate") + echo "((sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60) - sum:nullplatform.scope.request_count{$filters,quality:ok_2xx_3xx}$group_clause.rollup(sum, 60)) / sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)) * 100" + ;; + "trace.http.request.p99") + local env_name=$(echo "$CONTEXT" | jq -r '.service.dimensions.environment') + local service_name=$(basename $(echo "$CONTEXT" | jq -r '.tags.repository_url'))-kubernetes + + local p99_interval=$((end_time - start_time)) + echo "p99:trace.http.request{service:$service_name,env:$env_name}.rollup(avg, $p99_interval) * 1000" + ;; + + # Generic handler for any other Datadog metric + *) + echo "avg:$metric{$filters}$group_clause" + ;; + esac +} + +# Query Datadog API +query_datadog() { + local query="$1" + local start_time="$2" + local end_time="$3" + + local base_url="https://api.${DATADOG_SITE:-datadoghq.com}" + local url="${base_url}/api/v2/query/timeseries" + + # Build JSON payload for v2 API + local payload=$(jq -n \ + --arg query "$query" \ + --arg from "$start_time" \ + --arg to "$end_time" \ + '{ + data: { + type: "timeseries_request", + attributes: { + formulas: [{ + formula: "a" + }], + queries: [{ + name: "a", + query: $query, + data_source: "metrics" + }], + from: ($from | tonumber * 1000), + to: ($to | tonumber * 1000), + interval: 60000 + } + } + }') + + curl -s -X POST "$url" \ + -H "DD-API-KEY: $DATADOG_API_KEY" \ + -H "DD-APPLICATION-KEY: $DATADOG_APP_KEY" \ + -H "Content-Type: application/json" \ + -d "$payload" +} + +# Handle START_TIME/END_TIME +if [[ -n "$START_TIME" && -n "$END_TIME" ]]; then + # For macOS compatibility, use a different date parsing approach + if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS date command + start_time=$(echo "$START_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -j -f "%Y-%m-%d %H:%M:%S" "{}" +%s 2>/dev/null || echo "0") + end_time=$(echo "$END_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -j -f "%Y-%m-%d %H:%M:%S" "{}" +%s 2>/dev/null || echo "0") + else + # Linux date command + start_time=$(echo "$START_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -d "{}" +%s 2>/dev/null || echo "0") + end_time=$(echo "$END_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -d "{}" +%s 2>/dev/null || echo "0") + fi + # Use 30 second intervals for queries 6 minutes or less + time_diff=$((end_time - start_time)) + if [[ $time_diff -le 360 ]]; then + step=${PERIOD:-30} + else + step=${PERIOD:-60} + fi +else + # Fallback to TIME_RANGE logic + end_time=$(date +%s) + case "$TIME_RANGE" in + *h) + hours=${TIME_RANGE%h} + start_time=$((end_time - hours * 3600)) + ;; + *m) + minutes=${TIME_RANGE%m} + start_time=$((end_time - minutes * 60)) + ;; + *d) + days=${TIME_RANGE%d} + start_time=$((end_time - days * 86400)) + ;; + *) + start_time=$((end_time - 3600)) + ;; + esac + # Use 30 second intervals for queries 6 minutes or less + time_diff=$((end_time - start_time)) + if [[ $time_diff -le 360 ]]; then + step=${PERIOD:-30} + else + step=${PERIOD:-60} + fi +fi + +config=$(get_metric_config) +metric_type=$(echo $config | cut -d' ' -f1) +unit=$(echo $config | cut -d' ' -f2) + +filters=$(build_filters) +query=$(build_datadog_query "$METRIC_NAME" "$filters" "$step" "$start_time" "$end_time") + +response=$(query_datadog "$query" "$start_time" "$end_time") + + +transform_response() { + local response="$1" + + # Check if response contains error + local error=$(echo "$response" | jq -r '.errors // empty') + if [[ -n "$error" ]]; then + echo "[]" + return + fi + + # Extract timeseries data from v2 API response + local series=$(echo "$response" | jq -r '.data.attributes.series // []') + + if [[ "$series" == "[]" || "$series" == "null" ]]; then + echo "[]" + return + fi + + # Transform v2 API response - combine times and values arrays into data points + echo "$response" | jq ' + (.data.attributes.times // []) as $times | + (.data.attributes.values // [[]]) as $values | + (.data.attributes.series // []) | map({ + selector: ( + if .group_tags then + if (.group_tags | type) == "array" then + if (.group_tags | length) == 0 then + {} + else + # Convert array of "key:value" strings to object + .group_tags | map(split(":") | {(.[0]): .[1]}) | add + end + else + .group_tags + end + elif .scope then + if (.scope | type) == "string" then + .scope | split(",") | map(split(":") | {(.[0]): .[1]}) | add + else + .scope + end + else + {} + end + ) + } + { + data: ( + if ($values | length) > 0 and ($times | length) > 0 then + [range($times | length)] | map({ + timestamp: ($times[.] / 1000 | todate), + value: $values[0][.] + }) + else + [] + end + ) + })' +} + +transformed_results=$(transform_response "$response") + +# Output compact JSON without formatting +jq -c -n \ + --arg metric "$METRIC_NAME" \ + --arg type "$metric_type" \ + --arg period "$step" \ + --arg unit "$unit" \ + --argjson results "$transformed_results" \ + '{ + metric: $metric, + type: $type, + period_in_seconds: ($period | tonumber), + unit: $unit, + results: $results + }' \ No newline at end of file diff --git a/datadog/metric/workflows/list.yaml b/datadog/metric/workflows/list.yaml new file mode 100644 index 00000000..051f871d --- /dev/null +++ b/datadog/metric/workflows/list.yaml @@ -0,0 +1,4 @@ +steps: + - name: metrics + type: script + file: "$OVERRIDES_PATH/metric/list" \ No newline at end of file diff --git a/datadog/metric/workflows/metric.yaml b/datadog/metric/workflows/metric.yaml new file mode 100644 index 00000000..e71469d5 --- /dev/null +++ b/datadog/metric/workflows/metric.yaml @@ -0,0 +1,7 @@ +steps: + - name: build context + type: script + file: "$OVERRIDES_PATH/metric/build_context" + - name: metric + type: script + file: "$OVERRIDES_PATH/metric/metric" \ No newline at end of file diff --git a/k8s/deployment/templates/blue-green-ingress.yaml.tpl b/k8s/deployment/templates/blue-green-ingress.yaml.tpl index 323fa4dc..314de1fe 100644 --- a/k8s/deployment/templates/blue-green-ingress.yaml.tpl +++ b/k8s/deployment/templates/blue-green-ingress.yaml.tpl @@ -68,3 +68,15 @@ spec: name: bg-deployment port: name: use-annotation +{{- range .scope.domains }} + - host: {{ .name }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bg-deployment + port: + name: use-annotation +{{- end }} diff --git a/k8s/deployment/templates/initial-ingress.yaml.tpl b/k8s/deployment/templates/initial-ingress.yaml.tpl index 287a17fb..dbcdaa12 100644 --- a/k8s/deployment/templates/initial-ingress.yaml.tpl +++ b/k8s/deployment/templates/initial-ingress.yaml.tpl @@ -62,4 +62,16 @@ spec: service: name: d-{{ .scope.id }}-{{ .deployment.id }} port: - number: 8080 \ No newline at end of file + number: 8080 +{{- range .scope.domains }} + - host: {{ .name }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: d-{{ $.scope.id }}-{{ $.deployment.id }} + port: + number: 8080 +{{- end }} \ No newline at end of file diff --git a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl index cad75173..5f45ad58 100644 --- a/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/blue-green-httproute.yaml.tpl @@ -45,6 +45,9 @@ metadata: spec: hostnames: - {{ .scope.domain }} +{{- range .scope.domains }} + - {{ .name }} +{{- end }} parentRefs: - group: gateway.networking.k8s.io kind: Gateway diff --git a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl index d01162e4..f300a5d3 100644 --- a/k8s/deployment/templates/istio/initial-httproute.yaml.tpl +++ b/k8s/deployment/templates/istio/initial-httproute.yaml.tpl @@ -45,6 +45,9 @@ metadata: spec: hostnames: - {{ .scope.domain }} +{{- range .scope.domains }} + - {{ .name }} +{{- end }} parentRefs: - group: gateway.networking.k8s.io kind: Gateway diff --git a/k8s/deployment/templates/scaling.yaml.tpl b/k8s/deployment/templates/scaling.yaml.tpl index 6c9a9a39..10187b80 100644 --- a/k8s/deployment/templates/scaling.yaml.tpl +++ b/k8s/deployment/templates/scaling.yaml.tpl @@ -18,4 +18,12 @@ spec: target: type: Utilization averageUtilization: {{ .scope.capabilities.autoscaling.target_cpu_utilization }} + {{- if and (has .scope.capabilities.autoscaling "target_memory_enabled") (eq .scope.capabilities.autoscaling.target_memory_enabled true) }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .scope.capabilities.autoscaling.target_memory_utilization }} + {{- end }} {{- end }} \ No newline at end of file diff --git a/k8s/metric/metric b/k8s/metric/metric index 49fb97b9..244c7bcb 100755 --- a/k8s/metric/metric +++ b/k8s/metric/metric @@ -2,6 +2,7 @@ GROUP_BY=${GROUP_BY:-""} + # Validate required parameters if [[ -z "$METRIC_NAME" ]]; then echo '{"metric":"","type":"","period_in_seconds":0,"unit":"","results":[]}' @@ -13,8 +14,13 @@ if [[ -z "$APPLICATION_ID" ]]; then exit 1 fi -if [[ -z "$PROM_URL" ]]; then - echo '{"error":"PROM_URL is required. Please specify with PROM_URL environment variable"}' +if [[ -z "$DATADOG_API_KEY" ]]; then + echo '{"error":"DATADOG_API_KEY is required. Please specify with DATADOG_API_KEY environment variable"}' + exit 1 +fi + +if [[ -z "$DATADOG_APP_KEY" ]]; then + echo '{"error":"DATADOG_APP_KEY is required. Please specify with DATADOG_APP_KEY environment variable"}' exit 1 fi @@ -35,10 +41,10 @@ get_metric_config() { "http.healthcheck_fail") echo "gauge count" ;; - "system.cpu_usage_percentage") - echo "gauge percent" + "trace.http.request.p99") + echo "gauge milliseconds" ;; - "system.cpu_usage_percentage_by_instance") + "system.cpu_usage_percentage") echo "gauge percent" ;; "system.memory_usage_percentage") @@ -47,24 +53,6 @@ get_metric_config() { "system.used_memory_kb") echo "gauge kilobytes" ;; - "cronjob.execution_count") - echo "gauge count" - ;; - "cronjob.success_count") - echo "gauge count" - ;; - "cronjob.failure_count") - echo "gauge count" - ;; - "cronjob.last_execution_start") - echo "gauge timestamp" - ;; - "cronjob.cpu_usage") - echo "gauge percent" - ;; - "cronjob.memory_usage") - echo "gauge bytes" - ;; *) echo "gauge unknown" ;; @@ -74,217 +62,173 @@ get_metric_config() { build_filters() { local filters="" - # Add application_id filter if [[ -n "$APPLICATION_ID" ]]; then - filters="application_id=\"$APPLICATION_ID\"" + filters="application_id:$APPLICATION_ID" fi - # Add scope_id filter if [[ -n "$SCOPE_ID" ]]; then if [[ -n "$filters" ]]; then - filters="$filters," + filters="$filters,scope_id:$SCOPE_ID" + else + filters="scope_id:$SCOPE_ID" fi - filters="${filters}scope_id=\"$SCOPE_ID\"" fi if [[ -n "$DEPLOYMENT_ID" && "$DEPLOYMENT_ID" != "null" ]]; then if [[ -n "$filters" ]]; then - filters="$filters," + filters="$filters,deployment_id:$DEPLOYMENT_ID" + else + filters="deployment_id:$DEPLOYMENT_ID" fi - filters="${filters}deployment_id=\"$DEPLOYMENT_ID\"" fi echo "$filters" } -# Build Prometheus query based on metric type -build_query() { +build_datadog_query() { local metric="$1" local filters="$2" - local interval="$3" + local rollup_step="$3" + local start_time="$4" + local end_time="$5" local groupBy="$GROUP_BY" if [[ "$groupBy" == "[]" || "$groupBy" == "" ]]; then groupBy="" fi + local group_clause="" + if [[ -n "$groupBy" ]]; then + group_clause=" by {$(echo "$groupBy" | tr ',' ' ' | xargs | tr ' ' ',')}" + fi + case "$metric" in + # Custom nullplatform metrics with specific logic "http.healthcheck_count") - local healthcheck_filters="${filters},is_healthcheck=\"yes\"" - if [[ -n "$groupBy" ]]; then - echo "sum(rate(nullplatform_http_response_time_count{$healthcheck_filters}[$interval])) by ($groupBy)" - else - echo "sum(rate(nullplatform_http_response_time_count{$healthcheck_filters}[$interval]))" - fi + echo "sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes} by {instance_id}.rollup(sum, $rollup_step)" ;; "http.healthcheck_fail") - local healthcheck_filters="${filters},is_healthcheck=\"yes\"" - local ok_filters="${filters},is_healthcheck=\"yes\",quality=\"OK (2XX, 3XX)\"" - if [[ -n "$groupBy" ]]; then - echo "sum(rate(nullplatform_http_response_time_count{$healthcheck_filters}[$interval])) by ($groupBy) - sum(rate(nullplatform_http_response_time_count{$ok_filters}[$interval])) by ($groupBy)" - else - echo "sum(rate(nullplatform_http_response_time_count{$healthcheck_filters}[$interval])) - sum(rate(nullplatform_http_response_time_count{$ok_filters}[$interval]))" - fi - ;; - "system.cpu_usage_percentage_by_instance") - echo "avg(nullplatform_system_cpu_usage_percentage{$filters}) by (instance_id)" + echo "sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes} by {instance_id}.rollup(sum, $rollup_step) - sum:nullplatform.scope.request_count{$filters,is_healthcheck:yes,quality:ok_2xx_3xx} by {instance_id}.rollup(sum, $rollup_step)" ;; "system.memory_usage_percentage") - if [[ -n "$groupBy" ]]; then - echo "avg(nullplatform_system_memory_usage_percentage{$filters}) by ($groupBy)" - else - echo "avg(nullplatform_system_memory_usage_percentage{$filters})" - fi + echo "avg:nullplatform.scope.memory_usage_percentage{$filters}$group_clause" ;; "system.cpu_usage_percentage") - if [[ -n "$groupBy" ]]; then - echo "avg(nullplatform_system_cpu_usage_percentage{$filters}) by ($groupBy)" - else - echo "avg(nullplatform_system_cpu_usage_percentage{$filters})" - fi + echo "avg:nullplatform.scope.cpu_usage_percentage{$filters}$group_clause" ;; "system.used_memory_kb") - if [[ -n "$groupBy" ]]; then - echo "avg(nullplatform_system_used_memory_kb{$filters}) by ($groupBy)" - else - echo "avg(nullplatform_system_used_memory_kb{$filters})" - fi + echo "avg:nullplatform.scope.memory_usage_kb{$filters}$group_clause" ;; "http.response_time") - if [[ -n "$groupBy" ]]; then - echo "sum(idelta(nullplatform_http_response_time{$filters}[$interval])) by ($groupBy)/sum(idelta(nullplatform_http_response_time_count{$filters}[$interval])) by ($groupBy)" - else - echo "sum(idelta(nullplatform_http_response_time{$filters}[$interval]))/sum(idelta(nullplatform_http_response_time_count{$filters}[$interval]))" - fi + echo "sum:nullplatform.scope.response_time{$filters}$group_clause.rollup(sum, 60) / sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)" ;; "http.rpm") - if [[ -n "$groupBy" ]]; then - echo "sum(rate(nullplatform_http_response_time_count{$filters}[$interval])) by ($groupBy) * 60" - else - echo "sum(rate(nullplatform_http_response_time_count{$filters}[$interval])) * 60" - fi + echo "sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)" ;; "http.error_rate") - local base_filters="$filters" - local ok_filters="${filters},quality=\"OK (2XX, 3XX)\"" - if [[ -n "$groupBy" ]]; then - echo "((sum(rate(nullplatform_http_response_time_count{${base_filters}}[$interval])) by ($groupBy) * 60 - sum(rate(nullplatform_http_response_time_count{${ok_filters}}[$interval])) by ($groupBy) * 60) / (sum(rate(nullplatform_http_response_time_count{${base_filters}}[$interval])) by ($groupBy) * 60 )) *100" - else - echo "((sum(rate(nullplatform_http_response_time_count{${base_filters}}[$interval])) * 60 - sum(rate(nullplatform_http_response_time_count{${ok_filters}}[$interval])) * 60) / (sum(rate(nullplatform_http_response_time_count{${base_filters}}[$interval])) * 60 )) *100" - fi - ;; - "cronjob.execution_count") - echo "sum by (scope_id) (label_replace(increase(kube_job_status_succeeded{job_name=~\"job-${SCOPE_ID}-.*\"}[$interval]) +increase(kube_job_status_failed{job_name=~\"job-${SCOPE_ID}-.*\"}[$interval]),\"scope_id\", \"\$1\", \"job_name\", \"job-([0-9]+)-.*\"))" - ;; - "cronjob.success_count") - echo "sum by (scope_id) (label_replace(increase(kube_job_status_succeeded{job_name=~\"job-${SCOPE_ID}-.*\"}[$interval]), \"scope_id\", \"\$1\", \"job_name\", \"job-([0-9]+)-.*\"))" - ;; - "cronjob.failure_count") - echo "sum by (scope_id) (label_replace(increase(kube_job_status_failed{job_name=~\"job-${SCOPE_ID}-.*\"}[$interval]), \"scope_id\", \"\$1\", \"job_name\", \"job-([0-9]+)-.*\"))" + echo "((sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60) - sum:nullplatform.scope.request_count{$filters,quality:ok_2xx_3xx}$group_clause.rollup(sum, 60)) / sum:nullplatform.scope.request_count{$filters}$group_clause.rollup(sum, 60)) * 100" ;; - "cronjob.cpu_usage") - echo "avg(avg_over_time(container_cpu_usage_seconds_total{pod=~\"job-${SCOPE_ID}-.*\", container!=\"\", container!=\"POD\"}[$interval])) * 100" - ;; - "cronjob.memory_usage") - echo "avg by (scope_id) (label_replace(container_memory_usage_bytes{pod=~\"job-${SCOPE_ID}-.*\", container!=\"\", container!=\"POD\"} / on(pod, container) kube_pod_container_resource_limits{resource=\"memory\", unit=\"byte\", pod=~\"job-${SCOPE_ID}-.*\"}, \"scope_id\", \"\$1\", \"pod\", \"job-([0-9]+)-.*\")) * 100" + "trace.http.request.p99") + local env_name=$(echo "$CONTEXT" | jq -r '.service.dimensions.environment') + local service_name=$(basename $(echo "$CONTEXT" | jq -r '.tags.repository_url'))-kubernetes + + local p99_interval=$((end_time - start_time)) + echo "p99:trace.http.request{service:$service_name,env:$env_name}.rollup(avg, $p99_interval) * 1000" ;; + + # Generic handler for any other Datadog metric *) - echo "up{$filters}" # Default query if metric not recognized + echo "avg:$metric{$filters}$group_clause" ;; esac } -# Query Prometheus and return the result -query_prometheus() { +# Query Datadog API +query_datadog() { local query="$1" local start_time="$2" local end_time="$3" - local step="$4" - - local url="${PROM_URL}/api/v1/query_range" - local params="query=$(urlencode "$query")&start=$start_time&end=$end_time&step=${step}s" - curl -s -G "$url" --data-urlencode "query=$query" --data-urlencode "start=$start_time" --data-urlencode "end=$end_time" --data-urlencode "step=${step}s" -} - -urlencode() { - local string="${1}" - local strlen=${#string} - local encoded="" - local pos c o + local base_url="https://api.${DATADOG_SITE:-datadoghq.com}" + local url="${base_url}/api/v2/query/timeseries" + + # Build JSON payload for v2 API + local payload=$(jq -n \ + --arg query "$query" \ + --arg from "$start_time" \ + --arg to "$end_time" \ + '{ + data: { + type: "timeseries_request", + attributes: { + formulas: [{ + formula: "a" + }], + queries: [{ + name: "a", + query: $query, + data_source: "metrics" + }], + from: ($from | tonumber * 1000), + to: ($to | tonumber * 1000), + interval: 60000 + } + } + }') - for (( pos=0 ; pos/dev/null || echo "0") - now=$(echo "$END_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -d "{}" +%s 2>/dev/null || echo "0") - step=${PERIOD:-60} - # Calculate interval like JavaScript service: period/60 + "m" - if [[ -n "$PERIOD" && "$PERIOD" -gt 0 ]]; then - interval_minutes=$((PERIOD / 60)) - if [[ $interval_minutes -lt 1 ]]; then - interval_minutes=1 - fi - # Use minimum 5m interval for HTTP metrics to ensure sufficient data points - case "$METRIC_NAME" in - "http.error_rate"|"http.response_time"|"http.rpm"|"http.healthcheck_count"|"http.healthcheck_fail") - if [[ $interval_minutes -lt 5 ]]; then - interval_minutes=5 - fi - ;; - esac - INTERVAL="${interval_minutes}m" + # For macOS compatibility, use a different date parsing approach + if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS date command + start_time=$(echo "$START_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -j -f "%Y-%m-%d %H:%M:%S" "{}" +%s 2>/dev/null || echo "0") + end_time=$(echo "$END_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -j -f "%Y-%m-%d %H:%M:%S" "{}" +%s 2>/dev/null || echo "0") else - INTERVAL="5m" + # Linux date command + start_time=$(echo "$START_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -d "{}" +%s 2>/dev/null || echo "0") + end_time=$(echo "$END_TIME" | sed 's/T/ /' | sed 's/\.[0-9]*Z$//' | xargs -I {} date -u -d "{}" +%s 2>/dev/null || echo "0") + fi + # Use 30 second intervals for queries 6 minutes or less + time_diff=$((end_time - start_time)) + if [[ $time_diff -le 360 ]]; then + step=${PERIOD:-30} + else + step=${PERIOD:-60} fi else # Fallback to TIME_RANGE logic - now=$(date +%s) + end_time=$(date +%s) case "$TIME_RANGE" in *h) hours=${TIME_RANGE%h} - start_time=$((now - hours * 3600)) + start_time=$((end_time - hours * 3600)) ;; *m) minutes=${TIME_RANGE%m} - start_time=$((now - minutes * 60)) + start_time=$((end_time - minutes * 60)) ;; *d) days=${TIME_RANGE%d} - start_time=$((now - days * 86400)) - ;; - *) - start_time=$((now - 3600)) - ;; - esac - - case "$INTERVAL" in - *h) - hours=${INTERVAL%h} - step=$((hours * 3600)) - ;; - *m) - minutes=${INTERVAL%m} - step=$((minutes * 60)) - ;; - *s) - step=${INTERVAL%s} + start_time=$((end_time - days * 86400)) ;; *) - step=60 + start_time=$((end_time - 3600)) ;; esac + # Use 30 second intervals for queries 6 minutes or less + time_diff=$((end_time - start_time)) + if [[ $time_diff -le 360 ]]; then + step=${PERIOD:-30} + else + step=${PERIOD:-60} + fi fi config=$(get_metric_config) @@ -292,35 +236,85 @@ metric_type=$(echo $config | cut -d' ' -f1) unit=$(echo $config | cut -d' ' -f2) filters=$(build_filters) -query=$(build_query "$METRIC_NAME" "$filters" "$INTERVAL") +query=$(build_datadog_query "$METRIC_NAME" "$filters" "$step" "$start_time" "$end_time") + +response=$(query_datadog "$query" "$start_time" "$end_time") -response=$(query_prometheus "$query" "$start_time" "$now" "$step") transform_response() { local response="$1" - local status=$(echo "$response" | jq -r '.status') - - if [[ "$status" != "success" ]]; then + + # Check if response contains error + local error=$(echo "$response" | jq -r '.errors // empty') + if [[ -n "$error" ]]; then echo "[]" return fi - local results=$(echo "$response" | jq '.data.result') - - if [[ "$results" == "[]" || "$results" == "null" ]]; then + # Extract timeseries data from v2 API response + local series=$(echo "$response" | jq -r '.data.attributes.series // []') + + if [[ "$series" == "[]" || "$series" == "null" ]]; then echo "[]" return fi - echo "$results" | jq 'map({ - selector: .metric, - data: .values | map({ - timestamp: (.[0] | tonumber | todate), - value: (.[1] | tonumber) - }) - })' + # Transform v2 API response - combine times and values arrays into data points + echo "$response" | jq ' + (.data.attributes.times // []) as $times | + (.data.attributes.values // [[]]) as $values | + (.data.attributes.series // []) | to_entries | map( + .key as $idx | + .value | { + selector: ( + if .group_tags then + if (.group_tags | type) == "array" then + if (.group_tags | length) == 0 then + {} + else + # Convert array of "key:value" strings to object + .group_tags | map(split(":") | {(.[0]): .[1]}) | add + end + else + .group_tags + end + elif .scope then + if (.scope | type) == "string" then + .scope | split(",") | map(split(":") | {(.[0]): .[1]}) | add + else + .scope + end + else + {} + end + ), + data: ( + if ($values | length) > 0 and ($times | length) > 0 then + [range($times | length)] | map({ + timestamp: ($times[.] / 1000 | todate), + value: ($values[$idx][.] // 0) + }) + else + [] + end + ) + } + )' } transformed_results=$(transform_response "$response") -echo "{\"metric\":\"$METRIC_NAME\",\"type\":\"$metric_type\",\"period_in_seconds\":$step,\"unit\":\"$unit\",\"results\":$transformed_results}" +# Output compact JSON without formatting +jq -c -n \ + --arg metric "$METRIC_NAME" \ + --arg type "$metric_type" \ + --arg period "$step" \ + --arg unit "$unit" \ + --argjson results "$transformed_results" \ + '{ + metric: $metric, + type: $type, + period_in_seconds: ($period | tonumber), + unit: $unit, + results: $results + }' \ No newline at end of file diff --git a/k8s/metric/workflows/metric.yaml b/k8s/metric/workflows/metric.yaml index d5a94430..d7b668c2 100644 --- a/k8s/metric/workflows/metric.yaml +++ b/k8s/metric/workflows/metric.yaml @@ -4,6 +4,6 @@ steps: file: "$SERVICE_PATH/metric/build_context" configuration: K8S_NAMESPACE: nullplatform - - name: logs + - name: metric type: script file: "$SERVICE_PATH/metric/metric" \ No newline at end of file diff --git a/k8s/scope/iam/build_service_account b/k8s/scope/iam/build_service_account index 15c820ed..b3f52676 100644 --- a/k8s/scope/iam/build_service_account +++ b/k8s/scope/iam/build_service_account @@ -23,6 +23,11 @@ SERVICE_ACCOUNT_NAME=$(echo "$IAM" | jq -r .PREFIX)-"$SCOPE_ID" echo "Looking for IAM role: $SERVICE_ACCOUNT_NAME" ROLE_ARN=$(aws iam get-role --role-name "$SERVICE_ACCOUNT_NAME" --query 'Role.Arn' --output text 2>&1) || { + if [[ "${ACTION:-}" == "delete" ]] && [[ "$ROLE_ARN" == *"NoSuchEntity"* ]] && [[ "$ROLE_ARN" == *"cannot be found"* ]]; then + echo "IAM role '$SERVICE_ACCOUNT_NAME' does not exist, skipping service account deletion" + return 0 + fi + echo "ERROR: Failed to find IAM role '$SERVICE_ACCOUNT_NAME'" echo "AWS Error: $ROLE_ARN" echo "Make sure the role exists and you have IAM permissions" diff --git a/k8s/scope/iam/delete_role b/k8s/scope/iam/delete_role index 08bfc678..3a9eb826 100755 --- a/k8s/scope/iam/delete_role +++ b/k8s/scope/iam/delete_role @@ -11,6 +11,18 @@ if [[ "$IAM_ENABLED" == "false" || "$IAM_ENABLED" == "null" ]]; then return fi +ROLE_ARN=$(aws iam get-role --role-name "$SERVICE_ACCOUNT_NAME" --query 'Role.Arn' --output text 2>&1) || { + if [[ "$ROLE_ARN" == *"NoSuchEntity"* ]] && [[ "$ROLE_ARN" == *"cannot be found"* ]]; then + echo "IAM role '$SERVICE_ACCOUNT_NAME' does not exist, skipping role deletion" + return 0 + fi + + echo "ERROR: Failed to find IAM role '$SERVICE_ACCOUNT_NAME'" + echo "AWS Error: $ROLE_ARN" + echo "Make sure the role exists and you have IAM permissions" + exit 1 +} + ROLE_NAME=$(echo "$IAM" | jq -r .PREFIX)-"$SCOPE_ID" echo "Detaching managed policies..." diff --git a/k8s/scope/networking/dns/manage_dns b/k8s/scope/networking/dns/manage_dns index a2ab62e3..eed162f9 100755 --- a/k8s/scope/networking/dns/manage_dns +++ b/k8s/scope/networking/dns/manage_dns @@ -5,6 +5,12 @@ set -euo pipefail echo "Managing DNS records" echo "DNS Type: $DNS_TYPE" echo "Action: $ACTION" +echo "Scope Domain: $SCOPE_DOMAIN" + +if [[ "$ACTION" == "DELETE" ]] && [[ -z "${SCOPE_DOMAIN:-}" || "${SCOPE_DOMAIN:-}" == "To be defined" ]]; then + echo "Skipping route53 action as the scope has no domain" + return 0 +fi case "$DNS_TYPE" in route53) diff --git a/k8s/scope/networking/dns/route53/manage_route b/k8s/scope/networking/dns/route53/manage_route index c9ecd9e1..5a415b65 100644 --- a/k8s/scope/networking/dns/route53/manage_route +++ b/k8s/scope/networking/dns/route53/manage_route @@ -74,12 +74,18 @@ for ZONE_ID in "${HOSTED_ZONES[@]}"; do } ] }" 2>&1) || { - echo "ERROR: Failed to create Route53 record" + + if [[ "$ACTION" == "DELETE" ]] && [[ "$ROUTE53_OUTPUT" == *"InvalidChangeBatch"* ]] && [[ "$ROUTE53_OUTPUT" == *"but it was not found"* ]]; then + echo "Route53 record for $SCOPE_DOMAIN does not exist in zone $ZONE_ID, skipping deletion" + continue + fi + + echo "ERROR: Failed to $ACTION Route53 record" echo "Zone ID: $ZONE_ID" echo "AWS Error: $ROUTE53_OUTPUT" echo "This often happens when the agent lacks Route53 permissions" exit 1 } - echo "Successfully created Route53 record" + echo "Successfully $ACTION Route53 record" done diff --git a/k8s/scope/workflows/create.yaml b/k8s/scope/workflows/create.yaml index e8f19841..c57caa2a 100644 --- a/k8s/scope/workflows/create.yaml +++ b/k8s/scope/workflows/create.yaml @@ -22,6 +22,8 @@ steps: - name: build service account type: script file: "$SERVICE_PATH/scope/iam/build_service_account" + configuration: + ACTION: create output: - name: SERVICE_ACCOUNT_TEMPLATE_PATH type: file diff --git a/k8s/scope/workflows/delete.yaml b/k8s/scope/workflows/delete.yaml index 6022c44f..541f53ad 100644 --- a/k8s/scope/workflows/delete.yaml +++ b/k8s/scope/workflows/delete.yaml @@ -36,6 +36,8 @@ steps: - name: build service account type: script file: "$SERVICE_PATH/scope/iam/build_service_account" + configuration: + ACTION: delete output: - name: SERVICE_ACCOUNT_TEMPLATE_PATH type: file diff --git a/k8s/specs/service-spec.json.tpl b/k8s/specs/service-spec.json.tpl index 4458a423..3a6d4efc 100644 --- a/k8s/specs/service-spec.json.tpl +++ b/k8s/specs/service-spec.json.tpl @@ -100,6 +100,19 @@ { "type":"Control", "scope":"#/properties/autoscaling/properties/target_cpu_utilization" + }, + { + "rule": { + "effect": "SHOW", + "condition": { + "scope": "#/properties/autoscaling/properties/target_memory_enabled", + "schema": { + "const": true + } + } + }, + "type": "Control", + "scope": "#/properties/autoscaling/properties/target_memory_utilization" } ] } @@ -363,6 +376,19 @@ "maximum":90, "minimum":50, "description":"CPU utilization threshold that triggers scaling" + }, + "target_memory_enabled": { + "type": "boolean", + "title": "Scale by memory", + "default": false + }, + "target_memory_utilization": { + "type": "integer", + "title": "Target memory utilization (%)", + "default": 70, + "maximum": 90, + "minimum": 50, + "description": "Memory utilization threshold that triggers scaling" } } }, @@ -546,6 +572,18 @@ } }, "description":"Configure automatic deployment from Git branches" + }, + "custom_domains": { + "type": "object", + "required": [ + "enabled" + ], + "properties": { + "enabled": { + "type": "boolean", + "default": true + } + } } } }