From 326a02777c9dea3facf866a7dcaa4357348c7b60 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 15:41:25 +1100 Subject: [PATCH 01/20] refactor autoscale to fit ms complex test requirement --- .../clusterloader2/autoscale/autoscale.py | 237 +++++++++++------- .../autoscale/config/ms_complex_config.yaml | 165 ++++++++++++ ...de-auto-provisioning-benchmark-complex.yml | 5 +- .../nap/terraform-inputs/azure-complex.tfvars | 188 +++++++------- .../clusterloader2/autoscale/collect.yml | 2 +- .../clusterloader2/autoscale/execute.yml | 7 +- 6 files changed, 415 insertions(+), 189 deletions(-) create mode 100644 modules/python/clusterloader2/autoscale/config/ms_complex_config.yaml diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 188ca5e4c3..8ebb3934a7 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -5,7 +5,7 @@ import subprocess from datetime import datetime, timezone -from clusterloader2.utils import parse_xml_to_json, run_cl2_command +from clusterloader2.utils import parse_xml_to_json, run_cl2_command, process_cl2_reports from clients.kubernetes_client import KubernetesClient from utils.logger_config import get_logger, setup_logging @@ -60,7 +60,7 @@ def calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, po cpu_request = int(cpu_request * 0.95) return cpu_request -def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir, os_type="linux", warmup_deployment_template="", deployment_template=""): +def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir, os_type="linux", warmup_deployment_template="", deployment_template="", pod_cpu_request=0, pod_memory_request=""): logger.info(f"CPU per node: {cpu_per_node}") desired_node_count = 1 if warmup_deployment in ["true", "True"]: @@ -72,26 +72,113 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") logger.info(f"CPU request for each pod: {cpu_request}m") - # assuming the number of surge nodes is no more than 10 + is_complex = override_file == "ms_complex_config.yaml" + with open(override_file, 'w', encoding='utf-8') as file: file.write(f"CL2_DEPLOYMENT_CPU: {cpu_request}m\n") - file.write(f"CL2_MIN_NODE_COUNT: {node_count}\n") - file.write(f"CL2_MAX_NODE_COUNT: {node_count + 10}\n") - file.write(f"CL2_DESIRED_NODE_COUNT: {desired_node_count}\n") + file.write(f"CL2_DEPLOYMENT_MEMORY: {pod_memory_request}\n") file.write(f"CL2_DEPLOYMENT_SIZE: {pod_count}\n") file.write(f"CL2_SCALE_UP_TIMEOUT: {scale_up_timeout}\n") file.write(f"CL2_SCALE_DOWN_TIMEOUT: {scale_down_timeout}\n") file.write(f"CL2_LOOP_COUNT: {loop_count}\n") - file.write(f"CL2_NODE_LABEL_SELECTOR: {node_label_selector}\n") + + if not is_complex: + file.write(f"CL2_MIN_NODE_COUNT: {node_count}\n") + file.write(f"CL2_MAX_NODE_COUNT: {node_count + 10}\n") + file.write(f"CL2_DESIRED_NODE_COUNT: {desired_node_count}\n") + file.write(f"CL2_NODE_LABEL_SELECTOR: {node_label_selector}\n") + file.write(f"CL2_NODE_SELECTOR: \"{node_selector}\"\n") file.write(f"CL2_OS_TYPE: {os_type}\n") - if deployment_template !='': + + if deployment_template: file.write(f"CL2_DEPLOYMENT_TEMPLATE_PATH: {deployment_template}\n") - file.close() -def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider): - run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True) +def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, cl2_config_file="config.yaml"): + run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file, overrides=True) + +def _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, autoscale_type="up", cpu_per_node=None, node_count=None, data=None, is_complex=False): + """Build CL2 measurement template""" + result = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "autoscale_type": autoscale_type, + "capacity_type": capacity_type, + "pod_count": pod_count, + "data": data if data is not None else {}, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url + } + + # Only add node-specific info for non-complex configs + if not is_complex: + result["cpu_per_node"] = cpu_per_node + result["node_count"] = node_count + if is_complex: # cl2 measurement + result["group"] = None + result["measurement"] = None + result["result"] = None + + return result + +def _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config=False): + """Process test results and generate JSON content""" + summary = {} + + # Define which metrics to include in data based on config type + if is_complex_config: + data_metrics = ["wait_for_pods_seconds"] + else: + data_metrics = [ + "wait_for_nodes_seconds", + "wait_for_50Perc_nodes_seconds", + "wait_for_70Perc_nodes_seconds", + "wait_for_90Perc_nodes_seconds", + "wait_for_99Perc_nodes_seconds", + "wait_for_pods_seconds" + ] + + # Process each loop + for testcase in testsuites[0]["testcases"]: + name = testcase["name"] + index = -1 + match = index_pattern.search(name) + if match: + index = match.group() + if index not in summary: + summary[index] = { + "up": { "failures": 0 }, + "down": { "failures": 0 } + } + else: + continue + + failure = testcase["failure"] + for test_key, (category, summary_key) in metric_mappings.items(): + if test_key in name: + summary[index][category][summary_key] = -1 if failure else testcase["time"] + summary[index][category]["failures"] += 1 if failure else 0 + break # Exit loop once matched + + content = "" + for index, inner_dict in summary.items(): + for key, value in inner_dict.items(): + # Build data dict dynamically based on available metrics + data = {metric: value.get(metric) for metric in data_metrics} + data["autoscale_result"] = "success" if value["failures"] == 0 else "failure" + + # For complex configs, don't include cpu_per_node and node_count + result = _build_report_template( + capacity_type, pod_count, cloud_info, run_id, run_url, + autoscale_type=key, + cpu_per_node=cpu_per_node, + node_count=node_count, + data=data, is_complex=is_complex_config + ) + content += json.dumps(result) + "\n" + + return content def collect_clusterloader2( cpu_per_node, @@ -102,83 +189,51 @@ def collect_clusterloader2( cloud_info, run_id, run_url, - result_file + result_file, + cl2_config_file ): index_pattern = re.compile(r'(\d+)$') raw_data = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) json_data = json.loads(raw_data) testsuites = json_data["testsuites"] - summary = {} - metric_mappings = { - "WaitForRunningPodsUp": ("up", "wait_for_pods_seconds"), - "WaitForNodesUpPerc50": ("up", "wait_for_50Perc_nodes_seconds"), - "WaitForNodesUpPerc70": ("up", "wait_for_70Perc_nodes_seconds"), - "WaitForNodesUpPerc90": ("up", "wait_for_90Perc_nodes_seconds"), - "WaitForNodesUpPerc99": ("up", "wait_for_99Perc_nodes_seconds"), - "WaitForNodesUpPerc100": ("up", "wait_for_nodes_seconds"), - "WaitForRunningPodsDown": ("down", "wait_for_pods_seconds"), - "WaitForNodesDownPerc50": ("down", "wait_for_50Perc_nodes_seconds"), - "WaitForNodesDownPerc70": ("down", "wait_for_70Perc_nodes_seconds"), - "WaitForNodesDownPerc90": ("down", "wait_for_90Perc_nodes_seconds"), - "WaitForNodesDownPerc99": ("down", "wait_for_99Perc_nodes_seconds"), - "WaitForNodesDownPerc100": ("down", "wait_for_nodes_seconds"), - } + + # Different metric mappings based on config file type + is_complex_config = "ms_complex_config" in cl2_config_file + + if is_complex_config: + # Metric mappings for complex config + metric_mappings = { + "WaitForRunningPodsUp": ("up", "wait_for_pods_seconds"), + "WaitForRunningPodsDown": ("down", "wait_for_pods_seconds"), + } + else: + # Metric mappings for standard config + metric_mappings = { + "WaitForRunningPodsUp": ("up", "wait_for_pods_seconds"), + "WaitForNodesUpPerc50": ("up", "wait_for_50Perc_nodes_seconds"), + "WaitForNodesUpPerc70": ("up", "wait_for_70Perc_nodes_seconds"), + "WaitForNodesUpPerc90": ("up", "wait_for_90Perc_nodes_seconds"), + "WaitForNodesUpPerc99": ("up", "wait_for_99Perc_nodes_seconds"), + "WaitForNodesUpPerc100": ("up", "wait_for_nodes_seconds"), + "WaitForRunningPodsDown": ("down", "wait_for_pods_seconds"), + "WaitForNodesDownPerc50": ("down", "wait_for_50Perc_nodes_seconds"), + "WaitForNodesDownPerc70": ("down", "wait_for_70Perc_nodes_seconds"), + "WaitForNodesDownPerc90": ("down", "wait_for_90Perc_nodes_seconds"), + "WaitForNodesDownPerc99": ("down", "wait_for_99Perc_nodes_seconds"), + "WaitForNodesDownPerc100": ("down", "wait_for_nodes_seconds"), + } if testsuites: - # Process each loop - for testcase in testsuites[0]["testcases"]: - name = testcase["name"] - index = -1 - match = index_pattern.search(name) - if match: - index = match.group() - if index not in summary: - summary[index] = { - "up": { "failures": 0 }, - "down": { "failures": 0 } - } - else: - continue - - failure = testcase["failure"] - for test_key, (category, summary_key) in metric_mappings.items(): - if test_key in name: - summary[index][category][summary_key] = -1 if failure else testcase["time"] - summary[index][category]["failures"] += 1 if failure else 0 - break # Exit loop once matched - - content = "" - for index, inner_dict in summary.items(): - for key, value in inner_dict.items(): - data = { - "wait_for_nodes_seconds": value["wait_for_nodes_seconds"], - "wait_for_50Perc_nodes_seconds": value["wait_for_50Perc_nodes_seconds"], - "wait_for_70Perc_nodes_seconds": value["wait_for_70Perc_nodes_seconds"], - "wait_for_90Perc_nodes_seconds": value["wait_for_90Perc_nodes_seconds"], - "wait_for_99Perc_nodes_seconds": value["wait_for_99Perc_nodes_seconds"], - "wait_for_pods_seconds": value["wait_for_pods_seconds"], - "autoscale_result": "success" if value["failures"] == 0 else "failure" - } - # TODO: Expose optional parameter to include test details - result = { - "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), - "autoscale_type": key, - "cpu_per_node": cpu_per_node, - "capacity_type": capacity_type, - "node_count": node_count, - "pod_count": pod_count, - "data": data, - # "raw_data": raw_data, - "cloud_info": cloud_info, - "run_id": run_id, - "run_url": run_url - } - content += json.dumps(result) + "\n" + content = _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config) else: raise Exception(f"No testsuites found in the report! Raw data: {raw_data}") - + if is_complex_config: + cl2_measurement = _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, data={}, is_complex=is_complex_config) + cl2_result = process_cl2_reports(cl2_report_dir, cl2_measurement) + logger.info(f"Result, category up: {cl2_result}") + content += cl2_result os.makedirs(os.path.dirname(result_file), exist_ok=True) with open(result_file, 'w', encoding='utf-8') as file: file.write(content) @@ -189,13 +244,13 @@ def main(): # Sub-command for override_config_clusterloader2 parser_override = subparsers.add_parser("override", help="Override CL2 config file") - parser_override.add_argument("cpu_per_node", type=int, help="Name of cpu cores per node") - parser_override.add_argument("node_count", type=int, help="Number of nodes") - parser_override.add_argument("pod_count", type=int, help="Number of pods") + parser_override.add_argument("cpu_per_node", type=int,default=0, help="Name of cpu cores per node") + parser_override.add_argument("node_count", type=int,default=0, help="Number of nodes") + parser_override.add_argument("pod_count", type=int,default=10, help="Number of pods") parser_override.add_argument("scale_up_timeout", type=str, help="Timeout before failing the scale up test") parser_override.add_argument("scale_down_timeout", type=str, help="Timeout before failing the scale down test") parser_override.add_argument("loop_count", type=int, help="Number of times to repeat the test") - parser_override.add_argument("node_label_selector", type=str, help="Node label selector") + parser_override.add_argument("node_label_selector", type=str, default="", help="Node label selector") parser_override.add_argument("node_selector", type=str, help="Node selector for the test pods") parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file") parser_override.add_argument("warmup_deployment", type=str, help="Warmup deployment to get the cpu request") @@ -203,7 +258,8 @@ def main(): parser_override.add_argument("--os_type", type=str, choices=["linux", "windows"], default="linux", help="Operating system type for the node pools") parser_override.add_argument("--warmup_deployment_template", type=str, default="", help="Path to the CL2 warm up deployment file") parser_override.add_argument("--deployment_template", type=str, default="", help="Path to the CL2 deployment file") - + parser_override.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") + parser_override.add_argument("--pod_memory_request", type=str, default="60Gi", help="Memory request for each pod") # Sub-command for execute_clusterloader2 parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image") @@ -211,27 +267,28 @@ def main(): parser_execute.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file") parser_execute.add_argument("provider", type=str, help="Cloud provider name") - + parser_execute.add_argument("--cl2_config_file", type=str, default="config.yaml", help="Path to the CL2 config file") # Sub-command for collect_clusterloader2 parser_collect = subparsers.add_parser("collect", help="Collect scale up data") - parser_collect.add_argument("cpu_per_node", type=int, help="Name of cpu cores per node") + parser_collect.add_argument("cpu_per_node", type=int,default=0, help="Name of cpu cores per node") parser_collect.add_argument("capacity_type", type=str, help="Capacity type", choices=["on-demand", "spot"], default="on-demand") - parser_collect.add_argument("node_count", type=int, help="Number of nodes") - parser_collect.add_argument("pod_count", type=int, help="Number of pods") + parser_collect.add_argument("node_count", type=int, default=0, help="Number of nodes") + parser_collect.add_argument("pod_count", type=int, default=0, help="Number of pods") parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_collect.add_argument("cloud_info", type=str, help="Cloud information") parser_collect.add_argument("run_id", type=str, help="Run ID") parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("result_file", type=str, help="Path to the result file") - + parser_collect.add_argument("--cl2_config_file", type=str, default="config.yaml", help="Path to the CL2 config file") + args = parser.parse_args() if args.command == "override": - override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir, args.os_type, args.warmup_deployment_template, args.deployment_template) + override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir, args.os_type, args.warmup_deployment_template, args.deployment_template,pod_cpu_request=args.pod_cpu_request, pod_memory_request=args.pod_memory_request) elif args.command == "execute": - execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider) + execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider, args.cl2_config_file) elif args.command == "collect": - collect_clusterloader2(args.cpu_per_node, args.capacity_type, args.node_count, args.pod_count, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file) + collect_clusterloader2(args.cpu_per_node, args.capacity_type, args.node_count, args.pod_count, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file, args.cl2_config_file) if __name__ == "__main__": main() diff --git a/modules/python/clusterloader2/autoscale/config/ms_complex_config.yaml b/modules/python/clusterloader2/autoscale/config/ms_complex_config.yaml new file mode 100644 index 0000000000..56e6d15dcf --- /dev/null +++ b/modules/python/clusterloader2/autoscale/config/ms_complex_config.yaml @@ -0,0 +1,165 @@ +{{$deploymentTemplatePath := DefaultParam .CL2_DEPLOYMENT_TEMPLATE_PATH "deployment_template.yaml"}} +{{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 100}} +{{$deploymentCpu := DefaultParam .CL2_DEPLOYMENT_CPU "346m"}} +{{$deploymentMemory := DefaultParam .CL2_DEPLOYMENT_MEMORY "100Mi"}} +{{$nodeSelector := DefaultParam .CL2_NODE_SELECTOR "{karpenter.sh/nodepool: default}"}} +{{$podLabelSelector := DefaultParam .CL2_POD_LABEL_SELECTOR "app = inflate"}} +{{$scaleUpTimeout := DefaultParam .CL2_SCALE_UP_TIMEOUT "30m"}} +{{$scaleDownTimeout := DefaultParam .CL2_SCALE_DOWN_TIMEOUT "10m"}} +{{$refreshInterval := DefaultParam .CL2_REFRESH_INTERVAL "5s"}} +{{$loopCount := DefaultParam .CL2_LOOP_COUNT 1}} +{{$coolDownTime := DefaultParam .CL2_COOLDOWN_TIME "120s"}} +{{$osType := DefaultParam .CL2_OS_TYPE "linux"}} +{{$countErrorMargin := MultiplyInt .CL2_DEPLOYMENT_SIZE 0.01}} + +name: autoscale +namespace: + number: 1 + prefix: autoscale + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: true + +tuningSets: +- name: Uniform1qps + qpsLoad: + qps: 20 + +steps: +{{range $i := Loop $loopCount}} +- name: Start Measurements {{$i}} + measurements: + - Identifier: ResourceUsageSummary + Method: ResourceUsageSummary + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: {{$podLabelSelector}} + threshold: {{$scaleUpTimeout}} + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: {{$podLabelSelector}} +- name: Create deployment {{$i}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Uniform1qps + objectBundle: + - basename: inflate + objectTemplatePath: {{$deploymentTemplatePath}} + templateFillMap: + Replicas: {{$deploymentSize}} + CPUperJob: {{$deploymentCpu}} + MemoryRequest: {{$deploymentMemory}} + NodeSelector: {{ (StructuralData $nodeSelector) }} + OSType: {{$osType}} +- name: Measure nodes and pods scale up {{$i}} + measurements: + - Identifier: WaitForRunningPodsUp {{$i}} + Method: WaitForRunningPods + Params: + action: start + desiredPodCount: {{$deploymentSize}} + countErrorMargin: {{$countErrorMargin}} + labelSelector: {{$podLabelSelector}} + timeout: {{$scaleUpTimeout}} + refreshInterval: {{$refreshInterval}} +- name: Capture Metrics After Scale Up {{$i}} + measurements: + - Identifier: ResourceMetrics{{$i}} + Method: GenericPrometheusQuery + Params: + action: start + metricName: Resource Metrics Summary + metricVersion: v1 + unit: mixed + queries: + # Node Level Summary + - name: TotalNodes + query: count(kube_node_status_allocatable{resource="cpu"}) + - name: NodeCPUAllocatable + query: sum(kube_node_status_allocatable{resource="cpu"}) + - name: NodeMemoryAllocatable + query: sum(kube_node_status_allocatable{resource="memory"}) + # Node CPU Usage Stats (from kubelet/cAdvisor - container metrics aggregated by node) + - name: NodeCPUUsageAvg + query: avg(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m]))) + - name: NodeCPUUsageMax + query: max(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m]))) + - name: NodeCPUUsageMin + query: min(sum by (instance) (rate(container_cpu_usage_seconds_total{id="/"}[2m]))) + # Node Memory Usage Stats (from kubelet/cAdvisor - container metrics aggregated by node) + - name: NodeMemoryUsageAvg + query: avg(sum by (instance) (container_memory_working_set_bytes{id="/"})) + - name: NodeMemoryUsageMax + query: max(sum by (instance) (container_memory_working_set_bytes{id="/"})) + - name: NodeMemoryUsageMin + query: min(sum by (instance) (container_memory_working_set_bytes{id="/"})) + # Pod Level Summary + - name: TotalPods + query: count(kube_pod_status_phase{phase="Running"}) + # Pod Distribution Summary + - name: PodsPerNodeAvg + query: avg(count by (node) (kube_pod_info{node!=""})) + - name: PodsPerNodeMax + query: max(count by (node) (kube_pod_info{node!=""})) + - name: PodsPerNodeMin + query: min(count by (node) (kube_pod_info{node!=""})) +- name: Gather Measurements {{$i}} + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + - Identifier: ResourceUsageSummary + Method: ResourceUsageSummary + Params: + action: gather + - Identifier: ResourceMetrics{{$i}} + Method: GenericPrometheusQuery + Params: + action: gather +- name: WaitBeforeDelete + measurements: + - Identifier: WaitBeforeDelete + Method: Sleep + Params: + action: start + duration: {{$coolDownTime}} +- name: Delete deployment {{$i}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: Uniform1qps + objectBundle: + - basename: inflate + objectTemplatePath: {{$deploymentTemplatePath}} + templateFillMap: + Replicas: {{$deploymentSize}} + CPUperJob: {{$deploymentCpu}} + MemoryRequest: {{$deploymentMemory}} + OSType: {{$osType}} +- name: Measure nodes and pods scale down {{$i}} + measurements: + - Identifier: WaitForRunningPodsDown {{$i}} + Method: WaitForRunningPods + Params: + action: start + desiredPodCount: 0 + labelSelector: {{$podLabelSelector}} + timeout: {{$scaleDownTimeout}} + refreshInterval: {{$refreshInterval}} +{{end}} \ No newline at end of file diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index 75286b09ba..73f1a10cf5 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -31,8 +31,9 @@ stages: matrix: complex-nap: cpu_per_node: 2 - node_count: 5 pod_count: 5 + pod_cpu_request: 2 + pod_memory_request: "4Gi" scale_up_timeout: "15m" scale_down_timeout: "15m" node_label_selector: "karpenter.sh/nodepool = default" @@ -42,6 +43,8 @@ stages: warmup_deployment_template: warmup_deployment.yaml vm_size: Standard_D2s_v4 capacity_type: on-demand + cl2_override_file: "ms_complex_config.yaml" + max_parallel: 1 timeout_in_minutes: 60 credential_type: service_connection diff --git a/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars b/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars index 17e75e82ae..610e8766fe 100644 --- a/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars +++ b/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars @@ -4,12 +4,12 @@ scenario_name = "nap" deletion_delay = "2h" owner = "aks" -public_ip_config_list = [ - { - name = "firewall-pip" - count = 1 - } -] +# public_ip_config_list = [ +# { +# name = "firewall-pip" +# count = 1 +# } +# ] network_config_list = [ @@ -33,90 +33,90 @@ network_config_list = [ } ] -firewall_config_list = [ - { - name = "nap-firewall" - network_role = "crud" - sku_tier = "Standard" - subnet_name = "AzureFirewallSubnet" - public_ip_name = "firewall-pip" - threat_intel_mode = "Alert" - dns_proxy_enabled = true - ip_configuration_name = "nap-fw-ipconfig" - application_rule_collections = [ - { - name = "allow-egress" - priority = 100 - action = "Allow" - rules = [ - { - name = "required-services" - source_addresses = ["*"] - target_fqdns = ["*.azure.com", "*.azure.net", - "*.windows.net", "*.azurecr.io", "*.ubuntu.com", "AzureKubernetesService", - "mcr-0001.mcr-msedge.net", "*.microsoft.com", - "*.microsoftonline.com", "*.microsoftonline.co", "*.azureedge.net", - "packages.aks.azure.com"] - protocols = [ - { port = "80", type = "Http" }, - { port = "443", type = "Https" } - ] - } - ] - } - ] - network_rule_collections = [ - { - name = "network-rules" - priority = 100 - action = "Allow" - rules = [ - { - name = "imds" - source_addresses = ["*"] - destination_addresses = ["169.254.169.254"] - destination_ports = ["80"] - protocols = ["Any"] - }, - { - name = "dns" - source_addresses = ["*"] - destination_addresses = ["*"] - destination_ports = ["53"] - protocols = ["UDP", "TCP"] - }, - { - name = "azure-and-web" - source_addresses = ["*"] - destination_addresses = ["*"] - destination_ports = ["443"] - protocols = ["TCP", "UDP"] - } - ] - } - ] - } -] -route_table_config_list = [ - { - name = "nap-rt" - bgp_route_propagation_enabled = false - routes = [ - { - name = "default-route" - address_prefix = "0.0.0.0/0" - next_hop_type = "VirtualAppliance" - next_hop_firewall_name = "nap-firewall" - }, - { - name = "firewall-internet" - address_prefix_publicip_name = "firewall-pip" - next_hop_type = "Internet" - } - ] - subnet_associations = [{ subnet_name = "nap-subnet-ms" }] - } -] +# firewall_config_list = [ +# { +# name = "nap-firewall" +# network_role = "crud" +# sku_tier = "Standard" +# subnet_name = "AzureFirewallSubnet" +# public_ip_name = "firewall-pip" +# threat_intel_mode = "Alert" +# dns_proxy_enabled = true +# ip_configuration_name = "nap-fw-ipconfig" +# application_rule_collections = [ +# { +# name = "allow-egress" +# priority = 100 +# action = "Allow" +# rules = [ +# { +# name = "required-services" +# source_addresses = ["*"] +# target_fqdns = ["*.azure.com", "*.azure.net", +# "*.windows.net", "*.azurecr.io", "*.ubuntu.com", "AzureKubernetesService", +# "mcr-0001.mcr-msedge.net", "*.microsoft.com", +# "*.microsoftonline.com", "*.microsoftonline.co", "*.azureedge.net", +# "packages.aks.azure.com"] +# protocols = [ +# { port = "80", type = "Http" }, +# { port = "443", type = "Https" } +# ] +# } +# ] +# } +# ] +# network_rule_collections = [ +# { +# name = "network-rules" +# priority = 100 +# action = "Allow" +# rules = [ +# { +# name = "imds" +# source_addresses = ["*"] +# destination_addresses = ["169.254.169.254"] +# destination_ports = ["80"] +# protocols = ["Any"] +# }, +# { +# name = "dns" +# source_addresses = ["*"] +# destination_addresses = ["*"] +# destination_ports = ["53"] +# protocols = ["UDP", "TCP"] +# }, +# { +# name = "azure-and-web" +# source_addresses = ["*"] +# destination_addresses = ["*"] +# destination_ports = ["443"] +# protocols = ["TCP", "UDP"] +# } +# ] +# } +# ] +# } +# ] +# route_table_config_list = [ +# { +# name = "nap-rt" +# bgp_route_propagation_enabled = false +# routes = [ +# { +# name = "default-route" +# address_prefix = "0.0.0.0/0" +# next_hop_type = "VirtualAppliance" +# next_hop_firewall_name = "nap-firewall" +# }, +# { +# name = "firewall-internet" +# address_prefix_publicip_name = "firewall-pip" +# next_hop_type = "Internet" +# } +# ] +# subnet_associations = [{ subnet_name = "nap-subnet-ms" }] +# } +# ] aks_cli_config_list = [ @@ -162,10 +162,10 @@ aks_cli_config_list = [ name = "enable-workload-identity" value = "" }, - { - name = "outbound-type" - value = "userDefinedRouting" - }, + # { + # name = "outbound-type" + # value = "userDefinedRouting" + # }, { name = "enable-addons" value = "azure-keyvault-secrets-provider" diff --git a/steps/engine/clusterloader2/autoscale/collect.yml b/steps/engine/clusterloader2/autoscale/collect.yml index f5b99c4074..8524f652fd 100644 --- a/steps/engine/clusterloader2/autoscale/collect.yml +++ b/steps/engine/clusterloader2/autoscale/collect.yml @@ -16,7 +16,7 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect $CPU_PER_NODE ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ - $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE + $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_OVERRIDE_FILE} workingDirectory: modules/python env: CLOUD: ${{ parameters.cloud }} diff --git a/steps/engine/clusterloader2/autoscale/execute.yml b/steps/engine/clusterloader2/autoscale/execute.yml index 8e878f33a7..13fbb52e66 100644 --- a/steps/engine/clusterloader2/autoscale/execute.yml +++ b/steps/engine/clusterloader2/autoscale/execute.yml @@ -13,11 +13,12 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \ - $CPU_PER_NODE $NODE_COUNT $POD_COUNT \ + ${CPU_PER_NODE:-0} ${NODE_COUNT:-0} ${POD_COUNT:-0} \ $SCALE_UP_TIMEOUT $SCALE_DOWN_TIMEOUT \ - $LOOP_COUNT "$NODE_LABEL_SELECTOR" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""} + $LOOP_COUNT "${NODE_LABEL_SELECTOR:-""}" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""} \ + --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ - ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD + ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_OVERRIDE_FILE} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: From a124cd88b1b390d10fe9b92975ef1b105a4c4bf8 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 15:52:44 +1100 Subject: [PATCH 02/20] fix cpu request --- modules/python/clusterloader2/autoscale/autoscale.py | 4 +++- .../node-auto-provisioning-benchmark-complex.yml | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 8ebb3934a7..335817d372 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -73,9 +73,11 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up logger.info(f"CPU request for each pod: {cpu_request}m") is_complex = override_file == "ms_complex_config.yaml" + if not is_complex: + pod_cpu_request = calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, pod_count, warmup_deployment, cl2_config_dir, warmup_deployment_template) with open(override_file, 'w', encoding='utf-8') as file: - file.write(f"CL2_DEPLOYMENT_CPU: {cpu_request}m\n") + file.write(f"CL2_DEPLOYMENT_CPU: {pod_cpu_request}m\n") file.write(f"CL2_DEPLOYMENT_MEMORY: {pod_memory_request}\n") file.write(f"CL2_DEPLOYMENT_SIZE: {pod_count}\n") file.write(f"CL2_SCALE_UP_TIMEOUT: {scale_up_timeout}\n") diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index 73f1a10cf5..1fe1e72c55 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -36,7 +36,6 @@ stages: pod_memory_request: "4Gi" scale_up_timeout: "15m" scale_down_timeout: "15m" - node_label_selector: "karpenter.sh/nodepool = default" node_selector: "{karpenter.sh/nodepool: default}" loop_count: 1 warmup_deployment: true From e9aa7b488e73b0f39f30dcfaec0d2a0c2aa556de Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 16:36:10 +1100 Subject: [PATCH 03/20] fix pod cpu request --- modules/python/clusterloader2/autoscale/autoscale.py | 8 +++----- .../node-auto-provisioning-benchmark-complex.yml | 3 +-- steps/engine/clusterloader2/autoscale/collect.yml | 2 +- steps/engine/clusterloader2/autoscale/execute.yml | 4 ++-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 335817d372..8d8d48919f 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -60,19 +60,16 @@ def calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, po cpu_request = int(cpu_request * 0.95) return cpu_request -def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir, os_type="linux", warmup_deployment_template="", deployment_template="", pod_cpu_request=0, pod_memory_request=""): - logger.info(f"CPU per node: {cpu_per_node}") +def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up_timeout, scale_down_timeout, loop_count, node_label_selector, node_selector, override_file, warmup_deployment, cl2_config_dir, os_type="linux", warmup_deployment_template="", deployment_template="", pod_cpu_request=0, pod_memory_request="", cl2_config_file="config.yaml"): desired_node_count = 1 if warmup_deployment in ["true", "True"]: warmup_deployment_for_karpeneter(cl2_config_dir, warmup_deployment_template) desired_node_count = 0 - cpu_request = calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, pod_count, warmup_deployment, cl2_config_dir, warmup_deployment_template) - logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") logger.info(f"CPU request for each pod: {cpu_request}m") - is_complex = override_file == "ms_complex_config.yaml" + is_complex = cl2_config_file == "ms_complex_config.yaml" if not is_complex: pod_cpu_request = calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, pod_count, warmup_deployment, cl2_config_dir, warmup_deployment_template) @@ -262,6 +259,7 @@ def main(): parser_override.add_argument("--deployment_template", type=str, default="", help="Path to the CL2 deployment file") parser_override.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") parser_override.add_argument("--pod_memory_request", type=str, default="60Gi", help="Memory request for each pod") + parser_override.add_argument("--cl2_config_file", type=str, default="config.yaml", help="name of CL2 config file") # Sub-command for execute_clusterloader2 parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image") diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index 1fe1e72c55..563ff53678 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -42,8 +42,7 @@ stages: warmup_deployment_template: warmup_deployment.yaml vm_size: Standard_D2s_v4 capacity_type: on-demand - cl2_override_file: "ms_complex_config.yaml" - + cl2_config_file: "ms_complex_config.yaml" max_parallel: 1 timeout_in_minutes: 60 credential_type: service_connection diff --git a/steps/engine/clusterloader2/autoscale/collect.yml b/steps/engine/clusterloader2/autoscale/collect.yml index 8524f652fd..c2bf8effcd 100644 --- a/steps/engine/clusterloader2/autoscale/collect.yml +++ b/steps/engine/clusterloader2/autoscale/collect.yml @@ -16,7 +16,7 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect $CPU_PER_NODE ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ - $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_OVERRIDE_FILE} + $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} workingDirectory: modules/python env: CLOUD: ${{ parameters.cloud }} diff --git a/steps/engine/clusterloader2/autoscale/execute.yml b/steps/engine/clusterloader2/autoscale/execute.yml index 13fbb52e66..39a3fba98a 100644 --- a/steps/engine/clusterloader2/autoscale/execute.yml +++ b/steps/engine/clusterloader2/autoscale/execute.yml @@ -16,9 +16,9 @@ steps: ${CPU_PER_NODE:-0} ${NODE_COUNT:-0} ${POD_COUNT:-0} \ $SCALE_UP_TIMEOUT $SCALE_DOWN_TIMEOUT \ $LOOP_COUNT "${NODE_LABEL_SELECTOR:-""}" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""} \ - --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} + --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} --cl2_config_file ${CL2_CONFIG_FILE} PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ - ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_OVERRIDE_FILE} + ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_CONFIG_FILE} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: From d6573a58b0482908bf60a56b33d8bd114d92929e Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 16:40:10 +1100 Subject: [PATCH 04/20] fix pod cpu request --- modules/python/clusterloader2/autoscale/autoscale.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 8d8d48919f..cbc530b6b6 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -66,8 +66,8 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up warmup_deployment_for_karpeneter(cl2_config_dir, warmup_deployment_template) desired_node_count = 0 - logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") - logger.info(f"CPU request for each pod: {cpu_request}m") + # logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") + # logger.info(f"CPU request for each pod: {cpu_request}m") is_complex = cl2_config_file == "ms_complex_config.yaml" if not is_complex: From b1f9aa01a761c6bc84616c9c226bbd5ea05aa572 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 16:55:54 +1100 Subject: [PATCH 05/20] fix cl2_config --- modules/python/clusterloader2/autoscale/autoscale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index cbc530b6b6..8dd7af1c46 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -198,7 +198,7 @@ def collect_clusterloader2( testsuites = json_data["testsuites"] # Different metric mappings based on config file type - is_complex_config = "ms_complex_config" in cl2_config_file + is_complex_config = "ms_complex_config.yaml" == cl2_config_file if is_complex_config: # Metric mappings for complex config From 6f093806510a9ce06625b429e32fbd82987a8421 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 17:16:23 +1100 Subject: [PATCH 06/20] prin config --- modules/python/clusterloader2/autoscale/autoscale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 8dd7af1c46..3b7c9602fe 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -68,7 +68,7 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up # logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") # logger.info(f"CPU request for each pod: {cpu_request}m") - + logger.info(f"Overriding CL2 config file at: {cl2_config_file}") is_complex = cl2_config_file == "ms_complex_config.yaml" if not is_complex: pod_cpu_request = calculate_cpu_request_for_clusterloader2(node_label_selector, node_count, pod_count, warmup_deployment, cl2_config_dir, warmup_deployment_template) From 06b59956f205cbb1667301bed9cb7268a5fb2bf6 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 17:47:33 +1100 Subject: [PATCH 07/20] customize cl2_config file --- modules/python/clusterloader2/autoscale/autoscale.py | 2 +- steps/engine/clusterloader2/autoscale/execute.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 3b7c9602fe..aae4b5b2ba 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -284,7 +284,7 @@ def main(): args = parser.parse_args() if args.command == "override": - override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir, args.os_type, args.warmup_deployment_template, args.deployment_template,pod_cpu_request=args.pod_cpu_request, pod_memory_request=args.pod_memory_request) + override_config_clusterloader2(args.cpu_per_node, args.node_count, args.pod_count, args.scale_up_timeout, args.scale_down_timeout, args.loop_count, args.node_label_selector, args.node_selector, args.cl2_override_file, args.warmup_deployment, args.cl2_config_dir, args.os_type, args.warmup_deployment_template, args.deployment_template,pod_cpu_request=args.pod_cpu_request, pod_memory_request=args.pod_memory_request, cl2_config_file=args.cl2_config_file) elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider, args.cl2_config_file) elif args.command == "collect": diff --git a/steps/engine/clusterloader2/autoscale/execute.yml b/steps/engine/clusterloader2/autoscale/execute.yml index 39a3fba98a..4ecc9afd75 100644 --- a/steps/engine/clusterloader2/autoscale/execute.yml +++ b/steps/engine/clusterloader2/autoscale/execute.yml @@ -16,9 +16,9 @@ steps: ${CPU_PER_NODE:-0} ${NODE_COUNT:-0} ${POD_COUNT:-0} \ $SCALE_UP_TIMEOUT $SCALE_DOWN_TIMEOUT \ $LOOP_COUNT "${NODE_LABEL_SELECTOR:-""}" "$NODE_SELECTOR" ${CL2_CONFIG_DIR}/overrides.yaml ${WARMUP_DEPLOYMENT:-false} ${CL2_CONFIG_DIR} --os_type ${OS_TYPE:-linux} --warmup_deployment_template ${WARMUP_DEPLOYMENT_TEMPLATE:-""} --deployment_template ${DEPLOYMENT_TEMPLATE:-""} \ - --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} --cl2_config_file ${CL2_CONFIG_FILE} + --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} --cl2_config_file ${CL2_CONFIG_FILE:-config.yaml} PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ - ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_CONFIG_FILE} + ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD --cl2_config_file ${CL2_CONFIG_FILE:-config.yaml} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: From 47bd7dc135c278a4d66914758586756c83b14731 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 18:15:11 +1100 Subject: [PATCH 08/20] use karpenter config --- ...de-auto-provisioning-benchmark-complex.yml | 9 +- .../karpenter_complex_nodepool.azure.yml | 137 ++++++++++++++++++ .../topology/karpenter/validate-resources.yml | 22 ++- 3 files changed, 158 insertions(+), 10 deletions(-) create mode 100644 scenarios/perf-eval/nap/kubernetes/karpenter_complex_nodepool.azure.yml diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index 563ff53678..bdd7cecb9c 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -30,19 +30,18 @@ stages: topology: karpenter matrix: complex-nap: - cpu_per_node: 2 - pod_count: 5 - pod_cpu_request: 2 - pod_memory_request: "4Gi" + pod_count: 5000 + pod_cpu_request: 16 + pod_memory_request: "60Gi" scale_up_timeout: "15m" scale_down_timeout: "15m" node_selector: "{karpenter.sh/nodepool: default}" loop_count: 1 warmup_deployment: true warmup_deployment_template: warmup_deployment.yaml - vm_size: Standard_D2s_v4 capacity_type: on-demand cl2_config_file: "ms_complex_config.yaml" + karpenter_nodepool_file: "karpenter_complex_nodepool.azure.yaml" max_parallel: 1 timeout_in_minutes: 60 credential_type: service_connection diff --git a/scenarios/perf-eval/nap/kubernetes/karpenter_complex_nodepool.azure.yml b/scenarios/perf-eval/nap/kubernetes/karpenter_complex_nodepool.azure.yml new file mode 100644 index 0000000000..a2758a6b6b --- /dev/null +++ b/scenarios/perf-eval/nap/kubernetes/karpenter_complex_nodepool.azure.yml @@ -0,0 +1,137 @@ +# Shared AKSNodeClass (common for both Spot and On-Demand) +--- +apiVersion: karpenter.azure.com/v1alpha2 +kind: AKSNodeClass +metadata: + name: default + annotations: + kubernetes.io/description: "General purpose AKSNodeClass for running Ubuntu2204 nodes" +spec: + imageFamily: Ubuntu2204 + +# On-Demand NodePool (default) +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: default + annotations: + kubernetes.io/description: "General purpose On-Demand NodePool" +spec: + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 1m + budgets: + - nodes: "100%" + template: + spec: + nodeClassRef: + group: karpenter.azure.com + kind: AKSNodeClass + name: default + expireAfter: Never + requirements: + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: karpenter.azure.com/sku-name + operator: In + values: + - "Standard_D96ds_v5" # 55k DDSv5 + - "Standard_D96d_v5" + - "Standard_D96_v5" # 100k Dv5 + - "Standard_D96s_v5" + - key: topology.kubernetes.io/zone + operator: In + values: + - eastus2-1 + - eastus2-2 + - eastus2-3 + +# Spot NodePool +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: spot + annotations: + kubernetes.io/description: "Spot NodePool for burstable cost-efficient workloads" +spec: + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 1s + budgets: + - nodes: "100%" + template: + spec: + nodeClassRef: + group: karpenter.azure.com + kind: AKSNodeClass + name: default + expireAfter: Never + requirements: + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + - key: karpenter.azure.com/sku-name + operator: In + values: [Standard_D2_v5] +# system-surge NodePool +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: system-surge + annotations: + kubernetes.io/description: "Surge capacity pool for system pod pressure" +spec: + disruption: + budgets: + - nodes: "1" + consolidateAfter: 1m + consolidationPolicy: WhenEmpty + template: + metadata: + labels: + kubernetes.azure.com/ebpf-dataplane: "cilium" + kubernetes.azure.com/mode: "system" + spec: + expireAfter: Never + nodeClassRef: + group: karpenter.azure.com + kind: AKSNodeClass + name: default + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: karpenter.azure.com/sku-name + operator: In + values: + - Standard_D16_v3 + - key: topology.kubernetes.io/zone + operator: In + values: + - eastus2-1 + - eastus2-2 + - eastus2-3 + startupTaints: + - effect: NoExecute + key: node.cilium.io/agent-not-ready + value: "true" + taints: + - effect: NoSchedule + key: CriticalAddonsOnly + value: "true" diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 85b669aade..93c8501335 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -5,6 +5,9 @@ parameters: type: string - name: regions type: object +- name: karpenter_nodepool_file + type: string + default: '' steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml @@ -23,15 +26,24 @@ steps: set -x kubectl apply -f $KARPENTER_NODEPOOL_FILE - # Patch the On-Demand NodePool - kubectl patch nodepool default --type='json' -p="[{'op': 'replace', 'path': '/spec/template/spec/requirements/2/values', 'value': ['$VM_SIZE']}]" + if [ -n "$VM_SIZE" ]; then + # Patch the On-Demand NodePool + kubectl patch nodepool default --type='json' -p="[{'op': 'replace', 'path': '/spec/template/spec/requirements/2/values', 'value': ['$VM_SIZE']}]" - # Patch the Spot NodePool - kubectl patch nodepool spot --type='json' -p="[{'op': 'replace', 'path': '/spec/template/spec/requirements/2/values', 'value': ['$VM_SIZE']}]" + # Patch the Spot NodePool + kubectl patch nodepool spot --type='json' -p="[{'op': 'replace', 'path': '/spec/template/spec/requirements/2/values', 'value': ['$VM_SIZE']}]" + fi kubectl get nodepool default -o yaml kubectl get nodepool spot -o yaml env: CLOUD: ${{ parameters.cloud }} - KARPENTER_NODEPOOL_FILE: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/karpenter_nodepool.${{ parameters.cloud }}.yml + KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/ + DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml + KARPENTER_NODEPOOL_FILE: | + ${{ if eq(parameters.karpenter_nodepool_file, '') }} + $(KARPENTER_DIR)$(DEFAULT_KARPENTER_FILE) + ${{ else }} + $(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }} + ${{ end }} displayName: "Validate Karpenter setup" From dbcb0cfee50e4d3d843d2ef8b595be2475127a8d Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 18:17:39 +1100 Subject: [PATCH 09/20] fix validation karpenter --- modules/python/clusterloader2/autoscale/autoscale.py | 3 +-- .../node-auto-provisioning-benchmark-complex.yml | 2 +- steps/topology/karpenter/validate-resources.yml | 7 +------ 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index aae4b5b2ba..16aae3652c 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -66,8 +66,7 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up warmup_deployment_for_karpeneter(cl2_config_dir, warmup_deployment_template) desired_node_count = 0 - # logger.info(f"Total number of nodes: {node_count}, total number of pods: {pod_count}") - # logger.info(f"CPU request for each pod: {cpu_request}m") + logger.info(f"Overriding CL2 config file at: {cl2_config_file}") is_complex = cl2_config_file == "ms_complex_config.yaml" if not is_complex: diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index bdd7cecb9c..e200adb917 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -30,7 +30,7 @@ stages: topology: karpenter matrix: complex-nap: - pod_count: 5000 + pod_count: 1000 pod_cpu_request: 16 pod_memory_request: "60Gi" scale_up_timeout: "15m" diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 93c8501335..63268ceb3f 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -40,10 +40,5 @@ steps: CLOUD: ${{ parameters.cloud }} KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/ DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml - KARPENTER_NODEPOOL_FILE: | - ${{ if eq(parameters.karpenter_nodepool_file, '') }} - $(KARPENTER_DIR)$(DEFAULT_KARPENTER_FILE) - ${{ else }} - $(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }} - ${{ end }} + KARPENTER_NODEPOOL_FILE: ${{ if eq(parameters.karpenter_nodepool_file, '') }}$(KARPENTER_DIR)$(DEFAULT_KARPENTER_FILE)${{ else }}$(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }}${{ end }} displayName: "Validate Karpenter setup" From e2c94e9b63af71098a23c740a9a74c12700dfcb8 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 18:24:08 +1100 Subject: [PATCH 10/20] fix karpenter nodepool file --- steps/topology/karpenter/validate-resources.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 63268ceb3f..28982f7db0 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -40,5 +40,8 @@ steps: CLOUD: ${{ parameters.cloud }} KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/ DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml - KARPENTER_NODEPOOL_FILE: ${{ if eq(parameters.karpenter_nodepool_file, '') }}$(KARPENTER_DIR)$(DEFAULT_KARPENTER_FILE)${{ else }}$(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }}${{ end }} + ${{ if eq(parameters.karpenter_nodepool_file, '') }}: + KARPENTER_NODEPOOL_FILE: ${{KARPENTER_DIR}}${{ DEFAULT_KARPENTER_FILE }} + ${{ else }}: + KARPENTER_NODEPOOL_FILE: $(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }} displayName: "Validate Karpenter setup" From 603bd18640893949ad142f660081122038e7da65 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 18:29:16 +1100 Subject: [PATCH 11/20] fix karpenter syntax --- steps/topology/karpenter/validate-resources.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 28982f7db0..83b860c26f 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -38,10 +38,10 @@ steps: kubectl get nodepool spot -o yaml env: CLOUD: ${{ parameters.cloud }} - KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/ + KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml ${{ if eq(parameters.karpenter_nodepool_file, '') }}: - KARPENTER_NODEPOOL_FILE: ${{KARPENTER_DIR}}${{ DEFAULT_KARPENTER_FILE }} + KARPENTER_NODEPOOL_FILE: $KARPENTER_DIR/$DEFAULT_KARPENTER_FILE ${{ else }}: - KARPENTER_NODEPOOL_FILE: $(KARPENTER_DIR)${{ parameters.karpenter_nodepool_file }} + KARPENTER_NODEPOOL_FILE: $KARPENTER_DIR/${{ parameters.karpenter_nodepool_file }} displayName: "Validate Karpenter setup" From 12c3283e0c97c87ab2261411ab4ecf272c68655e Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Mon, 5 Jan 2026 18:56:40 +1100 Subject: [PATCH 12/20] fix karpenter directory --- steps/topology/karpenter/validate-resources.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 83b860c26f..4dd73701da 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -24,7 +24,7 @@ steps: - bash: | set -euo pipefail set -x - kubectl apply -f $KARPENTER_NODEPOOL_FILE + kubectl apply -f $KARPENTER_DIR/$KARPENTER_NODEPOOL_FILE if [ -n "$VM_SIZE" ]; then # Patch the On-Demand NodePool @@ -41,7 +41,7 @@ steps: KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml ${{ if eq(parameters.karpenter_nodepool_file, '') }}: - KARPENTER_NODEPOOL_FILE: $KARPENTER_DIR/$DEFAULT_KARPENTER_FILE + KARPENTER_NODEPOOL_FILE: $DEFAULT_KARPENTER_FILE ${{ else }}: - KARPENTER_NODEPOOL_FILE: $KARPENTER_DIR/${{ parameters.karpenter_nodepool_file }} + KARPENTER_NODEPOOL_FILE: ${{ parameters.karpenter_nodepool_file }} displayName: "Validate Karpenter setup" From 7cfd5b39f8b788843657f1f63f89e966f6a3b0dd Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 10:53:22 +1100 Subject: [PATCH 13/20] karpenter nodepool file validation --- steps/topology/karpenter/validate-resources.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 4dd73701da..281c1791b8 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -7,7 +7,6 @@ parameters: type: object - name: karpenter_nodepool_file type: string - default: '' steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml @@ -24,6 +23,13 @@ steps: - bash: | set -euo pipefail set -x + + if [ -z "$KARPENTER_NODEPOOL_FILE" ]; then + echo "karpenter_nodepool_file parameter is null or empty. Using default." + KARPENTER_NODEPOOL_FILE=$DEFAULT_KARPENTER_FILE + fi + echo "Using karpenter_nodepool_file: $KARPENTER_NODEPOOL_FILE" + kubectl apply -f $KARPENTER_DIR/$KARPENTER_NODEPOOL_FILE if [ -n "$VM_SIZE" ]; then @@ -39,9 +45,6 @@ steps: env: CLOUD: ${{ parameters.cloud }} KARPENTER_DIR: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes + KARPENTER_NODEPOOL_FILE: ${{ parameters.karpenter_nodepool_file }} DEFAULT_KARPENTER_FILE: karpenter_nodepool.${{ parameters.cloud }}.yml - ${{ if eq(parameters.karpenter_nodepool_file, '') }}: - KARPENTER_NODEPOOL_FILE: $DEFAULT_KARPENTER_FILE - ${{ else }}: - KARPENTER_NODEPOOL_FILE: ${{ parameters.karpenter_nodepool_file }} displayName: "Validate Karpenter setup" From 466369b2d72654cb45df14fb678ef44ce06b7d84 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 10:54:51 +1100 Subject: [PATCH 14/20] karpenter nodepool file validation --- steps/topology/karpenter/validate-resources.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/steps/topology/karpenter/validate-resources.yml b/steps/topology/karpenter/validate-resources.yml index 281c1791b8..8c1039cfe4 100644 --- a/steps/topology/karpenter/validate-resources.yml +++ b/steps/topology/karpenter/validate-resources.yml @@ -7,6 +7,7 @@ parameters: type: object - name: karpenter_nodepool_file type: string + default: '' steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml From 2842944339434560d9e18925f288210b247372e4 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 11:37:09 +1100 Subject: [PATCH 15/20] fix karpenter nodepool file --- .../node-auto-provisioning-benchmark-complex.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index e200adb917..f78062d2f5 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -41,7 +41,7 @@ stages: warmup_deployment_template: warmup_deployment.yaml capacity_type: on-demand cl2_config_file: "ms_complex_config.yaml" - karpenter_nodepool_file: "karpenter_complex_nodepool.azure.yaml" + karpenter_nodepool_file: "karpenter_complex_nodepool.azure.yml" max_parallel: 1 timeout_in_minutes: 60 credential_type: service_connection From b326fb4046c426ba37d16fe7b5387569e626a24e Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 13:46:58 +1100 Subject: [PATCH 16/20] make cpu per node optional --- steps/engine/clusterloader2/autoscale/collect.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/engine/clusterloader2/autoscale/collect.yml b/steps/engine/clusterloader2/autoscale/collect.yml index c2bf8effcd..9f0c0fccec 100644 --- a/steps/engine/clusterloader2/autoscale/collect.yml +++ b/steps/engine/clusterloader2/autoscale/collect.yml @@ -15,7 +15,7 @@ steps: - script: | set -eo pipefail - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect $CPU_PER_NODE ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect ${CPU_PER_NODE:-0} ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} workingDirectory: modules/python env: From aa770ed0f698a5d830593014ff049fe26b93f805 Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 15:43:38 +1100 Subject: [PATCH 17/20] add pod cpu request and pod memory request --- .../clusterloader2/autoscale/autoscale.py | 22 ++++++++++++------- .../clusterloader2/autoscale/collect.yml | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 16aae3652c..fb33eae288 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -96,7 +96,7 @@ def override_config_clusterloader2(cpu_per_node, node_count, pod_count, scale_up def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider, cl2_config_file="config.yaml"): run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file, overrides=True) -def _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, autoscale_type="up", cpu_per_node=None, node_count=None, data=None, is_complex=False): +def _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, autoscale_type="up", cpu_per_node=None, node_count=None, data=None, is_complex=False, pod_cpu_request=0, pod_memory_request=""): """Build CL2 measurement template""" result = { "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), @@ -117,10 +117,12 @@ def _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url result["group"] = None result["measurement"] = None result["result"] = None + result["pod_memory"] = pod_memory_request + result["pod_cpu"] = pod_cpu_request return result -def _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config=False): +def _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config=False, pod_cpu_request=0, pod_memory_request=""): """Process test results and generate JSON content""" summary = {} @@ -172,7 +174,7 @@ def _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_no autoscale_type=key, cpu_per_node=cpu_per_node, node_count=node_count, - data=data, is_complex=is_complex_config + data=data, is_complex=is_complex_config, pod_cpu_request, pod_memory_request ) content += json.dumps(result) + "\n" @@ -188,7 +190,9 @@ def collect_clusterloader2( run_id, run_url, result_file, - cl2_config_file + cl2_config_file, + pod_cpu_request, + pod_memory_request ): index_pattern = re.compile(r'(\d+)$') raw_data = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) @@ -223,12 +227,12 @@ def collect_clusterloader2( } if testsuites: - content = _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config) + content = _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_node, capacity_type, node_count, pod_count, cloud_info, run_id, run_url, is_complex_config, pod_cpu_request, pod_memory_request) else: raise Exception(f"No testsuites found in the report! Raw data: {raw_data}") if is_complex_config: - cl2_measurement = _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, data={}, is_complex=is_complex_config) + cl2_measurement = _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, data={}, is_complex=is_complex_config, pod_cpu_request, pod_memory_request) cl2_result = process_cl2_reports(cl2_report_dir, cl2_measurement) logger.info(f"Result, category up: {cl2_result}") content += cl2_result @@ -257,7 +261,7 @@ def main(): parser_override.add_argument("--warmup_deployment_template", type=str, default="", help="Path to the CL2 warm up deployment file") parser_override.add_argument("--deployment_template", type=str, default="", help="Path to the CL2 deployment file") parser_override.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") - parser_override.add_argument("--pod_memory_request", type=str, default="60Gi", help="Memory request for each pod") + parser_override.add_argument("--pod_memory_request", type=str, default="", help="Memory request for each pod") parser_override.add_argument("--cl2_config_file", type=str, default="config.yaml", help="name of CL2 config file") # Sub-command for execute_clusterloader2 parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") @@ -279,6 +283,8 @@ def main(): parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("result_file", type=str, help="Path to the result file") parser_collect.add_argument("--cl2_config_file", type=str, default="config.yaml", help="Path to the CL2 config file") + parser_collect.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") + parser_collect.add_argument("--pod_memory_request", type=str, default="", help="Memory request for each pod") args = parser.parse_args() @@ -287,7 +293,7 @@ def main(): elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider, args.cl2_config_file) elif args.command == "collect": - collect_clusterloader2(args.cpu_per_node, args.capacity_type, args.node_count, args.pod_count, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file, args.cl2_config_file) + collect_clusterloader2(args.cpu_per_node, args.capacity_type, args.node_count, args.pod_count, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file, args.cl2_config_file, args.pod_cpu_request, args.pod_memory_request) if __name__ == "__main__": main() diff --git a/steps/engine/clusterloader2/autoscale/collect.yml b/steps/engine/clusterloader2/autoscale/collect.yml index 9f0c0fccec..58e30cdec8 100644 --- a/steps/engine/clusterloader2/autoscale/collect.yml +++ b/steps/engine/clusterloader2/autoscale/collect.yml @@ -16,7 +16,7 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect ${CPU_PER_NODE:-0} ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ - $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} + $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} workingDirectory: modules/python env: CLOUD: ${{ parameters.cloud }} From 4fcd4ea65ac24229678f39b0994d7aefa9d8ec3b Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 15:44:57 +1100 Subject: [PATCH 18/20] collecting pod cpu and pod memory --- ...de-auto-provisioning-benchmark-complex.yml | 4 +- .../nap/terraform-inputs/azure-complex.tfvars | 190 +++++++++--------- 2 files changed, 96 insertions(+), 98 deletions(-) diff --git a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml index f78062d2f5..e9cbe5cf73 100644 --- a/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml +++ b/pipelines/perf-eval/Autoscale Benchmark/node-auto-provisioning-benchmark-complex.yml @@ -30,7 +30,7 @@ stages: topology: karpenter matrix: complex-nap: - pod_count: 1000 + pod_count: 5000 pod_cpu_request: 16 pod_memory_request: "60Gi" scale_up_timeout: "15m" @@ -43,6 +43,6 @@ stages: cl2_config_file: "ms_complex_config.yaml" karpenter_nodepool_file: "karpenter_complex_nodepool.azure.yml" max_parallel: 1 - timeout_in_minutes: 60 + timeout_in_minutes: 120 credential_type: service_connection ssh_key_enabled: false diff --git a/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars b/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars index 610e8766fe..d7594125df 100644 --- a/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars +++ b/scenarios/perf-eval/nap/terraform-inputs/azure-complex.tfvars @@ -4,13 +4,12 @@ scenario_name = "nap" deletion_delay = "2h" owner = "aks" -# public_ip_config_list = [ -# { -# name = "firewall-pip" -# count = 1 -# } -# ] - +public_ip_config_list = [ + { + name = "firewall-pip" + count = 1 + } +] network_config_list = [ { @@ -33,91 +32,90 @@ network_config_list = [ } ] -# firewall_config_list = [ -# { -# name = "nap-firewall" -# network_role = "crud" -# sku_tier = "Standard" -# subnet_name = "AzureFirewallSubnet" -# public_ip_name = "firewall-pip" -# threat_intel_mode = "Alert" -# dns_proxy_enabled = true -# ip_configuration_name = "nap-fw-ipconfig" -# application_rule_collections = [ -# { -# name = "allow-egress" -# priority = 100 -# action = "Allow" -# rules = [ -# { -# name = "required-services" -# source_addresses = ["*"] -# target_fqdns = ["*.azure.com", "*.azure.net", -# "*.windows.net", "*.azurecr.io", "*.ubuntu.com", "AzureKubernetesService", -# "mcr-0001.mcr-msedge.net", "*.microsoft.com", -# "*.microsoftonline.com", "*.microsoftonline.co", "*.azureedge.net", -# "packages.aks.azure.com"] -# protocols = [ -# { port = "80", type = "Http" }, -# { port = "443", type = "Https" } -# ] -# } -# ] -# } -# ] -# network_rule_collections = [ -# { -# name = "network-rules" -# priority = 100 -# action = "Allow" -# rules = [ -# { -# name = "imds" -# source_addresses = ["*"] -# destination_addresses = ["169.254.169.254"] -# destination_ports = ["80"] -# protocols = ["Any"] -# }, -# { -# name = "dns" -# source_addresses = ["*"] -# destination_addresses = ["*"] -# destination_ports = ["53"] -# protocols = ["UDP", "TCP"] -# }, -# { -# name = "azure-and-web" -# source_addresses = ["*"] -# destination_addresses = ["*"] -# destination_ports = ["443"] -# protocols = ["TCP", "UDP"] -# } -# ] -# } -# ] -# } -# ] -# route_table_config_list = [ -# { -# name = "nap-rt" -# bgp_route_propagation_enabled = false -# routes = [ -# { -# name = "default-route" -# address_prefix = "0.0.0.0/0" -# next_hop_type = "VirtualAppliance" -# next_hop_firewall_name = "nap-firewall" -# }, -# { -# name = "firewall-internet" -# address_prefix_publicip_name = "firewall-pip" -# next_hop_type = "Internet" -# } -# ] -# subnet_associations = [{ subnet_name = "nap-subnet-ms" }] -# } -# ] - +firewall_config_list = [ + { + name = "nap-firewall" + network_role = "crud" + sku_tier = "Standard" + subnet_name = "AzureFirewallSubnet" + public_ip_name = "firewall-pip" + threat_intel_mode = "Alert" + dns_proxy_enabled = true + ip_configuration_name = "nap-fw-ipconfig" + application_rule_collections = [ + { + name = "allow-egress" + priority = 100 + action = "Allow" + rules = [ + { + name = "required-services" + source_addresses = ["*"] + target_fqdns = ["*.azure.com", "*.azure.net", + "*.windows.net", "*.azurecr.io", "*.ubuntu.com", "AzureKubernetesService", + "mcr-0001.mcr-msedge.net", "*.microsoft.com", + "*.microsoftonline.com", "*.microsoftonline.co", "*.azureedge.net", + "packages.aks.azure.com"] + protocols = [ + { port = "80", type = "Http" }, + { port = "443", type = "Https" } + ] + } + ] + } + ] + network_rule_collections = [ + { + name = "network-rules" + priority = 100 + action = "Allow" + rules = [ + { + name = "imds" + source_addresses = ["*"] + destination_addresses = ["169.254.169.254"] + destination_ports = ["80"] + protocols = ["Any"] + }, + { + name = "dns" + source_addresses = ["*"] + destination_addresses = ["*"] + destination_ports = ["53"] + protocols = ["UDP", "TCP"] + }, + { + name = "azure-and-web" + source_addresses = ["*"] + destination_addresses = ["*"] + destination_ports = ["443"] + protocols = ["TCP", "UDP"] + } + ] + } + ] + } +] +route_table_config_list = [ + { + name = "nap-rt" + bgp_route_propagation_enabled = false + routes = [ + { + name = "default-route" + address_prefix = "0.0.0.0/0" + next_hop_type = "VirtualAppliance" + next_hop_firewall_name = "nap-firewall" + }, + { + name = "firewall-internet" + address_prefix_publicip_name = "firewall-pip" + next_hop_type = "Internet" + } + ] + subnet_associations = [{ subnet_name = "nap-subnet-ms" }] + } +] aks_cli_config_list = [ { @@ -162,10 +160,10 @@ aks_cli_config_list = [ name = "enable-workload-identity" value = "" }, - # { - # name = "outbound-type" - # value = "userDefinedRouting" - # }, + { + name = "outbound-type" + value = "userDefinedRouting" + }, { name = "enable-addons" value = "azure-keyvault-secrets-provider" From 6b5c0d215aa0efb607e8635633e4a1643c158bcd Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 17:07:38 +1100 Subject: [PATCH 19/20] fix autoscale error --- modules/python/clusterloader2/autoscale/autoscale.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index fb33eae288..6d86968f6d 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -174,7 +174,7 @@ def _process_test_results(testsuites, index_pattern, metric_mappings, cpu_per_no autoscale_type=key, cpu_per_node=cpu_per_node, node_count=node_count, - data=data, is_complex=is_complex_config, pod_cpu_request, pod_memory_request + data=data, is_complex=is_complex_config, pod_cpu_request=pod_cpu_request, pod_memory_request=pod_memory_request ) content += json.dumps(result) + "\n" @@ -232,7 +232,7 @@ def collect_clusterloader2( else: raise Exception(f"No testsuites found in the report! Raw data: {raw_data}") if is_complex_config: - cl2_measurement = _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, data={}, is_complex=is_complex_config, pod_cpu_request, pod_memory_request) + cl2_measurement = _build_report_template(capacity_type, pod_count, cloud_info, run_id, run_url, data={}, is_complex=is_complex_config, pod_cpu_request=pod_cpu_request, pod_memory_request=pod_memory_request) cl2_result = process_cl2_reports(cl2_report_dir, cl2_measurement) logger.info(f"Result, category up: {cl2_result}") content += cl2_result From 8942f2d251b575a671da2ada86137e7448d53dbb Mon Sep 17 00:00:00 2001 From: vittoria salim Date: Tue, 6 Jan 2026 17:38:15 +1100 Subject: [PATCH 20/20] reconfigure cl2 --- .../clusterloader2/autoscale/autoscale.py | 28 +++++++++---------- .../clusterloader2/autoscale/collect.yml | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 6d86968f6d..9e91efe68d 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -246,13 +246,13 @@ def main(): # Sub-command for override_config_clusterloader2 parser_override = subparsers.add_parser("override", help="Override CL2 config file") - parser_override.add_argument("cpu_per_node", type=int,default=0, help="Name of cpu cores per node") - parser_override.add_argument("node_count", type=int,default=0, help="Number of nodes") - parser_override.add_argument("pod_count", type=int,default=10, help="Number of pods") + parser_override.add_argument("cpu_per_node", type=int, help="Name of cpu cores per node") + parser_override.add_argument("node_count", type=int, help="Number of nodes") + parser_override.add_argument("pod_count", type=int, help="Number of pods") parser_override.add_argument("scale_up_timeout", type=str, help="Timeout before failing the scale up test") parser_override.add_argument("scale_down_timeout", type=str, help="Timeout before failing the scale down test") parser_override.add_argument("loop_count", type=int, help="Number of times to repeat the test") - parser_override.add_argument("node_label_selector", type=str, default="", help="Node label selector") + parser_override.add_argument("node_label_selector", type=str, help="Node label selector") parser_override.add_argument("node_selector", type=str, help="Node selector for the test pods") parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file") parser_override.add_argument("warmup_deployment", type=str, help="Warmup deployment to get the cpu request") @@ -260,9 +260,9 @@ def main(): parser_override.add_argument("--os_type", type=str, choices=["linux", "windows"], default="linux", help="Operating system type for the node pools") parser_override.add_argument("--warmup_deployment_template", type=str, default="", help="Path to the CL2 warm up deployment file") parser_override.add_argument("--deployment_template", type=str, default="", help="Path to the CL2 deployment file") - parser_override.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") - parser_override.add_argument("--pod_memory_request", type=str, default="", help="Memory request for each pod") - parser_override.add_argument("--cl2_config_file", type=str, default="config.yaml", help="name of CL2 config file") + parser_override.add_argument("--pod_cpu_request", type=int, help="CPU request for each pod") + parser_override.add_argument("--pod_memory_request", type=str, help="Memory request for each pod") + parser_override.add_argument("--cl2_config_file", default="config.yaml", type=str, default="config.yaml", help="name of CL2 config file") # Sub-command for execute_clusterloader2 parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image") @@ -270,21 +270,21 @@ def main(): parser_execute.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file") parser_execute.add_argument("provider", type=str, help="Cloud provider name") - parser_execute.add_argument("--cl2_config_file", type=str, default="config.yaml", help="Path to the CL2 config file") + parser_execute.add_argument("--cl2_config_file", default="config.yaml", type=str, default="config.yaml", help="Path to the CL2 config file") # Sub-command for collect_clusterloader2 parser_collect = subparsers.add_parser("collect", help="Collect scale up data") - parser_collect.add_argument("cpu_per_node", type=int,default=0, help="Name of cpu cores per node") + parser_collect.add_argument("cpu_per_node", type=int, help="Name of cpu cores per node") parser_collect.add_argument("capacity_type", type=str, help="Capacity type", choices=["on-demand", "spot"], default="on-demand") - parser_collect.add_argument("node_count", type=int, default=0, help="Number of nodes") - parser_collect.add_argument("pod_count", type=int, default=0, help="Number of pods") + parser_collect.add_argument("node_count", type=int, help="Number of nodes") + parser_collect.add_argument("pod_count", type=int, help="Number of pods") parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_collect.add_argument("cloud_info", type=str, help="Cloud information") parser_collect.add_argument("run_id", type=str, help="Run ID") parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("result_file", type=str, help="Path to the result file") - parser_collect.add_argument("--cl2_config_file", type=str, default="config.yaml", help="Path to the CL2 config file") - parser_collect.add_argument("--pod_cpu_request", type=int, default=0, help="CPU request for each pod") - parser_collect.add_argument("--pod_memory_request", type=str, default="", help="Memory request for each pod") + parser_collect.add_argument("--cl2_config_file", default="config.yaml", type=str, help="Path to the CL2 config file") + parser_collect.add_argument("--pod_cpu_request", type=int, help="CPU request for each pod") + parser_collect.add_argument("--pod_memory_request", type=str, help="Memory request for each pod") args = parser.parse_args() diff --git a/steps/engine/clusterloader2/autoscale/collect.yml b/steps/engine/clusterloader2/autoscale/collect.yml index 58e30cdec8..4177f1ef4f 100644 --- a/steps/engine/clusterloader2/autoscale/collect.yml +++ b/steps/engine/clusterloader2/autoscale/collect.yml @@ -15,7 +15,7 @@ steps: - script: | set -eo pipefail - PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect ${CPU_PER_NODE:-0} ${CAPACITY_TYPE:-on-demand} $NODE_COUNT $POD_COUNT \ + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect ${CPU_PER_NODE:-0} ${CAPACITY_TYPE:-on-demand} ${NODE_COUNT:-0} ${POD_COUNT:-0} \ $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE --cl2_config_file ${CL2_CONFIG_FILE} --pod_cpu_request ${POD_CPU_REQUEST:-0} --pod_memory_request ${POD_MEMORY_REQUEST:-""} workingDirectory: modules/python env: