From 424abd91605f78cbe3ca5afa89505d8c8684c106 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 08:23:40 -0600 Subject: [PATCH 01/10] Test catching k8s api exceptions --- src/stack/deploy/k8s/deploy_k8s.py | 83 ++++++++++++++++-------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index 7937d26..1cd2033 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -216,45 +216,52 @@ def _find_certificate_for_host_name(self, host_name): return None def up(self, detach, skip_cluster_management, services): - self.skip_cluster_management = skip_cluster_management - if not opts.o.dry_run: - if self.is_kind() and not self.skip_cluster_management: - # Create the kind cluster - create_cluster( - self.kind_cluster_name, - self.deployment_dir.joinpath(constants.kind_config_filename), - ) - # Ensure the referenced containers are copied into kind - load_images_into_kind(self.kind_cluster_name, self.cluster_info.image_set) - self.connect_api() - if self.is_kind() and not self.skip_cluster_management: - # Now configure an ingress controller (not installed by default in kind) - install_ingress_for_kind() - # Wait for ingress to start (deployment provisioning will fail unless this is done) - wait_for_ingress_in_kind() - - else: - log_info("Dry run mode enabled, skipping k8s API connect") - - self._create_volume_data() - self._create_deployments() - - http_proxy_info = self.cluster_info.spec.get_http_proxy() - # Note: at present we don't support tls for kind (and enabling tls causes errors) - use_tls = http_proxy_info and not self.is_kind() - certificate = self._find_certificate_for_host_name(http_proxy_info[0]["host-name"]) if use_tls else None - if certificate: - log_debug(f"Using existing certificate: {certificate}") - - ingress: client.V1Ingress = self.cluster_info.get_ingress(use_tls=use_tls, certificate=certificate) - if ingress: - log_debug(f"Sending this ingress: {ingress}") + try: + self.skip_cluster_management = skip_cluster_management if not opts.o.dry_run: - ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) - log_debug("Ingress created:") - log_debug(f"{ingress_resp}") - else: - log_debug("No ingress configured") + if self.is_kind() and not self.skip_cluster_management: + # Create the kind cluster + create_cluster( + self.kind_cluster_name, + self.deployment_dir.joinpath(constants.kind_config_filename), + ) + # Ensure the referenced containers are copied into kind + load_images_into_kind(self.kind_cluster_name, self.cluster_info.image_set) + self.connect_api() + if self.is_kind() and not self.skip_cluster_management: + # Now configure an ingress controller (not installed by default in kind) + install_ingress_for_kind() + # Wait for ingress to start (deployment provisioning will fail unless this is done) + wait_for_ingress_in_kind() + + else: + log_info("Dry run mode enabled, skipping k8s API connect") + + self._create_volume_data() + self._create_deployments() + + http_proxy_info = self.cluster_info.spec.get_http_proxy() + # Note: at present we don't support tls for kind (and enabling tls causes errors) + use_tls = http_proxy_info and not self.is_kind() + certificate = self._find_certificate_for_host_name(http_proxy_info[0]["host-name"]) if use_tls else None + if certificate: + log_debug(f"Using existing certificate: {certificate}") + + ingress: client.V1Ingress = self.cluster_info.get_ingress(use_tls=use_tls, certificate=certificate) + if ingress: + log_debug(f"Sending this ingress: {ingress}") + if not opts.o.dry_run: + # Exception thrown here + # raise ApiException(http_resp=r) + # kubernetes.client.exceptions.ApiException: (500) + # Reason: Internal Server Error + ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) + log_debug("Ingress created:") + log_debug(f"{ingress_resp}") + else: + log_debug("No ingress configured") + except Exception as e: + error_exit(f"Error from kubernetes API: {e}") def down(self, timeout, volumes, skip_cluster_management): # noqa: C901 self.skip_cluster_management = skip_cluster_management From 702cdc63c50aaa5c17fcbd22cd25cf1c75e07bcc Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 08:40:22 -0600 Subject: [PATCH 02/10] Trigger CI job --- src/stack/deploy/k8s/deploy_k8s.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index 1cd2033..608b94f 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -254,7 +254,6 @@ def up(self, detach, skip_cluster_management, services): # Exception thrown here # raise ApiException(http_resp=r) # kubernetes.client.exceptions.ApiException: (500) - # Reason: Internal Server Error ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") log_debug(f"{ingress_resp}") From 5d9b1b64e3fafd18825bb6905538cd0eaa095e51 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 08:58:52 -0600 Subject: [PATCH 03/10] Trigger CI job --- src/stack/deploy/k8s/deploy_k8s.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index 608b94f..2fcd398 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -252,7 +252,6 @@ def up(self, detach, skip_cluster_management, services): log_debug(f"Sending this ingress: {ingress}") if not opts.o.dry_run: # Exception thrown here - # raise ApiException(http_resp=r) # kubernetes.client.exceptions.ApiException: (500) ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") From 8ae4e95785b4f9f687293e9d0aade2fa551fe118 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 09:24:43 -0600 Subject: [PATCH 04/10] Trigger CI job --- src/stack/deploy/k8s/deploy_k8s.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index 2fcd398..fd727fb 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -251,7 +251,6 @@ def up(self, detach, skip_cluster_management, services): if ingress: log_debug(f"Sending this ingress: {ingress}") if not opts.o.dry_run: - # Exception thrown here # kubernetes.client.exceptions.ApiException: (500) ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") From 431c246e89f92e7c485a365c1956ebba427b67ac Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 09:35:43 -0600 Subject: [PATCH 05/10] Trigger CI job --- src/stack/deploy/k8s/deploy_k8s.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index fd727fb..afa8d8b 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -251,6 +251,7 @@ def up(self, detach, skip_cluster_management, services): if ingress: log_debug(f"Sending this ingress: {ingress}") if not opts.o.dry_run: + # We should catch this in a general way on every API call # kubernetes.client.exceptions.ApiException: (500) ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") From 268b0d1d57a8f9c6f25351015adae2c6097deb1d Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 10:17:50 -0600 Subject: [PATCH 06/10] Trigger CI job --- src/stack/deploy/k8s/deploy_k8s.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index afa8d8b..95cfa4f 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -252,6 +252,7 @@ def up(self, detach, skip_cluster_management, services): log_debug(f"Sending this ingress: {ingress}") if not opts.o.dry_run: # We should catch this in a general way on every API call + # # kubernetes.client.exceptions.ApiException: (500) ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") From 344b6e5bbef59ccf3714b42f7386cde799a7d4d9 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 12:50:22 -0600 Subject: [PATCH 07/10] Fix yaml syntax in passing --- .github/workflows/test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4764285..d59d190 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,9 +2,11 @@ name: Smoke Test on: pull_request: - branches: '*' + branches: + - '*' push: - branches: '*' + branches: + - '*' jobs: test: From 5d1b27893f1ca5cd577f421eca8c6314b8c291fe Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 12:50:43 -0600 Subject: [PATCH 08/10] Add exception catch to down flow --- src/stack/deploy/k8s/deploy_k8s.py | 126 +++++++++++++++-------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index 95cfa4f..f4a1d67 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -251,87 +251,89 @@ def up(self, detach, skip_cluster_management, services): if ingress: log_debug(f"Sending this ingress: {ingress}") if not opts.o.dry_run: - # We should catch this in a general way on every API call - # - # kubernetes.client.exceptions.ApiException: (500) + # We've seen this exception thrown here: kubernetes.client.exceptions.ApiException: (500) ingress_resp = self.networking_api.create_namespaced_ingress(namespace=self.k8s_namespace, body=ingress) log_debug("Ingress created:") log_debug(f"{ingress_resp}") else: log_debug("No ingress configured") except Exception as e: - error_exit(f"Error from kubernetes API: {e}") + error_exit(f"Exception thrown bringing stack up: {e}") def down(self, timeout, volumes, skip_cluster_management): # noqa: C901 - self.skip_cluster_management = skip_cluster_management - self.connect_api() - # Delete the k8s objects + try: + self.skip_cluster_management = skip_cluster_management + self.connect_api() + # Delete the k8s objects + + if volumes: + # Create the host-path-mounted PVs for this deployment + pvs = self.cluster_info.get_pvs() + for pv in pvs: + log_debug(f"Deleting this pv: {pv}") + try: + pv_resp = self.core_api.delete_persistent_volume(name=pv.metadata.name) + log_debug("PV deleted:") + log_debug(f"{pv_resp}") + except client.exceptions.ApiException as e: + _check_delete_exception(e) + + # Figure out the PVCs for this deployment + pvcs = self.cluster_info.get_pvcs() + for pvc in pvcs: + log_debug(f"Deleting this pvc: {pvc}") + try: + pvc_resp = self.core_api.delete_namespaced_persistent_volume_claim( + name=pvc.metadata.name, namespace=self.k8s_namespace + ) + log_debug("PVCs deleted:") + log_debug(f"{pvc_resp}") + except client.exceptions.ApiException as e: + _check_delete_exception(e) - if volumes: - # Create the host-path-mounted PVs for this deployment - pvs = self.cluster_info.get_pvs() - for pv in pvs: - log_debug(f"Deleting this pv: {pv}") + # Figure out the ConfigMaps for this deployment + cfg_maps = self.cluster_info.get_configmaps() + for cfg_map in cfg_maps: + log_debug(f"Deleting this ConfigMap: {cfg_map}") try: - pv_resp = self.core_api.delete_persistent_volume(name=pv.metadata.name) - log_debug("PV deleted:") - log_debug(f"{pv_resp}") + cfg_map_resp = self.core_api.delete_namespaced_config_map(name=cfg_map.metadata.name, namespace=self.k8s_namespace) + log_debug("ConfigMap deleted:") + log_debug(f"{cfg_map_resp}") except client.exceptions.ApiException as e: _check_delete_exception(e) - # Figure out the PVCs for this deployment - pvcs = self.cluster_info.get_pvcs() - for pvc in pvcs: - log_debug(f"Deleting this pvc: {pvc}") + deployments = self.cluster_info.get_deployments() + for deployment in deployments: + log_debug(f"Deleting this deployment: {deployment}") try: - pvc_resp = self.core_api.delete_namespaced_persistent_volume_claim( - name=pvc.metadata.name, namespace=self.k8s_namespace - ) - log_debug("PVCs deleted:") - log_debug(f"{pvc_resp}") + self.apps_api.delete_namespaced_deployment(name=deployment.metadata.name, namespace=self.k8s_namespace) except client.exceptions.ApiException as e: _check_delete_exception(e) - # Figure out the ConfigMaps for this deployment - cfg_maps = self.cluster_info.get_configmaps() - for cfg_map in cfg_maps: - log_debug(f"Deleting this ConfigMap: {cfg_map}") - try: - cfg_map_resp = self.core_api.delete_namespaced_config_map(name=cfg_map.metadata.name, namespace=self.k8s_namespace) - log_debug("ConfigMap deleted:") - log_debug(f"{cfg_map_resp}") - except client.exceptions.ApiException as e: - _check_delete_exception(e) - - deployments = self.cluster_info.get_deployments() - for deployment in deployments: - log_debug(f"Deleting this deployment: {deployment}") - try: - self.apps_api.delete_namespaced_deployment(name=deployment.metadata.name, namespace=self.k8s_namespace) - except client.exceptions.ApiException as e: - _check_delete_exception(e) + services: client.V1Service = self.cluster_info.get_services() + for svc in services: + log_debug(f"Deleting service: {svc}") + try: + self.core_api.delete_namespaced_service(namespace=self.k8s_namespace, name=svc.metadata.name) + except client.exceptions.ApiException as e: + _check_delete_exception(e) - services: client.V1Service = self.cluster_info.get_services() - for svc in services: - log_debug(f"Deleting service: {svc}") - try: - self.core_api.delete_namespaced_service(namespace=self.k8s_namespace, name=svc.metadata.name) - except client.exceptions.ApiException as e: - _check_delete_exception(e) - - ingress: client.V1Ingress = self.cluster_info.get_ingress(use_tls=not self.is_kind()) - if ingress: - log_debug(f"Deleting this ingress: {ingress}") - try: - self.networking_api.delete_namespaced_ingress(name=ingress.metadata.name, namespace=self.k8s_namespace) - except client.exceptions.ApiException as e: - _check_delete_exception(e) - else: - log_debug("No ingress to delete") + ingress: client.V1Ingress = self.cluster_info.get_ingress(use_tls=not self.is_kind()) + if ingress: + log_debug(f"Deleting this ingress: {ingress}") + try: + self.networking_api.delete_namespaced_ingress(name=ingress.metadata.name, namespace=self.k8s_namespace) + except client.exceptions.ApiException as e: + _check_delete_exception(e) + else: + log_debug("No ingress to delete") - if self.is_kind() and not self.skip_cluster_management: - # Destroy the kind cluster - destroy_cluster(self.kind_cluster_name) + if self.is_kind() and not self.skip_cluster_management: + # Destroy the kind cluster + destroy_cluster(self.kind_cluster_name) + + except Exception as e: + error_exit(f"Exception thrown bringing stack up: {e}") def status(self): self.connect_api() From b43427786e0a167ad498929ddd2bf4fab89a9b30 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 12:56:14 -0600 Subject: [PATCH 09/10] Fix lint errors --- src/stack/deploy/k8s/deploy_k8s.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/stack/deploy/k8s/deploy_k8s.py b/src/stack/deploy/k8s/deploy_k8s.py index f4a1d67..5e576ad 100644 --- a/src/stack/deploy/k8s/deploy_k8s.py +++ b/src/stack/deploy/k8s/deploy_k8s.py @@ -296,7 +296,10 @@ def down(self, timeout, volumes, skip_cluster_management): # noqa: C901 for cfg_map in cfg_maps: log_debug(f"Deleting this ConfigMap: {cfg_map}") try: - cfg_map_resp = self.core_api.delete_namespaced_config_map(name=cfg_map.metadata.name, namespace=self.k8s_namespace) + cfg_map_resp = self.core_api.delete_namespaced_config_map( + name=cfg_map.metadata.name, + namespace=self.k8s_namespace + ) log_debug("ConfigMap deleted:") log_debug(f"{cfg_map_resp}") except client.exceptions.ApiException as e: @@ -331,7 +334,7 @@ def down(self, timeout, volumes, skip_cluster_management): # noqa: C901 if self.is_kind() and not self.skip_cluster_management: # Destroy the kind cluster destroy_cluster(self.kind_cluster_name) - + except Exception as e: error_exit(f"Exception thrown bringing stack up: {e}") From bf88e417cd4ecab16ca893fdc0ce5e7bfb5edb9f Mon Sep 17 00:00:00 2001 From: David Boreham Date: Thu, 11 Sep 2025 13:16:22 -0600 Subject: [PATCH 10/10] Add a sleep after kind ingress ready to address intermittent CI failures --- src/stack/deploy/k8s/helpers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/stack/deploy/k8s/helpers.py b/src/stack/deploy/k8s/helpers.py index f28d579..afd335c 100644 --- a/src/stack/deploy/k8s/helpers.py +++ b/src/stack/deploy/k8s/helpers.py @@ -21,6 +21,7 @@ from kubernetes import client, utils, watch from pathlib import Path from ruamel.yaml.comments import CommentedSeq +from time import sleep from typing import Set, Mapping, List from stack.build.build_util import container_exists_locally @@ -58,6 +59,11 @@ def wait_for_ingress_in_kind(): if event["object"].status.container_statuses[0].ready is True: if warned_waiting: log_info("Ingress controller is ready") + # Hack to work around https://github.com/bozemanpass/stack/issues/110 + # Theory is that depending on when in the 30 second polling cycle we hit ready, + # the controller may not actually be ready to serve ingress requests yet. + # So we wait a bit longer here. + sleep(10) return log_info("Waiting for ingress controller to become ready...") warned_waiting = True