Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
fd02f1c
fix: stackUpgradeHandler constructs OCI image ref from talosVersion
ontave May 7, 2026
fe0c6fb
fix: stackUpgradeHandler uses per-node rolling reboot, not staged-only
ontave May 7, 2026
095ccb9
fix: use docker.io/siderolabs/installer and kubelet refs for lab regi…
ontave May 7, 2026
c81114a
feat(phase-1): migrate runnerlib to conductor-sdk external module
ontave May 12, 2026
2225b58
feat(phase-3.3-3.7): update conductor consumers to seam.ontai.dev dis…
ontave May 12, 2026
de4a430
feat(migration-3.1): update conductor to seam.ontai.dev/TalosCluster GVK
ontave May 12, 2026
6325475
feat(migration-3.2): update conductor to platform ClusterLog GVK for …
ontave May 12, 2026
38ae38b
feat(migration-4.3): update conductor to seam + dispatcher module paths
ontave May 12, 2026
94bc8f4
chore: update replace directives to renamed seam and dispatcher direc…
ontave May 12, 2026
6e4698d
migration(phase-5): security.ontai.dev -> guardian.ontai.dev across c…
ontave May 12, 2026
2905a4a
migration(phase-6): fix Dockerfiles for seam/dispatcher renames + con…
ontave May 13, 2026
b245114
docs: session/25m -- Phase 8.5 conductor documentation rewrite
ontave May 13, 2026
312c437
fix: update CI workflow (seam-core->seam, wrapper->dispatcher, add se…
ontave May 17, 2026
08e17c3
feat(conductor): implement SeamOperator interface and startup SeamMem…
ontave May 18, 2026
b69c84a
feat(conductor/compiler): wire scaffold subcommand
ontave May 18, 2026
7e5876e
fix(conductor): force reboot after talos upgrade staging; fix kubecon…
ontave May 18, 2026
99d4f17
feat(watchdog): implement PackPodHealthLoop, RuntimeDriftHandler, lab…
ontave May 18, 2026
d116e30
fix(compiler): eliminate all pre-migration residue from enable bundle
ontave May 20, 2026
4acc55e
fix(compiler): honour mode: import from cluster-input.yaml
ontave May 20, 2026
94f4f6c
fix(conductor): post-migration API group seam.ontai.dev in agent inte…
ontave May 20, 2026
da078cd
fix: post-migration residue -- PackDelivery namespace, compile_enable…
ontave May 20, 2026
ebdac30
fix: SSA force ownership and compile_enable pack-reader RBAC
ontave May 20, 2026
471dbff
fix: revert PackDelivery namespace to seam-tenant; simplify RunnerCon…
ontave May 20, 2026
c148180
fix(conductor): TC-MC-5/6 -- packbuild doc separator, CRD generation,…
ontave May 21, 2026
4236b40
feat(conductor): OperatorContext watcher + autonomy gate for action d…
ontave May 21, 2026
3eec757
fix(conductor): complete infrastructure.ontai.dev -> seam.ontai.dev m…
ontave May 21, 2026
aa751bf
fix(compiler): revert CNPG_SECRET_NAME to guardian-db-app (CNPG auto-…
ontave May 21, 2026
9cc1d91
fix(agent): DriftSignalHandler skips RuntimeDrift signals to avoid ra…
ontave May 21, 2026
e319c61
feat(conductor): mismatchContext population + RemediationApproval gat…
ontave May 21, 2026
f212cf3
Merge remote-tracking branch 'origin/main' into feature/post-migratio…
ontave May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Image tag default — override via environment: TAG=v1.9.3-r1 make docker-build
# Image tag default — override via environment: TAG=v0.1.0 make docker-build
# EXEC_TAG controls the conductor-exec image tag independently; defaults to TAG.
# conductor-exec tracks the Talos version (e.g. v1.9.3). Bump EXEC_TAG when
# a new Talos version is validated, not when the operator code changes.
# Example: make docker-build TAG=v0.1.0 EXEC_TAG=v1.9.3
IMAGE_REGISTRY ?= registry.ontai.dev/ontai-dev
TAG ?= dev
EXEC_TAG ?= $(TAG)

.PHONY: build test test-unit test-integration test-all e2e lint lint-docs lint-images install-hooks clean docker-build docker-push

Expand Down Expand Up @@ -67,21 +72,25 @@ docker-build:
..
docker build \
--platform linux/amd64 \
-f Dockerfile.execute \
-t $(IMAGE_REGISTRY)/conductor-execute:$(TAG) \
-f Dockerfile.agent \
-t $(IMAGE_REGISTRY)/conductor:$(TAG) \
..

docker-build-execute:
docker build \
--platform linux/amd64 \
-f Dockerfile.agent \
-t $(IMAGE_REGISTRY)/conductor:$(TAG) \
-f Dockerfile.execute \
-t $(IMAGE_REGISTRY)/conductor-exec:$(EXEC_TAG) \
..

# docker-push pushes all three already-built conductor images to the registry.
# docker-push pushes compiler and agent images to the registry.
docker-push:
docker push $(IMAGE_REGISTRY)/compiler:$(TAG)
docker push $(IMAGE_REGISTRY)/conductor-execute:$(TAG)
docker push $(IMAGE_REGISTRY)/conductor:$(TAG)

docker-push-execute:
docker push $(IMAGE_REGISTRY)/conductor-exec:$(EXEC_TAG)

# lint-images verifies all three conductor images exist in the local OCI registry.
lint-images:
@echo ">>> lint-images: checking conductor images in registry"
Expand Down
19 changes: 19 additions & 0 deletions api/conductor/v1alpha1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Package v1alpha1 contains API types for the conductor.ontai.dev group.
// CRDs in this package are Conductor-internal resources (RemediationPolicy,
// RemediationApproval) that govern the Conductor Watchdog remediation lifecycle.
// Group: conductor.ontai.dev.
//
// +groupName=conductor.ontai.dev
// +kubebuilder:object:generate=true
package v1alpha1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
GroupVersion = schema.GroupVersion{Group: "conductor.ontai.dev", Version: "v1alpha1"}
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
AddToScheme = SchemeBuilder.AddToScheme
)
78 changes: 78 additions & 0 deletions api/conductor/v1alpha1/remediationapproval_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// RemediationApprovalSpec is authored by a human operator to grant permission
// for automatic redeployment of an exhausted PackInstalled. INV-007.
type RemediationApprovalSpec struct {
// PackInstalledRef is the name+namespace of the PackInstalled that requires
// redeployment approval.
PackInstalledRef RemediationApprovalRef `json:"packInstalledRef"`

// FailureReason is the FailureReason enum value from the Exhausted DriftSignal
// that triggered this approval request.
// +kubebuilder:validation:Enum=CrashLoopBackOff;OOMKilled;ImagePullBackOff;FailedMount;MultiAttachError
FailureReason string `json:"failureReason"`

// ApprovedBy is the identity of the human approver.
ApprovedBy string `json:"approvedBy"`

// ApprovedAt is the time this approval was granted.
ApprovedAt metav1.Time `json:"approvedAt"`
}

// RemediationApprovalRef is a name+namespace reference to a PackInstalled CR.
type RemediationApprovalRef struct {
// Name is the PackInstalled CR name.
Name string `json:"name"`
// Namespace is the namespace of the PackInstalled CR.
Namespace string `json:"namespace"`
}

// RemediationApprovalStatus is the observed state of a RemediationApproval.
type RemediationApprovalStatus struct {
// ObservedGeneration is the generation most recently reconciled.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`

// Acted is true when the management Conductor has consumed this approval
// and initiated redeployment.
// +optional
Acted bool `json:"acted,omitempty"`

// ActedAt is the time the approval was consumed.
// +optional
ActedAt *metav1.Time `json:"actedAt,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Namespaced,shortName=ra

// RemediationApproval is a human-authored CR that grants permission for the
// Conductor Watchdog to initiate a full PackDelivery redeployment after exhausting
// automated remediation attempts. INV-007: destructive operations require an
// affirmative CR with a human approval gate.
// group: conductor.ontai.dev.
type RemediationApproval struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec RemediationApprovalSpec `json:"spec,omitempty"`
Status RemediationApprovalStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// RemediationApprovalList contains a list of RemediationApproval.
type RemediationApprovalList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []RemediationApproval `json:"items"`
}

func init() {
SchemeBuilder.Register(&RemediationApproval{}, &RemediationApprovalList{})
}
125 changes: 125 additions & 0 deletions api/conductor/v1alpha1/remediationpolicy_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// ThresholdsSection maps each FailureReason to the consecutive failure count
// required to trigger the watchdog. Keys are seam-sdk FailureReason string values.
// When a FailureReason key is absent, the default threshold of 3 applies.
type ThresholdsSection struct {
// PerReason maps FailureReason string values to threshold counts.
// Absent keys use the default threshold of 3.
// +optional
PerReason map[string]int32 `json:"perReason,omitempty"`
}

// StrategySection maps each FailureReason to the RemediationStrategy to apply.
// Keys are seam-sdk FailureReason string values.
// When a FailureReason key is absent, the DefaultStrategy() from seam-sdk applies.
type StrategySection struct {
// PerReason maps FailureReason string values to RemediationStrategy string values.
// Absent keys use the seam-sdk DefaultStrategy for the given reason.
// +optional
PerReason map[string]string `json:"perReason,omitempty"`
}

// EscalationSection defines behaviour after the remediation attempt count is exhausted.
type EscalationSection struct {
// MaxAttempts is the maximum number of remediation Jobs to submit before
// marking the DriftSignal as Exhausted. Default: 3.
// +kubebuilder:default=3
// +optional
MaxAttempts int32 `json:"maxAttempts,omitempty"`

// TimeoutWindow is the duration the tenant Conductor waits for acknowledgement
// before re-emitting the DriftSignal. Default: 5m.
// +optional
TimeoutWindow *metav1.Duration `json:"timeoutWindow,omitempty"`

// AutomaticRedeployment enables the Conductor to signal the Dispatcher for a
// full PackDelivery redeployment when Exhausted=true. Requires explicit Governor
// enablement. Default: false. INV-007.
// +kubebuilder:default=false
// +optional
AutomaticRedeployment bool `json:"automaticRedeployment,omitempty"`
}

// RemediationPolicySpec declares the remediation behaviour for packs referencing
// this policy. When a PackInstalled does not reference a policy, the platform
// defaults apply (threshold=3, per-reason default strategies, MaxAttempts=3, 5m window).
type RemediationPolicySpec struct {
// Thresholds configures per-FailureReason consecutive failure counts.
// +optional
Thresholds ThresholdsSection `json:"thresholds,omitempty"`

// Strategy configures per-FailureReason remediation actions.
// +optional
Strategy StrategySection `json:"strategy,omitempty"`

// Escalation configures the post-exhaustion behaviour.
// +optional
Escalation EscalationSection `json:"escalation,omitempty"`
}

// RemediationPolicyStatus is the observed state of a RemediationPolicy.
type RemediationPolicyStatus struct {
// ObservedGeneration is the generation most recently reconciled.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Namespaced,shortName=rp

// RemediationPolicy declares the automated remediation behaviour for packs
// on a target cluster. Referenced by PackInstalled.spec.remediationPolicyRef.
// group: conductor.ontai.dev.
type RemediationPolicy struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec RemediationPolicySpec `json:"spec,omitempty"`
Status RemediationPolicyStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// RemediationPolicyList contains a list of RemediationPolicy.
type RemediationPolicyList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []RemediationPolicy `json:"items"`
}

// DefaultThreshold is the consecutive failure count applied when a FailureReason
// has no explicit entry in ThresholdsSection.
const DefaultThreshold int32 = 3

// DefaultMaxAttempts is the maximum remediation Job count when
// EscalationSection.MaxAttempts is zero.
const DefaultMaxAttempts int32 = 3

// ThresholdFor returns the configured threshold for the given FailureReason,
// falling back to DefaultThreshold when not explicitly set.
func (p *RemediationPolicySpec) ThresholdFor(reason string) int32 {
if p.Thresholds.PerReason != nil {
if v, ok := p.Thresholds.PerReason[reason]; ok && v > 0 {
return v
}
}
return DefaultThreshold
}

// MaxAttempts returns the effective MaxAttempts, applying the default when zero.
func (p *RemediationPolicySpec) EffectiveMaxAttempts() int32 {
if p.Escalation.MaxAttempts > 0 {
return p.Escalation.MaxAttempts
}
return DefaultMaxAttempts
}

func init() {
SchemeBuilder.Register(&RemediationPolicy{}, &RemediationPolicyList{})
}
45 changes: 45 additions & 0 deletions api/conductor/v1alpha1/remediationpolicy_types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package v1alpha1

import (
"testing"
)

func TestThresholdForDefault(t *testing.T) {
spec := &RemediationPolicySpec{}
got := spec.ThresholdFor("CrashLoopBackOff")
if got != DefaultThreshold {
t.Errorf("ThresholdFor with empty policy = %d, want %d", got, DefaultThreshold)
}
}

func TestThresholdForExplicit(t *testing.T) {
spec := &RemediationPolicySpec{
Thresholds: ThresholdsSection{
PerReason: map[string]int32{"CrashLoopBackOff": 5},
},
}
got := spec.ThresholdFor("CrashLoopBackOff")
if got != 5 {
t.Errorf("ThresholdFor explicit = %d, want 5", got)
}
other := spec.ThresholdFor("OOMKilled")
if other != DefaultThreshold {
t.Errorf("ThresholdFor absent key = %d, want %d", other, DefaultThreshold)
}
}

func TestEffectiveMaxAttemptsDefault(t *testing.T) {
spec := &RemediationPolicySpec{}
if spec.EffectiveMaxAttempts() != DefaultMaxAttempts {
t.Errorf("EffectiveMaxAttempts empty = %d, want %d", spec.EffectiveMaxAttempts(), DefaultMaxAttempts)
}
}

func TestEffectiveMaxAttemptsExplicit(t *testing.T) {
spec := &RemediationPolicySpec{
Escalation: EscalationSection{MaxAttempts: 7},
}
if spec.EffectiveMaxAttempts() != 7 {
t.Errorf("EffectiveMaxAttempts explicit = %d, want 7", spec.EffectiveMaxAttempts())
}
}
Loading
Loading