From 850c618279ed776e599c42034f379d3639a16023 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 18:53:34 -0700 Subject: [PATCH 01/20] plumb userspace policy scheduler inactive state --- .../plan-1378-policy-schedulers.md | 7 ++ pkg/dataplane/userspace/manager_test.go | 68 ++++++++++ pkg/dataplane/userspace/protocol.go | 3 + pkg/dataplane/userspace/snapshot.go | 24 ++++ userspace-dp/src/afxdp/test_fixtures.rs | 2 + userspace-dp/src/afxdp/tests.rs | 1 + userspace-dp/src/policy.rs | 22 ++++ userspace-dp/src/policy_tests.rs | 116 ++++++++++++++++++ userspace-dp/src/protocol.rs | 54 ++++++++ 9 files changed, 297 insertions(+) diff --git a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md index 8d824ccb2..109d6c736 100644 --- a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md +++ b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md @@ -20,6 +20,13 @@ identity must not depend on transient array position alone; use a config-driven UUID if available or `(policy_set_id, policy_name, rule_name)`/equivalent compiled identity. +Safe #1378 slice status: this change wires `rule_id`, `scheduler_name`, and +`inactive` through userspace policy snapshots and Rust policy evaluation. The +Go snapshot builder can compute inactive bits when supplied scheduler active +state, but the normal daemon publish path still builds snapshots without that +state. Scheduler tick/restart/failover republish and missing-scheduler commit +errors remain gates before scheduled userspace policies are end-to-end enforced. + On scheduler state changes, publish one atomic userspace snapshot delta that contains the updated inactive bits for all affected rules. Do not issue per-rule fast-path toggles because first-match ordering requires same-instant diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index d6ac05ecd..2e993fdf8 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2587,6 +2587,74 @@ func TestBuildPolicySnapshotsIncludesGlobalPolicies(t *testing.T) { } } +func TestBuildPolicySnapshotsRoundTripsSchedulerInactiveAndRuleID(t *testing.T) { + cfg := &config.Config{} + cfg.Security.Policies = []*config.ZonePairPolicies{{ + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{{ + Name: "zone-allow", + SchedulerName: "workhours", + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"any"}, + }, + Action: config.PolicyPermit, + }}, + }} + cfg.Security.GlobalPolicies = []*config.Policy{{ + Name: "global-deny-all", + SchedulerName: "always", + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"any"}, + }, + Action: config.PolicyDeny, + }} + + snap := buildPolicySnapshotsWithSchedulerState(cfg, map[string]bool{ + "workhours": false, + "always": true, + }) + if len(snap) != 2 { + t.Fatalf("len(snap) = %d, want 2", len(snap)) + } + if got, want := snap[0].RuleID, "trust->untrust/zone-allow"; got != want { + t.Fatalf("snap[0].RuleID = %q, want %q", got, want) + } + if got, want := snap[0].SchedulerName, "workhours"; got != want { + t.Fatalf("snap[0].SchedulerName = %q, want %q", got, want) + } + if !snap[0].Inactive { + t.Fatalf("snap[0].Inactive = false, want true for inactive scheduler") + } + if got, want := snap[1].RuleID, "junos-global->junos-global/global-deny-all"; got != want { + t.Fatalf("snap[1].RuleID = %q, want %q", got, want) + } + if snap[1].Inactive { + t.Fatalf("snap[1].Inactive = true, want false for active scheduler") + } + + data, err := json.Marshal(snap) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + var roundTrip []PolicyRuleSnapshot + if err := json.Unmarshal(data, &roundTrip); err != nil { + t.Fatalf("Unmarshal: %v", err) + } + if len(roundTrip) != 2 { + t.Fatalf("len(roundTrip) = %d, want 2", len(roundTrip)) + } + if roundTrip[0].RuleID != snap[0].RuleID || + roundTrip[0].SchedulerName != snap[0].SchedulerName || + roundTrip[0].Inactive != snap[0].Inactive { + t.Fatalf("roundTrip[0] = %+v, want scheduler/inactive/rule_id from %+v", roundTrip[0], snap[0]) + } +} + func TestUserspaceSupportsScreenProfilesBasic(t *testing.T) { cfg := &config.Config{} cfg.Security.Screen = map[string]*config.ScreenProfile{ diff --git a/pkg/dataplane/userspace/protocol.go b/pkg/dataplane/userspace/protocol.go index 3024f50f1..4fbcaf042 100644 --- a/pkg/dataplane/userspace/protocol.go +++ b/pkg/dataplane/userspace/protocol.go @@ -365,9 +365,12 @@ type PolicyApplicationSnapshot struct { } type PolicyRuleSnapshot struct { + RuleID string `json:"rule_id,omitempty"` Name string `json:"name"` FromZone string `json:"from_zone,omitempty"` ToZone string `json:"to_zone,omitempty"` + SchedulerName string `json:"scheduler_name,omitempty"` + Inactive bool `json:"inactive,omitempty"` SourceAddresses []string `json:"source_addresses,omitempty"` DestinationAddresses []string `json:"destination_addresses,omitempty"` Applications []string `json:"applications,omitempty"` diff --git a/pkg/dataplane/userspace/snapshot.go b/pkg/dataplane/userspace/snapshot.go index 635ed2799..865fed44a 100644 --- a/pkg/dataplane/userspace/snapshot.go +++ b/pkg/dataplane/userspace/snapshot.go @@ -1968,6 +1968,10 @@ func buildClassOfServiceSnapshot(cfg *config.Config) *ClassOfServiceSnapshot { } func buildPolicySnapshots(cfg *config.Config) []PolicyRuleSnapshot { + return buildPolicySnapshotsWithSchedulerState(cfg, nil) +} + +func buildPolicySnapshotsWithSchedulerState(cfg *config.Config, activeState map[string]bool) []PolicyRuleSnapshot { if cfg == nil || (len(cfg.Security.Policies) == 0 && len(cfg.Security.GlobalPolicies) == 0) { return nil } @@ -1992,10 +1996,14 @@ func buildPolicySnapshots(cfg *config.Config) []PolicyRuleSnapshot { if !ok { applicationTerms = nil } + schedulerName := pol.SchedulerName out = append(out, PolicyRuleSnapshot{ + RuleID: stablePolicyRuleID(zpp.FromZone, zpp.ToZone, pol.Name), Name: pol.Name, FromZone: zpp.FromZone, ToZone: zpp.ToZone, + SchedulerName: schedulerName, + Inactive: policyRuleInactive(schedulerName, activeState), SourceAddresses: sourceAddresses, DestinationAddresses: destinationAddresses, Applications: append([]string(nil), pol.Match.Applications...), @@ -2021,10 +2029,14 @@ func buildPolicySnapshots(cfg *config.Config) []PolicyRuleSnapshot { if !ok { applicationTerms = nil } + schedulerName := pol.SchedulerName out = append(out, PolicyRuleSnapshot{ + RuleID: stablePolicyRuleID("junos-global", "junos-global", pol.Name), Name: pol.Name, FromZone: "junos-global", ToZone: "junos-global", + SchedulerName: schedulerName, + Inactive: policyRuleInactive(schedulerName, activeState), SourceAddresses: sourceAddresses, DestinationAddresses: destinationAddresses, Applications: append([]string(nil), pol.Match.Applications...), @@ -2035,6 +2047,18 @@ func buildPolicySnapshots(cfg *config.Config) []PolicyRuleSnapshot { return out } +func stablePolicyRuleID(fromZone, toZone, ruleName string) string { + return fmt.Sprintf("%s->%s/%s", fromZone, toZone, ruleName) +} + +func policyRuleInactive(schedulerName string, activeState map[string]bool) bool { + if schedulerName == "" || activeState == nil { + return false + } + active, ok := activeState[schedulerName] + return !ok || !active +} + func policyActionString(action config.PolicyAction) string { switch action { case config.PolicyPermit: diff --git a/userspace-dp/src/afxdp/test_fixtures.rs b/userspace-dp/src/afxdp/test_fixtures.rs index 6267ba57e..093ca1830 100644 --- a/userspace-dp/src/afxdp/test_fixtures.rs +++ b/userspace-dp/src/afxdp/test_fixtures.rs @@ -432,6 +432,7 @@ pub(super) fn nat_snapshot() -> ConfigSnapshot { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }], neighbors: vec![ NeighborSnapshot { @@ -529,6 +530,7 @@ pub(super) fn policy_deny_snapshot() -> ConfigSnapshot { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }], ..Default::default() } diff --git a/userspace-dp/src/afxdp/tests.rs b/userspace-dp/src/afxdp/tests.rs index ee677eb1a..815921823 100644 --- a/userspace-dp/src/afxdp/tests.rs +++ b/userspace-dp/src/afxdp/tests.rs @@ -939,6 +939,7 @@ fn post_dnat_source_nat_matches_translated_destination() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }); let state = build_forwarding_state(&snapshot); diff --git a/userspace-dp/src/policy.rs b/userspace-dp/src/policy.rs index f443341ea..93c7faf86 100644 --- a/userspace-dp/src/policy.rs +++ b/userspace-dp/src/policy.rs @@ -45,8 +45,11 @@ impl Default for PolicyAction { #[derive(Debug)] pub(crate) struct PolicyRule { + pub(crate) rule_id: String, pub(crate) from_zone: String, pub(crate) to_zone: String, + pub(crate) scheduler_name: String, + pub(crate) inactive: bool, /// #923: adaptive prefix set (MatchAny / Linear ≤16 / Trie >16). /// Replaces the legacy `Vec` linear scan in /// `nets_match_v4/v6`. Empty input collapses to `MatchAny`, @@ -65,8 +68,11 @@ pub(crate) struct PolicyRule { impl Default for PolicyRule { fn default() -> Self { Self { + rule_id: String::new(), from_zone: String::new(), to_zone: String::new(), + scheduler_name: String::new(), + inactive: false, source_v4: PrefixSetV4::default(), source_v6: PrefixSetV6::default(), destination_v4: PrefixSetV4::default(), @@ -85,8 +91,11 @@ impl Default for PolicyRule { impl Clone for PolicyRule { fn clone(&self) -> Self { Self { + rule_id: self.rule_id.clone(), from_zone: self.from_zone.clone(), to_zone: self.to_zone.clone(), + scheduler_name: self.scheduler_name.clone(), + inactive: self.inactive, source_v4: self.source_v4.clone(), source_v6: self.source_v6.clone(), destination_v4: self.destination_v4.clone(), @@ -228,8 +237,11 @@ pub(crate) fn parse_policy_state( parse_address(prefix, &mut dst_v4, &mut dst_v6); } let mut rule = PolicyRule { + rule_id: stable_policy_rule_id(snap), from_zone: snap.from_zone.clone(), to_zone: snap.to_zone.clone(), + scheduler_name: snap.scheduler_name.clone(), + inactive: snap.inactive, action: parse_action(&snap.action), source_v4: PrefixSetV4::from_prefixes(src_v4), source_v6: PrefixSetV6::from_prefixes(src_v6), @@ -326,6 +338,9 @@ fn try_match_rule( src_port: u16, dst_port: u16, ) -> Option { + if rule.inactive { + return None; + } if !rule.compiled_apps.matches(protocol, src_port, dst_port) { return None; } @@ -346,6 +361,13 @@ fn try_match_rule( } } +fn stable_policy_rule_id(snap: &PolicyRuleSnapshot) -> String { + if !snap.rule_id.is_empty() { + return snap.rule_id.clone(); + } + format!("{}->{}/{}", snap.from_zone, snap.to_zone, snap.name) +} + fn parse_action(action: &str) -> PolicyAction { match action { "permit" => PolicyAction::Permit, diff --git a/userspace-dp/src/policy_tests.rs b/userspace-dp/src/policy_tests.rs index 45344f2c5..5bd3bfab5 100644 --- a/userspace-dp/src/policy_tests.rs +++ b/userspace-dp/src/policy_tests.rs @@ -29,6 +29,7 @@ fn allow_all_matches_zone_pair() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }], &test_zone_name_to_id(), ); @@ -65,6 +66,113 @@ fn default_deny_applies_without_match() { ); } +#[test] +fn evaluate_policy_skips_inactive_rules() { + let state = parse_policy_state( + "deny", + &[PolicyRuleSnapshot { + rule_id: "security-policy:lan:wan:inactive-allow".to_string(), + name: "inactive-allow".to_string(), + from_zone: "lan".to_string(), + to_zone: "wan".to_string(), + scheduler_name: "workhours".to_string(), + inactive: true, + source_addresses: vec!["any".to_string()], + destination_addresses: vec!["any".to_string()], + applications: vec!["any".to_string()], + action: "permit".to_string(), + ..Default::default() + }], + &test_zone_name_to_id(), + ); + + assert_eq!( + state.rules[0].rule_id, + "security-policy:lan:wan:inactive-allow" + ); + assert_eq!(state.rules[0].scheduler_name, "workhours"); + assert!(state.rules[0].inactive); + assert_eq!( + evaluate_policy( + &state, + TEST_LAN_ZONE_ID, + TEST_WAN_ZONE_ID, + "10.0.61.100".parse().expect("src"), + "172.16.80.200".parse().expect("dst"), + PROTO_TCP, + 12345, + 5201, + ), + PolicyAction::Deny + ); + assert_eq!( + state.rules[0] + .hit_count + .load(std::sync::atomic::Ordering::Relaxed), + 0 + ); +} + +#[test] +fn inactive_rule_falls_through_to_next_match() { + let state = parse_policy_state( + "deny", + &[ + PolicyRuleSnapshot { + name: "inactive-deny".to_string(), + from_zone: "lan".to_string(), + to_zone: "wan".to_string(), + scheduler_name: "offhours".to_string(), + inactive: true, + source_addresses: vec!["any".to_string()], + destination_addresses: vec!["any".to_string()], + applications: vec!["any".to_string()], + action: "deny".to_string(), + ..Default::default() + }, + PolicyRuleSnapshot { + rule_id: "security-policy:lan:wan:active-allow".to_string(), + name: "active-allow".to_string(), + from_zone: "lan".to_string(), + to_zone: "wan".to_string(), + source_addresses: vec!["any".to_string()], + destination_addresses: vec!["any".to_string()], + applications: vec!["any".to_string()], + action: "permit".to_string(), + ..Default::default() + }, + ], + &test_zone_name_to_id(), + ); + + assert_eq!(state.rules[0].rule_id, "lan->wan/inactive-deny"); + assert_eq!( + evaluate_policy( + &state, + TEST_LAN_ZONE_ID, + TEST_WAN_ZONE_ID, + "10.0.61.100".parse().expect("src"), + "172.16.80.200".parse().expect("dst"), + PROTO_TCP, + 12345, + 5201, + ), + PolicyAction::Permit + ); + assert_eq!( + state.rules[0] + .hit_count + .load(std::sync::atomic::Ordering::Relaxed), + 0 + ); + assert_eq!( + state.rules[1] + .hit_count + .load(std::sync::atomic::Ordering::Relaxed), + 1 + ); +} + #[test] fn cidr_matches_ipv6() { let state = parse_policy_state( @@ -78,6 +186,7 @@ fn cidr_matches_ipv6() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }], &test_zone_name_to_id(), ); @@ -114,6 +223,7 @@ fn named_application_matches_protocol_and_port() { destination_port: "80".to_string(), }], action: "permit".to_string(), + ..Default::default() }], &test_zone_name_to_id(), ); @@ -171,6 +281,7 @@ fn application_set_matches_any_expanded_term() { }, ], action: "permit".to_string(), + ..Default::default() }], &test_zone_name_to_id(), ); @@ -202,6 +313,7 @@ fn global_policy_matches_any_zone_pair() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }], &test_zone_name_to_id(), ); @@ -248,6 +360,7 @@ fn global_policy_evaluated_after_zone_specific() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "deny".to_string(), + ..Default::default() }, PolicyRuleSnapshot { name: "global-allow".to_string(), @@ -258,6 +371,7 @@ fn global_policy_evaluated_after_zone_specific() { applications: vec!["any".to_string()], application_terms: Vec::new(), action: "permit".to_string(), + ..Default::default() }, ], &test_zone_name_to_id(), @@ -310,6 +424,7 @@ fn evaluate_policy_unknown_zone_pair_returns_default_action() { applications: vec!["any".into()], application_terms: Vec::new(), action: "permit".into(), + ..Default::default() }], &zones, ); @@ -353,6 +468,7 @@ fn malformed_only_input_yields_match_all_via_evaluate_policy() { applications: vec!["any".into()], application_terms: Vec::new(), action: "permit".into(), + ..Default::default() }], &zones, ); diff --git a/userspace-dp/src/protocol.rs b/userspace-dp/src/protocol.rs index 0e70fdbc2..c057cd873 100644 --- a/userspace-dp/src/protocol.rs +++ b/userspace-dp/src/protocol.rs @@ -593,11 +593,17 @@ pub(crate) struct FlowExportSnapshot { #[derive(Clone, Debug, Serialize, Deserialize, Default)] pub(crate) struct PolicyRuleSnapshot { + #[serde(rename = "rule_id", default)] + pub rule_id: String, pub name: String, #[serde(rename = "from_zone", default)] pub from_zone: String, #[serde(rename = "to_zone", default)] pub to_zone: String, + #[serde(rename = "scheduler_name", default)] + pub scheduler_name: String, + #[serde(default)] + pub inactive: bool, #[serde(rename = "source_addresses", default)] pub source_addresses: Vec, #[serde(rename = "destination_addresses", default)] @@ -2278,6 +2284,54 @@ mod tests { assert_eq!(back.v_min_throttles, status.v_min_throttles); } + #[test] + fn policy_rule_snapshot_scheduler_fields_round_trip() { + let snap = PolicyRuleSnapshot { + rule_id: "security-policy:trust:untrust:allow-web".into(), + name: "allow-web".into(), + from_zone: "trust".into(), + to_zone: "untrust".into(), + scheduler_name: "workhours".into(), + inactive: true, + source_addresses: vec!["any".into()], + destination_addresses: vec!["any".into()], + applications: vec!["junos-http".into()], + action: "permit".into(), + ..Default::default() + }; + + let value: serde_json::Value = + serde_json::to_value(&snap).expect("serialize PolicyRuleSnapshot to Value"); + assert_eq!(value["rule_id"], "security-policy:trust:untrust:allow-web"); + assert_eq!(value["scheduler_name"], "workhours"); + assert_eq!(value["inactive"], true); + + let back: PolicyRuleSnapshot = + serde_json::from_value(value).expect("deserialize PolicyRuleSnapshot"); + assert_eq!(back.rule_id, snap.rule_id); + assert_eq!(back.scheduler_name, snap.scheduler_name); + assert_eq!(back.inactive, snap.inactive); + } + + #[test] + fn policy_rule_snapshot_legacy_scheduler_fields_default() { + let legacy_json = r#"{ + "name": "allow-web", + "from_zone": "trust", + "to_zone": "untrust", + "source_addresses": ["any"], + "destination_addresses": ["any"], + "applications": ["junos-http"], + "action": "permit" + }"#; + + let snap: PolicyRuleSnapshot = + serde_json::from_str(legacy_json).expect("pre-#1378 PolicyRuleSnapshot decodes"); + assert_eq!(snap.rule_id, ""); + assert_eq!(snap.scheduler_name, ""); + assert_eq!(snap.inactive, false); + } + // #915 forward-compat: a pre-#915 CoSSchedulerSnapshot // payload (no `surplus_sharing` field) must decode with // `surplus_sharing == false` so the runtime sees the field From 572859fd39bdbd3fb051b8d9dcdc6eda7f87ec6b Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 19:48:34 -0700 Subject: [PATCH 02/20] userspace: publish policy scheduler state --- .../plan-1378-policy-schedulers.md | 24 ++-- pkg/config/compiler.go | 34 ++++++ pkg/config/parser_ast_test.go | 34 ++++-- pkg/dataplane/userspace/manager.go | 106 +++++++++++++++--- pkg/dataplane/userspace/manager_test.go | 96 ++++++++++++++++ pkg/dataplane/userspace/protocol.go | 2 +- pkg/dataplane/userspace/snapshot.go | 6 +- pkg/scheduler/README.md | 4 + userspace-dp/src/main_tests.rs | 49 ++++++++ userspace-dp/src/protocol.rs | 2 + userspace-dp/src/server/README.md | 5 + userspace-dp/src/server/handlers.rs | 92 ++++++++------- 12 files changed, 376 insertions(+), 78 deletions(-) diff --git a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md index 109d6c736..02dda7103 100644 --- a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md +++ b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md @@ -8,9 +8,9 @@ scheduled policy rules activate and deactivate correctly without the eBPF ## Dependencies -- #1381 must land first. `UpdatePolicyScheduleState` currently dispatches - through the embedded eBPF `DataPlane`; userspace needs either the split - interface or an explicit stub/snapshot branch. +- The safe slice no longer waits on #1381. The userspace manager now shadows + `UpdatePolicyScheduleState` and republishes a userspace snapshot instead of + falling through to the embedded eBPF manager. ## Design @@ -22,10 +22,9 @@ compiled identity. Safe #1378 slice status: this change wires `rule_id`, `scheduler_name`, and `inactive` through userspace policy snapshots and Rust policy evaluation. The -Go snapshot builder can compute inactive bits when supplied scheduler active -state, but the normal daemon publish path still builds snapshots without that -state. Scheduler tick/restart/failover republish and missing-scheduler commit -errors remain gates before scheduled userspace policies are end-to-end enforced. +Go userspace publish path uses the last known scheduler active-state map during +config rebuilds, and scheduler ticks publish one coherent snapshot delta with +updated inactive bits. Missing scheduler references are compile errors. On scheduler state changes, publish one atomic userspace snapshot delta that contains the updated inactive bits for all affected rules. Do not issue @@ -38,9 +37,11 @@ existing sessions unless a separate `policy-rematch` feature is implemented. That matches Junos default behavior: schedulers block new lookups, not existing sessions. -Scheduler granularity is 60 seconds. Tests and docs must use deterministic -clock injection or windows that span multiple evaluator ticks; the earlier -30-second integration target is invalid. +Scheduler granularity is 60 seconds. The wall clock is used only by the Go +control-plane scheduler to decide the next active-state map; workers receive +booleans in the snapshot and never evaluate wall-clock time in the packet path. +Tests and docs must use deterministic scheduler inputs or windows that span +multiple evaluator ticks; the earlier 30-second integration target is invalid. Missing scheduler references fail closed as commit errors. Do not copy the existing eBPF behavior that can default missing scheduler state to active. @@ -50,6 +51,9 @@ existing eBPF behavior that can default missing scheduler state to active. - One inactive-branch per rule on miss path is acceptable; no scheduler clock evaluation occurs in the packet worker. - Snapshot publication is ArcSwap-atomic across all rule inactive bits. +- Snapshots carrying scheduler inactive bits require protocol version 2; the + Rust control server rejects older/unknown snapshot versions instead of + silently ignoring scheduling fields. - Hit counters are keyed by stable rule identity outside rebuilt rule structs so counters survive scheduler snapshot rebuilds. - Do not copy the existing eBPF indexing bug in diff --git a/pkg/config/compiler.go b/pkg/config/compiler.go index 41611ff78..88faef325 100644 --- a/pkg/config/compiler.go +++ b/pkg/config/compiler.go @@ -221,6 +221,9 @@ func compileExpanded(tree *ConfigTree) (*Config, error) { if err := validateThreeColorPolicersStrict(cfg.Firewall.ThreeColorPolicers); err != nil { return nil, err } + if err := validatePolicySchedulerReferencesStrict(cfg); err != nil { + return nil, err + } if warnings := ValidateConfig(cfg); len(warnings) > 0 { for _, w := range warnings { @@ -273,6 +276,37 @@ func validateThreeColorPolicersStrict(policers map[string]*ThreeColorPolicerConf return nil } +func validatePolicySchedulerReferencesStrict(cfg *Config) error { + if cfg == nil { + return nil + } + check := func(pol *Policy) error { + if pol == nil || pol.SchedulerName == "" { + return nil + } + if _, ok := cfg.Schedulers[pol.SchedulerName]; ok { + return nil + } + return fmt.Errorf("policy %q references undefined scheduler %q", pol.Name, pol.SchedulerName) + } + for _, zpp := range cfg.Security.Policies { + if zpp == nil { + continue + } + for _, pol := range zpp.Policies { + if err := check(pol); err != nil { + return err + } + } + } + for _, pol := range cfg.Security.GlobalPolicies { + if err := check(pol); err != nil { + return err + } + } + return nil +} + func validateClassOfServiceStrict(cos *ClassOfServiceConfig) error { if cos == nil { return nil diff --git a/pkg/config/parser_ast_test.go b/pkg/config/parser_ast_test.go index f94d39a66..83bddbca4 100644 --- a/pkg/config/parser_ast_test.go +++ b/pkg/config/parser_ast_test.go @@ -1473,7 +1473,6 @@ security { policy sched-test { match { source-address any; destination-address any; application any; } then { permit; } - scheduler-name missing-sched; } } } @@ -1488,7 +1487,7 @@ security { if err != nil { t.Fatalf("CompileConfig: %v", err) } - var foundIfaceWarn, foundPoolWarn, foundSchedWarn bool + var foundIfaceWarn, foundPoolWarn bool for _, w := range cfg.Warnings { if strings.Contains(w, "missing-iface") && strings.Contains(w, "not in interfaces") { foundIfaceWarn = true @@ -1496,9 +1495,6 @@ security { if strings.Contains(w, "missing-pool") && strings.Contains(w, "not defined") { foundPoolWarn = true } - if strings.Contains(w, "missing-sched") && strings.Contains(w, "not defined") { - foundSchedWarn = true - } } if !foundIfaceWarn { t.Errorf("missing warning for zone referencing unconfigured interface, got: %v", cfg.Warnings) @@ -1506,8 +1502,32 @@ security { if !foundPoolWarn { t.Errorf("missing warning for SNAT referencing undefined pool, got: %v", cfg.Warnings) } - if !foundSchedWarn { - t.Errorf("missing warning for policy referencing undefined scheduler, got: %v", cfg.Warnings) +} + +func TestPolicySchedulerMissingReferenceFailsCompile(t *testing.T) { + input := `security { + policies { + from-zone trust to-zone untrust { + policy sched-test { + match { source-address any; destination-address any; application any; } + then { permit; } + scheduler-name missing-sched; + } + } + } +} +` + parser := NewParser(input) + tree, errs := parser.Parse() + if len(errs) > 0 { + t.Fatalf("parse errors: %v", errs) + } + _, err := CompileConfig(tree) + if err == nil { + t.Fatal("CompileConfig succeeded, want missing scheduler error") + } + if !strings.Contains(err.Error(), `policy "sched-test" references undefined scheduler "missing-sched"`) { + t.Fatalf("CompileConfig error = %v", err) } } diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index a9df84bfd..e48142102 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -58,28 +58,29 @@ type Manager struct { dataplane.DataPlane inner *dataplane.Manager - mu sync.Mutex - sessionMu sync.Mutex // separate lock for session sync requests (Phase 3) - proc *exec.Cmd - cfg config.UserspaceConfig - clusterHA bool - generation uint64 - syncCancel context.CancelFunc - lastStatus ProcessStatus - lastSnapshot *ConfigSnapshot - haGroups map[int]HAGroupStatus - lastIngressIfaces []uint32 - lastRSTv4 []netip.Addr - lastRSTv6 []netip.Addr - lastRSTAttempt time.Time - lastRSTInstallOK bool - lastSnapshotHash [32]byte // content hash of last published snapshot (excludes volatile fields) + mu sync.Mutex + sessionMu sync.Mutex // separate lock for session sync requests (Phase 3) + proc *exec.Cmd + cfg config.UserspaceConfig + clusterHA bool + generation uint64 + syncCancel context.CancelFunc + lastStatus ProcessStatus + lastSnapshot *ConfigSnapshot + policySchedulerActive map[string]bool + haGroups map[int]HAGroupStatus + lastIngressIfaces []uint32 + lastRSTv4 []netip.Addr + lastRSTv6 []netip.Addr + lastRSTAttempt time.Time + lastRSTInstallOK bool + lastSnapshotHash [32]byte // content hash of last published snapshot (excludes volatile fields) // #1197: O(1) neighbor lookup index for the listener hot path. // Keyed by (ifindex, ip-string). Rebuilt whenever lastSnapshot.Neighbors // is replaced. Read under m.mu (existing snapshot lock). neighborIndex map[neighborIndexKey]*NeighborSnapshot // #1197: ifindex set for listener filter; rebuilt on config commit. - monitoredIfindexes map[int]struct{} + monitoredIfindexes map[int]struct{} lastBindingIndices []uint32 neighborsPrewarmed bool ctrlEnableAt time.Time @@ -156,6 +157,23 @@ func New() *Manager { } } +func copyPolicySchedulerActiveState(activeState map[string]bool) map[string]bool { + if activeState == nil { + return nil + } + out := make(map[string]bool, len(activeState)) + for name, active := range activeState { + out[name] = active + } + return out +} + +func (m *Manager) policySchedulerActiveStateSnapshot() map[string]bool { + m.mu.Lock() + defer m.mu.Unlock() + return copyPolicySchedulerActiveState(m.policySchedulerActive) +} + // EventStream returns the event stream instance, or nil if not available. func (m *Manager) EventStream() *EventStream { m.mu.Lock() @@ -265,7 +283,8 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) return nil, err } ucfg := deriveUserspaceConfig(cfg) - snap := buildSnapshot(cfg, ucfg, m.bumpGeneration(), m.readFIBGeneration()) + activeState := m.policySchedulerActiveStateSnapshot() + snap := buildSnapshotWithSchedulerState(cfg, ucfg, m.bumpGeneration(), m.readFIBGeneration(), activeState) m.syncInterfaceAttachments(result, snap) m.mu.Lock() @@ -381,6 +400,57 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) return result, nil } +// UpdatePolicyScheduleState republishes the userspace policy snapshot with one +// coherent inactive-bit view. This shadows the embedded eBPF manager method; +// scheduled userspace policies must not update the policy_rules BPF map. +func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[string]bool) { + activeCopy := copyPolicySchedulerActiveState(activeState) + + m.mu.Lock() + defer m.mu.Unlock() + + m.policySchedulerActive = activeCopy + if cfg == nil { + if m.lastSnapshot == nil { + return + } + cfg = m.lastSnapshot.Config + } + if cfg == nil || m.lastSnapshot == nil { + return + } + + next := *m.lastSnapshot + m.generation++ + next.Generation = m.generation + next.FIBGeneration = m.readFIBGeneration() + next.GeneratedAt = time.Now().UTC() + next.Config = cfg + next.Policies = buildPolicySnapshotsWithSchedulerState(cfg, activeCopy) + m.lastSnapshot = &next + + if m.proc == nil || m.proc.Process == nil { + return + } + publishSnap := next + publishSnap.Neighbors = filterPublishableNeighbors(next.Neighbors) + var status ProcessStatus + if err := m.requestLocked(ControlRequest{Type: "apply_snapshot", Snapshot: &publishSnap}, &status); err != nil { + slog.Warn("userspace: failed to publish policy scheduler state", "err", err) + return + } + m.rebuildNeighborIndex() + m.rebuildMonitoredIfindexes() + m.publishedSnapshot = next.Generation + m.publishedPlanKey = snapshotBindingPlanKey(&next) + if h, ok := snapshotContentHash(&next); ok { + m.lastSnapshotHash = h + } + if err := m.applyHelperStatusLocked(&status); err != nil { + slog.Warn("userspace: failed to sync helper status after policy scheduler publish", "err", err) + } +} + func (m *Manager) syncInterfaceAttachments(result *dataplane.CompileResult, snapshot *ConfigSnapshot) { if result == nil { return diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 2e993fdf8..535edf310 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2655,6 +2655,102 @@ func TestBuildPolicySnapshotsRoundTripsSchedulerInactiveAndRuleID(t *testing.T) } } +func TestUpdatePolicyScheduleStatePublishesUserspaceSnapshot(t *testing.T) { + dir := t.TempDir() + controlSock := filepath.Join(dir, "control.sock") + ln, err := net.Listen("unix", controlSock) + if err != nil { + t.Fatalf("listen control socket: %v", err) + } + defer ln.Close() + + reqCh := make(chan ControlRequest, 1) + done := make(chan struct{}, 1) + go func() { + conn, err := ln.Accept() + if err != nil { + return + } + defer conn.Close() + var req ControlRequest + if err := json.NewDecoder(conn).Decode(&req); err != nil { + return + } + reqCh <- req + _ = json.NewEncoder(conn).Encode(ControlResponse{ + OK: true, + Status: &ProcessStatus{ + Enabled: true, + LastSnapshotGeneration: req.Snapshot.Generation, + LastFIBGeneration: req.Snapshot.FIBGeneration, + }, + }) + done <- struct{}{} + }() + + cfg := &config.Config{} + cfg.Security.Policies = []*config.ZonePairPolicies{{ + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{{ + Name: "scheduled-allow", + SchedulerName: "workhours", + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"any"}, + }, + Action: config.PolicyPermit, + }}, + }} + cfg.Schedulers = map[string]*config.SchedulerConfig{ + "workhours": {Name: "workhours"}, + } + + m := New() + m.proc = &exec.Cmd{Process: &os.Process{Pid: os.Getpid()}} + m.cfg.ControlSocket = controlSock + m.generation = 7 + m.lastSnapshot = buildSnapshot(cfg, config.UserspaceConfig{ControlSocket: controlSock}, 7, 0) + + m.UpdatePolicyScheduleState(cfg, map[string]bool{"workhours": false}) + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for apply_snapshot publish") + } + req := <-reqCh + if req.Type != "apply_snapshot" { + t.Fatalf("request type = %q, want apply_snapshot", req.Type) + } + if req.Snapshot == nil { + t.Fatal("apply_snapshot missing snapshot") + } + if req.Snapshot.Version != ProtocolVersion { + t.Fatalf("snapshot version = %d, want %d", req.Snapshot.Version, ProtocolVersion) + } + if req.Snapshot.Generation <= 7 { + t.Fatalf("snapshot generation = %d, want > 7", req.Snapshot.Generation) + } + if len(req.Snapshot.Policies) != 1 { + t.Fatalf("policy count = %d, want 1", len(req.Snapshot.Policies)) + } + pol := req.Snapshot.Policies[0] + if pol.RuleID != "trust->untrust/scheduled-allow" { + t.Fatalf("policy rule_id = %q", pol.RuleID) + } + if pol.SchedulerName != "workhours" { + t.Fatalf("scheduler_name = %q", pol.SchedulerName) + } + if !pol.Inactive { + t.Fatalf("inactive = false, want true for inactive scheduler state") + } + if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || !m.lastSnapshot.Policies[0].Inactive { + t.Fatalf("manager lastSnapshot did not keep inactive policy bit: %+v", m.lastSnapshot) + } +} + func TestUserspaceSupportsScreenProfilesBasic(t *testing.T) { cfg := &config.Config{} cfg.Security.Screen = map[string]*config.ScreenProfile{ diff --git a/pkg/dataplane/userspace/protocol.go b/pkg/dataplane/userspace/protocol.go index 4fbcaf042..b95c17d72 100644 --- a/pkg/dataplane/userspace/protocol.go +++ b/pkg/dataplane/userspace/protocol.go @@ -7,7 +7,7 @@ import ( ) const ( - ProtocolVersion = 1 + ProtocolVersion = 2 TypeUserspace = "userspace" ) diff --git a/pkg/dataplane/userspace/snapshot.go b/pkg/dataplane/userspace/snapshot.go index 865fed44a..4b1be0611 100644 --- a/pkg/dataplane/userspace/snapshot.go +++ b/pkg/dataplane/userspace/snapshot.go @@ -21,6 +21,10 @@ import ( ) func buildSnapshot(cfg *config.Config, ucfg config.UserspaceConfig, generation uint64, fibGeneration uint32) *ConfigSnapshot { + return buildSnapshotWithSchedulerState(cfg, ucfg, generation, fibGeneration, nil) +} + +func buildSnapshotWithSchedulerState(cfg *config.Config, ucfg config.UserspaceConfig, generation uint64, fibGeneration uint32, activeState map[string]bool) *ConfigSnapshot { if cfg == nil { return &ConfigSnapshot{ Version: ProtocolVersion, @@ -50,7 +54,7 @@ func buildSnapshot(cfg *config.Config, ucfg config.UserspaceConfig, generation u Routes: buildRouteSnapshots(cfg, interfaces), Flow: buildFlowSnapshot(cfg), DefaultPolicy: policyActionString(cfg.Security.DefaultPolicy), - Policies: buildPolicySnapshots(cfg), + Policies: buildPolicySnapshotsWithSchedulerState(cfg, activeState), SourceNAT: buildSourceNATSnapshots(cfg), StaticNAT: buildStaticNATSnapshots(cfg), DestinationNAT: buildDestinationNATSnapshots(cfg), diff --git a/pkg/scheduler/README.md b/pkg/scheduler/README.md index 1358a376b..e91f62847 100644 --- a/pkg/scheduler/README.md +++ b/pkg/scheduler/README.md @@ -33,3 +33,7 @@ during specific windows. precision through this package. - `updateFn` receives the **full** active-state map, not just the changed entries. Callers compute their own diff if they care. +- The scheduler uses wall-clock time only in the control plane to evaluate + Junos time windows. Packet workers must consume published active/inactive + booleans from the userspace snapshot and must not evaluate scheduler time in + the hot path. diff --git a/userspace-dp/src/main_tests.rs b/userspace-dp/src/main_tests.rs index 108b8406c..964c975c4 100644 --- a/userspace-dp/src/main_tests.rs +++ b/userspace-dp/src/main_tests.rs @@ -884,6 +884,55 @@ fn binding_counters_snapshot_serializes_with_expected_wire_keys() { assert_eq!(round, snap); } +#[test] +fn apply_snapshot_rejects_unsupported_protocol_version() { + let (mut client, server) = std::os::unix::net::UnixStream::pair().expect("control socket pair"); + let state = Arc::new(Mutex::new(ServerState { + status: ProcessStatus::default(), + snapshot: None, + afxdp: afxdp::Coordinator::new(), + state_writer: Arc::new(StateWriter::new()), + })); + let running = Arc::new(AtomicBool::new(true)); + let state_file = format!( + "{}/xpf-policy-scheduler-version-gate-{}.json", + std::env::temp_dir().display(), + std::process::id() + ); + let handle = { + let state = state.clone(); + let running = running.clone(); + std::thread::spawn(move || handle_stream(server, &state_file, state, running)) + }; + + let request = ControlRequest { + request_type: "apply_snapshot".to_string(), + snapshot: Some(ConfigSnapshot { + version: CONFIG_SNAPSHOT_PROTOCOL_VERSION - 1, + generated_at: Utc::now(), + ..ConfigSnapshot::default() + }), + ..ControlRequest::default() + }; + serde_json::to_writer(&mut client, &request).expect("write request"); + std::io::Write::write_all(&mut client, b"\n").expect("newline"); + + let response: ControlResponse = + serde_json::from_reader(std::io::BufReader::new(client)).expect("read response"); + assert!(!response.ok); + assert!( + response + .error + .contains("unsupported snapshot protocol version"), + "unexpected error: {}", + response.error + ); + handle + .join() + .expect("handler thread") + .expect("handler result"); +} + #[test] fn binding_counters_snapshot_tolerates_pre_split_wire() { // #804: a helper snapshot that pre-dates the diff --git a/userspace-dp/src/protocol.rs b/userspace-dp/src/protocol.rs index c057cd873..959eb1b04 100644 --- a/userspace-dp/src/protocol.rs +++ b/userspace-dp/src/protocol.rs @@ -7,6 +7,8 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; +pub(crate) const CONFIG_SNAPSHOT_PROTOCOL_VERSION: i32 = 2; + // --------------------------------------------------------------------------- // Snapshot schema // --------------------------------------------------------------------------- diff --git a/userspace-dp/src/server/README.md b/userspace-dp/src/server/README.md index 6a819b7c1..32d7a3286 100644 --- a/userspace-dp/src/server/README.md +++ b/userspace-dp/src/server/README.md @@ -27,6 +27,11 @@ The shapes are mirrored in `pkg/dataplane/userspace/protocol.go` on the Go side; **the JSON tags ARE the contract** — changing one without updating the other breaks the helper. +`ConfigSnapshot.version` is a compatibility gate, not just documentation. +The helper accepts only the current snapshot protocol version; this prevents a +new daemon from publishing fields such as policy-scheduler inactive bits to a +helper that would silently ignore them. + ## Reconciliation `replan_queues` derives the binding plan from the current diff --git a/userspace-dp/src/server/handlers.rs b/userspace-dp/src/server/handlers.rs index ef5c51e3f..1d2b52e1e 100644 --- a/userspace-dp/src/server/handlers.rs +++ b/userspace-dp/src/server/handlers.rs @@ -52,51 +52,61 @@ pub(crate) fn handle_stream( "ping" | "status" => {} "apply_snapshot" => { if let Some(snapshot) = request.snapshot { - eprintln!( - "CTRL_REQ: apply_snapshot generation={} fib_generation={} forwarding_armed_before={}", - snapshot.generation, snapshot.fib_generation, guard.status.forwarding_armed - ); - guard.status.last_snapshot_generation = snapshot.generation; - guard.status.last_fib_generation = snapshot.fib_generation; - guard.status.last_snapshot_at = Some(snapshot.generated_at); - guard.status.capabilities = snapshot.capabilities.clone(); - let existing_bindings = guard.status.bindings.clone(); - let previous_snapshot = guard.snapshot.as_ref(); - let same_plan = previous_snapshot.is_some_and(|prev| { - let prev_key = snapshot_binding_plan_key(prev); - let next_key = snapshot_binding_plan_key(&snapshot); - let same = prev_key == next_key; - if !same { - eprintln!( - "CTRL_REQ: binding plan changed prev_key={} next_key={}", - prev_key, next_key - ); - } - same - }); - if same_plan { - guard.afxdp.refresh_runtime_snapshot(&snapshot); - guard.snapshot = Some(snapshot); - refresh_status(&mut guard); - persist_state = true; + if snapshot.version != CONFIG_SNAPSHOT_PROTOCOL_VERSION { + response.ok = false; + response.error = format!( + "unsupported snapshot protocol version {} (want {})", + snapshot.version, CONFIG_SNAPSHOT_PROTOCOL_VERSION + ); } else { - let defer_workers = snapshot.defer_workers; - guard.snapshot = Some(snapshot); - let replanned = replan_queues( - guard.snapshot.as_ref(), - guard.status.workers, - &existing_bindings, + eprintln!( + "CTRL_REQ: apply_snapshot generation={} fib_generation={} forwarding_armed_before={}", + snapshot.generation, + snapshot.fib_generation, + guard.status.forwarding_armed ); - guard.status.bindings = replanned; - if defer_workers { - eprintln!( - "CTRL_REQ: apply_snapshot defer_workers=true — skipping worker spawn (RETH MAC pending)" - ); + guard.status.last_snapshot_generation = snapshot.generation; + guard.status.last_fib_generation = snapshot.fib_generation; + guard.status.last_snapshot_at = Some(snapshot.generated_at); + guard.status.capabilities = snapshot.capabilities.clone(); + let existing_bindings = guard.status.bindings.clone(); + let previous_snapshot = guard.snapshot.as_ref(); + let same_plan = previous_snapshot.is_some_and(|prev| { + let prev_key = snapshot_binding_plan_key(prev); + let next_key = snapshot_binding_plan_key(&snapshot); + let same = prev_key == next_key; + if !same { + eprintln!( + "CTRL_REQ: binding plan changed prev_key={} next_key={}", + prev_key, next_key + ); + } + same + }); + if same_plan { + guard.afxdp.refresh_runtime_snapshot(&snapshot); + guard.snapshot = Some(snapshot); + refresh_status(&mut guard); + persist_state = true; } else { - reconcile_status_bindings(&mut guard); + let defer_workers = snapshot.defer_workers; + guard.snapshot = Some(snapshot); + let replanned = replan_queues( + guard.snapshot.as_ref(), + guard.status.workers, + &existing_bindings, + ); + guard.status.bindings = replanned; + if defer_workers { + eprintln!( + "CTRL_REQ: apply_snapshot defer_workers=true — skipping worker spawn (RETH MAC pending)" + ); + } else { + reconcile_status_bindings(&mut guard); + } + refresh_status(&mut guard); + persist_state = true; } - refresh_status(&mut guard); - persist_state = true; } } else { response.ok = false; From 9d369d964e825adf0a56e3de0b6b1a4216b90422 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 20:25:30 -0700 Subject: [PATCH 03/20] userspace: serialize policy scheduler snapshots --- .../plan-1378-policy-schedulers.md | 12 ++- pkg/daemon/daemon.go | 2 + pkg/daemon/daemon_apply.go | 6 ++ pkg/daemon/daemon_run.go | 16 ---- pkg/daemon/daemon_scheduler.go | 69 ++++++++++++++++++ pkg/dataplane/userspace/manager.go | 59 +++++++++++++++ pkg/dataplane/userspace/manager_test.go | 73 +++++++++++++++++++ pkg/dataplane/userspace/process.go | 15 +++- pkg/dataplane/userspace/protocol.go | 45 ++++++------ pkg/scheduler/README.md | 7 ++ pkg/scheduler/scheduler.go | 67 ++++++++++++----- pkg/scheduler/scheduler_test.go | 21 ++++++ userspace-dp/src/protocol.rs | 2 + userspace-dp/src/server/README.md | 3 + userspace-dp/src/server/lifecycle.rs | 1 + 15 files changed, 337 insertions(+), 61 deletions(-) create mode 100644 pkg/daemon/daemon_scheduler.go diff --git a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md index 02dda7103..366478024 100644 --- a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md +++ b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md @@ -22,9 +22,11 @@ compiled identity. Safe #1378 slice status: this change wires `rule_id`, `scheduler_name`, and `inactive` through userspace policy snapshots and Rust policy evaluation. The -Go userspace publish path uses the last known scheduler active-state map during -config rebuilds, and scheduler ticks publish one coherent snapshot delta with -updated inactive bits. Missing scheduler references are compile errors. +daemon reconciles the scheduler lifecycle on every committed config while +holding the apply semaphore; userspace snapshot rebuilds are seeded with that +same active-state map, and runtime scheduler ticks acquire the same semaphore +before publishing one coherent snapshot delta. Missing scheduler references are +compile errors. On scheduler state changes, publish one atomic userspace snapshot delta that contains the updated inactive bits for all affected rules. Do not issue @@ -53,7 +55,9 @@ existing eBPF behavior that can default missing scheduler state to active. - Snapshot publication is ArcSwap-atomic across all rule inactive bits. - Snapshots carrying scheduler inactive bits require protocol version 2; the Rust control server rejects older/unknown snapshot versions instead of - silently ignoring scheduling fields. + silently ignoring scheduling fields, and status exposes the helper's supported + snapshot protocol so new Go refuses to publish scheduled-policy snapshots to + an old helper before the fail-open path can occur. - Hit counters are keyed by stable rule identity outside rebuilt rule structs so counters survive scheduler snapshot rebuilds. - Do not copy the existing eBPF indexing bug in diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 174194e13..80119f732 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -79,6 +79,8 @@ type Daemon struct { snmpAgent *snmp.Agent lldpMgr *lldp.Manager scheduler *scheduler.Scheduler + schedulerCancel context.CancelFunc + policySchedulerEpoch atomic.Uint64 cluster *cluster.Manager sessionSync *cluster.SessionSync syncBulkPrimed atomic.Bool diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index db21af78c..c2311b20e 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -420,6 +420,9 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) { } } + policySchedulerActiveState := d.reconcilePolicySchedulerLocked(cfg) + d.seedPolicySchedulerActiveStateLocked(policySchedulerActiveState) + // 2. Compile eBPF dataplane var compileResult *dataplane.CompileResult if d.dp != nil { @@ -430,6 +433,9 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) { d.recordCompileSuccess() } } + if d.dp != nil && policySchedulerActiveState != nil && compileResult != nil { + d.dp.UpdatePolicyScheduleState(cfg, policySchedulerActiveState) + } // Clear defer flag after Compile so subsequent recompiles (where MAC // is already set) don't skip workers. diff --git a/pkg/daemon/daemon_run.go b/pkg/daemon/daemon_run.go index 0bec2451f..3d175db5b 100644 --- a/pkg/daemon/daemon_run.go +++ b/pkg/daemon/daemon_run.go @@ -38,7 +38,6 @@ import ( "github.com/psaab/xpf/pkg/ra" "github.com/psaab/xpf/pkg/routing" "github.com/psaab/xpf/pkg/rpm" - "github.com/psaab/xpf/pkg/scheduler" "github.com/psaab/xpf/pkg/snmp" "github.com/psaab/xpf/pkg/vrrp" ) @@ -604,21 +603,6 @@ func (d *Daemon) Run(ctx context.Context) error { } } - // Start policy scheduler if configured. - if cfg := d.store.ActiveConfig(); cfg != nil && len(cfg.Schedulers) > 0 && d.dp != nil { - d.scheduler = scheduler.New(cfg.Schedulers, func(activeState map[string]bool) { - slog.Info("scheduler state changed, updating policy rules") - if activeCfg := d.store.ActiveConfig(); activeCfg != nil { - d.dp.UpdatePolicyScheduleState(activeCfg, activeState) - } - }) - wg.Add(1) - go func() { - defer wg.Done() - d.scheduler.Run(ctx) - }() - } - // Start periodic neighbor resolution to keep ARP entries warm for // known forwarding targets (DNAT pools, gateways, address-book hosts). // Without this, bpf_fib_lookup returns NO_NEIGH when ARP expires, diff --git a/pkg/daemon/daemon_scheduler.go b/pkg/daemon/daemon_scheduler.go new file mode 100644 index 000000000..b412ecf07 --- /dev/null +++ b/pkg/daemon/daemon_scheduler.go @@ -0,0 +1,69 @@ +package daemon + +import ( + "context" + "log/slog" + "time" + + "github.com/psaab/xpf/pkg/config" + "github.com/psaab/xpf/pkg/scheduler" +) + +type policySchedulerActiveStateSetter interface { + SetPolicySchedulerActiveState(map[string]bool) +} + +// reconcilePolicySchedulerLocked runs under applySem. It makes the scheduler +// lifecycle follow committed config instead of only daemon startup, and returns +// the active-state map that must be used for the same apply transaction. +func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]bool { + if d.schedulerCancel != nil { + d.schedulerCancel() + d.schedulerCancel = nil + } + d.scheduler = nil + epoch := d.policySchedulerEpoch.Add(1) + + if cfg == nil || len(cfg.Schedulers) == 0 { + return nil + } + + sched, activeState := scheduler.NewPrimed(cfg.Schedulers, func(activeState map[string]bool) { + d.publishPolicyScheduleState(epoch, activeState) + }, time.Now()) + d.scheduler = sched + + if d.daemonCtx != nil { + ctx, cancel := context.WithCancel(d.daemonCtx) + d.schedulerCancel = cancel + go sched.Run(ctx) + } + return activeState +} + +func (d *Daemon) publishPolicyScheduleState(epoch uint64, activeState map[string]bool) { + if err := d.applySem.Acquire(context.Background(), 1); err != nil { + slog.Warn("scheduler: failed to acquire apply semaphore", "err", err) + return + } + defer d.applySem.Release(1) + + if epoch != d.policySchedulerEpoch.Load() { + return + } + cfg := d.store.ActiveConfig() + if cfg == nil || d.dp == nil { + return + } + d.seedPolicySchedulerActiveStateLocked(activeState) + d.dp.UpdatePolicyScheduleState(cfg, activeState) +} + +func (d *Daemon) seedPolicySchedulerActiveStateLocked(activeState map[string]bool) { + if d.dp == nil { + return + } + if setter, ok := d.dp.(policySchedulerActiveStateSetter); ok { + setter.SetPolicySchedulerActiveState(activeState) + } +} diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index e48142102..af45f4d50 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -174,6 +174,15 @@ func (m *Manager) policySchedulerActiveStateSnapshot() map[string]bool { return copyPolicySchedulerActiveState(m.policySchedulerActive) } +// SetPolicySchedulerActiveState seeds the active-state map used by the next +// full snapshot build. The daemon calls this while holding applySem so config +// commits and scheduler flips cannot publish hybrid policy snapshots. +func (m *Manager) SetPolicySchedulerActiveState(activeState map[string]bool) { + m.mu.Lock() + defer m.mu.Unlock() + m.policySchedulerActive = copyPolicySchedulerActiveState(activeState) +} + // EventStream returns the event stream instance, or nil if not available. func (m *Manager) EventStream() *EventStream { m.mu.Lock() @@ -358,6 +367,9 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) if err := m.ensureProcessLocked(ucfg); err != nil { return result, err } + if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { + return result, err + } if m.deferWorkers { snap.DeferWorkers = true } @@ -432,6 +444,10 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ if m.proc == nil || m.proc.Process == nil { return } + if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { + slog.Warn("userspace: refusing policy scheduler publish to incompatible helper", "err", err) + return + } publishSnap := next publishSnap.Neighbors = filterPublishableNeighbors(next.Neighbors) var status ProcessStatus @@ -492,6 +508,49 @@ func (m *Manager) readFIBGeneration() uint32 { return gen } +func configHasScheduledPolicy(cfg *config.Config) bool { + if cfg == nil { + return false + } + for _, zpp := range cfg.Security.Policies { + if zpp == nil { + continue + } + for _, pol := range zpp.Policies { + if pol != nil && pol.SchedulerName != "" { + return true + } + } + } + for _, pol := range cfg.Security.GlobalPolicies { + if pol != nil && pol.SchedulerName != "" { + return true + } + } + return false +} + +func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error { + if !configHasScheduledPolicy(cfg) { + return nil + } + if m.lastStatus.ConfigSnapshotProtocolVersion >= ProtocolVersion { + return nil + } + var status ProcessStatus + if err := m.requestLocked(ControlRequest{Type: "status"}, &status); err == nil { + m.lastStatus.ConfigSnapshotProtocolVersion = status.ConfigSnapshotProtocolVersion + if status.ConfigSnapshotProtocolVersion >= ProtocolVersion { + return nil + } + } + return fmt.Errorf( + "userspace helper config snapshot protocol version %d does not support policy scheduler snapshots (want >= %d)", + m.lastStatus.ConfigSnapshotProtocolVersion, + ProtocolVersion, + ) +} + // bpfKtimeNs returns the current CLOCK_BOOTTIME in nanoseconds, matching // the clock used by BPF's bpf_ktime_get_ns() for session Created timestamps. func (m *Manager) bpfKtimeNs() uint64 { diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 535edf310..a006053fe 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2712,6 +2712,7 @@ func TestUpdatePolicyScheduleStatePublishesUserspaceSnapshot(t *testing.T) { m.cfg.ControlSocket = controlSock m.generation = 7 m.lastSnapshot = buildSnapshot(cfg, config.UserspaceConfig{ControlSocket: controlSock}, 7, 0) + m.lastStatus.ConfigSnapshotProtocolVersion = ProtocolVersion m.UpdatePolicyScheduleState(cfg, map[string]bool{"workhours": false}) @@ -2751,6 +2752,78 @@ func TestUpdatePolicyScheduleStatePublishesUserspaceSnapshot(t *testing.T) { } } +func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testing.T) { + dir := t.TempDir() + controlSock := filepath.Join(dir, "control.sock") + ln, err := net.Listen("unix", controlSock) + if err != nil { + t.Fatalf("listen control socket: %v", err) + } + defer ln.Close() + + reqCh := make(chan ControlRequest, 2) + done := make(chan struct{}, 1) + go func() { + conn, err := ln.Accept() + if err != nil { + return + } + defer conn.Close() + var req ControlRequest + if err := json.NewDecoder(conn).Decode(&req); err != nil { + return + } + reqCh <- req + _ = json.NewEncoder(conn).Encode(ControlResponse{ + OK: true, + Status: &ProcessStatus{ + PID: 1234, + }, + }) + done <- struct{}{} + }() + + cfg := &config.Config{} + cfg.Security.Policies = []*config.ZonePairPolicies{{ + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{{ + Name: "scheduled-allow", + SchedulerName: "workhours", + Action: config.PolicyPermit, + }}, + }} + cfg.Schedulers = map[string]*config.SchedulerConfig{ + "workhours": {Name: "workhours"}, + } + + m := New() + m.proc = &exec.Cmd{Process: &os.Process{Pid: os.Getpid()}} + m.cfg.ControlSocket = controlSock + m.generation = 7 + m.lastSnapshot = buildSnapshot(cfg, config.UserspaceConfig{ControlSocket: controlSock}, 7, 0) + + m.UpdatePolicyScheduleState(cfg, map[string]bool{"workhours": false}) + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for status probe") + } + req := <-reqCh + if req.Type != "status" { + t.Fatalf("first request type = %q, want status", req.Type) + } + select { + case req := <-reqCh: + t.Fatalf("unexpected apply request to old helper: %+v", req) + default: + } + if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || !m.lastSnapshot.Policies[0].Inactive { + t.Fatalf("manager should keep local inactive state even when publish is refused: %+v", m.lastSnapshot) + } +} + func TestUserspaceSupportsScreenProfilesBasic(t *testing.T) { cfg := &config.Config{} cfg.Security.Screen = map[string]*config.ScreenProfile{ diff --git a/pkg/dataplane/userspace/process.go b/pkg/dataplane/userspace/process.go index 778ddaa61..a4162c248 100644 --- a/pkg/dataplane/userspace/process.go +++ b/pkg/dataplane/userspace/process.go @@ -24,7 +24,11 @@ import ( func (m *Manager) ensureProcessLocked(cfg config.UserspaceConfig) error { tuneSocketBuffers() if m.proc != nil && m.proc.Process != nil && configEqual(m.cfg, cfg) { - if err := m.requestLocked(ControlRequest{Type: "ping"}, nil); err == nil { + var status ProcessStatus + if err := m.requestLocked(ControlRequest{Type: "ping"}, &status); err == nil { + if status.PID != 0 || status.ConfigSnapshotProtocolVersion != 0 { + m.lastStatus = status + } return nil } slog.Warn("userspace dataplane helper unhealthy, restarting") @@ -108,7 +112,11 @@ func (m *Manager) ensureProcessLocked(cfg config.UserspaceConfig) error { deadline := time.Now().Add(5 * time.Second) for time.Now().Before(deadline) { if _, err := os.Stat(cfg.ControlSocket); err == nil { - if err := m.requestLocked(ControlRequest{Type: "ping"}, nil); err == nil { + var status ProcessStatus + if err := m.requestLocked(ControlRequest{Type: "ping"}, &status); err == nil { + if status.PID != 0 || status.ConfigSnapshotProtocolVersion != 0 { + m.lastStatus = status + } slog.Info("userspace dataplane helper started", "pid", cmd.Process.Pid, "socket", cfg.ControlSocket) return nil } @@ -321,6 +329,9 @@ func (m *Manager) syncSnapshotLocked() error { // for parity with update_neighbors path. publishSnap := *m.lastSnapshot publishSnap.Neighbors = filterPublishableNeighbors(m.lastSnapshot.Neighbors) + if err := m.ensurePolicySchedulerProtocolLocked(publishSnap.Config); err != nil { + return err + } var status ProcessStatus if err := m.requestLocked(ControlRequest{Type: "apply_snapshot", Snapshot: &publishSnap}, &status); err != nil { return fmt.Errorf("publish userspace snapshot: %w", err) diff --git a/pkg/dataplane/userspace/protocol.go b/pkg/dataplane/userspace/protocol.go index b95c17d72..7d9bb999a 100644 --- a/pkg/dataplane/userspace/protocol.go +++ b/pkg/dataplane/userspace/protocol.go @@ -425,28 +425,29 @@ type UserspaceCapabilities struct { } type ProcessStatus struct { - PID int `json:"pid"` - StartedAt time.Time `json:"started_at"` - ControlSocket string `json:"control_socket"` - StateFile string `json:"state_file"` - Workers int `json:"workers"` - RingEntries int `json:"ring_entries"` - HelperMode string `json:"helper_mode"` - IOUringPlanned bool `json:"io_uring_planned"` - IOUringActive bool `json:"io_uring_active,omitempty"` - IOUringMode string `json:"io_uring_mode,omitempty"` - IOUringLastError string `json:"io_uring_last_error,omitempty"` - Enabled bool `json:"enabled"` - ForwardingArmed bool `json:"forwarding_armed,omitempty"` - Capabilities UserspaceCapabilities `json:"capabilities"` - LastSnapshotGeneration uint64 `json:"last_snapshot_generation"` - LastFIBGeneration uint32 `json:"last_fib_generation,omitempty"` - LastSnapshotAt time.Time `json:"last_snapshot_at,omitempty"` - InterfaceAddresses int `json:"interface_addresses,omitempty"` - NeighborEntries int `json:"neighbor_entries,omitempty"` - NeighborGeneration uint64 `json:"neighbor_generation,omitempty"` - RouteEntries int `json:"route_entries,omitempty"` - WorkerHeartbeats []time.Time `json:"worker_heartbeats,omitempty"` + PID int `json:"pid"` + ConfigSnapshotProtocolVersion int `json:"config_snapshot_protocol_version,omitempty"` + StartedAt time.Time `json:"started_at"` + ControlSocket string `json:"control_socket"` + StateFile string `json:"state_file"` + Workers int `json:"workers"` + RingEntries int `json:"ring_entries"` + HelperMode string `json:"helper_mode"` + IOUringPlanned bool `json:"io_uring_planned"` + IOUringActive bool `json:"io_uring_active,omitempty"` + IOUringMode string `json:"io_uring_mode,omitempty"` + IOUringLastError string `json:"io_uring_last_error,omitempty"` + Enabled bool `json:"enabled"` + ForwardingArmed bool `json:"forwarding_armed,omitempty"` + Capabilities UserspaceCapabilities `json:"capabilities"` + LastSnapshotGeneration uint64 `json:"last_snapshot_generation"` + LastFIBGeneration uint32 `json:"last_fib_generation,omitempty"` + LastSnapshotAt time.Time `json:"last_snapshot_at,omitempty"` + InterfaceAddresses int `json:"interface_addresses,omitempty"` + NeighborEntries int `json:"neighbor_entries,omitempty"` + NeighborGeneration uint64 `json:"neighbor_generation,omitempty"` + RouteEntries int `json:"route_entries,omitempty"` + WorkerHeartbeats []time.Time `json:"worker_heartbeats,omitempty"` // #869: per-worker busy/idle runtime telemetry. WorkerRuntime []WorkerRuntimeStatus `json:"worker_runtime,omitempty"` HAGroups []HAGroupStatus `json:"ha_groups,omitempty"` diff --git a/pkg/scheduler/README.md b/pkg/scheduler/README.md index e91f62847..8429fde36 100644 --- a/pkg/scheduler/README.md +++ b/pkg/scheduler/README.md @@ -12,6 +12,9 @@ during specific windows. - `New(schedulers map[string]*config.SchedulerConfig, updateFn func(map[string]bool)) *Scheduler` — `scheduler.go`. The `updateFn` callback fires only on state change, not every tick. +- `NewPrimed(..., now)` — constructor for daemon apply paths that need the + initial active-state map without firing the callback while an external + apply semaphore is already held. - `Run(ctx context.Context)` — `scheduler.go`. - `IsActive(name string) bool` — `scheduler.go`. - `ActiveState() map[string]bool` — `scheduler.go`. Snapshot of every @@ -33,6 +36,10 @@ during specific windows. precision through this package. - `updateFn` receives the **full** active-state map, not just the changed entries. Callers compute their own diff if they care. +- Daemon callers must publish scheduler changes while holding the daemon + apply semaphore. Runtime scheduler callbacks take that semaphore before + touching dataplane state so commits and time-window flips cannot publish + hybrid policy snapshots. - The scheduler uses wall-clock time only in the control plane to evaluate Junos time windows. Packet workers must consume published active/inactive booleans from the userspace snapshot and must not evaluate scheduler time in diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index d81b60793..c0eb7929e 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -18,17 +18,31 @@ type Scheduler struct { updateFn func(activeState map[string]bool) } -// New creates a Scheduler with the given scheduler configs and update callback. -// updateFn is called whenever any scheduler's active state changes, receiving -// the current active state of all schedulers. -func New(schedulers map[string]*config.SchedulerConfig, updateFn func(activeState map[string]bool)) *Scheduler { +// NewPrimed creates a Scheduler, evaluates the initial active-state map, and +// returns that map without firing updateFn from inside the constructor. Daemon +// apply paths use this when they already hold their own serialization lock and +// must publish the initial state as part of the same apply transaction. +func NewPrimed(schedulers map[string]*config.SchedulerConfig, updateFn func(activeState map[string]bool), now time.Time) (*Scheduler, map[string]bool) { s := &Scheduler{ schedulers: schedulers, active: make(map[string]bool), updateFn: updateFn, } - // Compute initial state. - s.evaluate(time.Now()) + s.evaluate(now, false) + return s, s.ActiveState() +} + +// New creates a Scheduler with the given scheduler configs and update callback. +// updateFn is called whenever any scheduler's active state changes, receiving +// the current active state of all schedulers. +func New(schedulers map[string]*config.SchedulerConfig, updateFn func(activeState map[string]bool)) *Scheduler { + s, _ := NewPrimed(schedulers, updateFn, time.Now()) + // Preserve the historical constructor contract: New notifies on initial + // state. NewPrimed is the no-notify variant for callers that publish the + // initial state under an external lock. + if len(s.active) > 0 { + s.notifyActiveState() + } return s } @@ -45,7 +59,7 @@ func (s *Scheduler) Run(ctx context.Context) { slog.Info("scheduler: stopping evaluation loop") return case t := <-ticker.C: - s.evaluate(t) + s.evaluate(t, true) } } } @@ -73,14 +87,13 @@ func (s *Scheduler) Update(schedulers map[string]*config.SchedulerConfig) { s.mu.Lock() s.schedulers = schedulers s.mu.Unlock() - s.evaluate(time.Now()) + s.evaluate(time.Now(), true) } // evaluate checks each scheduler against the current time and fires the // callback if any state changed. -func (s *Scheduler) evaluate(now time.Time) { +func (s *Scheduler) evaluate(now time.Time, notify bool) { s.mu.Lock() - defer s.mu.Unlock() changed := false newActive := make(map[string]bool, len(s.schedulers)) @@ -104,14 +117,34 @@ func (s *Scheduler) evaluate(now time.Time) { s.active = newActive - if changed && s.updateFn != nil { - // Pass a copy so the callback cannot mutate internal state. - cp := make(map[string]bool, len(newActive)) - for k, v := range newActive { - cp[k] = v - } - s.updateFn(cp) + if !changed || !notify || s.updateFn == nil { + s.mu.Unlock() + return + } + cp := copyActiveState(newActive) + updateFn := s.updateFn + s.mu.Unlock() + updateFn(cp) +} + +func (s *Scheduler) notifyActiveState() { + s.mu.RLock() + if s.updateFn == nil { + s.mu.RUnlock() + return + } + cp := copyActiveState(s.active) + updateFn := s.updateFn + s.mu.RUnlock() + updateFn(cp) +} + +func copyActiveState(in map[string]bool) map[string]bool { + out := make(map[string]bool, len(in)) + for k, v := range in { + out[k] = v } + return out } // isWithinWindow determines whether now falls within the time window defined diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index f381c19c3..dc73f504c 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -178,6 +178,27 @@ func TestScheduler_InitialState(t *testing.T) { } } +func TestScheduler_NewPrimedDoesNotNotifyInitialState(t *testing.T) { + var called bool + schedCfg := map[string]*config.SchedulerConfig{ + "always-on": {Name: "always-on"}, + } + + s, state := NewPrimed(schedCfg, func(activeState map[string]bool) { + called = true + }, time.Date(2026, 2, 12, 14, 30, 0, 0, time.UTC)) + + if called { + t.Fatal("NewPrimed fired callback during constructor") + } + if !state["always-on"] { + t.Fatal("initial state missing always-on=true") + } + if !s.IsActive("always-on") { + t.Fatal("IsActive should return true for always-on") + } +} + func TestScheduler_ActiveState(t *testing.T) { schedCfg := map[string]*config.SchedulerConfig{ "always-on": {Name: "always-on"}, diff --git a/userspace-dp/src/protocol.rs b/userspace-dp/src/protocol.rs index 959eb1b04..a0bfb0642 100644 --- a/userspace-dp/src/protocol.rs +++ b/userspace-dp/src/protocol.rs @@ -706,6 +706,8 @@ pub(crate) struct ControlRequest { #[derive(Clone, Debug, Serialize, Deserialize, Default)] pub(crate) struct ProcessStatus { pub pid: i32, + #[serde(rename = "config_snapshot_protocol_version", default)] + pub config_snapshot_protocol_version: i32, #[serde(rename = "started_at")] pub started_at: DateTime, #[serde(rename = "control_socket")] diff --git a/userspace-dp/src/server/README.md b/userspace-dp/src/server/README.md index 32d7a3286..595c2c9b4 100644 --- a/userspace-dp/src/server/README.md +++ b/userspace-dp/src/server/README.md @@ -31,6 +31,9 @@ updating the other breaks the helper. The helper accepts only the current snapshot protocol version; this prevents a new daemon from publishing fields such as policy-scheduler inactive bits to a helper that would silently ignore them. +The helper also reports `config_snapshot_protocol_version` in status so a new +daemon can fail closed before sending scheduled-policy snapshots to an older +helper binary that predates the gate. ## Reconciliation diff --git a/userspace-dp/src/server/lifecycle.rs b/userspace-dp/src/server/lifecycle.rs index 3ce721721..acf7c19cd 100644 --- a/userspace-dp/src/server/lifecycle.rs +++ b/userspace-dp/src/server/lifecycle.rs @@ -73,6 +73,7 @@ pub(crate) fn run() -> Result<(), String> { let state = Arc::new(Mutex::new(ServerState { status: ProcessStatus { pid: std::process::id() as i32, + config_snapshot_protocol_version: CONFIG_SNAPSHOT_PROTOCOL_VERSION, started_at: Utc::now(), control_socket: args.control_socket.clone(), state_file: args.state_file.clone(), From bd591775f3e2620600a80d5846641cddf1239787 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 21:31:13 -0700 Subject: [PATCH 04/20] userspace: fail closed scheduled policy drift --- .../plan-1378-policy-schedulers.md | 12 +++-- pkg/dataplane/userspace/manager.go | 26 ++++++++++ pkg/dataplane/userspace/manager_test.go | 46 +++++++++++------- pkg/dataplane/userspace/process.go | 1 + pkg/scheduler/README.md | 5 ++ pkg/scheduler/scheduler.go | 48 +++++++++++++++++-- pkg/scheduler/scheduler_test.go | 29 +++++++++++ userspace-dp/src/server/README.md | 6 ++- 8 files changed, 148 insertions(+), 25 deletions(-) diff --git a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md index 366478024..ea8a8eac0 100644 --- a/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md +++ b/docs/pr/1373-retire-ebpf-dataplane/plan-1378-policy-schedulers.md @@ -42,6 +42,9 @@ sessions. Scheduler granularity is 60 seconds. The wall clock is used only by the Go control-plane scheduler to decide the next active-state map; workers receive booleans in the snapshot and never evaluate wall-clock time in the packet path. +The scheduler compares wall elapsed time with Go's monotonic elapsed time at +each evaluation. Backward wall-clock steps or drift beyond tolerance fail +closed for that evaluation by publishing all scheduler bits inactive. Tests and docs must use deterministic scheduler inputs or windows that span multiple evaluator ticks; the earlier 30-second integration target is invalid. @@ -57,7 +60,9 @@ existing eBPF behavior that can default missing scheduler state to active. Rust control server rejects older/unknown snapshot versions instead of silently ignoring scheduling fields, and status exposes the helper's supported snapshot protocol so new Go refuses to publish scheduled-policy snapshots to - an old helper before the fail-open path can occur. + an old helper before the fail-open path can occur. The refusal actively + disarms helper forwarding with `set_forwarding_state armed=false`; recording + a compile error while leaving the old helper armed is not fail-closed. - Hit counters are keyed by stable rule identity outside rebuilt rule structs so counters survive scheduler snapshot rebuilds. - Do not copy the existing eBPF indexing bug in @@ -79,8 +84,9 @@ existing eBPF behavior that can default missing scheduler state to active. - Scheduler atomicity: first-match policy ordering requires affected inactive bits to publish as one coherent snapshot. Per-rule toggles can expose an impossible mixed policy state. -- Clock drift: scheduler state is daemon-clock derived. HA peers must recompute - after failover rather than trusting stale peer-local state. +- Clock drift: scheduler state is daemon-clock derived. The scheduler must + fail closed on wall-clock discontinuity, and HA peers must recompute after + failover rather than trusting stale peer-local state. - Counter continuity: stable rule identity is mandatory because inactive flips and snapshot rebuilds must not reset operator-visible hit counters. - Missing scheduler references: fail-open behavior admits traffic outside the diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index af45f4d50..9a17db43d 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -368,6 +368,7 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) return result, err } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { + m.disarmPolicySchedulerProtocolFailureLocked(err) return result, err } if m.deferWorkers { @@ -445,6 +446,7 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ return } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { + m.disarmPolicySchedulerProtocolFailureLocked(err) slog.Warn("userspace: refusing policy scheduler publish to incompatible helper", "err", err) return } @@ -551,6 +553,30 @@ func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error ) } +func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) { + if m.proc == nil || m.proc.Process == nil { + return + } + req := ControlRequest{ + Type: "set_forwarding_state", + Forwarding: &ForwardingControlRequest{ + Armed: false, + }, + } + var status ProcessStatus + if err := m.requestLocked(req, &status); err != nil { + slog.Warn("userspace: failed to disarm incompatible helper after policy scheduler protocol error", + "protocol_err", protocolErr, "err", err) + return + } + if err := m.applyHelperStatusLocked(&status); err != nil { + slog.Warn("userspace: failed to sync helper status after policy scheduler fail-closed disarm", + "protocol_err", protocolErr, "err", err) + return + } + slog.Warn("userspace: disarmed helper after policy scheduler protocol error", "err", protocolErr) +} + // bpfKtimeNs returns the current CLOCK_BOOTTIME in nanoseconds, matching // the clock used by BPF's bpf_ktime_get_ns() for session Created timestamps. func (m *Manager) bpfKtimeNs() uint64 { diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index a006053fe..163dc6a6f 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2764,22 +2764,30 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin reqCh := make(chan ControlRequest, 2) done := make(chan struct{}, 1) go func() { - conn, err := ln.Accept() - if err != nil { - return - } - defer conn.Close() - var req ControlRequest - if err := json.NewDecoder(conn).Decode(&req); err != nil { - return + for i := 0; i < 2; i++ { + conn, err := ln.Accept() + if err != nil { + return + } + var req ControlRequest + if err := json.NewDecoder(conn).Decode(&req); err != nil { + conn.Close() + return + } + reqCh <- req + status := &ProcessStatus{ + PID: 1234, + ForwardingArmed: true, + } + if req.Type == "set_forwarding_state" { + status.ForwardingArmed = false + } + _ = json.NewEncoder(conn).Encode(ControlResponse{ + OK: true, + Status: status, + }) + conn.Close() } - reqCh <- req - _ = json.NewEncoder(conn).Encode(ControlResponse{ - OK: true, - Status: &ProcessStatus{ - PID: 1234, - }, - }) done <- struct{}{} }() @@ -2816,8 +2824,14 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin } select { case req := <-reqCh: - t.Fatalf("unexpected apply request to old helper: %+v", req) + if req.Type != "set_forwarding_state" || req.Forwarding == nil || req.Forwarding.Armed { + t.Fatalf("second request = %+v, want fail-closed set_forwarding_state armed=false", req) + } default: + t.Fatal("expected fail-closed set_forwarding_state request") + } + if m.lastStatus.ForwardingArmed { + t.Fatal("helper status should be disarmed after protocol mismatch") } if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || !m.lastSnapshot.Policies[0].Inactive { t.Fatalf("manager should keep local inactive state even when publish is refused: %+v", m.lastSnapshot) diff --git a/pkg/dataplane/userspace/process.go b/pkg/dataplane/userspace/process.go index a4162c248..8e762068c 100644 --- a/pkg/dataplane/userspace/process.go +++ b/pkg/dataplane/userspace/process.go @@ -330,6 +330,7 @@ func (m *Manager) syncSnapshotLocked() error { publishSnap := *m.lastSnapshot publishSnap.Neighbors = filterPublishableNeighbors(m.lastSnapshot.Neighbors) if err := m.ensurePolicySchedulerProtocolLocked(publishSnap.Config); err != nil { + m.disarmPolicySchedulerProtocolFailureLocked(err) return err } var status ProcessStatus diff --git a/pkg/scheduler/README.md b/pkg/scheduler/README.md index 8429fde36..6e7d26a71 100644 --- a/pkg/scheduler/README.md +++ b/pkg/scheduler/README.md @@ -44,3 +44,8 @@ during specific windows. Junos time windows. Packet workers must consume published active/inactive booleans from the userspace snapshot and must not evaluate scheduler time in the hot path. +- Wall-clock discontinuities are fail-closed. Each evaluation compares + wall elapsed time with Go's monotonic elapsed time from the previous + evaluation; backward wall steps or drift beyond the tolerance publish + all schedulers inactive for that evaluation instead of extending an + allow window with a stale wall-clock assumption. diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index c0eb7929e..251384358 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -12,12 +12,16 @@ import ( // Scheduler periodically evaluates time windows for named schedulers // and notifies a callback when any scheduler's active state changes. type Scheduler struct { - mu sync.RWMutex - schedulers map[string]*config.SchedulerConfig - active map[string]bool - updateFn func(activeState map[string]bool) + mu sync.RWMutex + schedulers map[string]*config.SchedulerConfig + active map[string]bool + updateFn func(activeState map[string]bool) + lastEval time.Time + lastWallUnixNano int64 } +const wallClockDriftTolerance = 5 * time.Second + // NewPrimed creates a Scheduler, evaluates the initial active-state map, and // returns that map without firing updateFn from inside the constructor. Daemon // apply paths use this when they already hold their own serialization lock and @@ -97,9 +101,13 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { changed := false newActive := make(map[string]bool, len(s.schedulers)) + wallClockUnsafe := s.wallClockUnsafeLocked(now) for name, sched := range s.schedulers { - cur := isWithinWindow(now, sched) + cur := false + if !wallClockUnsafe { + cur = isWithinWindow(now, sched) + } newActive[name] = cur if prev, ok := s.active[name]; !ok || prev != cur { slog.Info("scheduler: state changed", "name", name, "active", cur) @@ -116,6 +124,8 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { } s.active = newActive + s.lastEval = now + s.lastWallUnixNano = now.UnixNano() if !changed || !notify || s.updateFn == nil { s.mu.Unlock() @@ -127,6 +137,34 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { updateFn(cp) } +func (s *Scheduler) wallClockUnsafeLocked(now time.Time) bool { + if s.lastEval.IsZero() { + return false + } + wallElapsed := time.Duration(now.UnixNano() - s.lastWallUnixNano) + if wallElapsed < 0 { + slog.Warn("scheduler: wall clock moved backward, failing closed until next evaluation", + "previous", s.lastEval, "current", now) + return true + } + monoElapsed := now.Sub(s.lastEval) + if monoElapsed < 0 { + slog.Warn("scheduler: monotonic clock moved backward, failing closed until next evaluation", + "previous", s.lastEval, "current", now) + return true + } + delta := wallElapsed - monoElapsed + if delta < 0 { + delta = -delta + } + if delta > wallClockDriftTolerance { + slog.Warn("scheduler: wall clock drift exceeded tolerance, failing closed until next evaluation", + "wall_elapsed", wallElapsed, "monotonic_elapsed", monoElapsed, "tolerance", wallClockDriftTolerance) + return true + } + return false +} + func (s *Scheduler) notifyActiveState() { s.mu.RLock() if s.updateFn == nil { diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index dc73f504c..8c8fd825b 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -199,6 +199,35 @@ func TestScheduler_NewPrimedDoesNotNotifyInitialState(t *testing.T) { } } +func TestScheduler_WallClockBackwardStepFailsClosed(t *testing.T) { + var lastState map[string]bool + schedCfg := map[string]*config.SchedulerConfig{ + "business-hours": { + Name: "business-hours", + StartTime: "08:00:00", + StopTime: "17:00:00", + }, + } + now := time.Date(2026, 2, 12, 12, 0, 0, 0, time.UTC) + s, state := NewPrimed(schedCfg, func(activeState map[string]bool) { + lastState = activeState + }, now) + if !state["business-hours"] { + t.Fatal("initial state should be active") + } + + s.evaluate(now.Add(-1*time.Hour), true) + if lastState == nil { + t.Fatal("expected callback after fail-closed state change") + } + if lastState["business-hours"] { + t.Fatalf("wall-clock backward step should fail closed, got state %+v", lastState) + } + if s.IsActive("business-hours") { + t.Fatal("scheduler should remain inactive after backward wall-clock step") + } +} + func TestScheduler_ActiveState(t *testing.T) { schedCfg := map[string]*config.SchedulerConfig{ "always-on": {Name: "always-on"}, diff --git a/userspace-dp/src/server/README.md b/userspace-dp/src/server/README.md index 595c2c9b4..fc89025eb 100644 --- a/userspace-dp/src/server/README.md +++ b/userspace-dp/src/server/README.md @@ -33,7 +33,11 @@ new daemon from publishing fields such as policy-scheduler inactive bits to a helper that would silently ignore them. The helper also reports `config_snapshot_protocol_version` in status so a new daemon can fail closed before sending scheduled-policy snapshots to an older -helper binary that predates the gate. +helper binary that predates the gate. If the daemon detects an incompatible +helper while scheduled policies are configured, it sends +`set_forwarding_state armed=false` before returning the compile/publish error; +the old helper must not keep forwarding a stale snapshot that ignores scheduler +inactive bits. ## Reconciliation From 94aa3f255145ef4473e8a1c775c6a09e1e7be790 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 04:44:21 +0000 Subject: [PATCH 05/20] fix: keep scheduler fail-closed and surface protocol mismatch apply errors Agent-Logs-Url: https://github.com/psaab/xpf/sessions/3a39f0a4-bc8e-440c-8652-dcf9a84b021a Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- pkg/daemon/daemon_apply.go | 25 +++++++++++++++++------ pkg/dataplane/userspace/manager.go | 27 +++++++++++++++---------- pkg/dataplane/userspace/process.go | 4 +++- pkg/scheduler/scheduler.go | 6 ++++-- pkg/scheduler/scheduler_test.go | 32 ++++++++++++++++++++++++++++++ 5 files changed, 74 insertions(+), 20 deletions(-) diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index c2311b20e..7af7133d2 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -4,6 +4,7 @@ package daemon import ( "bytes" "context" + "errors" "fmt" "log/slog" "net" @@ -67,7 +68,9 @@ func (d *Daemon) bootstrapFromFile() error { func (d *Daemon) applyConfig(cfg *config.Config) { _ = d.applySem.Acquire(context.Background(), 1) defer d.applySem.Release(1) - d.applyConfigLocked(cfg) + if err := d.applyConfigLocked(cfg); err != nil { + slog.Warn("apply config failed", "err", err) + } } // commitAndApply atomically promotes the candidate config and @@ -98,7 +101,9 @@ func (d *Daemon) commitAndApply(ctx context.Context, comment string, syncPeer bo if err != nil { return nil, err } - d.applyConfigLocked(compiled) + if err := d.applyConfigLocked(compiled); err != nil { + return nil, err + } if syncPeer { d.syncConfigToPeer() } @@ -121,7 +126,9 @@ func (d *Daemon) syncAndApply(ctx context.Context, configText string, chassisPre return nil, err } if compiled != nil { - d.applyConfigLocked(compiled) + if err := d.applyConfigLocked(compiled); err != nil { + return nil, err + } } return compiled, nil } @@ -138,7 +145,9 @@ func (d *Daemon) commitConfirmedAndApply(ctx context.Context, minutes int, syncP if err != nil { return nil, err } - d.applyConfigLocked(compiled) + if err := d.applyConfigLocked(compiled); err != nil { + return nil, err + } if syncPeer { d.syncConfigToPeer() } @@ -147,10 +156,10 @@ func (d *Daemon) commitConfirmedAndApply(ctx context.Context, minutes int, syncP // applyConfigLocked runs the actual reconcile pipeline. MUST be // called with d.applySem held. -func (d *Daemon) applyConfigLocked(cfg *config.Config) { +func (d *Daemon) applyConfigLocked(cfg *config.Config) error { if d.applyBodyForTest != nil { d.applyBodyForTest(cfg) - return + return nil } // Reset VIP warning suppression so new config gets fresh warnings. d.vipWarnedIfaces = nil @@ -429,6 +438,9 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) { var err error if compileResult, err = d.dp.Compile(cfg); err != nil { d.recordCompileFailure(err) + if errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) { + return err + } } else { d.recordCompileSuccess() } @@ -1020,4 +1032,5 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) { d.applyStep0Tunables(userspaceDP, claimHostTunables, governor, netdevBudget, coalesceExplicit, coalesceEnable, coalesceRX, coalesceTX, rssAllowed) } + return nil } diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index 9a17db43d..3b81b9aac 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -26,6 +26,8 @@ import ( var _ dataplane.DataPlane = (*Manager)(nil) +var ErrPolicySchedulerProtocolIncompatible = errors.New("userspace policy scheduler snapshot protocol incompatible") + // DataplaneMode describes which packet-processing pipeline is active. type DataplaneMode int @@ -368,7 +370,9 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) return result, err } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { - m.disarmPolicySchedulerProtocolFailureLocked(err) + if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { + return result, errors.Join(err, disarmErr) + } return result, err } if m.deferWorkers { @@ -446,7 +450,10 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ return } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { - m.disarmPolicySchedulerProtocolFailureLocked(err) + if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { + slog.Warn("userspace: failed to disarm helper after refusing policy scheduler publish", + "protocol_err", err, "err", disarmErr) + } slog.Warn("userspace: refusing policy scheduler publish to incompatible helper", "err", err) return } @@ -547,15 +554,16 @@ func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error } } return fmt.Errorf( - "userspace helper config snapshot protocol version %d does not support policy scheduler snapshots (want >= %d)", + "%w: userspace helper config snapshot protocol version %d does not support policy scheduler snapshots (want >= %d)", + ErrPolicySchedulerProtocolIncompatible, m.lastStatus.ConfigSnapshotProtocolVersion, ProtocolVersion, ) } -func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) { +func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) error { if m.proc == nil || m.proc.Process == nil { - return + return nil } req := ControlRequest{ Type: "set_forwarding_state", @@ -565,16 +573,13 @@ func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) } var status ProcessStatus if err := m.requestLocked(req, &status); err != nil { - slog.Warn("userspace: failed to disarm incompatible helper after policy scheduler protocol error", - "protocol_err", protocolErr, "err", err) - return + return fmt.Errorf("userspace: disarm helper after policy scheduler protocol error: %w", err) } if err := m.applyHelperStatusLocked(&status); err != nil { - slog.Warn("userspace: failed to sync helper status after policy scheduler fail-closed disarm", - "protocol_err", protocolErr, "err", err) - return + return fmt.Errorf("userspace: sync helper status after policy scheduler fail-closed disarm: %w", err) } slog.Warn("userspace: disarmed helper after policy scheduler protocol error", "err", protocolErr) + return nil } // bpfKtimeNs returns the current CLOCK_BOOTTIME in nanoseconds, matching diff --git a/pkg/dataplane/userspace/process.go b/pkg/dataplane/userspace/process.go index 8e762068c..816ac47ba 100644 --- a/pkg/dataplane/userspace/process.go +++ b/pkg/dataplane/userspace/process.go @@ -330,7 +330,9 @@ func (m *Manager) syncSnapshotLocked() error { publishSnap := *m.lastSnapshot publishSnap.Neighbors = filterPublishableNeighbors(m.lastSnapshot.Neighbors) if err := m.ensurePolicySchedulerProtocolLocked(publishSnap.Config); err != nil { - m.disarmPolicySchedulerProtocolFailureLocked(err) + if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { + return errors.Join(err, disarmErr) + } return err } var status ProcessStatus diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 251384358..e446b65bb 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -124,8 +124,10 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { } s.active = newActive - s.lastEval = now - s.lastWallUnixNano = now.UnixNano() + if !wallClockUnsafe { + s.lastEval = now + s.lastWallUnixNano = now.UnixNano() + } if !changed || !notify || s.updateFn == nil { s.mu.Unlock() diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 8c8fd825b..77a2f9b00 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -228,6 +228,38 @@ func TestScheduler_WallClockBackwardStepFailsClosed(t *testing.T) { } } +func TestScheduler_WallClockBackwardStepStaysFailClosedUntilClockRecovers(t *testing.T) { + var lastState map[string]bool + schedCfg := map[string]*config.SchedulerConfig{ + "business-hours": { + Name: "business-hours", + StartTime: "08:00:00", + StopTime: "17:00:00", + }, + } + now := time.Date(2026, 2, 12, 12, 0, 0, 0, time.UTC) + s, state := NewPrimed(schedCfg, func(activeState map[string]bool) { + lastState = activeState + }, now) + if !state["business-hours"] { + t.Fatal("initial state should be active") + } + + s.evaluate(now.Add(-1*time.Hour), true) + if lastState == nil || lastState["business-hours"] { + t.Fatalf("first backward-step evaluation should fail closed, got state %+v", lastState) + } + lastState = nil + + s.evaluate(now.Add(-59*time.Minute), true) + if lastState != nil { + t.Fatalf("second rollback evaluation should not notify without state change, got %+v", lastState) + } + if s.IsActive("business-hours") { + t.Fatal("scheduler should stay inactive while wall-clock remains rolled back") + } +} + func TestScheduler_ActiveState(t *testing.T) { schedCfg := map[string]*config.SchedulerConfig{ "always-on": {Name: "always-on"}, From bdd81d8f4f67c003a8d08eb5ec95821220fe9fe1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 04:46:26 +0000 Subject: [PATCH 06/20] chore: tighten protocol error text and test clarity Agent-Logs-Url: https://github.com/psaab/xpf/sessions/3a39f0a4-bc8e-440c-8652-dcf9a84b021a Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- go.mod | 2 +- pkg/dataplane/userspace/manager.go | 2 +- pkg/scheduler/scheduler_test.go | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index e3175fa5f..9af98c17b 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/insomniacslk/dhcp v0.0.0-20251020182700-175e84fbb167 github.com/mdlayher/ndp v1.1.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/vishvananda/netlink v1.3.1 golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 @@ -27,7 +28,6 @@ require ( github.com/mdlayher/socket v0.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index 3b81b9aac..e1a822c4e 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -554,7 +554,7 @@ func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error } } return fmt.Errorf( - "%w: userspace helper config snapshot protocol version %d does not support policy scheduler snapshots (want >= %d)", + "%w: helper config snapshot protocol version %d < required %d for policy scheduler snapshots", ErrPolicySchedulerProtocolIncompatible, m.lastStatus.ConfigSnapshotProtocolVersion, ProtocolVersion, diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 77a2f9b00..e781bd461 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -251,6 +251,7 @@ func TestScheduler_WallClockBackwardStepStaysFailClosedUntilClockRecovers(t *tes } lastState = nil + // Keep wall time behind the original baseline so continuity remains unsafe. s.evaluate(now.Add(-59*time.Minute), true) if lastState != nil { t.Fatalf("second rollback evaluation should not notify without state change, got %+v", lastState) From 0135c3ad2798356691c5832d6695b87abc9a802c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 04:47:01 +0000 Subject: [PATCH 07/20] chore: restore go.mod classification drift Agent-Logs-Url: https://github.com/psaab/xpf/sessions/3a39f0a4-bc8e-440c-8652-dcf9a84b021a Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 9af98c17b..e3175fa5f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/insomniacslk/dhcp v0.0.0-20251020182700-175e84fbb167 github.com/mdlayher/ndp v1.1.0 github.com/prometheus/client_golang v1.23.2 - github.com/prometheus/client_model v0.6.2 github.com/vishvananda/netlink v1.3.1 golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 @@ -28,6 +27,7 @@ require ( github.com/mdlayher/socket v0.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect + github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect From 5fe64290bfdf2cc3de697ebf044533c427c4a1c2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 04:48:38 +0000 Subject: [PATCH 08/20] chore: finalize round-4 blocker validation Agent-Logs-Url: https://github.com/psaab/xpf/sessions/3a39f0a4-bc8e-440c-8652-dcf9a84b021a Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index e3175fa5f..9af98c17b 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/insomniacslk/dhcp v0.0.0-20251020182700-175e84fbb167 github.com/mdlayher/ndp v1.1.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/vishvananda/netlink v1.3.1 golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 @@ -27,7 +28,6 @@ require ( github.com/mdlayher/socket v0.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect From f4c38b054e9e0ce92717d1aebd753dfd5e64f98b Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 21:59:15 -0700 Subject: [PATCH 09/20] userspace: avoid caching rejected scheduler snapshots --- pkg/dataplane/userspace/manager.go | 12 ++++++++++-- pkg/dataplane/userspace/manager_test.go | 4 ++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index e1a822c4e..23d0179bd 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -326,13 +326,19 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) pendingXSKStartup = false samePlanRefresh = false } - m.lastSnapshot = snap // #1197 v4 (Codex code-review v3 #1+#2): rebuild listener // caches ONLY after a successful apply_snapshot. Doing it // here (before publish) leaves the listener thinking // userspace-dp has entries it doesn't if apply_snapshot fails. // Moved to the post-success path below (after line 343). if pendingXSKStartup { + if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { + if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { + return result, errors.Join(err, disarmErr) + } + return result, err + } + m.lastSnapshot = snap if err := m.syncIngressIfaceMapLocked(snap); err != nil { return result, err } @@ -389,6 +395,7 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) if err := m.requestLocked(ControlRequest{Type: "apply_snapshot", Snapshot: &publishSnap}, &status); err != nil { return result, fmt.Errorf("publish userspace snapshot: %w", err) } + m.lastSnapshot = snap // #1197 v4: apply_snapshot succeeded — userspace-dp has the // new neighbors. NOW rebuild listener caches; before this // point the index would shadow events for entries the @@ -444,9 +451,9 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ next.GeneratedAt = time.Now().UTC() next.Config = cfg next.Policies = buildPolicySnapshotsWithSchedulerState(cfg, activeCopy) - m.lastSnapshot = &next if m.proc == nil || m.proc.Process == nil { + m.lastSnapshot = &next return } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { @@ -464,6 +471,7 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ slog.Warn("userspace: failed to publish policy scheduler state", "err", err) return } + m.lastSnapshot = &next m.rebuildNeighborIndex() m.rebuildMonitoredIfindexes() m.publishedSnapshot = next.Generation diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 163dc6a6f..7eeafc747 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2833,8 +2833,8 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin if m.lastStatus.ForwardingArmed { t.Fatal("helper status should be disarmed after protocol mismatch") } - if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || !m.lastSnapshot.Policies[0].Inactive { - t.Fatalf("manager should keep local inactive state even when publish is refused: %+v", m.lastSnapshot) + if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || m.lastSnapshot.Policies[0].Inactive { + t.Fatalf("manager should not cache rejected inactive state when publish is refused: %+v", m.lastSnapshot) } } From f0c503db469a25e25d17e4860555cd49a54ef612 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 05:13:19 +0000 Subject: [PATCH 10/20] fix: cache deferred userspace snapshot only after map sync succeeds Agent-Logs-Url: https://github.com/psaab/xpf/sessions/8be45280-5d1e-434e-b8d7-7ee35e5789f3 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- _Log.md | 8 ++++++++ pkg/dataplane/userspace/manager.go | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/_Log.md b/_Log.md index 1980d3714..aa8d332fb 100644 --- a/_Log.md +++ b/_Log.md @@ -11,6 +11,14 @@ - **Action**: PR #1395 cleanup — reverted an unintended `go.mod` direct/indirect dependency classification change introduced by automated tooling so the round-4 fix stays scoped to three-color policer compiler logic/tests/docs. - **File(s)**: `go.mod`, `_Log.md` +- **Timestamp**: 2026-05-17T05:12:00Z + - **Action**: Round-5 follow-up fix — in userspace pending-XSK-startup compile path, defer `lastSnapshot` cache update until ingress/local/NAT map sync succeeds so sync failures cannot poison cached snapshot state with an unpublished generation. + - **File(s)**: `pkg/dataplane/userspace/manager.go`, `_Log.md` + +- **Timestamp**: 2026-05-17T04:48:51Z + - **Action**: Re-restored `go.mod` after a subsequent tooling pass reintroduced the same direct/indirect dependency classification flip. + - **File(s)**: `go.mod`, `_Log.md` + - **Timestamp**: 2026-05-17T05:03:00Z - **Action**: PR #1395 round-4 follow-up cleanup — moved three-color mode marker assignment outside repeated same-mode child loops to avoid redundant writes while preserving duplicate-sibling merge semantics. - **File(s)**: `pkg/config/compiler_firewall.go`, `_Log.md` diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index 23d0179bd..e6c6cea55 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -338,7 +338,6 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) } return result, err } - m.lastSnapshot = snap if err := m.syncIngressIfaceMapLocked(snap); err != nil { return result, err } @@ -348,6 +347,7 @@ func (m *Manager) Compile(cfg *config.Config) (*dataplane.CompileResult, error) if err := m.syncInterfaceNATAddressMapsLocked(snap); err != nil { return result, err } + m.lastSnapshot = snap m.cfg = ucfg slog.Info( "userspace: deferring snapshot publish during XSK startup", From 363f92a55ffe8233ba79726a4ef345dd1883dd7f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 05:15:15 +0000 Subject: [PATCH 11/20] chore: restore go.mod after unintended dependency flip Agent-Logs-Url: https://github.com/psaab/xpf/sessions/8be45280-5d1e-434e-b8d7-7ee35e5789f3 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- _Log.md | 4 ++++ go.mod | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/_Log.md b/_Log.md index aa8d332fb..abc8acf97 100644 --- a/_Log.md +++ b/_Log.md @@ -15,6 +15,10 @@ - **Action**: Round-5 follow-up fix — in userspace pending-XSK-startup compile path, defer `lastSnapshot` cache update until ingress/local/NAT map sync succeeds so sync failures cannot poison cached snapshot state with an unpublished generation. - **File(s)**: `pkg/dataplane/userspace/manager.go`, `_Log.md` +- **Timestamp**: 2026-05-17T05:16:00Z + - **Action**: Restored `go.mod` after an unintended direct/indirect dependency classification flip introduced by an automation-only progress update. + - **File(s)**: `go.mod`, `_Log.md` + - **Timestamp**: 2026-05-17T04:48:51Z - **Action**: Re-restored `go.mod` after a subsequent tooling pass reintroduced the same direct/indirect dependency classification flip. - **File(s)**: `go.mod`, `_Log.md` diff --git a/go.mod b/go.mod index 9af98c17b..e3175fa5f 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/insomniacslk/dhcp v0.0.0-20251020182700-175e84fbb167 github.com/mdlayher/ndp v1.1.0 github.com/prometheus/client_golang v1.23.2 - github.com/prometheus/client_model v0.6.2 github.com/vishvananda/netlink v1.3.1 golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 @@ -28,6 +27,7 @@ require ( github.com/mdlayher/socket v0.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect + github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect From 32000b868e0d74ce9aa47e878894bd30694ee340 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 22:34:47 -0700 Subject: [PATCH 12/20] userspace: harden policy scheduler fail-closed paths --- pkg/daemon/daemon_run.go | 1 + pkg/daemon/daemon_scheduler.go | 14 +++++++---- pkg/daemon/daemon_scheduler_test.go | 31 +++++++++++++++++++++++++ pkg/dataplane/userspace/manager_test.go | 11 +++++++++ pkg/dataplane/userspace/snapshot.go | 5 +++- pkg/scheduler/scheduler.go | 25 +++++++++++++++----- pkg/scheduler/scheduler_test.go | 21 +++++++++++++---- 7 files changed, 92 insertions(+), 16 deletions(-) create mode 100644 pkg/daemon/daemon_scheduler_test.go diff --git a/pkg/daemon/daemon_run.go b/pkg/daemon/daemon_run.go index 3d175db5b..0c92c29eb 100644 --- a/pkg/daemon/daemon_run.go +++ b/pkg/daemon/daemon_run.go @@ -72,6 +72,7 @@ func collectAppliedTunnels(cfg *config.Config) []*config.TunnelConfig { // Run starts the daemon and blocks until shutdown. func (d *Daemon) Run(ctx context.Context) error { d.daemonCtx = ctx + d.startPolicySchedulerLoopLocked() // Wrap the default slog handler to support system syslog forwarding. // Syslog clients are added later when config is applied. diff --git a/pkg/daemon/daemon_scheduler.go b/pkg/daemon/daemon_scheduler.go index b412ecf07..c53f9674f 100644 --- a/pkg/daemon/daemon_scheduler.go +++ b/pkg/daemon/daemon_scheduler.go @@ -32,13 +32,17 @@ func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]b d.publishPolicyScheduleState(epoch, activeState) }, time.Now()) d.scheduler = sched + d.startPolicySchedulerLoopLocked() + return activeState +} - if d.daemonCtx != nil { - ctx, cancel := context.WithCancel(d.daemonCtx) - d.schedulerCancel = cancel - go sched.Run(ctx) +func (d *Daemon) startPolicySchedulerLoopLocked() { + if d.daemonCtx == nil || d.scheduler == nil || d.schedulerCancel != nil { + return } - return activeState + ctx, cancel := context.WithCancel(d.daemonCtx) + d.schedulerCancel = cancel + go d.scheduler.Run(ctx) } func (d *Daemon) publishPolicyScheduleState(epoch uint64, activeState map[string]bool) { diff --git a/pkg/daemon/daemon_scheduler_test.go b/pkg/daemon/daemon_scheduler_test.go new file mode 100644 index 000000000..d388ab8b3 --- /dev/null +++ b/pkg/daemon/daemon_scheduler_test.go @@ -0,0 +1,31 @@ +package daemon + +import ( + "context" + "testing" + "time" + + "github.com/psaab/xpf/pkg/config" + "github.com/psaab/xpf/pkg/scheduler" +) + +func TestStartPolicySchedulerLoopLockedWaitsForDaemonContext(t *testing.T) { + sched, _ := scheduler.NewPrimed(map[string]*config.SchedulerConfig{ + "always": {Name: "always"}, + }, func(map[string]bool) {}, time.Now()) + + d := &Daemon{scheduler: sched} + d.startPolicySchedulerLoopLocked() + if d.schedulerCancel != nil { + t.Fatal("scheduler loop started before daemon context was available") + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + d.daemonCtx = ctx + d.startPolicySchedulerLoopLocked() + if d.schedulerCancel == nil { + t.Fatal("scheduler loop did not start after daemon context became available") + } + d.schedulerCancel() +} diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 7eeafc747..55a822a64 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2614,6 +2614,16 @@ func TestBuildPolicySnapshotsRoundTripsSchedulerInactiveAndRuleID(t *testing.T) Action: config.PolicyDeny, }} + unseeded := buildPolicySnapshots(cfg) + if len(unseeded) != 2 { + t.Fatalf("len(unseeded) = %d, want 2", len(unseeded)) + } + for _, pol := range unseeded { + if !pol.Inactive { + t.Fatalf("policy %q inactive = false with nil scheduler state, want fail-closed true", pol.RuleID) + } + } + snap := buildPolicySnapshotsWithSchedulerState(cfg, map[string]bool{ "workhours": false, "always": true, @@ -2810,6 +2820,7 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin m.cfg.ControlSocket = controlSock m.generation = 7 m.lastSnapshot = buildSnapshot(cfg, config.UserspaceConfig{ControlSocket: controlSock}, 7, 0) + m.lastSnapshot.Policies[0].Inactive = false m.UpdatePolicyScheduleState(cfg, map[string]bool{"workhours": false}) diff --git a/pkg/dataplane/userspace/snapshot.go b/pkg/dataplane/userspace/snapshot.go index 4b1be0611..84db97eb2 100644 --- a/pkg/dataplane/userspace/snapshot.go +++ b/pkg/dataplane/userspace/snapshot.go @@ -2056,9 +2056,12 @@ func stablePolicyRuleID(fromZone, toZone, ruleName string) string { } func policyRuleInactive(schedulerName string, activeState map[string]bool) bool { - if schedulerName == "" || activeState == nil { + if schedulerName == "" { return false } + if activeState == nil { + return true + } active, ok := activeState[schedulerName] return !ok || !active } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index e446b65bb..a19e5f8c4 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -18,9 +18,13 @@ type Scheduler struct { updateFn func(activeState map[string]bool) lastEval time.Time lastWallUnixNano int64 + unsafeUntil time.Time } -const wallClockDriftTolerance = 5 * time.Second +const ( + wallClockDriftTolerance = 5 * time.Second + wallClockRecoveryHold = 2 * time.Minute +) // NewPrimed creates a Scheduler, evaluates the initial active-state map, and // returns that map without firing updateFn from inside the constructor. Daemon @@ -101,7 +105,15 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { changed := false newActive := make(map[string]bool, len(s.schedulers)) - wallClockUnsafe := s.wallClockUnsafeLocked(now) + wallClockDiscontinuous := s.wallClockDiscontinuousLocked(now) + wallClockUnsafe := wallClockDiscontinuous + if !wallClockUnsafe && !s.unsafeUntil.IsZero() { + if now.Before(s.unsafeUntil) { + wallClockUnsafe = true + } else { + s.unsafeUntil = time.Time{} + } + } for name, sched := range s.schedulers { cur := false @@ -124,10 +136,11 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { } s.active = newActive - if !wallClockUnsafe { - s.lastEval = now - s.lastWallUnixNano = now.UnixNano() + if wallClockDiscontinuous { + s.unsafeUntil = now.Add(wallClockRecoveryHold) } + s.lastEval = now + s.lastWallUnixNano = now.UnixNano() if !changed || !notify || s.updateFn == nil { s.mu.Unlock() @@ -139,7 +152,7 @@ func (s *Scheduler) evaluate(now time.Time, notify bool) { updateFn(cp) } -func (s *Scheduler) wallClockUnsafeLocked(now time.Time) bool { +func (s *Scheduler) wallClockDiscontinuousLocked(now time.Time) bool { if s.lastEval.IsZero() { return false } diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index e781bd461..09c058513 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -245,19 +245,32 @@ func TestScheduler_WallClockBackwardStepStaysFailClosedUntilClockRecovers(t *tes t.Fatal("initial state should be active") } - s.evaluate(now.Add(-1*time.Hour), true) + // Simulate the real NTP rollback shape: monotonic time advances while + // wall time appears to move backward relative to the previous wall sample. + s.mu.Lock() + s.lastEval = now + s.lastWallUnixNano = now.Add(time.Hour).UnixNano() + s.mu.Unlock() + + s.evaluate(now.Add(time.Second), true) if lastState == nil || lastState["business-hours"] { t.Fatalf("first backward-step evaluation should fail closed, got state %+v", lastState) } lastState = nil - // Keep wall time behind the original baseline so continuity remains unsafe. - s.evaluate(now.Add(-59*time.Minute), true) + // The recovery hold keeps the scheduler closed for more than one tick, + // even after the new wall/monotonic samples are internally consistent. + s.evaluate(now.Add(time.Minute), true) if lastState != nil { t.Fatalf("second rollback evaluation should not notify without state change, got %+v", lastState) } if s.IsActive("business-hours") { - t.Fatal("scheduler should stay inactive while wall-clock remains rolled back") + t.Fatal("scheduler should stay inactive during wall-clock recovery hold") + } + + s.evaluate(now.Add(3*time.Minute), true) + if lastState == nil || !lastState["business-hours"] { + t.Fatalf("scheduler should recover after hold window, got state %+v", lastState) } } From 1074c62b9a0b4aba8c06c6f523751ac0782a9d22 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sat, 16 May 2026 23:27:56 -0700 Subject: [PATCH 13/20] Harden policy scheduler apply lifecycle --- pkg/daemon/daemon.go | 1 + pkg/daemon/daemon_apply.go | 28 +++++++----- pkg/daemon/daemon_scheduler.go | 29 ++++++++++++- pkg/daemon/daemon_scheduler_test.go | 57 +++++++++++++++++++++++++ pkg/dataplane/userspace/manager.go | 7 ++- pkg/dataplane/userspace/manager_test.go | 33 ++++++++++++++ pkg/scheduler/README.md | 6 +++ 7 files changed, 145 insertions(+), 16 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 80119f732..cf19e1cf0 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -80,6 +80,7 @@ type Daemon struct { lldpMgr *lldp.Manager scheduler *scheduler.Scheduler schedulerCancel context.CancelFunc + policySchedulerConfigHash [32]byte policySchedulerEpoch atomic.Uint64 cluster *cluster.Manager sessionSync *cluster.SessionSync diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index 7af7133d2..0d2ec2eba 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -4,7 +4,6 @@ package daemon import ( "bytes" "context" - "errors" "fmt" "log/slog" "net" @@ -403,6 +402,8 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { // programming is done. This avoids the double-bind that causes EBUSY // on mlx5 zero-copy queues. rethMACPending := false + deferWorkersActive := false + var clearDeferWorkers func() if d.cluster != nil && cfg.Chassis.Cluster != nil && d.dp != nil { cc := cfg.Chassis.Cluster for rethName, physName := range cfg.RethToPhysical() { @@ -425,6 +426,15 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { type deferSetter interface{ SetDeferWorkers(bool) } if ds, ok := d.dp.(deferSetter); ok { ds.SetDeferWorkers(true) + deferWorkersActive = true + clearDeferWorkers = func() { + ds.SetDeferWorkers(false) + } + defer func() { + if deferWorkersActive { + clearDeferWorkers() + } + }() } } } @@ -438,24 +448,22 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { var err error if compileResult, err = d.dp.Compile(cfg); err != nil { d.recordCompileFailure(err) - if errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) { - return err - } + return err } else { d.recordCompileSuccess() } } if d.dp != nil && policySchedulerActiveState != nil && compileResult != nil { - d.dp.UpdatePolicyScheduleState(cfg, policySchedulerActiveState) + if _, isUserspace := d.dp.(*dpuserspace.Manager); !isUserspace { + d.dp.UpdatePolicyScheduleState(cfg, policySchedulerActiveState) + } } // Clear defer flag after Compile so subsequent recompiles (where MAC // is already set) don't skip workers. - if rethMACPending { - type deferSetter interface{ SetDeferWorkers(bool) } - if ds, ok := d.dp.(deferSetter); ok { - ds.SetDeferWorkers(false) - } + if deferWorkersActive { + clearDeferWorkers() + deferWorkersActive = false } // 2.1. Wire aggressive session aging config to GC. diff --git a/pkg/daemon/daemon_scheduler.go b/pkg/daemon/daemon_scheduler.go index c53f9674f..2a7b18a16 100644 --- a/pkg/daemon/daemon_scheduler.go +++ b/pkg/daemon/daemon_scheduler.go @@ -2,6 +2,8 @@ package daemon import ( "context" + "crypto/sha256" + "encoding/json" "log/slog" "time" @@ -17,6 +19,12 @@ type policySchedulerActiveStateSetter interface { // lifecycle follow committed config instead of only daemon startup, and returns // the active-state map that must be used for the same apply transaction. func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]bool { + hash, hasSchedulers := policySchedulerConfigHash(cfg) + if hasSchedulers && d.scheduler != nil && hash == d.policySchedulerConfigHash { + d.startPolicySchedulerLoopLocked() + return d.scheduler.ActiveState() + } + if d.schedulerCancel != nil { d.schedulerCancel() d.schedulerCancel = nil @@ -24,7 +32,8 @@ func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]b d.scheduler = nil epoch := d.policySchedulerEpoch.Add(1) - if cfg == nil || len(cfg.Schedulers) == 0 { + if !hasSchedulers { + d.policySchedulerConfigHash = [32]byte{} return nil } @@ -32,10 +41,22 @@ func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]b d.publishPolicyScheduleState(epoch, activeState) }, time.Now()) d.scheduler = sched + d.policySchedulerConfigHash = hash d.startPolicySchedulerLoopLocked() return activeState } +func policySchedulerConfigHash(cfg *config.Config) ([32]byte, bool) { + if cfg == nil || len(cfg.Schedulers) == 0 { + return [32]byte{}, false + } + b, err := json.Marshal(cfg.Schedulers) + if err != nil { + return [32]byte{}, false + } + return sha256.Sum256(b), true +} + func (d *Daemon) startPolicySchedulerLoopLocked() { if d.daemonCtx == nil || d.scheduler == nil || d.schedulerCancel != nil { return @@ -46,7 +67,11 @@ func (d *Daemon) startPolicySchedulerLoopLocked() { } func (d *Daemon) publishPolicyScheduleState(epoch uint64, activeState map[string]bool) { - if err := d.applySem.Acquire(context.Background(), 1); err != nil { + ctx := d.daemonCtx + if ctx == nil { + ctx = context.Background() + } + if err := d.applySem.Acquire(ctx, 1); err != nil { slog.Warn("scheduler: failed to acquire apply semaphore", "err", err) return } diff --git a/pkg/daemon/daemon_scheduler_test.go b/pkg/daemon/daemon_scheduler_test.go index d388ab8b3..ffc7df643 100644 --- a/pkg/daemon/daemon_scheduler_test.go +++ b/pkg/daemon/daemon_scheduler_test.go @@ -7,6 +7,7 @@ import ( "github.com/psaab/xpf/pkg/config" "github.com/psaab/xpf/pkg/scheduler" + "golang.org/x/sync/semaphore" ) func TestStartPolicySchedulerLoopLockedWaitsForDaemonContext(t *testing.T) { @@ -29,3 +30,59 @@ func TestStartPolicySchedulerLoopLockedWaitsForDaemonContext(t *testing.T) { } d.schedulerCancel() } + +func TestReconcilePolicySchedulerLockedKeepsByteIdenticalScheduler(t *testing.T) { + cfg := &config.Config{ + Schedulers: map[string]*config.SchedulerConfig{ + "always": {Name: "always"}, + }, + } + d := &Daemon{} + + first := d.reconcilePolicySchedulerLocked(cfg) + if d.scheduler == nil { + t.Fatal("scheduler was not created") + } + sched := d.scheduler + epoch := d.policySchedulerEpoch.Load() + + second := d.reconcilePolicySchedulerLocked(&config.Config{ + Schedulers: map[string]*config.SchedulerConfig{ + "always": {Name: "always"}, + }, + }) + if d.scheduler != sched { + t.Fatal("byte-identical scheduler config recreated the scheduler") + } + if got := d.policySchedulerEpoch.Load(); got != epoch { + t.Fatalf("epoch = %d, want unchanged %d", got, epoch) + } + if first["always"] != second["always"] { + t.Fatalf("active state changed across identical reconcile: first=%v second=%v", first, second) + } +} + +func TestPublishPolicyScheduleStateUsesDaemonContext(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + d := &Daemon{ + daemonCtx: ctx, + applySem: semaphore.NewWeighted(1), + } + if err := d.applySem.Acquire(context.Background(), 1); err != nil { + t.Fatalf("acquire semaphore: %v", err) + } + defer d.applySem.Release(1) + + done := make(chan struct{}) + go func() { + d.publishPolicyScheduleState(0, map[string]bool{"always": true}) + close(done) + }() + + select { + case <-done: + case <-time.After(500 * time.Millisecond): + t.Fatal("publishPolicyScheduleState blocked on apply semaphore after daemon context cancellation") + } +} diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index e6c6cea55..7c63bb490 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -443,6 +443,9 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ if cfg == nil || m.lastSnapshot == nil { return } + if m.proc == nil || m.proc.Process == nil { + return + } next := *m.lastSnapshot m.generation++ @@ -452,10 +455,6 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ next.Config = cfg next.Policies = buildPolicySnapshotsWithSchedulerState(cfg, activeCopy) - if m.proc == nil || m.proc.Process == nil { - m.lastSnapshot = &next - return - } if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { slog.Warn("userspace: failed to disarm helper after refusing policy scheduler publish", diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 55a822a64..586a49147 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2849,6 +2849,39 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin } } +func TestUpdatePolicyScheduleStateWithoutHelperDoesNotMutateSnapshot(t *testing.T) { + cfg := &config.Config{} + cfg.Security.Policies = []*config.ZonePairPolicies{{ + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{{ + Name: "scheduled-allow", + SchedulerName: "workhours", + Action: config.PolicyPermit, + }}, + }} + cfg.Schedulers = map[string]*config.SchedulerConfig{ + "workhours": {Name: "workhours"}, + } + + m := New() + m.generation = 7 + m.lastSnapshot = buildSnapshot(cfg, config.UserspaceConfig{}, 7, 0) + m.lastSnapshot.Policies[0].Inactive = false + + m.UpdatePolicyScheduleState(cfg, map[string]bool{"workhours": false}) + + if m.generation != 7 { + t.Fatalf("generation = %d, want unchanged 7", m.generation) + } + if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || m.lastSnapshot.Policies[0].Inactive { + t.Fatalf("lastSnapshot mutated without helper: %+v", m.lastSnapshot) + } + if got, ok := m.policySchedulerActive["workhours"]; !ok || got { + t.Fatalf("policySchedulerActive[workhours] = %t, present=%t; want false and present", got, ok) + } +} + func TestUserspaceSupportsScreenProfilesBasic(t *testing.T) { cfg := &config.Config{} cfg.Security.Screen = map[string]*config.ScreenProfile{ diff --git a/pkg/scheduler/README.md b/pkg/scheduler/README.md index 6e7d26a71..38522ad59 100644 --- a/pkg/scheduler/README.md +++ b/pkg/scheduler/README.md @@ -40,6 +40,12 @@ during specific windows. apply semaphore. Runtime scheduler callbacks take that semaphore before touching dataplane state so commits and time-window flips cannot publish hybrid policy snapshots. +- The daemon reconciler keeps an existing scheduler instance when the + committed scheduler config is byte-identical. This preserves the + monotonic/wall-clock recovery state and avoids resetting timers on + no-op commits. Runtime publishes use the daemon context when acquiring + the apply semaphore, so shutdown cancels a blocked scheduler publish + instead of leaving a goroutine parked behind a long apply. - The scheduler uses wall-clock time only in the control plane to evaluate Junos time windows. Packet workers must consume published active/inactive booleans from the userspace snapshot and must not evaluate scheduler time in From 33baa497d5f242acc3dd85b044e82609b8caf982 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 06:47:16 +0000 Subject: [PATCH 14/20] fix: keep degraded compile contract except protocol mismatch Agent-Logs-Url: https://github.com/psaab/xpf/sessions/1bb33a9b-c0bb-4973-8771-b143edac1b95 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- pkg/daemon/compile_error_policy_test.go | 21 +++++++++++++++++++++ pkg/daemon/daemon_apply.go | 9 ++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 pkg/daemon/compile_error_policy_test.go diff --git a/pkg/daemon/compile_error_policy_test.go b/pkg/daemon/compile_error_policy_test.go new file mode 100644 index 000000000..fec27bcf9 --- /dev/null +++ b/pkg/daemon/compile_error_policy_test.go @@ -0,0 +1,21 @@ +package daemon + +import ( + "errors" + "fmt" + "testing" + + dpuserspace "github.com/psaab/xpf/pkg/dataplane/userspace" +) + +func TestCompileErrorMustAbortApply(t *testing.T) { + if !compileErrorMustAbortApply(dpuserspace.ErrPolicySchedulerProtocolIncompatible) { + t.Fatal("protocol incompatibility must abort apply") + } + if !compileErrorMustAbortApply(fmt.Errorf("wrapped: %w", dpuserspace.ErrPolicySchedulerProtocolIncompatible)) { + t.Fatal("wrapped protocol incompatibility must abort apply") + } + if compileErrorMustAbortApply(errors.New("compile failed for unrelated dataplane reason")) { + t.Fatal("non-protocol compile failures must not abort apply") + } +} diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index 0d2ec2eba..447e244b9 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -4,6 +4,7 @@ package daemon import ( "bytes" "context" + "errors" "fmt" "log/slog" "net" @@ -448,7 +449,9 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { var err error if compileResult, err = d.dp.Compile(cfg); err != nil { d.recordCompileFailure(err) - return err + if compileErrorMustAbortApply(err) { + return err + } } else { d.recordCompileSuccess() } @@ -1042,3 +1045,7 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { } return nil } + +func compileErrorMustAbortApply(err error) bool { + return errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) +} From 29a0c2780539fccb59535ed8f79451e88bb07877 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 06:49:14 +0000 Subject: [PATCH 15/20] Preserve degraded compile contract; fail closed only on userspace scheduler protocol mismatch Agent-Logs-Url: https://github.com/psaab/xpf/sessions/1bb33a9b-c0bb-4973-8771-b143edac1b95 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index e3175fa5f..9af98c17b 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/insomniacslk/dhcp v0.0.0-20251020182700-175e84fbb167 github.com/mdlayher/ndp v1.1.0 github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/vishvananda/netlink v1.3.1 golang.org/x/net v0.47.0 golang.org/x/sync v0.18.0 @@ -27,7 +28,6 @@ require ( github.com/mdlayher/socket v0.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pierrec/lz4/v4 v4.1.14 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect From 2fda5797daa07863c46172018afb617ab933938a Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sun, 17 May 2026 00:16:32 -0700 Subject: [PATCH 16/20] daemon: pin policy scheduler apply edge cases --- pkg/daemon/daemon_apply.go | 16 +++-- pkg/daemon/policy_scheduler_apply_test.go | 88 +++++++++++++++++++++++ pkg/scheduler/scheduler.go | 6 +- 3 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 pkg/daemon/policy_scheduler_apply_test.go diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index 447e244b9..6eb1997e6 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -456,11 +456,7 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { d.recordCompileSuccess() } } - if d.dp != nil && policySchedulerActiveState != nil && compileResult != nil { - if _, isUserspace := d.dp.(*dpuserspace.Manager); !isUserspace { - d.dp.UpdatePolicyScheduleState(cfg, policySchedulerActiveState) - } - } + d.publishInitialPolicySchedulerStateLocked(cfg, policySchedulerActiveState, compileResult) // Clear defer flag after Compile so subsequent recompiles (where MAC // is already set) don't skip workers. @@ -1049,3 +1045,13 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { func compileErrorMustAbortApply(err error) bool { return errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) } + +func (d *Daemon) publishInitialPolicySchedulerStateLocked(cfg *config.Config, activeState map[string]bool, compileResult *dataplane.CompileResult) { + if d.dp == nil || activeState == nil || compileResult == nil { + return + } + if _, isUserspace := d.dp.(*dpuserspace.Manager); isUserspace { + return + } + d.dp.UpdatePolicyScheduleState(cfg, activeState) +} diff --git a/pkg/daemon/policy_scheduler_apply_test.go b/pkg/daemon/policy_scheduler_apply_test.go new file mode 100644 index 000000000..ee238ea1e --- /dev/null +++ b/pkg/daemon/policy_scheduler_apply_test.go @@ -0,0 +1,88 @@ +package daemon + +import ( + "errors" + "testing" + + "github.com/psaab/xpf/pkg/cluster" + "github.com/psaab/xpf/pkg/config" + "github.com/psaab/xpf/pkg/dataplane" + dpuserspace "github.com/psaab/xpf/pkg/dataplane/userspace" +) + +type policySchedulerApplyTestDP struct { + *dataplane.Manager + + compileErr error + compileCalls int + deferStates []bool + updateCalls int + updateStateSeen map[string]bool +} + +func (d *policySchedulerApplyTestDP) Compile(*config.Config) (*dataplane.CompileResult, error) { + d.compileCalls++ + if d.compileErr != nil { + return nil, d.compileErr + } + return &dataplane.CompileResult{}, nil +} + +func (d *policySchedulerApplyTestDP) SetDeferWorkers(v bool) { + d.deferStates = append(d.deferStates, v) +} + +func (d *policySchedulerApplyTestDP) UpdatePolicyScheduleState(_ *config.Config, activeState map[string]bool) { + d.updateCalls++ + d.updateStateSeen = activeState +} + +func TestApplyConfigClearsDeferWorkersOnAbortCompileError(t *testing.T) { + dp := &policySchedulerApplyTestDP{ + compileErr: dpuserspace.ErrPolicySchedulerProtocolIncompatible, + } + d := &Daemon{ + cluster: &cluster.Manager{}, + dp: dp, + } + cfg := &config.Config{ + Chassis: config.ChassisConfig{ + Cluster: &config.ClusterConfig{ + ClusterID: 1, + NodeID: 0, + }, + }, + Interfaces: config.InterfacesConfig{ + Interfaces: map[string]*config.InterfaceConfig{ + "reth0": {Name: "reth0", RedundancyGroup: 1}, + "lo": {Name: "lo", RedundantParent: "reth0"}, + }, + }, + } + + if err := d.applyConfigLocked(cfg); !errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) { + t.Fatalf("applyConfigLocked error = %v, want protocol incompatibility", err) + } + if dp.compileCalls != 1 { + t.Fatalf("Compile calls = %d, want 1", dp.compileCalls) + } + if len(dp.deferStates) != 2 || !dp.deferStates[0] || dp.deferStates[1] { + t.Fatalf("defer worker states = %v, want [true false]", dp.deferStates) + } +} + +func TestApplyConfigPublishesScheduleStateToNonUserspaceDataplane(t *testing.T) { + dp := &policySchedulerApplyTestDP{} + d := &Daemon{dp: dp} + cfg := &config.Config{} + activeState := map[string]bool{"workhours": true} + + d.publishInitialPolicySchedulerStateLocked(cfg, activeState, &dataplane.CompileResult{}) + + if dp.updateCalls != 1 { + t.Fatalf("UpdatePolicyScheduleState calls = %d, want 1", dp.updateCalls) + } + if got, ok := dp.updateStateSeen["workhours"]; !ok || !got { + t.Fatalf("active state for workhours = %t, present=%t; want active true", got, ok) + } +} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index a19e5f8c4..2a299b56b 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -158,13 +158,13 @@ func (s *Scheduler) wallClockDiscontinuousLocked(now time.Time) bool { } wallElapsed := time.Duration(now.UnixNano() - s.lastWallUnixNano) if wallElapsed < 0 { - slog.Warn("scheduler: wall clock moved backward, failing closed until next evaluation", + slog.Warn("scheduler: wall clock moved backward, failing closed during recovery hold", "previous", s.lastEval, "current", now) return true } monoElapsed := now.Sub(s.lastEval) if monoElapsed < 0 { - slog.Warn("scheduler: monotonic clock moved backward, failing closed until next evaluation", + slog.Warn("scheduler: monotonic clock moved backward, failing closed during recovery hold", "previous", s.lastEval, "current", now) return true } @@ -173,7 +173,7 @@ func (s *Scheduler) wallClockDiscontinuousLocked(now time.Time) bool { delta = -delta } if delta > wallClockDriftTolerance { - slog.Warn("scheduler: wall clock drift exceeded tolerance, failing closed until next evaluation", + slog.Warn("scheduler: wall clock drift exceeded tolerance, failing closed during recovery hold", "wall_elapsed", wallElapsed, "monotonic_elapsed", monoElapsed, "tolerance", wallClockDriftTolerance) return true } From e243e510a31fbc2777734643d963f113efe645bb Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sun, 17 May 2026 00:38:58 -0700 Subject: [PATCH 17/20] daemon: preserve scheduler state across aborting applies --- pkg/config/compiler.go | 4 -- pkg/config/parser_ast_test.go | 13 ++--- pkg/daemon/daemon_apply.go | 4 +- pkg/daemon/daemon_scheduler.go | 59 ++++++++++++++++++++--- pkg/daemon/policy_scheduler_apply_test.go | 46 ++++++++++++++++++ pkg/dataplane/userspace/manager.go | 32 ++++++++---- pkg/dataplane/userspace/manager_test.go | 6 +++ pkg/dataplane/userspace/maps_sync.go | 12 +---- pkg/scheduler/scheduler_test.go | 25 ++++++++++ 9 files changed, 164 insertions(+), 37 deletions(-) diff --git a/pkg/config/compiler.go b/pkg/config/compiler.go index 88faef325..2fdf4d08e 100644 --- a/pkg/config/compiler.go +++ b/pkg/config/compiler.go @@ -221,10 +221,6 @@ func compileExpanded(tree *ConfigTree) (*Config, error) { if err := validateThreeColorPolicersStrict(cfg.Firewall.ThreeColorPolicers); err != nil { return nil, err } - if err := validatePolicySchedulerReferencesStrict(cfg); err != nil { - return nil, err - } - if warnings := ValidateConfig(cfg); len(warnings) > 0 { for _, w := range warnings { cfg.Warnings = append(cfg.Warnings, w) diff --git a/pkg/config/parser_ast_test.go b/pkg/config/parser_ast_test.go index 83bddbca4..604f50591 100644 --- a/pkg/config/parser_ast_test.go +++ b/pkg/config/parser_ast_test.go @@ -1504,7 +1504,7 @@ security { } } -func TestPolicySchedulerMissingReferenceFailsCompile(t *testing.T) { +func TestPolicySchedulerMissingReferenceWarns(t *testing.T) { input := `security { policies { from-zone trust to-zone untrust { @@ -1522,12 +1522,13 @@ func TestPolicySchedulerMissingReferenceFailsCompile(t *testing.T) { if len(errs) > 0 { t.Fatalf("parse errors: %v", errs) } - _, err := CompileConfig(tree) - if err == nil { - t.Fatal("CompileConfig succeeded, want missing scheduler error") + cfg, err := CompileConfig(tree) + if err != nil { + t.Fatalf("CompileConfig returned error for warning-only missing scheduler reference: %v", err) } - if !strings.Contains(err.Error(), `policy "sched-test" references undefined scheduler "missing-sched"`) { - t.Fatalf("CompileConfig error = %v", err) + warnings := strings.Join(cfg.Warnings, "\n") + if !strings.Contains(warnings, `policy "sched-test": scheduler "missing-sched" not defined`) { + t.Fatalf("CompileConfig warnings = %v, want missing scheduler warning", cfg.Warnings) } } diff --git a/pkg/daemon/daemon_apply.go b/pkg/daemon/daemon_apply.go index 6eb1997e6..e4a0acd63 100644 --- a/pkg/daemon/daemon_apply.go +++ b/pkg/daemon/daemon_apply.go @@ -440,7 +440,8 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { } } - policySchedulerActiveState := d.reconcilePolicySchedulerLocked(cfg) + policySchedulerApplyTime := time.Now() + policySchedulerActiveState := d.policySchedulerActiveStateForApplyLocked(cfg, policySchedulerApplyTime) d.seedPolicySchedulerActiveStateLocked(policySchedulerActiveState) // 2. Compile eBPF dataplane @@ -456,6 +457,7 @@ func (d *Daemon) applyConfigLocked(cfg *config.Config) error { d.recordCompileSuccess() } } + policySchedulerActiveState = d.reconcilePolicySchedulerLockedAt(cfg, policySchedulerApplyTime) d.publishInitialPolicySchedulerStateLocked(cfg, policySchedulerActiveState, compileResult) // Clear defer flag after Compile so subsequent recompiles (where MAC diff --git a/pkg/daemon/daemon_scheduler.go b/pkg/daemon/daemon_scheduler.go index 2a7b18a16..d0b090a21 100644 --- a/pkg/daemon/daemon_scheduler.go +++ b/pkg/daemon/daemon_scheduler.go @@ -3,8 +3,8 @@ package daemon import ( "context" "crypto/sha256" - "encoding/json" "log/slog" + "sort" "time" "github.com/psaab/xpf/pkg/config" @@ -19,6 +19,10 @@ type policySchedulerActiveStateSetter interface { // lifecycle follow committed config instead of only daemon startup, and returns // the active-state map that must be used for the same apply transaction. func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]bool { + return d.reconcilePolicySchedulerLockedAt(cfg, time.Now()) +} + +func (d *Daemon) reconcilePolicySchedulerLockedAt(cfg *config.Config, now time.Time) map[string]bool { hash, hasSchedulers := policySchedulerConfigHash(cfg) if hasSchedulers && d.scheduler != nil && hash == d.policySchedulerConfigHash { d.startPolicySchedulerLoopLocked() @@ -39,22 +43,65 @@ func (d *Daemon) reconcilePolicySchedulerLocked(cfg *config.Config) map[string]b sched, activeState := scheduler.NewPrimed(cfg.Schedulers, func(activeState map[string]bool) { d.publishPolicyScheduleState(epoch, activeState) - }, time.Now()) + }, now) d.scheduler = sched d.policySchedulerConfigHash = hash d.startPolicySchedulerLoopLocked() return activeState } +func (d *Daemon) policySchedulerActiveStateForApplyLocked(cfg *config.Config, now time.Time) map[string]bool { + hash, hasSchedulers := policySchedulerConfigHash(cfg) + if !hasSchedulers { + return nil + } + if d.scheduler != nil && hash == d.policySchedulerConfigHash { + return d.scheduler.ActiveState() + } + _, activeState := scheduler.NewPrimed(cfg.Schedulers, func(map[string]bool) {}, now) + return activeState +} + func policySchedulerConfigHash(cfg *config.Config) ([32]byte, bool) { if cfg == nil || len(cfg.Schedulers) == 0 { return [32]byte{}, false } - b, err := json.Marshal(cfg.Schedulers) - if err != nil { - return [32]byte{}, false + h := sha256.New() + names := make([]string, 0, len(cfg.Schedulers)) + for name := range cfg.Schedulers { + names = append(names, name) + } + sort.Strings(names) + for _, name := range names { + writePolicySchedulerHashString(h, name) + sched := cfg.Schedulers[name] + if sched == nil { + writePolicySchedulerHashString(h, "") + continue + } + writePolicySchedulerHashString(h, sched.Name) + writePolicySchedulerHashString(h, sched.StartTime) + writePolicySchedulerHashString(h, sched.StopTime) + writePolicySchedulerHashString(h, sched.StartDate) + writePolicySchedulerHashString(h, sched.StopDate) + if sched.Daily { + _, _ = h.Write([]byte{1}) + } else { + _, _ = h.Write([]byte{0}) + } + } + var out [32]byte + copy(out[:], h.Sum(nil)) + return out, true +} + +func writePolicySchedulerHashString(h interface{ Write([]byte) (int, error) }, s string) { + var lenBuf [8]byte + for i := 0; i < len(lenBuf); i++ { + lenBuf[i] = byte(uint64(len(s)) >> (8 * i)) } - return sha256.Sum256(b), true + _, _ = h.Write(lenBuf[:]) + _, _ = h.Write([]byte(s)) } func (d *Daemon) startPolicySchedulerLoopLocked() { diff --git a/pkg/daemon/policy_scheduler_apply_test.go b/pkg/daemon/policy_scheduler_apply_test.go index ee238ea1e..a6755390e 100644 --- a/pkg/daemon/policy_scheduler_apply_test.go +++ b/pkg/daemon/policy_scheduler_apply_test.go @@ -3,11 +3,13 @@ package daemon import ( "errors" "testing" + "time" "github.com/psaab/xpf/pkg/cluster" "github.com/psaab/xpf/pkg/config" "github.com/psaab/xpf/pkg/dataplane" dpuserspace "github.com/psaab/xpf/pkg/dataplane/userspace" + "github.com/psaab/xpf/pkg/scheduler" ) type policySchedulerApplyTestDP struct { @@ -71,6 +73,46 @@ func TestApplyConfigClearsDeferWorkersOnAbortCompileError(t *testing.T) { } } +func TestApplyConfigProtocolAbortPreservesExistingScheduler(t *testing.T) { + oldCfg := &config.Config{ + Schedulers: map[string]*config.SchedulerConfig{ + "old": {Name: "old"}, + }, + } + oldScheduler, oldState := scheduler.NewPrimed(oldCfg.Schedulers, func(map[string]bool) {}, testPolicySchedulerApplyNow()) + oldHash, _ := policySchedulerConfigHash(oldCfg) + dp := &policySchedulerApplyTestDP{ + compileErr: dpuserspace.ErrPolicySchedulerProtocolIncompatible, + } + d := &Daemon{ + dp: dp, + scheduler: oldScheduler, + policySchedulerConfigHash: oldHash, + } + d.policySchedulerEpoch.Store(42) + newCfg := &config.Config{ + Schedulers: map[string]*config.SchedulerConfig{ + "new": {Name: "new"}, + }, + } + + if err := d.applyConfigLocked(newCfg); !errors.Is(err, dpuserspace.ErrPolicySchedulerProtocolIncompatible) { + t.Fatalf("applyConfigLocked error = %v, want protocol incompatibility", err) + } + if d.scheduler != oldScheduler { + t.Fatal("protocol abort replaced scheduler before apply completed") + } + if got := d.policySchedulerEpoch.Load(); got != 42 { + t.Fatalf("policySchedulerEpoch = %d, want unchanged 42", got) + } + if d.policySchedulerConfigHash != oldHash { + t.Fatal("protocol abort changed scheduler config hash") + } + if got := d.scheduler.ActiveState()["old"]; got != oldState["old"] { + t.Fatalf("old scheduler active state = %t, want %t", got, oldState["old"]) + } +} + func TestApplyConfigPublishesScheduleStateToNonUserspaceDataplane(t *testing.T) { dp := &policySchedulerApplyTestDP{} d := &Daemon{dp: dp} @@ -86,3 +128,7 @@ func TestApplyConfigPublishesScheduleStateToNonUserspaceDataplane(t *testing.T) t.Fatalf("active state for workhours = %t, present=%t; want active true", got, ok) } } + +func testPolicySchedulerApplyNow() time.Time { + return time.Date(2026, 5, 17, 12, 0, 0, 0, time.UTC) +} diff --git a/pkg/dataplane/userspace/manager.go b/pkg/dataplane/userspace/manager.go index 7c63bb490..a532a5870 100644 --- a/pkg/dataplane/userspace/manager.go +++ b/pkg/dataplane/userspace/manager.go @@ -447,14 +447,6 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ return } - next := *m.lastSnapshot - m.generation++ - next.Generation = m.generation - next.FIBGeneration = m.readFIBGeneration() - next.GeneratedAt = time.Now().UTC() - next.Config = cfg - next.Policies = buildPolicySnapshotsWithSchedulerState(cfg, activeCopy) - if err := m.ensurePolicySchedulerProtocolLocked(cfg); err != nil { if disarmErr := m.disarmPolicySchedulerProtocolFailureLocked(err); disarmErr != nil { slog.Warn("userspace: failed to disarm helper after refusing policy scheduler publish", @@ -463,6 +455,14 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ slog.Warn("userspace: refusing policy scheduler publish to incompatible helper", "err", err) return } + next := *m.lastSnapshot + nextGeneration := m.generation + 1 + next.Generation = nextGeneration + next.FIBGeneration = m.readFIBGeneration() + next.GeneratedAt = time.Now().UTC() + next.Config = cfg + next.Policies = buildPolicySnapshotsWithSchedulerState(cfg, activeCopy) + publishSnap := next publishSnap.Neighbors = filterPublishableNeighbors(next.Neighbors) var status ProcessStatus @@ -470,6 +470,7 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ slog.Warn("userspace: failed to publish policy scheduler state", "err", err) return } + m.generation = nextGeneration m.lastSnapshot = &next m.rebuildNeighborIndex() m.rebuildMonitoredIfindexes() @@ -555,7 +556,7 @@ func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error } var status ProcessStatus if err := m.requestLocked(ControlRequest{Type: "status"}, &status); err == nil { - m.lastStatus.ConfigSnapshotProtocolVersion = status.ConfigSnapshotProtocolVersion + m.recordHelperStatusLocked(&status) if status.ConfigSnapshotProtocolVersion >= ProtocolVersion { return nil } @@ -568,6 +569,18 @@ func (m *Manager) ensurePolicySchedulerProtocolLocked(cfg *config.Config) error ) } +func (m *Manager) recordHelperStatusLocked(status *ProcessStatus) { + status.DataplaneMode = m.mode.String() + status.ConfiguredMode = m.configuredMode.String() + status.EntryPrograms = m.entryProgramsLocked() + status.FallbackCounters = m.readFallbackStatsLocked() + if m.eventStream != nil { + es := m.eventStream.Status() + status.EventStream = &es + } + m.lastStatus = *status +} + func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) error { if m.proc == nil || m.proc.Process == nil { return nil @@ -583,6 +596,7 @@ func (m *Manager) disarmPolicySchedulerProtocolFailureLocked(protocolErr error) return fmt.Errorf("userspace: disarm helper after policy scheduler protocol error: %w", err) } if err := m.applyHelperStatusLocked(&status); err != nil { + m.recordHelperStatusLocked(&status) return fmt.Errorf("userspace: sync helper status after policy scheduler fail-closed disarm: %w", err) } slog.Warn("userspace: disarmed helper after policy scheduler protocol error", "err", protocolErr) diff --git a/pkg/dataplane/userspace/manager_test.go b/pkg/dataplane/userspace/manager_test.go index 586a49147..700562ea1 100644 --- a/pkg/dataplane/userspace/manager_test.go +++ b/pkg/dataplane/userspace/manager_test.go @@ -2844,6 +2844,12 @@ func TestUpdatePolicyScheduleStateRefusesOldHelperForScheduledPolicies(t *testin if m.lastStatus.ForwardingArmed { t.Fatal("helper status should be disarmed after protocol mismatch") } + if m.generation != 7 { + t.Fatalf("generation = %d, want unchanged 7 after protocol mismatch", m.generation) + } + if m.lastStatus.PID != 1234 { + t.Fatalf("lastStatus PID = %d, want status probe PID 1234", m.lastStatus.PID) + } if m.lastSnapshot == nil || len(m.lastSnapshot.Policies) != 1 || m.lastSnapshot.Policies[0].Inactive { t.Fatalf("manager should not cache rejected inactive state when publish is refused: %+v", m.lastSnapshot) } diff --git a/pkg/dataplane/userspace/maps_sync.go b/pkg/dataplane/userspace/maps_sync.go index a57ca825c..504dcb5e8 100644 --- a/pkg/dataplane/userspace/maps_sync.go +++ b/pkg/dataplane/userspace/maps_sync.go @@ -604,17 +604,7 @@ ctrlReady: // even for packets that bypassed the BPF pipeline (#332). m.syncBPFCountersLocked(status) - // Populate runtime mode and observability fields in status. - status.DataplaneMode = m.mode.String() - status.ConfiguredMode = m.configuredMode.String() - status.EntryPrograms = m.entryProgramsLocked() - status.FallbackCounters = m.readFallbackStatsLocked() - if m.eventStream != nil { - es := m.eventStream.Status() - status.EventStream = &es - } - - m.lastStatus = *status + m.recordHelperStatusLocked(status) return nil } diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index 09c058513..8c4459b7f 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -274,6 +274,31 @@ func TestScheduler_WallClockBackwardStepStaysFailClosedUntilClockRecovers(t *tes } } +func TestScheduler_MonotonicAdvanceDoesNotFailClosed(t *testing.T) { + schedCfg := map[string]*config.SchedulerConfig{ + "business-hours": { + Name: "business-hours", + StartTime: "00:00:00", + StopTime: "23:59:59", + }, + } + start := time.Now() + s, state := NewPrimed(schedCfg, func(map[string]bool) {}, start) + if !state["business-hours"] { + t.Fatal("initial state should be active") + } + + // time.Add preserves Go's monotonic reading. This exercises the real + // monotonic path; time.Date-only tests would silently skip it. + s.evaluate(start.Add(time.Minute), true) + if s.IsActive("business-hours") == false { + t.Fatal("monotonic time advance with matching wall time should stay active") + } + if !s.unsafeUntil.IsZero() { + t.Fatalf("unsafeUntil = %v, want zero for monotonic advance", s.unsafeUntil) + } +} + func TestScheduler_ActiveState(t *testing.T) { schedCfg := map[string]*config.SchedulerConfig{ "always-on": {Name: "always-on"}, From ffa694bafe69df273461a9ee90f8163e4021d177 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 07:57:17 +0000 Subject: [PATCH 18/20] fix: align scheduler rule slot updates for ebpf/dpdk Agent-Logs-Url: https://github.com/psaab/xpf/sessions/4088459f-cc3c-47bb-8ec6-831e389d7a37 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- pkg/config/compiler.go | 8 ++ pkg/config/parser_ast_test.go | 28 ++++++ pkg/dataplane/dpdk/dpdk_cgo.go | 52 +++++------ pkg/dataplane/maps.go | 55 +++++------ pkg/dataplane/policy_scheduler.go | 123 +++++++++++++++++++++++++ pkg/dataplane/policy_scheduler_test.go | 88 ++++++++++++++++++ 6 files changed, 296 insertions(+), 58 deletions(-) create mode 100644 pkg/dataplane/policy_scheduler.go create mode 100644 pkg/dataplane/policy_scheduler_test.go diff --git a/pkg/config/compiler.go b/pkg/config/compiler.go index 2fdf4d08e..7dedfeea1 100644 --- a/pkg/config/compiler.go +++ b/pkg/config/compiler.go @@ -548,6 +548,14 @@ func ValidateConfig(cfg *Config) []string { } } } + for _, p := range cfg.Security.GlobalPolicies { + if p.SchedulerName != "" { + if _, ok := cfg.Schedulers[p.SchedulerName]; !ok { + warnings = append(warnings, fmt.Sprintf( + "policy %q: scheduler %q not defined", p.Name, p.SchedulerName)) + } + } + } // Validate routing-instance interface references for _, ri := range cfg.RoutingInstances { diff --git a/pkg/config/parser_ast_test.go b/pkg/config/parser_ast_test.go index 604f50591..a12b0cd2c 100644 --- a/pkg/config/parser_ast_test.go +++ b/pkg/config/parser_ast_test.go @@ -1532,6 +1532,34 @@ func TestPolicySchedulerMissingReferenceWarns(t *testing.T) { } } +func TestGlobalPolicySchedulerMissingReferenceWarns(t *testing.T) { + input := `security { + policies { + global { + policy global-sched-test { + match { source-address any; destination-address any; application any; } + then { permit; } + scheduler-name missing-global-sched; + } + } + } +} +` + parser := NewParser(input) + tree, errs := parser.Parse() + if len(errs) > 0 { + t.Fatalf("parse errors: %v", errs) + } + cfg, err := CompileConfig(tree) + if err != nil { + t.Fatalf("CompileConfig returned error for warning-only missing global scheduler reference: %v", err) + } + warnings := strings.Join(cfg.Warnings, "\n") + if !strings.Contains(warnings, `policy "global-sched-test": scheduler "missing-global-sched" not defined`) { + t.Fatalf("CompileConfig warnings = %v, want missing global scheduler warning", cfg.Warnings) + } +} + func TestMultiTermApplication(t *testing.T) { input := `applications { application ssh-long { diff --git a/pkg/dataplane/dpdk/dpdk_cgo.go b/pkg/dataplane/dpdk/dpdk_cgo.go index d2d04056f..fca9e8725 100644 --- a/pkg/dataplane/dpdk/dpdk_cgo.go +++ b/pkg/dataplane/dpdk/dpdk_cgo.go @@ -348,37 +348,33 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ return } - policySetID := uint32(0) - for _, zpp := range cfg.Security.Policies { - for i, pol := range zpp.Policies { - if pol.SchedulerName == "" { - policySetID++ - continue - } - - active, exists := activeState[pol.SchedulerName] - if !exists { - active = true // default active if scheduler not found - } + slots, err := dataplane.BuildScheduledPolicyRuleSlots(cfg) + if err != nil { + slog.Warn("dpdk policy scheduler: failed to build scheduled rule slots", "err", err) + return + } + for _, slot := range slots { + active, exists := activeState[slot.SchedulerName] + if !exists { + active = true // default active if scheduler not found + } - idx := policySetID*C.MAX_RULES_PER_POLICY + uint32(i) - ptr := (*C.struct_policy_rule)(unsafe.Pointer( - uintptr(unsafe.Pointer(shm.policy_rules)) + - uintptr(idx)*unsafe.Sizeof(C.struct_policy_rule{}))) + idx := slot.AbsoluteRuleIdx + ptr := (*C.struct_policy_rule)(unsafe.Pointer( + uintptr(unsafe.Pointer(shm.policy_rules)) + + uintptr(idx)*unsafe.Sizeof(C.struct_policy_rule{}))) - var newActive C.uint8_t - if active { - newActive = 1 - } - if ptr.active != newActive { - ptr.active = newActive - slog.Info("DPDK policy schedule state updated", - "policy", pol.Name, - "scheduler", pol.SchedulerName, - "active", active) - } + var newActive C.uint8_t + if active { + newActive = 1 + } + if ptr.active != newActive { + ptr.active = newActive + slog.Info("DPDK policy schedule state updated", + "policy", slot.PolicyName, + "scheduler", slot.SchedulerName, + "active", active) } - policySetID++ } } diff --git a/pkg/dataplane/maps.go b/pkg/dataplane/maps.go index c66627f4b..bf10d591a 100644 --- a/pkg/dataplane/maps.go +++ b/pkg/dataplane/maps.go @@ -1500,39 +1500,34 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ return } - policySetID := uint32(0) - for _, zpp := range cfg.Security.Policies { - for i, pol := range zpp.Policies { - if pol.SchedulerName == "" { - policySetID++ - continue - } - - active, exists := activeState[pol.SchedulerName] - if !exists { - active = true // default active if scheduler not found - } + slots, err := BuildScheduledPolicyRuleSlots(cfg) + if err != nil { + slog.Warn("policy scheduler: failed to build scheduled rule slots", "err", err) + return + } + for _, slot := range slots { + active, exists := activeState[slot.SchedulerName] + if !exists { + active = true // default active if scheduler not found + } - idx := policySetID*MaxRulesPerPolicy + uint32(i) - var rule PolicyRule - if err := zm.Lookup(idx, &rule); err != nil { - continue - } + var rule PolicyRule + if err := zm.Lookup(slot.AbsoluteRuleIdx, &rule); err != nil { + continue + } - var newActive uint8 - if active { - newActive = 1 - } - if rule.Active != newActive { - rule.Active = newActive - zm.Update(idx, rule, ebpf.UpdateAny) - slog.Info("policy schedule state updated", - "policy", pol.Name, - "scheduler", pol.SchedulerName, - "active", active) - } + var newActive uint8 + if active { + newActive = 1 + } + if rule.Active != newActive { + rule.Active = newActive + zm.Update(slot.AbsoluteRuleIdx, rule, ebpf.UpdateAny) + slog.Info("policy schedule state updated", + "policy", slot.PolicyName, + "scheduler", slot.SchedulerName, + "active", active) } - policySetID++ } } diff --git a/pkg/dataplane/policy_scheduler.go b/pkg/dataplane/policy_scheduler.go new file mode 100644 index 000000000..2f7d8cf07 --- /dev/null +++ b/pkg/dataplane/policy_scheduler.go @@ -0,0 +1,123 @@ +package dataplane + +import ( + "fmt" + + "github.com/psaab/xpf/pkg/config" +) + +type ScheduledPolicyRuleSlot struct { + PolicyName string + SchedulerName string + AbsoluteRuleIdx uint32 +} + +func BuildScheduledPolicyRuleSlots(cfg *config.Config) ([]ScheduledPolicyRuleSlot, error) { + if cfg == nil { + return nil, nil + } + + var slots []ScheduledPolicyRuleSlot + policySetID := uint32(0) + + for _, zpp := range cfg.Security.Policies { + if zpp == nil { + policySetID++ + continue + } + setSlots, err := scheduledPolicyRuleSlotsForPolicies(policySetID, zpp.Policies, cfg) + if err != nil { + return nil, err + } + slots = append(slots, setSlots...) + policySetID++ + } + + if len(cfg.Security.GlobalPolicies) > 0 { + setSlots, err := scheduledPolicyRuleSlotsForPolicies(policySetID, cfg.Security.GlobalPolicies, cfg) + if err != nil { + return nil, err + } + slots = append(slots, setSlots...) + } + + return slots, nil +} + +func scheduledPolicyRuleSlotsForPolicies(policySetID uint32, policies []*config.Policy, cfg *config.Config) ([]ScheduledPolicyRuleSlot, error) { + var ( + slots []ScheduledPolicyRuleSlot + seq uint32 + ) + + for _, pol := range policies { + if pol == nil { + continue + } + ruleCount, err := expandedPolicyRuleCount(cfg, pol) + if err != nil { + return nil, err + } + if pol.SchedulerName == "" { + seq += ruleCount + continue + } + for i := uint32(0); i < ruleCount; i++ { + slots = append(slots, ScheduledPolicyRuleSlot{ + PolicyName: pol.Name, + SchedulerName: pol.SchedulerName, + AbsoluteRuleIdx: policySetID*MaxRulesPerPolicy + seq + i, + }) + } + seq += ruleCount + } + + return slots, nil +} + +func expandedPolicyRuleCount(cfg *config.Config, pol *config.Policy) (uint32, error) { + hasAny := false + for _, appName := range pol.Match.Applications { + if appName == "any" { + hasAny = true + break + } + } + if hasAny || len(pol.Match.Applications) == 0 { + return 1, nil + } + + seen := make(map[string]struct{}) + count := uint32(0) + for _, appName := range pol.Match.Applications { + if appName == "" { + continue + } + if _, isSet := cfg.Applications.ApplicationSets[appName]; isSet { + expanded, err := config.ExpandApplicationSet(appName, &cfg.Applications) + if err != nil { + return 0, fmt.Errorf("policy %q expand app-set %q: %w", pol.Name, appName, err) + } + for _, expandedApp := range expanded { + if expandedApp == "" { + continue + } + if _, ok := seen[expandedApp]; ok { + continue + } + seen[expandedApp] = struct{}{} + count++ + } + continue + } + if _, ok := seen[appName]; ok { + continue + } + seen[appName] = struct{}{} + count++ + } + if count == 0 { + return 1, nil + } + return count, nil +} diff --git a/pkg/dataplane/policy_scheduler_test.go b/pkg/dataplane/policy_scheduler_test.go new file mode 100644 index 000000000..0e6f7f670 --- /dev/null +++ b/pkg/dataplane/policy_scheduler_test.go @@ -0,0 +1,88 @@ +package dataplane + +import ( + "testing" + + "github.com/psaab/xpf/pkg/config" +) + +func TestBuildScheduledPolicyRuleSlotsHandlesExpandedAndGlobalPolicies(t *testing.T) { + cfg := &config.Config{ + Applications: config.ApplicationsConfig{ + ApplicationSets: map[string]*config.ApplicationSet{ + "set-web": { + Name: "set-web", + Applications: []string{ + "junos-http", + "junos-https", + }, + }, + }, + }, + Security: config.SecurityConfig{ + Policies: []*config.ZonePairPolicies{ + { + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{ + { + Name: "unscheduled", + SchedulerName: "", + Match: config.PolicyMatch{ + Applications: []string{"any"}, + }, + }, + { + Name: "scheduled-zp", + SchedulerName: "workhours", + Match: config.PolicyMatch{ + Applications: []string{"set-web"}, + }, + }, + }, + }, + }, + GlobalPolicies: []*config.Policy{ + { + Name: "scheduled-global", + SchedulerName: "afterhours", + Match: config.PolicyMatch{ + Applications: []string{"set-web"}, + }, + }, + }, + }, + } + + slots, err := BuildScheduledPolicyRuleSlots(cfg) + if err != nil { + t.Fatalf("BuildScheduledPolicyRuleSlots returned error: %v", err) + } + if len(slots) != 4 { + t.Fatalf("slot count = %d, want 4", len(slots)) + } + for _, slot := range slots { + if slot.PolicyName == "unscheduled" { + t.Fatalf("unscheduled policy unexpectedly included in slots: %+v", slot) + } + } + + globalBase := uint32(len(cfg.Security.Policies)) * MaxRulesPerPolicy + + want := []struct { + policy string + scheduler string + index uint32 + }{ + {policy: "scheduled-zp", scheduler: "workhours", index: 1}, + {policy: "scheduled-zp", scheduler: "workhours", index: 2}, + {policy: "scheduled-global", scheduler: "afterhours", index: globalBase}, + {policy: "scheduled-global", scheduler: "afterhours", index: globalBase + 1}, + } + for i, w := range want { + got := slots[i] + if got.PolicyName != w.policy || got.SchedulerName != w.scheduler || got.AbsoluteRuleIdx != w.index { + t.Fatalf("slot[%d] = %+v, want policy=%q scheduler=%q index=%d", i, got, w.policy, w.scheduler, w.index) + } + } +} From fb36b3ea24493e59645743d0dcd50b771409ed58 Mon Sep 17 00:00:00 2001 From: Paul Saab Date: Sun, 17 May 2026 01:19:54 -0700 Subject: [PATCH 19/20] dataplane: schedule compiled policy slots --- pkg/config/compiler.go | 2 +- pkg/config/parser_ast_test.go | 6 +-- pkg/dataplane/README.md | 4 +- pkg/dataplane/compiler.go | 32 ++++++++++++ pkg/dataplane/compiler_test.go | 91 ++++++++++++++++++++++++++++++++++ pkg/dataplane/dpdk/dpdk_cgo.go | 15 +++--- pkg/dataplane/maps.go | 16 +++--- 7 files changed, 144 insertions(+), 22 deletions(-) diff --git a/pkg/config/compiler.go b/pkg/config/compiler.go index 7dedfeea1..f80311dbd 100644 --- a/pkg/config/compiler.go +++ b/pkg/config/compiler.go @@ -552,7 +552,7 @@ func ValidateConfig(cfg *Config) []string { if p.SchedulerName != "" { if _, ok := cfg.Schedulers[p.SchedulerName]; !ok { warnings = append(warnings, fmt.Sprintf( - "policy %q: scheduler %q not defined", p.Name, p.SchedulerName)) + "global policy %q: scheduler %q not defined", p.Name, p.SchedulerName)) } } } diff --git a/pkg/config/parser_ast_test.go b/pkg/config/parser_ast_test.go index a12b0cd2c..20fbcc257 100644 --- a/pkg/config/parser_ast_test.go +++ b/pkg/config/parser_ast_test.go @@ -1536,10 +1536,10 @@ func TestGlobalPolicySchedulerMissingReferenceWarns(t *testing.T) { input := `security { policies { global { - policy global-sched-test { + policy sched-global { match { source-address any; destination-address any; application any; } then { permit; } - scheduler-name missing-global-sched; + scheduler-name missing-sched; } } } @@ -1555,7 +1555,7 @@ func TestGlobalPolicySchedulerMissingReferenceWarns(t *testing.T) { t.Fatalf("CompileConfig returned error for warning-only missing global scheduler reference: %v", err) } warnings := strings.Join(cfg.Warnings, "\n") - if !strings.Contains(warnings, `policy "global-sched-test": scheduler "missing-global-sched" not defined`) { + if !strings.Contains(warnings, `global policy "sched-global": scheduler "missing-sched" not defined`) { t.Fatalf("CompileConfig warnings = %v, want missing global scheduler warning", cfg.Warnings) } } diff --git a/pkg/dataplane/README.md b/pkg/dataplane/README.md index 5a8306e6e..9e31c3db8 100644 --- a/pkg/dataplane/README.md +++ b/pkg/dataplane/README.md @@ -34,8 +34,8 @@ sent while exact queues were still backlogged. NAT, static NAT, NAT64 prefixes, NPTv6, screen profiles, default policy, flow timeouts, firewall filters, flow config, port mirroring. -- `CompileResult` — `compiler.go`. Zone/policy/NAT/app IDs and the - per-interface networkd configs. +- `CompileResult` — `compiler.go`. Zone/policy/NAT/app IDs, compiled + policy-scheduler rule slots, and the per-interface networkd configs. - Session iteration: `IterateSessions`, `BatchIterateSessions`, `IterateSessionsV6`, `BatchIterateSessionsV6`. diff --git a/pkg/dataplane/compiler.go b/pkg/dataplane/compiler.go index 4bd6761ae..dce17ca60 100644 --- a/pkg/dataplane/compiler.go +++ b/pkg/dataplane/compiler.go @@ -33,6 +33,8 @@ type CompileResult struct { PolicySets int // number of policy sets created FilterIDs map[string]uint32 // "inet:name" or "inet6:name" -> filter_id + PolicyScheduleRuleSlots []PolicyScheduleRuleSlot + Lo0FilterV4 uint32 // lo0 inet filter ID (0=none), set by compileFirewallFilters Lo0FilterV6 uint32 // lo0 inet6 filter ID (0=none), set by compileFirewallFilters @@ -68,6 +70,18 @@ type CompileResult struct { ethtoolApplied map[string]bool } +// PolicyScheduleRuleSlot records the exact compiled policy_rules map slot for a +// scheduled policy. A single policy can compile into multiple dense app-term +// slots; runtime scheduler updates must toggle those compiled slots rather than +// recomputing indexes from the original config policy position. +type PolicyScheduleRuleSlot struct { + PolicySetID uint32 + RuleIndex uint32 + RuleID uint32 + PolicyName string + SchedulerName string +} + // cachedInterfaceByName returns a cached *net.Interface, performing the // syscall only on the first lookup for each name. func (r *CompileResult) cachedInterfaceByName(name string) (*net.Interface, error) { @@ -792,6 +806,15 @@ func compilePolicies(dp DataPlane, cfg *config.Config, result *CompileResult) er } result.PolicyNames[rule.RuleID] = pol.Name + if pol.SchedulerName != "" { + result.PolicyScheduleRuleSlots = append(result.PolicyScheduleRuleSlots, PolicyScheduleRuleSlot{ + PolicySetID: policySetID, + RuleIndex: uint32(i), + RuleID: rule.RuleID, + PolicyName: pol.Name, + SchedulerName: pol.SchedulerName, + }) + } slog.Debug("policy rule compiled", "from", zpp.FromZone, "to", zpp.ToZone, @@ -913,6 +936,15 @@ func compilePolicies(dp DataPlane, cfg *config.Config, result *CompileResult) er } result.PolicyNames[rule.RuleID] = pol.Name + if pol.SchedulerName != "" { + result.PolicyScheduleRuleSlots = append(result.PolicyScheduleRuleSlots, PolicyScheduleRuleSlot{ + PolicySetID: policySetID, + RuleIndex: uint32(i), + RuleID: rule.RuleID, + PolicyName: pol.Name, + SchedulerName: pol.SchedulerName, + }) + } slog.Debug("global policy rule compiled", "policy", pol.Name, "action", rule.Action, diff --git a/pkg/dataplane/compiler_test.go b/pkg/dataplane/compiler_test.go index 7fc9005aa..2fbf93c59 100644 --- a/pkg/dataplane/compiler_test.go +++ b/pkg/dataplane/compiler_test.go @@ -7,6 +7,97 @@ import ( "github.com/psaab/xpf/pkg/config" ) +type policyScheduleSlotTestDP struct { + DataPlane + + rules []PolicyRule +} + +func (d *policyScheduleSlotTestDP) SetZonePairPolicy(fromZone, toZone uint16, ps PolicySet) error { + return nil +} + +func (d *policyScheduleSlotTestDP) SetPolicyRule(policySetID uint32, ruleIndex uint32, rule PolicyRule) error { + d.rules = append(d.rules, rule) + return nil +} + +func (d *policyScheduleSlotTestDP) DeleteStaleZonePairPolicies(written map[ZonePairKey]bool) {} + +func TestCompilePoliciesRecordsExpandedPolicyScheduleSlots(t *testing.T) { + cfg := &config.Config{} + cfg.Security.Policies = []*config.ZonePairPolicies{{ + FromZone: "trust", + ToZone: "untrust", + Policies: []*config.Policy{ + { + Name: "plain", + Action: config.PolicyPermit, + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"any"}, + }, + }, + { + Name: "scheduled", + SchedulerName: "workhours", + Action: config.PolicyPermit, + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"app-a", "app-b"}, + }, + }, + }, + }} + cfg.Security.GlobalPolicies = []*config.Policy{{ + Name: "global-scheduled", + SchedulerName: "night", + Action: config.PolicyPermit, + Match: config.PolicyMatch{ + SourceAddresses: []string{"any"}, + DestinationAddresses: []string{"any"}, + Applications: []string{"app-c", "app-d"}, + }, + }} + result := &CompileResult{ + ZoneIDs: map[string]uint16{ + "trust": 1, + "untrust": 2, + }, + AppIDs: map[string]uint32{ + "app-a": 1, + "app-b": 2, + "app-c": 3, + "app-d": 4, + }, + } + dp := &policyScheduleSlotTestDP{} + + if err := compilePolicies(dp, cfg, result); err != nil { + t.Fatalf("compilePolicies: %v", err) + } + + want := []PolicyScheduleRuleSlot{ + {PolicySetID: 0, RuleIndex: 1, RuleID: 1, PolicyName: "scheduled", SchedulerName: "workhours"}, + {PolicySetID: 0, RuleIndex: 2, RuleID: 2, PolicyName: "scheduled", SchedulerName: "workhours"}, + {PolicySetID: 1, RuleIndex: 0, RuleID: MaxRulesPerPolicy, PolicyName: "global-scheduled", SchedulerName: "night"}, + {PolicySetID: 1, RuleIndex: 1, RuleID: MaxRulesPerPolicy + 1, PolicyName: "global-scheduled", SchedulerName: "night"}, + } + if len(result.PolicyScheduleRuleSlots) != len(want) { + t.Fatalf("got %d slots, want %d: %#v", len(result.PolicyScheduleRuleSlots), len(want), result.PolicyScheduleRuleSlots) + } + for i := range want { + if got := result.PolicyScheduleRuleSlots[i]; got != want[i] { + t.Fatalf("slot %d = %#v, want %#v", i, got, want[i]) + } + } + if len(dp.rules) != 5 { + t.Fatalf("compiled %d policy rules, want 5", len(dp.rules)) + } +} + func TestExpandFilterTermNegateFlags(t *testing.T) { prefixLists := map[string]*config.PrefixList{ "rfc1918": { diff --git a/pkg/dataplane/dpdk/dpdk_cgo.go b/pkg/dataplane/dpdk/dpdk_cgo.go index fca9e8725..0aa740466 100644 --- a/pkg/dataplane/dpdk/dpdk_cgo.go +++ b/pkg/dataplane/dpdk/dpdk_cgo.go @@ -342,24 +342,23 @@ func (m *Manager) SetDefaultPolicy(action uint8) error { return nil } -func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[string]bool) { +func (m *Manager) UpdatePolicyScheduleState(_ *config.Config, activeState map[string]bool) { shm := m.platform.shm - if shm == nil || cfg == nil { + if shm == nil { return } - - slots, err := dataplane.BuildScheduledPolicyRuleSlots(cfg) - if err != nil { - slog.Warn("dpdk policy scheduler: failed to build scheduled rule slots", "err", err) + result := m.LastCompileResult() + if result == nil { return } - for _, slot := range slots { + + for _, slot := range result.PolicyScheduleRuleSlots { active, exists := activeState[slot.SchedulerName] if !exists { active = true // default active if scheduler not found } - idx := slot.AbsoluteRuleIdx + idx := slot.PolicySetID*C.MAX_RULES_PER_POLICY + slot.RuleIndex ptr := (*C.struct_policy_rule)(unsafe.Pointer( uintptr(unsafe.Pointer(shm.policy_rules)) + uintptr(idx)*unsafe.Sizeof(C.struct_policy_rule{}))) diff --git a/pkg/dataplane/maps.go b/pkg/dataplane/maps.go index bf10d591a..29bc70e53 100644 --- a/pkg/dataplane/maps.go +++ b/pkg/dataplane/maps.go @@ -1494,25 +1494,25 @@ func (m *Manager) ClearFilterConfigs() error { // UpdatePolicyScheduleState iterates policy rules and toggles the Active flag // based on scheduler state. Only rules whose scheduler state changed are updated. -func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[string]bool) { +func (m *Manager) UpdatePolicyScheduleState(_ *config.Config, activeState map[string]bool) { zm, ok := m.maps["policy_rules"] if !ok { return } - - slots, err := BuildScheduledPolicyRuleSlots(cfg) - if err != nil { - slog.Warn("policy scheduler: failed to build scheduled rule slots", "err", err) + result := m.LastCompileResult() + if result == nil { return } - for _, slot := range slots { + + for _, slot := range result.PolicyScheduleRuleSlots { active, exists := activeState[slot.SchedulerName] if !exists { active = true // default active if scheduler not found } + idx := slot.PolicySetID*MaxRulesPerPolicy + slot.RuleIndex var rule PolicyRule - if err := zm.Lookup(slot.AbsoluteRuleIdx, &rule); err != nil { + if err := zm.Lookup(idx, &rule); err != nil { continue } @@ -1522,7 +1522,7 @@ func (m *Manager) UpdatePolicyScheduleState(cfg *config.Config, activeState map[ } if rule.Active != newActive { rule.Active = newActive - zm.Update(slot.AbsoluteRuleIdx, rule, ebpf.UpdateAny) + zm.Update(idx, rule, ebpf.UpdateAny) slog.Info("policy schedule state updated", "policy", slot.PolicyName, "scheduler", slot.SchedulerName, From 3a79c99747a8aeec86184cb9500a6ccde85ff4f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 17 May 2026 08:32:00 +0000 Subject: [PATCH 20/20] dataplane: remove obsolete scheduler slot helper Agent-Logs-Url: https://github.com/psaab/xpf/sessions/913eb4a1-a048-478c-832f-e80f4c7fd212 Co-authored-by: psaab <196946+psaab@users.noreply.github.com> --- pkg/dataplane/policy_scheduler.go | 123 ------------------------- pkg/dataplane/policy_scheduler_test.go | 88 ------------------ 2 files changed, 211 deletions(-) delete mode 100644 pkg/dataplane/policy_scheduler.go delete mode 100644 pkg/dataplane/policy_scheduler_test.go diff --git a/pkg/dataplane/policy_scheduler.go b/pkg/dataplane/policy_scheduler.go deleted file mode 100644 index 2f7d8cf07..000000000 --- a/pkg/dataplane/policy_scheduler.go +++ /dev/null @@ -1,123 +0,0 @@ -package dataplane - -import ( - "fmt" - - "github.com/psaab/xpf/pkg/config" -) - -type ScheduledPolicyRuleSlot struct { - PolicyName string - SchedulerName string - AbsoluteRuleIdx uint32 -} - -func BuildScheduledPolicyRuleSlots(cfg *config.Config) ([]ScheduledPolicyRuleSlot, error) { - if cfg == nil { - return nil, nil - } - - var slots []ScheduledPolicyRuleSlot - policySetID := uint32(0) - - for _, zpp := range cfg.Security.Policies { - if zpp == nil { - policySetID++ - continue - } - setSlots, err := scheduledPolicyRuleSlotsForPolicies(policySetID, zpp.Policies, cfg) - if err != nil { - return nil, err - } - slots = append(slots, setSlots...) - policySetID++ - } - - if len(cfg.Security.GlobalPolicies) > 0 { - setSlots, err := scheduledPolicyRuleSlotsForPolicies(policySetID, cfg.Security.GlobalPolicies, cfg) - if err != nil { - return nil, err - } - slots = append(slots, setSlots...) - } - - return slots, nil -} - -func scheduledPolicyRuleSlotsForPolicies(policySetID uint32, policies []*config.Policy, cfg *config.Config) ([]ScheduledPolicyRuleSlot, error) { - var ( - slots []ScheduledPolicyRuleSlot - seq uint32 - ) - - for _, pol := range policies { - if pol == nil { - continue - } - ruleCount, err := expandedPolicyRuleCount(cfg, pol) - if err != nil { - return nil, err - } - if pol.SchedulerName == "" { - seq += ruleCount - continue - } - for i := uint32(0); i < ruleCount; i++ { - slots = append(slots, ScheduledPolicyRuleSlot{ - PolicyName: pol.Name, - SchedulerName: pol.SchedulerName, - AbsoluteRuleIdx: policySetID*MaxRulesPerPolicy + seq + i, - }) - } - seq += ruleCount - } - - return slots, nil -} - -func expandedPolicyRuleCount(cfg *config.Config, pol *config.Policy) (uint32, error) { - hasAny := false - for _, appName := range pol.Match.Applications { - if appName == "any" { - hasAny = true - break - } - } - if hasAny || len(pol.Match.Applications) == 0 { - return 1, nil - } - - seen := make(map[string]struct{}) - count := uint32(0) - for _, appName := range pol.Match.Applications { - if appName == "" { - continue - } - if _, isSet := cfg.Applications.ApplicationSets[appName]; isSet { - expanded, err := config.ExpandApplicationSet(appName, &cfg.Applications) - if err != nil { - return 0, fmt.Errorf("policy %q expand app-set %q: %w", pol.Name, appName, err) - } - for _, expandedApp := range expanded { - if expandedApp == "" { - continue - } - if _, ok := seen[expandedApp]; ok { - continue - } - seen[expandedApp] = struct{}{} - count++ - } - continue - } - if _, ok := seen[appName]; ok { - continue - } - seen[appName] = struct{}{} - count++ - } - if count == 0 { - return 1, nil - } - return count, nil -} diff --git a/pkg/dataplane/policy_scheduler_test.go b/pkg/dataplane/policy_scheduler_test.go deleted file mode 100644 index 0e6f7f670..000000000 --- a/pkg/dataplane/policy_scheduler_test.go +++ /dev/null @@ -1,88 +0,0 @@ -package dataplane - -import ( - "testing" - - "github.com/psaab/xpf/pkg/config" -) - -func TestBuildScheduledPolicyRuleSlotsHandlesExpandedAndGlobalPolicies(t *testing.T) { - cfg := &config.Config{ - Applications: config.ApplicationsConfig{ - ApplicationSets: map[string]*config.ApplicationSet{ - "set-web": { - Name: "set-web", - Applications: []string{ - "junos-http", - "junos-https", - }, - }, - }, - }, - Security: config.SecurityConfig{ - Policies: []*config.ZonePairPolicies{ - { - FromZone: "trust", - ToZone: "untrust", - Policies: []*config.Policy{ - { - Name: "unscheduled", - SchedulerName: "", - Match: config.PolicyMatch{ - Applications: []string{"any"}, - }, - }, - { - Name: "scheduled-zp", - SchedulerName: "workhours", - Match: config.PolicyMatch{ - Applications: []string{"set-web"}, - }, - }, - }, - }, - }, - GlobalPolicies: []*config.Policy{ - { - Name: "scheduled-global", - SchedulerName: "afterhours", - Match: config.PolicyMatch{ - Applications: []string{"set-web"}, - }, - }, - }, - }, - } - - slots, err := BuildScheduledPolicyRuleSlots(cfg) - if err != nil { - t.Fatalf("BuildScheduledPolicyRuleSlots returned error: %v", err) - } - if len(slots) != 4 { - t.Fatalf("slot count = %d, want 4", len(slots)) - } - for _, slot := range slots { - if slot.PolicyName == "unscheduled" { - t.Fatalf("unscheduled policy unexpectedly included in slots: %+v", slot) - } - } - - globalBase := uint32(len(cfg.Security.Policies)) * MaxRulesPerPolicy - - want := []struct { - policy string - scheduler string - index uint32 - }{ - {policy: "scheduled-zp", scheduler: "workhours", index: 1}, - {policy: "scheduled-zp", scheduler: "workhours", index: 2}, - {policy: "scheduled-global", scheduler: "afterhours", index: globalBase}, - {policy: "scheduled-global", scheduler: "afterhours", index: globalBase + 1}, - } - for i, w := range want { - got := slots[i] - if got.PolicyName != w.policy || got.SchedulerName != w.scheduler || got.AbsoluteRuleIdx != w.index { - t.Fatalf("slot[%d] = %+v, want policy=%q scheduler=%q index=%d", i, got, w.policy, w.scheduler, w.index) - } - } -}