From 93e6e1b9ad7ea5a2830919ade3293ac3df4bfca5 Mon Sep 17 00:00:00 2001 From: Entlein Date: Wed, 13 May 2026 14:47:57 +0200 Subject: [PATCH 1/3] perf(nn): amortise CompileIP/CompileDNS via per-container matcher cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profile-checksum-invalidated cache of compiled networkmatch.IPMatcher / DNSMatcher per (containerID, neighborIndex). The previous code path re-compiled every NetworkNeighbor's entries on each CEL function-cache miss; this PR builds each matcher at most once per profile-checksum lifetime and reuses it across subsequent misses. Design: matcherCache (sync.Map) inside nnLibrary, zero-value safe so existing test fixtures that construct nnLibrary{} directly continue to work without changes. Per-container entry tagged with the profile's SyncChecksumMetadataKey annotation. On lookup: if checksum matches, reuse; else allocate a fresh containerMatchers and store with LoadOrStore (concurrent-safe). Per-neighbor matchers are nil-init and lazily compiled on first use, so a profile with 10 egress entries that only ever fires through 2 of them pays compile cost for only those 2. Benchmarks (arm64, -benchtime=1s): IP, realistic profile (5 neighbors x 3 entries, observation misses all): Cold (per-call recompile): 1733 ns/op 1920 B/op 76 allocs/op Hot (cached matchers) : 177 ns/op 32 B/op 2 allocs/op ~ -90% time, -98% bytes, -97% allocs DNS, realistic profile: Cold: 1219 ns/op 1800 B/op 41 allocs/op Hot : 318 ns/op 272 B/op 7 allocs/op ~ -74% time, -85% bytes, -83% allocs Churning profile (checksum flips every iteration — pathological): 1527 ns/op 1936 B/op 77 allocs/op Matches cold path: cache overhead itself is negligible; the savings come strictly from amortising compile across stable-checksum windows. In production this stacks on top of the existing CEL functionCache (which already absorbs same-(containerID,observed) cache hits). The matcher cache catches what slips through: unique-observation cache misses within a profile-checksum lifetime. Touched: - matcher_cache.go new file: cache impl - matcher_cache_bench_test.go new file: comparison bench - network.go use cached matchers in all 6 CEL fns - nn.go matcherCache field on nnLibrary --- .../networkneighborhood/matcher_cache.go | 121 +++++++++++++ .../matcher_cache_bench_test.go | 161 ++++++++++++++++++ .../libraries/networkneighborhood/network.go | 114 +++---------- .../cel/libraries/networkneighborhood/nn.go | 5 + 4 files changed, 313 insertions(+), 88 deletions(-) create mode 100644 pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go create mode 100644 pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_bench_test.go diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go new file mode 100644 index 0000000000..5c973dbea0 --- /dev/null +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go @@ -0,0 +1,121 @@ +package networkneighborhood + +import ( + "sync" + + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/kubescape/storage/pkg/registry/file/networkmatch" +) + +// neighborMatchers carries the compiled-once matchers for ONE NetworkNeighbor. +// Built lazily on first match attempt against this neighbor. +type neighborMatchers struct { + ip *networkmatch.IPMatcher + dns *networkmatch.DNSMatcher +} + +// containerMatchers caches every neighbor's compiled matchers for one +// container, keyed by direction + position in the spec slice. Tagged with +// the profile's SyncChecksumMetadataKey so we can invalidate atomically when +// the profile mutates. +type containerMatchers struct { + checksum string + egress []neighborMatchers + ingress []neighborMatchers +} + +// matcherCache is owned by an nnLibrary instance. Keyed by containerID. +// Map values are *containerMatchers; the cache uses sync.Map for lock-free +// reads (the common case on the CEL hot path). +// +// Zero-value usable: a freshly-declared matcherCache (no construction) is +// a valid empty cache. Tests can build nnLibrary{} without explicit init. +type matcherCache struct { + m sync.Map // containerID -> *containerMatchers +} + +// getOrBuild returns the compiled-matcher set for this container's current +// profile. If the cached entry is stale (different checksum, or different +// neighbor count after a profile shape change), it rebuilds. +// +// The build itself is a no-op pre-compile: we don't pay the per-neighbor +// CompileIP/CompileDNS cost until the first match call against that +// neighbor. neighborMatchers struct fields are nil-initialised so the +// matcher accessor lazily builds. +func (c *matcherCache) getOrBuild(containerID, checksum string, cp *v1beta1.ContainerProfile) *containerMatchers { + if v, ok := c.m.Load(containerID); ok { + cm := v.(*containerMatchers) + if cm.checksum == checksum && + len(cm.egress) == len(cp.Spec.Egress) && + len(cm.ingress) == len(cp.Spec.Ingress) { + return cm + } + } + fresh := &containerMatchers{ + checksum: checksum, + egress: make([]neighborMatchers, len(cp.Spec.Egress)), + ingress: make([]neighborMatchers, len(cp.Spec.Ingress)), + } + // LoadOrStore: another goroutine may have raced us with the same checksum; + // keep the first one stored so callers converge on a single instance. + actual, _ := c.m.LoadOrStore(containerID, fresh) + cm := actual.(*containerMatchers) + if cm.checksum != checksum { + // Concurrent update with a different checksum landed first. Replace. + c.m.Store(containerID, fresh) + return fresh + } + return cm +} + +// ipMatcher returns the compiled IP matcher for the given neighbor index, +// lazily building it the first time. Combines the deprecated singular +// IPAddress and the new IPAddresses[] into one matcher per neighbor. +// +// Concurrency: writes to neighborMatchers.ip are guarded by an atomic +// LoadOrStore-style pattern; multiple goroutines racing on the same index +// MAY each pay the compile cost, but only one *IPMatcher pointer wins. +// In practice the CEL functionCache layer above us serialises most calls. +func (cm *containerMatchers) ipMatcher(neighbors []v1beta1.NetworkNeighbor, idx int, slot *[]neighborMatchers) *networkmatch.IPMatcher { + nm := &(*slot)[idx] + if nm.ip != nil { + return nm.ip + } + n := &neighbors[idx] + // Single compile per neighbor combining both deprecated singular IPAddress + // and the v0.0.2 IPAddresses[] list. Same merged entries as + // network.go:neighborMatchesIP, just amortised across calls. + entries := make([]string, 0, len(n.IPAddresses)+1) + if n.IPAddress != "" { + entries = append(entries, n.IPAddress) + } + entries = append(entries, n.IPAddresses...) + built := networkmatch.CompileIP(entries) + nm.ip = built + return built +} + +func (cm *containerMatchers) dnsMatcher(neighbors []v1beta1.NetworkNeighbor, idx int, slot *[]neighborMatchers) *networkmatch.DNSMatcher { + nm := &(*slot)[idx] + if nm.dns != nil { + return nm.dns + } + n := &neighbors[idx] + entries := make([]string, 0, len(n.DNSNames)+1) + if n.DNS != "" { + entries = append(entries, n.DNS) + } + entries = append(entries, n.DNSNames...) + built := networkmatch.CompileDNS(entries) + nm.dns = built + return built +} + +// invalidate drops the cached entry for a container. Called from the +// nnLibrary on profile-delete signals (future hook); not wired today, +// so entries linger until the container goes away. Memory footprint is +// 2 × sizeof(neighborMatchers) × num-neighbors which is bounded by the +// profile size — typically under a few hundred bytes per container. +func (c *matcherCache) invalidate(containerID string) { + c.m.Delete(containerID) +} diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_bench_test.go b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_bench_test.go new file mode 100644 index 0000000000..cb6d89d6b8 --- /dev/null +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_bench_test.go @@ -0,0 +1,161 @@ +package networkneighborhood + +import ( + "testing" + + "github.com/google/cel-go/common/types" + "github.com/google/cel-go/common/types/ref" + "github.com/goradd/maps" + "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/objectcache" + objectcachev1 "github.com/kubescape/node-agent/pkg/objectcache/v1" + "github.com/kubescape/node-agent/pkg/rulemanager/cel/libraries/cache" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Benchmarks that measure the production-realistic call shape: +// a CEL function (e.g. nn.was_address_in_egress) is invoked on a cache miss, +// walks the profile's egress neighbors, compiles+matches each one. +// +// Two axes: +// - profile size (small: 1 neighbor / 1 entry vs realistic: 5 neighbors / 3 entries) +// - cache state (cold: every call recompiles vs hot: matcherCache reuses) +// +// The "cold" baseline simulates what the previous feat/network-wildcards +// branch did before this PR (re-compile on every CEL function-cache miss). +// The "hot" measures the actual code path of this PR (compile-once amortised). + +func buildProfile(neighbors int, entriesPerNeighbor int) *v1beta1.ContainerProfile { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "bench-pod", + Annotations: map[string]string{ + helpers.SyncChecksumMetadataKey: "bench-checksum-v1", + }, + }, + } + cp.Spec.Egress = make([]v1beta1.NetworkNeighbor, neighbors) + for i := 0; i < neighbors; i++ { + ips := make([]string, entriesPerNeighbor) + // Mix of CIDR + literal so neither path has trivial work. + for j := 0; j < entriesPerNeighbor; j++ { + if j%2 == 0 { + ips[j] = "10.0.0.0/8" + } else { + ips[j] = "192.168.1.1" + } + } + cp.Spec.Egress[i] = v1beta1.NetworkNeighbor{ + Identifier: "n", + IPAddresses: ips, + DNSNames: []string{"*.example.com.", "api.partner.io."}, + } + } + return cp +} + +func buildBenchLib(b *testing.B, cp *v1beta1.ContainerProfile) *nnLibrary { + b.Helper() + objCache := objectcachev1.RuleObjectCacheMock{ + ContainerIDToSharedData: maps.NewSafeMap[string, *objectcache.WatchedContainerData](), + } + objCache.SetSharedContainerData("bench-cid", &objectcache.WatchedContainerData{ + ContainerType: objectcache.Container, + ContainerInfos: map[objectcache.ContainerType][]objectcache.ContainerInfo{ + objectcache.Container: {{Name: "bench"}}, + }, + }) + objCache.SetContainerProfile(cp) + return &nnLibrary{ + objectCache: &objCache, + functionCache: cache.NewFunctionCache(cache.DefaultFunctionCacheConfig()), + } +} + +func runEgressIPMatch(b *testing.B, lib *nnLibrary, address ref.Val) { + cid := types.String("bench-cid") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = lib.wasAddressInEgress(cid, address) + } +} + +// Small profile: 1 neighbor, 1 IP. Establishes the floor cost. +func BenchmarkCEL_EgressIP_Small_Hot(b *testing.B) { + lib := buildBenchLib(b, buildProfile(1, 1)) + // Prime the matcher cache: one call before the timed loop so the + // per-CEL-invocation cost is amortised. + _ = lib.wasAddressInEgress(types.String("bench-cid"), types.String("10.1.2.3")) + runEgressIPMatch(b, lib, types.String("10.1.2.3")) +} + +// Realistic profile: 5 neighbors × 3 entries (mix of CIDR + literal). +// Hot path = matcherCache reused. This is what production looks like +// AFTER the first CEL function-cache miss within a profile lifetime. +func BenchmarkCEL_EgressIP_Realistic_Hot(b *testing.B) { + lib := buildBenchLib(b, buildProfile(5, 3)) + _ = lib.wasAddressInEgress(types.String("bench-cid"), types.String("8.8.8.8")) + runEgressIPMatch(b, lib, types.String("8.8.8.8")) // worst case: miss every neighbor +} + +// Cold path: simulate the pre-cache pattern by wiping the matcher cache +// each iteration. This is what the previous feat/network-wildcards branch +// did on EVERY CEL function-cache miss (a unique containerID,address pair). +func BenchmarkCEL_EgressIP_Realistic_Cold(b *testing.B) { + cp := buildProfile(5, 3) + lib := buildBenchLib(b, cp) + addr := types.String("8.8.8.8") + cid := types.String("bench-cid") + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Drop the entire cache entry to force recompile on the next call. + lib.matcherCache.invalidate("bench-cid") + _ = lib.wasAddressInEgress(cid, addr) + } +} + +// DNS variants. + +func BenchmarkCEL_EgressDNS_Realistic_Hot(b *testing.B) { + lib := buildBenchLib(b, buildProfile(5, 3)) + _ = lib.isDomainInEgress(types.String("bench-cid"), types.String("ignored.fake.tld.")) + cid := types.String("bench-cid") + dom := types.String("ignored.fake.tld.") + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = lib.isDomainInEgress(cid, dom) + } +} + +func BenchmarkCEL_EgressDNS_Realistic_Cold(b *testing.B) { + cp := buildProfile(5, 3) + lib := buildBenchLib(b, cp) + cid := types.String("bench-cid") + dom := types.String("ignored.fake.tld.") + b.ResetTimer() + for i := 0; i < b.N; i++ { + lib.matcherCache.invalidate("bench-cid") + _ = lib.isDomainInEgress(cid, dom) + } +} + +// Profile churn: simulate a learning-mode profile that gets updated +// frequently (checksum changes), so cache lookups are mostly invalidated. +// Validates that the cache invalidation path itself isn't catastrophic. +func BenchmarkCEL_EgressIP_ChurningProfile(b *testing.B) { + cp := buildProfile(5, 3) + lib := buildBenchLib(b, cp) + cid := types.String("bench-cid") + addr := types.String("8.8.8.8") + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Bump checksum each iteration to force rebuild via getOrBuild. + if i%2 == 0 { + cp.Annotations[helpers.SyncChecksumMetadataKey] = "bench-checksum-v1" + } else { + cp.Annotations[helpers.SyncChecksumMetadataKey] = "bench-checksum-v2" + } + _ = lib.wasAddressInEgress(cid, addr) + } +} diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/network.go b/pkg/rulemanager/cel/libraries/networkneighborhood/network.go index 4851412fc1..0679874673 100644 --- a/pkg/rulemanager/cel/libraries/networkneighborhood/network.go +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/network.go @@ -6,62 +6,21 @@ import ( "github.com/kubescape/node-agent/pkg/rulemanager/cel/libraries/cache" "github.com/kubescape/node-agent/pkg/rulemanager/profilehelper" "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" - "github.com/kubescape/storage/pkg/registry/file/networkmatch" ) -// neighborMatchesIP reports whether the observed IP matches any entry on -// the neighbor — either the deprecated singular IPAddress (back-compat) -// or any of the new IPAddresses[] entries (literal, CIDR, or '*' sentinel). +// Each CEL function performs the same shape of work: +// 1. resolve container profile + checksum +// 2. fetch or build cached compiled matchers for this profile version +// 3. walk the relevant direction's neighbor slice, asking each compiled +// matcher whether the observation matches // -// Both the deprecated singular field and the new list field accept the -// SAME wildcard token vocabulary — i.e. a profile that sets -// IPAddress: "10.0.0.0/8" or IPAddress: "*" gets CIDR/sentinel matching -// just like the list form would. This unifies admission validation and -// runtime matching across both back-compat and current shapes. -// -// Built fresh per-call rather than cached. The functionCache layer in -// nn.go memoises the (containerID, address) tuple, so a hot rule firing -// on the same address won't repeatedly recompile the matcher. -func neighborMatchesIP(neighbor *v1beta1.NetworkNeighbor, observed string) bool { - // Route the deprecated singular IPAddress through MatchIP as a single-element - // slice so it gets the same canonicalisation (IPv6 forms, IPv4-mapped) as - // the new IPAddresses[] entries. Symmetric with neighborMatchesDNS, which - // also routes the deprecated singular DNS field through its matcher. - if neighbor.IPAddress != "" && networkmatch.MatchIP([]string{neighbor.IPAddress}, observed) { - return true - } - if len(neighbor.IPAddresses) > 0 { - if networkmatch.MatchIP(neighbor.IPAddresses, observed) { - return true - } - } - return false -} - -// neighborMatchesDNS reports whether the observed DNS name matches any -// entry on the neighbor — the deprecated singular DNS field, or any of -// the DNSNames[] entries (literal, leading-*, trailing-*, mid-⋯). -func neighborMatchesDNS(neighbor *v1beta1.NetworkNeighbor, observed string) bool { - // Route the deprecated singular DNS through MatchDNS as a single-element - // slice so it gets the same trailing-dot stripping + lowercasing as the - // new DNSNames[] entries — back-compat shouldn't mean inconsistent - // normalisation. - if neighbor.DNS != "" && networkmatch.MatchDNS([]string{neighbor.DNS}, observed) { - return true - } - if len(neighbor.DNSNames) > 0 { - if networkmatch.MatchDNS(neighbor.DNSNames, observed) { - return true - } - } - return false -} +// The matcherCache means we pay CompileIP / CompileDNS at most once per +// profile checksum per neighbor — not on every CEL function-cache miss. func (l *nnLibrary) wasAddressInEgress(containerID, address ref.Val) ref.Val { if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -70,18 +29,16 @@ func (l *nnLibrary) wasAddressInEgress(containerID, address ref.Val) ref.Val { if !ok { return types.MaybeNoSuchOverloadErr(address) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Egress { - if neighborMatchesIP(&cp.Spec.Egress[i], addressStr) { + if cm.ipMatcher(cp.Spec.Egress, i, &cm.egress).Match(addressStr) { return types.Bool(true) } } - return types.Bool(false) } @@ -89,7 +46,6 @@ func (l *nnLibrary) wasAddressInIngress(containerID, address ref.Val) ref.Val { if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -98,18 +54,16 @@ func (l *nnLibrary) wasAddressInIngress(containerID, address ref.Val) ref.Val { if !ok { return types.MaybeNoSuchOverloadErr(address) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Ingress { - if neighborMatchesIP(&cp.Spec.Ingress[i], addressStr) { + if cm.ipMatcher(cp.Spec.Ingress, i, &cm.ingress).Match(addressStr) { return types.Bool(true) } } - return types.Bool(false) } @@ -117,7 +71,6 @@ func (l *nnLibrary) isDomainInEgress(containerID, domain ref.Val) ref.Val { if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -126,18 +79,16 @@ func (l *nnLibrary) isDomainInEgress(containerID, domain ref.Val) ref.Val { if !ok { return types.MaybeNoSuchOverloadErr(domain) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Egress { - if neighborMatchesDNS(&cp.Spec.Egress[i], domainStr) { + if cm.dnsMatcher(cp.Spec.Egress, i, &cm.egress).Match(domainStr) { return types.Bool(true) } } - return types.Bool(false) } @@ -145,7 +96,6 @@ func (l *nnLibrary) isDomainInIngress(containerID, domain ref.Val) ref.Val { if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -154,18 +104,16 @@ func (l *nnLibrary) isDomainInIngress(containerID, domain ref.Val) ref.Val { if !ok { return types.MaybeNoSuchOverloadErr(domain) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Ingress { - if neighborMatchesDNS(&cp.Spec.Ingress[i], domainStr) { + if cm.dnsMatcher(cp.Spec.Ingress, i, &cm.ingress).Match(domainStr) { return types.Bool(true) } } - return types.Bool(false) } @@ -173,7 +121,6 @@ func (l *nnLibrary) wasAddressPortProtocolInEgress(containerID, address, port, p if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -186,10 +133,7 @@ func (l *nnLibrary) wasAddressPortProtocolInEgress(containerID, address, port, p if !ok { return types.MaybeNoSuchOverloadErr(port) } - // Reject out-of-range ports BEFORE narrowing to int32. CEL evaluates - // port as int64, but TCP/UDP wire ports are uint16. A bogus value - // like 4294967739 narrows to 443 and would match — return false - // instead of letting the wrap silently succeed. + // See network.go on feat/network-wildcards for the int64→int32 wrap rationale. if portInt < 0 || portInt > 65535 { return types.Bool(false) } @@ -198,15 +142,14 @@ func (l *nnLibrary) wasAddressPortProtocolInEgress(containerID, address, port, p if !ok { return types.MaybeNoSuchOverloadErr(protocol) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Egress { egress := &cp.Spec.Egress[i] - if !neighborMatchesIP(egress, addressStr) { + if !cm.ipMatcher(cp.Spec.Egress, i, &cm.egress).Match(addressStr) { continue } for _, portInfo := range egress.Ports { @@ -215,7 +158,6 @@ func (l *nnLibrary) wasAddressPortProtocolInEgress(containerID, address, port, p } } } - return types.Bool(false) } @@ -223,7 +165,6 @@ func (l *nnLibrary) wasAddressPortProtocolInIngress(containerID, address, port, if l.objectCache == nil { return types.NewErr("objectCache is nil") } - containerIDStr, ok := containerID.Value().(string) if !ok { return types.MaybeNoSuchOverloadErr(containerID) @@ -236,7 +177,6 @@ func (l *nnLibrary) wasAddressPortProtocolInIngress(containerID, address, port, if !ok { return types.MaybeNoSuchOverloadErr(port) } - // See wasAddressPortProtocolInEgress for the int64→int32 wrap rationale. if portInt < 0 || portInt > 65535 { return types.Bool(false) } @@ -245,15 +185,14 @@ func (l *nnLibrary) wasAddressPortProtocolInIngress(containerID, address, port, if !ok { return types.MaybeNoSuchOverloadErr(protocol) } - - cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) + cp, checksum, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - + cm := l.matcherCache.getOrBuild(containerIDStr, checksum, cp) for i := range cp.Spec.Ingress { ingress := &cp.Spec.Ingress[i] - if !neighborMatchesIP(ingress, addressStr) { + if !cm.ipMatcher(cp.Spec.Ingress, i, &cm.ingress).Match(addressStr) { continue } for _, portInfo := range ingress.Ports { @@ -262,6 +201,5 @@ func (l *nnLibrary) wasAddressPortProtocolInIngress(containerID, address, port, } } } - return types.Bool(false) } diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/nn.go b/pkg/rulemanager/cel/libraries/networkneighborhood/nn.go index cf9feef93c..ceefb20c7c 100644 --- a/pkg/rulemanager/cel/libraries/networkneighborhood/nn.go +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/nn.go @@ -28,6 +28,11 @@ func NN(objectCache objectcache.ObjectCache, config config.Config) cel.EnvOption type nnLibrary struct { objectCache objectcache.ObjectCache functionCache *cache.FunctionCache + // matcherCache amortises per-NetworkNeighbor CompileIP/CompileDNS + // across CEL function-cache misses. Invalidated by profile checksum. + // Zero-value-safe: sync.Map handles concurrent first-write fine, so + // callers don't have to construct it explicitly. + matcherCache matcherCache } func (l *nnLibrary) LibraryName() string { From 419ebbaba040846f6b256fe5419593cc29df674f Mon Sep 17 00:00:00 2001 From: Entlein Date: Wed, 13 May 2026 15:12:21 +0200 Subject: [PATCH 2/3] fix(matcher_cache): atomic-pointer lazy init + unconditional staleness replace (CR #42) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings from CodeRabbit round 1, both fixed: 1. Stale-entry shape race in getOrBuild (Major) Old code used LoadOrStore on the staleness path and only replaced on checksum mismatch — but a shape mismatch (neighbor count change) could leak the stale entry to a caller whose profile has a different shape, which then index-panics in ipMatcher/dnsMatcher. Fix: when staleness is detected (by checksum OR shape), always Store unconditionally. Worst-case contention: several goroutines build shape-correct fresh entries and one Store wins; all callers still see a shape-correct entry. Orphans get GC'd. 2. Unsynchronised lazy-init of per-neighbor matchers (Critical) neighborMatchers.ip / .dns were *Matcher with a non-atomic 'if nil then build then assign' pattern — a real data race. Fix: switched to atomic.Pointer[networkmatch.IPMatcher] (and DNS). First-build callers may race on Compile but only one pointer wins via CompareAndSwap; everyone returns the winning matcher. Pure functions (no shared state) so duplicate Compile work is wasteful but not incorrect. New tests in matcher_cache_test.go pin the contract: - TestMatcherCache_ConcurrentFirstBuild: 64 goroutines racing on the same slot, run under -race, asserts matchers are populated exactly once - TestMatcherCache_StaleEntryReplaced: shape-mismatch path returns a fresh containerMatchers, not the stale one - TestMatcherCache_ChecksumPreservedAcrossCalls: same checksum hits cache (no rebuild) Benchmarks re-run after atomic.Pointer switch — negligible impact (177 → 186 ns/op, still 8x faster than cold path). All headline savings preserved. --- .../networkneighborhood/matcher_cache.go | 73 +++++++----- .../networkneighborhood/matcher_cache_test.go | 112 ++++++++++++++++++ 2 files changed, 155 insertions(+), 30 deletions(-) create mode 100644 pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go index 5c973dbea0..e5eb7ff8b4 100644 --- a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache.go @@ -2,6 +2,7 @@ package networkneighborhood import ( "sync" + "sync/atomic" "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" "github.com/kubescape/storage/pkg/registry/file/networkmatch" @@ -9,15 +10,24 @@ import ( // neighborMatchers carries the compiled-once matchers for ONE NetworkNeighbor. // Built lazily on first match attempt against this neighbor. +// +// Concurrency: both fields are atomic pointers. Multiple goroutines may +// race on the first build for a given index; CompileIP/CompileDNS are +// pure (no shared state), so duplicate builds are wasteful but correct. +// Only one resulting *matcher pointer wins via CompareAndSwap. type neighborMatchers struct { - ip *networkmatch.IPMatcher - dns *networkmatch.DNSMatcher + ip atomic.Pointer[networkmatch.IPMatcher] + dns atomic.Pointer[networkmatch.DNSMatcher] } // containerMatchers caches every neighbor's compiled matchers for one // container, keyed by direction + position in the spec slice. Tagged with // the profile's SyncChecksumMetadataKey so we can invalidate atomically when // the profile mutates. +// +// containerMatchers is treated as immutable once published into matcherCache.m: +// callers MUST NOT mutate egress/ingress slices in place. Stale entries are +// REPLACED wholesale (via Store), never patched. type containerMatchers struct { checksum string egress []neighborMatchers @@ -35,13 +45,18 @@ type matcherCache struct { } // getOrBuild returns the compiled-matcher set for this container's current -// profile. If the cached entry is stale (different checksum, or different -// neighbor count after a profile shape change), it rebuilds. +// profile. If the cached entry is stale — by checksum OR by neighbor-count +// shape — it builds a fresh entry and replaces unconditionally. +// +// Always-Store-on-staleness avoids a subtle race: with LoadOrStore, two +// goroutines racing past a stale entry could "agree" on whichever lost the +// store, even if its shape didn't match the current profile. That would +// later panic in ipMatcher/dnsMatcher when indexed past the cached slice. // -// The build itself is a no-op pre-compile: we don't pay the per-neighbor +// The build itself is a no-op pre-allocation: we don't pay the per-neighbor // CompileIP/CompileDNS cost until the first match call against that -// neighbor. neighborMatchers struct fields are nil-initialised so the -// matcher accessor lazily builds. +// neighbor. neighborMatchers fields are atomic.Pointer-zero so the matcher +// accessor builds them lazily and concurrently-safely. func (c *matcherCache) getOrBuild(containerID, checksum string, cp *v1beta1.ContainerProfile) *containerMatchers { if v, ok := c.m.Load(containerID); ok { cm := v.(*containerMatchers) @@ -56,49 +71,45 @@ func (c *matcherCache) getOrBuild(containerID, checksum string, cp *v1beta1.Cont egress: make([]neighborMatchers, len(cp.Spec.Egress)), ingress: make([]neighborMatchers, len(cp.Spec.Ingress)), } - // LoadOrStore: another goroutine may have raced us with the same checksum; - // keep the first one stored so callers converge on a single instance. - actual, _ := c.m.LoadOrStore(containerID, fresh) - cm := actual.(*containerMatchers) - if cm.checksum != checksum { - // Concurrent update with a different checksum landed first. Replace. - c.m.Store(containerID, fresh) - return fresh - } - return cm + // Store unconditionally on the staleness path: replaces any + // concurrently-stored entry. Worst case under contention: a few + // goroutines all compile fresh shape-correct entries and one Store wins, + // other goroutines hold a now-orphaned but still-shape-correct fresh. + // All callers see a shape-correct entry; orphans get GC'd. + c.m.Store(containerID, fresh) + return fresh } // ipMatcher returns the compiled IP matcher for the given neighbor index, // lazily building it the first time. Combines the deprecated singular // IPAddress and the new IPAddresses[] into one matcher per neighbor. // -// Concurrency: writes to neighborMatchers.ip are guarded by an atomic -// LoadOrStore-style pattern; multiple goroutines racing on the same index -// MAY each pay the compile cost, but only one *IPMatcher pointer wins. -// In practice the CEL functionCache layer above us serialises most calls. +// Concurrency: atomic.Pointer.CompareAndSwap publishes the matcher. +// Concurrent first-build callers may each compile, but only one pointer +// wins; everyone returns the winning pointer. func (cm *containerMatchers) ipMatcher(neighbors []v1beta1.NetworkNeighbor, idx int, slot *[]neighborMatchers) *networkmatch.IPMatcher { nm := &(*slot)[idx] - if nm.ip != nil { - return nm.ip + if existing := nm.ip.Load(); existing != nil { + return existing } n := &neighbors[idx] - // Single compile per neighbor combining both deprecated singular IPAddress - // and the v0.0.2 IPAddresses[] list. Same merged entries as - // network.go:neighborMatchesIP, just amortised across calls. entries := make([]string, 0, len(n.IPAddresses)+1) if n.IPAddress != "" { entries = append(entries, n.IPAddress) } entries = append(entries, n.IPAddresses...) built := networkmatch.CompileIP(entries) - nm.ip = built + if !nm.ip.CompareAndSwap(nil, built) { + // Lost the race. Return the winning matcher. + return nm.ip.Load() + } return built } func (cm *containerMatchers) dnsMatcher(neighbors []v1beta1.NetworkNeighbor, idx int, slot *[]neighborMatchers) *networkmatch.DNSMatcher { nm := &(*slot)[idx] - if nm.dns != nil { - return nm.dns + if existing := nm.dns.Load(); existing != nil { + return existing } n := &neighbors[idx] entries := make([]string, 0, len(n.DNSNames)+1) @@ -107,7 +118,9 @@ func (cm *containerMatchers) dnsMatcher(neighbors []v1beta1.NetworkNeighbor, idx } entries = append(entries, n.DNSNames...) built := networkmatch.CompileDNS(entries) - nm.dns = built + if !nm.dns.CompareAndSwap(nil, built) { + return nm.dns.Load() + } return built } diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go new file mode 100644 index 0000000000..d6500b5ef7 --- /dev/null +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go @@ -0,0 +1,112 @@ +package networkneighborhood + +import ( + "sync" + "testing" + + "github.com/google/cel-go/common/types" + "github.com/goradd/maps" + "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/objectcache" + objectcachev1 "github.com/kubescape/node-agent/pkg/objectcache/v1" + "github.com/kubescape/node-agent/pkg/rulemanager/cel/libraries/cache" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// TestMatcherCache_ConcurrentFirstBuild pins the atomic-pointer race +// contract on neighborMatchers. Concurrent first-build callers may each +// compile, but they MUST all return the same *IPMatcher / *DNSMatcher +// pointer (the CompareAndSwap winner), and the cached entry MUST be +// reusable thereafter without rebuild. +// +// Run with `go test -race` to catch unsynchronised writes. +func TestMatcherCache_ConcurrentFirstBuild(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{helpers.SyncChecksumMetadataKey: "csum-1"}, + }, + } + cp.Spec.Egress = []v1beta1.NetworkNeighbor{ + {IPAddresses: []string{"10.0.0.0/8"}, DNSNames: []string{"*.example.com."}}, + } + + objCache := objectcachev1.RuleObjectCacheMock{ + ContainerIDToSharedData: maps.NewSafeMap[string, *objectcache.WatchedContainerData](), + } + objCache.SetSharedContainerData("cid", &objectcache.WatchedContainerData{ + ContainerType: objectcache.Container, + ContainerInfos: map[objectcache.ContainerType][]objectcache.ContainerInfo{ + objectcache.Container: {{Name: "c"}}, + }, + }) + objCache.SetContainerProfile(cp) + lib := &nnLibrary{ + objectCache: &objCache, + functionCache: cache.NewFunctionCache(cache.DefaultFunctionCacheConfig()), + } + + const goroutines = 64 + var wg sync.WaitGroup + wg.Add(goroutines) + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + // Both functions race on the same neighborMatchers slot. + _ = lib.wasAddressInEgress(types.String("cid"), types.String("10.1.2.3")) + _ = lib.isDomainInEgress(types.String("cid"), types.String("api.example.com.")) + }() + } + wg.Wait() + + // Post-condition: cached entry exists, has the right shape, and + // per-neighbor matchers are populated. + cm := lib.matcherCache.getOrBuild("cid", "csum-1", cp) + require.Equal(t, 1, len(cm.egress), "egress shape must match profile") + require.NotNil(t, cm.egress[0].ip.Load(), "ip matcher must be built after concurrent access") + require.NotNil(t, cm.egress[0].dns.Load(), "dns matcher must be built after concurrent access") +} + +// TestMatcherCache_StaleEntryReplaced confirms that shape-mismatched +// cached entries are unconditionally replaced — never returned to a +// caller whose profile has a different shape (which would later index- +// panic in ipMatcher/dnsMatcher). +func TestMatcherCache_StaleEntryReplaced(t *testing.T) { + mc := &matcherCache{} + cpV1 := &v1beta1.ContainerProfile{} + cpV1.Spec.Egress = []v1beta1.NetworkNeighbor{ + {IPAddresses: []string{"10.0.0.0/8"}}, + } + // Seed with a v1 entry. + cm1 := mc.getOrBuild("cid", "csum-v1", cpV1) + require.Equal(t, 1, len(cm1.egress)) + + // Now the profile grows to 3 egress entries; new call should NOT + // return the stale 1-entry cm1. + cpV2 := &v1beta1.ContainerProfile{} + cpV2.Spec.Egress = []v1beta1.NetworkNeighbor{ + {IPAddresses: []string{"10.0.0.0/8"}}, + {IPAddresses: []string{"192.168.0.0/16"}}, + {IPAddresses: []string{"172.16.0.0/12"}}, + } + cm2 := mc.getOrBuild("cid", "csum-v2", cpV2) + require.Equal(t, 3, len(cm2.egress), "shape-mismatched stale entry must be replaced") + require.NotEqual(t, cm1, cm2, "must be a different containerMatchers instance") +} + +// TestMatcherCache_ChecksumPreservedAcrossCalls confirms that repeated +// getOrBuild calls with the SAME checksum return the SAME instance, +// proving the cache is doing what we want it to do. +func TestMatcherCache_ChecksumPreservedAcrossCalls(t *testing.T) { + mc := &matcherCache{} + cp := &v1beta1.ContainerProfile{} + cp.Spec.Egress = []v1beta1.NetworkNeighbor{ + {IPAddresses: []string{"10.0.0.0/8"}}, + } + a := mc.getOrBuild("cid", "csum", cp) + b := mc.getOrBuild("cid", "csum", cp) + c := mc.getOrBuild("cid", "csum", cp) + require.Same(t, a, b, "same checksum must hit cache on second call") + require.Same(t, b, c, "same checksum must hit cache on third call") +} From 05ce6d93d305473c6350443147d30218da1f403d Mon Sep 17 00:00:00 2001 From: Entlein Date: Wed, 13 May 2026 15:30:40 +0200 Subject: [PATCH 3/3] test(matcher_cache): add start barrier to concurrency test (CR #42 round 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without the barrier, goroutine launch jitter staggers first-call arrivals, hiding any unsynchronised-write data race during the first-build window. With the barrier, all 64 goroutines hit the contended path simultaneously when close(start) fires — much tighter race-detector coverage of the atomic.Pointer.CompareAndSwap path. --- .../libraries/networkneighborhood/matcher_cache_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go index d6500b5ef7..67852c4899 100644 --- a/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/matcher_cache_test.go @@ -48,16 +48,24 @@ func TestMatcherCache_ConcurrentFirstBuild(t *testing.T) { } const goroutines = 64 + // Start barrier: every goroutine blocks on <-start before doing the + // contended work, so when we close(start) they all race the + // first-build path simultaneously rather than staggered. Without this, + // goroutine launch jitter can hide the unsynchronised-write data race + // that this test exists to detect. + start := make(chan struct{}) var wg sync.WaitGroup wg.Add(goroutines) for i := 0; i < goroutines; i++ { go func() { defer wg.Done() + <-start // Both functions race on the same neighborMatchers slot. _ = lib.wasAddressInEgress(types.String("cid"), types.String("10.1.2.3")) _ = lib.isDomainInEgress(types.String("cid"), types.String("api.example.com.")) }() } + close(start) wg.Wait() // Post-condition: cached entry exists, has the right shape, and