diff --git a/pkg/registry/file/dynamicpathdetector/compare_exec_args.go b/pkg/registry/file/dynamicpathdetector/compare_exec_args.go new file mode 100644 index 000000000..31550a467 --- /dev/null +++ b/pkg/registry/file/dynamicpathdetector/compare_exec_args.go @@ -0,0 +1,95 @@ +package dynamicpathdetector + +// CompareExecArgs reports whether a runtime exec argument vector matches a +// profile argument vector. The profile vector may contain two wildcard +// tokens: +// +// DynamicIdentifier ("⋯") — matches exactly one argument position. +// WildcardIdentifier ("*") — matches zero or more consecutive arguments. +// +// Anything else is a literal-equality match. The match is anchored at both +// ends: every runtime argument must be consumed by the profile vector, +// either by a literal, a DynamicIdentifier, or absorbed into a +// WildcardIdentifier run. +// +// Empty profileArgs is treated as "no argv constraint" — i.e. matches any +// runtime arg vector. This keeps path-only Execs entries (the common case +// in user-defined ApplicationProfiles, which omit the Args field) from +// silently triggering R0040 just because the rule started consulting +// was_executed_with_args. A user that wants to assert "this exec must have +// no args" can write Args: []string{} in their profile and the empty +// runtime vector still matches by virtue of the wildcard semantics. +// +// Implementation is index-based recursive backtracking with memoisation +// on (profileIndex, runtimeIndex) state pairs. The naive backtracking +// form would degrade to exponential time on adversarial inputs like +// `[*, *, *, …, x]` against a long literal vector — every prefix `*` +// has multiple split choices and the suffix mismatch only surfaces +// at the very end, so each path gets re-explored. Memoisation bounds +// the work at O(len(profile) * len(runtime)) — i.e. quadratic in the +// vector lengths, the standard wildcard-match complexity. CodeRabbit +// flagged this as a Major on PR #27. +func CompareExecArgs(profileArgs, runtimeArgs []string) bool { + // Outer-level empty profile = "no argv constraint" — wildcard match. + // The inner matcher keeps strict empty-empty semantics so anchoring + // during recursion (`profile fully consumed but runtime has more`) + // remains a mismatch. + if len(profileArgs) == 0 { + return true + } + + // State key for memoisation: (pi, ri) is the suffix-matching position + // in profile and runtime vectors respectively. Because both sides only + // shrink (we never re-enter a prefix), there are at most + // (len(profile)+1) * (len(runtime)+1) reachable states. + type state struct{ pi, ri int } + memo := make(map[state]bool, (len(profileArgs)+1)*(len(runtimeArgs)+1)) + seen := make(map[state]bool, (len(profileArgs)+1)*(len(runtimeArgs)+1)) + + var match func(pi, ri int) bool + match = func(pi, ri int) bool { + s := state{pi: pi, ri: ri} + if seen[s] { + return memo[s] + } + seen[s] = true + + // Profile fully consumed → runtime must also be fully consumed + // (anchored match). + if pi == len(profileArgs) { + memo[s] = ri == len(runtimeArgs) + return memo[s] + } + + head := profileArgs[pi] + + if head == WildcardIdentifier { + // Try absorbing 0..(remaining runtime) into this *, + // then match the rest. First successful split wins. + for k := ri; k <= len(runtimeArgs); k++ { + if match(pi+1, k) { + memo[s] = true + return true + } + } + memo[s] = false + return false + } + + // Non-wildcard head needs a runtime argument to consume. + if ri == len(runtimeArgs) { + memo[s] = false + return false + } + + if head == DynamicIdentifier || head == runtimeArgs[ri] { + memo[s] = match(pi+1, ri+1) + return memo[s] + } + + memo[s] = false + return false + } + + return match(0, 0) +} diff --git a/pkg/registry/file/dynamicpathdetector/tests/compare_exec_args_test.go b/pkg/registry/file/dynamicpathdetector/tests/compare_exec_args_test.go new file mode 100644 index 000000000..82ab0da4a --- /dev/null +++ b/pkg/registry/file/dynamicpathdetector/tests/compare_exec_args_test.go @@ -0,0 +1,217 @@ +package dynamicpathdetectortests + +import ( + "testing" + "time" + + "github.com/kubescape/storage/pkg/registry/file/dynamicpathdetector" +) + +// CompareExecArgs matches a runtime argument vector against a profile +// argument vector that may contain two wildcard tokens: +// +// "⋯" (DynamicIdentifier) — matches exactly ONE argument position. +// "*" (WildcardIdentifier) — matches ZERO OR MORE consecutive args. +// +// Anything else is a literal string match. The match must be exact across +// the full vectors — extra runtime args after the profile is exhausted (and +// no trailing wildcard absorbs them) is a non-match. + +func TestCompareExecArgs_LiteralMatch(t *testing.T) { + cases := []struct { + name string + profile []string + runtime []string + want bool + }{ + // Empty profileArgs = "no argv constraint" — matches any runtime. + // Pinned this way so path-only Execs entries in user-defined + // ApplicationProfiles don't silently trigger R0040 when the rule + // consults was_executed_with_args. See storage/node-agent issue + // where Test_28 (and others using path-only entries) failed because + // the strict empty-empty match was firing R0040 on every legit exec. + {"both empty", nil, nil, true}, + {"empty profile, non-empty runtime", nil, []string{"a"}, true}, + {"empty profile, multi-arg runtime", nil, []string{"a", "b", "c"}, true}, + {"non-empty profile, empty runtime", []string{"a"}, nil, false}, + {"single literal match", []string{"--help"}, []string{"--help"}, true}, + {"single literal mismatch", []string{"--help"}, []string{"--version"}, false}, + {"profile longer than runtime", []string{"a", "b"}, []string{"a"}, false}, + {"runtime longer than profile (no wildcard)", []string{"a"}, []string{"a", "b"}, false}, + {"multi-literal match", []string{"-l", "-a", "/tmp"}, []string{"-l", "-a", "/tmp"}, true}, + {"multi-literal mismatch in middle", []string{"-l", "-a", "/tmp"}, []string{"-l", "-z", "/tmp"}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := dynamicpathdetector.CompareExecArgs(tc.profile, tc.runtime); got != tc.want { + t.Errorf("CompareExecArgs(%v, %v) = %v, want %v", tc.profile, tc.runtime, got, tc.want) + } + }) + } +} + +func TestCompareExecArgs_DynamicIdentifier(t *testing.T) { + cases := []struct { + name string + profile []string + runtime []string + want bool + }{ + {"⋯ matches one arg", []string{"⋯"}, []string{"anything"}, true}, + {"⋯ does NOT match zero args", []string{"⋯"}, nil, false}, + {"⋯ does NOT match two args", []string{"⋯"}, []string{"a", "b"}, false}, + {"⋯ in middle, full vector matches", []string{"--user", "⋯", "--port", "8080"}, []string{"--user", "alice", "--port", "8080"}, true}, + {"⋯ in middle, surrounding literal mismatch", []string{"--user", "⋯", "--port", "8080"}, []string{"--user", "alice", "--port", "9090"}, false}, + {"adjacent ⋯⋯ matches exactly two args", []string{"⋯", "⋯"}, []string{"a", "b"}, true}, + {"adjacent ⋯⋯ rejects one arg", []string{"⋯", "⋯"}, []string{"a"}, false}, + {"adjacent ⋯⋯ rejects three args", []string{"⋯", "⋯"}, []string{"a", "b", "c"}, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := dynamicpathdetector.CompareExecArgs(tc.profile, tc.runtime); got != tc.want { + t.Errorf("CompareExecArgs(%v, %v) = %v, want %v", tc.profile, tc.runtime, got, tc.want) + } + }) + } +} + +func TestCompareExecArgs_WildcardIdentifier(t *testing.T) { + cases := []struct { + name string + profile []string + runtime []string + want bool + }{ + {"* matches empty runtime", []string{"*"}, nil, true}, + {"* matches one arg", []string{"*"}, []string{"a"}, true}, + {"* matches many args", []string{"*"}, []string{"a", "b", "c", "d"}, true}, + {"trailing * with prefix match", []string{"-c", "*"}, []string{"-c", "echo hi"}, true}, + {"trailing * absorbs nothing when runtime exact-prefix length", []string{"-c", "*"}, []string{"-c"}, true}, + {"trailing * mismatch in literal prefix", []string{"-c", "*"}, []string{"-x", "echo hi"}, false}, + {"middle * matches and re-anchors on literal", []string{"sh", "*", "exit"}, []string{"sh", "-c", "echo hi", "exit"}, true}, + {"middle * with literal that does not appear", []string{"sh", "*", "exit"}, []string{"sh", "-c", "echo hi"}, false}, + {"middle * matches when zero args between anchors", []string{"sh", "*", "exit"}, []string{"sh", "exit"}, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := dynamicpathdetector.CompareExecArgs(tc.profile, tc.runtime); got != tc.want { + t.Errorf("CompareExecArgs(%v, %v) = %v, want %v", tc.profile, tc.runtime, got, tc.want) + } + }) + } +} + +func TestCompareExecArgs_MixedTokens(t *testing.T) { + cases := []struct { + name string + profile []string + runtime []string + want bool + }{ + {"⋯ then * — needs at least one arg before the *", + []string{"⋯", "*"}, []string{"a"}, true}, + {"⋯ then * — empty runtime fails (⋯ needs one)", + []string{"⋯", "*"}, nil, false}, + {"⋯ then * — many args ok", + []string{"⋯", "*"}, []string{"a", "b", "c"}, true}, + {"* then ⋯ — needs at least one arg for ⋯", + []string{"*", "⋯"}, []string{"x"}, true}, + {"* then ⋯ — empty runtime fails", + []string{"*", "⋯"}, nil, false}, + {"literal, ⋯, * — typical user pattern", + []string{"--user", "⋯", "*"}, []string{"--user", "alice", "--verbose", "--out", "/tmp"}, true}, + {"literal, ⋯, * — runtime too short for ⋯", + []string{"--user", "⋯", "*"}, []string{"--user"}, false}, + {"only ⋯, runtime empty — fails (⋯ requires exactly one)", + []string{"⋯"}, []string{}, false}, + {"only *, runtime empty — passes", + []string{"*"}, []string{}, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := dynamicpathdetector.CompareExecArgs(tc.profile, tc.runtime); got != tc.want { + t.Errorf("CompareExecArgs(%v, %v) = %v, want %v", tc.profile, tc.runtime, got, tc.want) + } + }) + } +} + +func TestCompareExecArgs_RealisticPatterns(t *testing.T) { + cases := []struct { + name string + profile []string + runtime []string + want bool + }{ + {"curl with any URL", []string{"-s", "⋯"}, []string{"-s", "https://example.com"}, true}, + {"sh -c with any command", + []string{"-c", "*"}, + []string{"-c", "while true; do sleep 1; done"}, + true, + }, + {"echo with any number of words", + []string{"hello", "*"}, + []string{"hello", "world", "from", "test"}, + true, + }, + {"ls -l in arbitrary directory", + []string{"-l", "⋯"}, + []string{"-l", "/var/log"}, + true, + }, + {"ls without args fails wildcard arg pattern", + []string{"-l", "⋯"}, + []string{"-l"}, + false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := dynamicpathdetector.CompareExecArgs(tc.profile, tc.runtime); got != tc.want { + t.Errorf("CompareExecArgs(%v, %v) = %v, want %v", tc.profile, tc.runtime, got, tc.want) + } + }) + } +} + +// TestCompareExecArgs_ReDoSResistance pins that the matcher handles +// adversarial wildcard-heavy inputs in bounded time. The classic +// catastrophic-backtracking case is `[*, *, *, …, "literal"]` vs a +// long literal-runtime vector that mismatches the trailing literal +// — every prefix * has multiple split choices and the suffix +// mismatch only surfaces at the very end, so each path gets +// re-explored. With memoisation this is O(P*R); without it, naïve +// recursion would be exponential. +// +// CodeRabbit flagged the unmemoised version on PR #27 (Major). +func TestCompareExecArgs_ReDoSResistance(t *testing.T) { + // 20 leading wildcards + a literal that won't match. Without + // memoisation, the naïve matcher tries roughly 2^20 path splits + // before failing — observable as a many-second test. The + // memoised version completes in microseconds. + profile := make([]string, 0, 21) + for i := 0; i < 20; i++ { + profile = append(profile, dynamicpathdetector.WildcardIdentifier) + } + profile = append(profile, "needle-that-does-not-exist") + + runtime := make([]string, 0, 50) + for i := 0; i < 50; i++ { + runtime = append(runtime, "a") + } + + start := time.Now() + got := dynamicpathdetector.CompareExecArgs(profile, runtime) + elapsed := time.Since(start) + + if got { + t.Errorf("expected mismatch for trailing-literal that isn't in runtime") + } + // Memoised matcher: 21 * 51 = ~1100 states, each O(R) work for + // the wildcard split → total bound ~50K ops. Generous budget of + // 100ms catches any regression to the unmemoised form (which + // would be measured in seconds, not milliseconds, on this input). + if elapsed > 100*time.Millisecond { + t.Errorf("matcher took %v on adversarial input — memoisation regression?", elapsed) + } +}