diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 05150948cd..b86e614adc 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -31,6 +31,9 @@ public static RootCommand Create() var reporterOpt = new Option("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; var noOverfittingCheckOpt = new Option("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; var overfittingFixOpt = new Option("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; + var selectivityTestOpt = new Option("--selectivity-test") { Description = "Run selectivity test using should_activate / should_not_activate prompts from eval.yaml" }; + var selectivityMinRecallOpt = new Option("--selectivity-min-recall") { Description = "Minimum recall (activation on should_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; + var selectivityMinPrecisionOpt = new Option("--selectivity-min-precision") { Description = "Minimum precision (non-activation on should_not_activate prompts) to pass (0-1)", DefaultValueFactory = _ => 0.8 }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -54,6 +57,9 @@ public static RootCommand Create() reporterOpt, noOverfittingCheckOpt, overfittingFixOpt, + selectivityTestOpt, + selectivityMinRecallOpt, + selectivityMinPrecisionOpt, }; command.SetAction(async (parseResult, _) => @@ -99,6 +105,9 @@ public static RootCommand Create() TestsDir = parseResult.GetValue(testsDirOpt), OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), OverfittingFix = parseResult.GetValue(overfittingFixOpt), + SelectivityTest = parseResult.GetValue(selectivityTestOpt), + SelectivityMinRecall = parseResult.GetValue(selectivityMinRecallOpt), + SelectivityMinPrecision = parseResult.GetValue(selectivityMinPrecisionOpt), }; return await Run(config); @@ -415,6 +424,36 @@ internal static List CheckAggregateDescriptionLimits(IReadOnlyList 0 } || skill.EvalConfig.ShouldNotActivatePrompts is { Count: > 0 })) + { + log("🎯 Running selectivity test (standalone)..."); + var selectivityResult = await ExecuteSelectivityTest(skill, config, spinner); + log($"🎯 Selectivity: recall={selectivityResult.Recall:P0}, precision={selectivityResult.Precision:P0} — {(selectivityResult.Passed ? "PASSED" : "FAILED")}"); + + return new SkillVerdict + { + SkillName = skill.Name, + SkillPath = skill.Path, + Passed = selectivityResult.Passed, + Scenarios = [], + OverallImprovementScore = 0, + Reason = selectivityResult.Passed + ? "Selectivity test passed" + : $"Selectivity test failed: {selectivityResult.Reason}", + FailureKind = selectivityResult.Passed ? null : "selectivity_failure", + ProfileWarnings = profile.Warnings, + SelectivityResult = selectivityResult, + }; + } + + log("⏭ Skipping (no selectivity prompts in eval.yaml)"); + return null; + } + // Launch overfitting check in parallel with scenario execution var workDir = Path.GetTempPath(); Task overfittingTask = Task.FromResult(null); @@ -578,8 +617,8 @@ private static async Task ExecuteRun( runLog("running agents..."); var agentTasks = await Task.WhenAll( - AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)), - AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog))); + AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, Log: runLog)), + AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, Log: runLog))); var baselineMetrics = agentTasks[0]; var withSkillMetrics = agentTasks[1]; @@ -724,4 +763,75 @@ private static string SanitizeErrorMessage(string? message) var singleLine = raw.ReplaceLineEndings(" "); return singleLine.Length > 150 ? singleLine[..150] + "…" : singleLine; } + + private static async Task ExecuteSelectivityTest(SkillInfo skill, ValidatorConfig config, Spinner spinner) + { + var prefix = $"[{skill.Name}/selectivity]"; + var log = (string msg) => spinner.Log($"{prefix} {msg}"); + + // Launch all probes in parallel + var tasks = new List>(); + + if (skill.EvalConfig!.ShouldActivatePrompts is { } activatePrompts) + { + foreach (var prompt in activatePrompts) + { + log($"Testing should_activate: \"{Truncate(prompt, 60)}\""); + tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: true, config, log)); + } + } + + if (skill.EvalConfig.ShouldNotActivatePrompts is { } deactivatePrompts) + { + foreach (var prompt in deactivatePrompts) + { + log($"Testing should_not_activate: \"{Truncate(prompt, 60)}\""); + tasks.Add(ProbeAndLog(skill, prompt, expectedActivation: false, config, log)); + } + } + + var promptResults = (await Task.WhenAll(tasks)).ToList(); + + // Calculate recall: fraction of should_activate prompts that actually activated + var shouldActivateResults = promptResults.Where(r => r.ExpectedActivation).ToList(); + double recall = shouldActivateResults.Count > 0 + ? (double)shouldActivateResults.Count(r => r.SkillActivated) / shouldActivateResults.Count + : 1.0; + + // Calculate precision: fraction of should_not_activate prompts that correctly did NOT activate + var shouldNotActivateResults = promptResults.Where(r => !r.ExpectedActivation).ToList(); + double precision = shouldNotActivateResults.Count > 0 + ? (double)shouldNotActivateResults.Count(r => !r.SkillActivated) / shouldNotActivateResults.Count + : 1.0; + + bool passed = recall >= config.SelectivityMinRecall && precision >= config.SelectivityMinPrecision; + var reasons = new List(); + if (recall < config.SelectivityMinRecall) + reasons.Add($"Recall {recall:P0} below threshold {config.SelectivityMinRecall:P0}"); + if (precision < config.SelectivityMinPrecision) + reasons.Add($"Precision {precision:P0} below threshold {config.SelectivityMinPrecision:P0}"); + string reason = passed ? "Selectivity test passed" : string.Join("; ", reasons); + + return new SelectivityResult(promptResults, recall, precision, passed, reason); + } + + private static async Task ProbeAndLog( + SkillInfo skill, string prompt, bool expectedActivation, ValidatorConfig config, Action log) + { + var activated = await TestSkillActivation(skill, prompt, config); + if (expectedActivation) + log($" → {(activated ? "✅ activated" : "❌ NOT activated")}: \"{Truncate(prompt, 50)}\""); + else + log($" → {(activated ? "❌ activated (unexpected)" : "✅ correctly NOT activated")}: \"{Truncate(prompt, 50)}\""); + return new SelectivityPromptResult(prompt, ExpectedActivation: expectedActivation, SkillActivated: activated); + } + + private static async Task TestSkillActivation(SkillInfo skill, string prompt, ValidatorConfig config) + { + var scenario = new EvalScenario(Name: "selectivity-probe", Prompt: prompt, Rubric: [], Timeout: 15); + return await AgentRunner.ProbeSkillActivation(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose)); + } + + private static string Truncate(string value, int maxLength) => + value.Length <= maxLength ? value : value[..(maxLength - 1)] + "…"; } diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 6081bcb8d1..b1cdc6ebf1 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -70,7 +70,10 @@ public sealed record EvalScenario( int? MaxTokens = null, bool ExpectActivation = true); -public sealed record EvalConfig(IReadOnlyList Scenarios); +public sealed record EvalConfig( + IReadOnlyList Scenarios, + IReadOnlyList? ShouldActivatePrompts = null, + IReadOnlyList? ShouldNotActivatePrompts = null); // --- Skill info --- @@ -259,6 +262,7 @@ public sealed class SkillVerdict public IReadOnlyList? ProfileWarnings { get; set; } public bool SkillNotActivated { get; set; } public OverfittingResult? OverfittingResult { get; set; } + public SelectivityResult? SelectivityResult { get; set; } } // --- Overfitting assessment --- @@ -306,6 +310,20 @@ public sealed record OverfittingJudgeOptions( int Timeout, string WorkDir); +// --- Selectivity test --- + +public sealed record SelectivityPromptResult( + string Prompt, + bool ExpectedActivation, + bool SkillActivated); + +public sealed record SelectivityResult( + IReadOnlyList PromptResults, + double Recall, + double Precision, + bool Passed, + string Reason); + // --- Config --- public sealed record ReporterSpec(ReporterType Type); @@ -340,6 +358,9 @@ public sealed record ValidatorConfig public string? TestsDir { get; init; } public bool OverfittingCheck { get; init; } = true; public bool OverfittingFix { get; init; } + public bool SelectivityTest { get; init; } + public double SelectivityMinRecall { get; init; } = 0.8; + public double SelectivityMinPrecision { get; init; } = 0.8; } public static class DefaultWeights diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 38c246567b..667899153a 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -311,6 +311,77 @@ private static async Task RunAgentCore(RunOptions options, Cancellat return metrics; } + /// + /// Lightweight probe that sends a prompt and checks whether the skill is activated. + /// Exits immediately when a SkillInvokedEvent is seen, or waits for the session to + /// complete/timeout. Designed to run many probes in parallel via Task.WhenAll. + /// + public static async Task ProbeSkillActivation(RunOptions options) + { + var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); + Directory.CreateDirectory(workDir); + _workDirs.Add(workDir); + + if (options.Verbose) + { + var write = options.Log ?? (msg => Console.Error.WriteLine(msg)); + write($" 📂 {workDir} (skilled)"); + } + + bool skillActivated = false; + var done = new TaskCompletionSource(); + + try + { + var client = await GetSharedClient(options.Verbose); + await using var session = await client.CreateSessionAsync( + BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers)); + + // 30s timeout — enough for the agent to reach the skill-loading decision + using var cts = new CancellationTokenSource(30_000); + cts.Token.Register(() => done.TrySetResult(skillActivated)); + + session.On(evt => + { + switch (evt) + { + // Skill loaded → we have our answer, bail immediately + case SkillInvokedEvent: + skillActivated = true; + done.TrySetResult(true); + break; + + // Session finished without loading the skill → not activated + case SessionIdleEvent: + done.TrySetResult(skillActivated); + break; + + case SessionErrorEvent err: + done.TrySetException(new InvalidOperationException(err.Data.Message ?? "Session error")); + break; + } + + if (options.Verbose && evt is SkillInvokedEvent si) + { + var write = options.Log ?? (m => Console.Error.WriteLine(m)); + write($" 📘 Skill invoked: {si.Data.Name}"); + } + if (options.Verbose && evt is ToolExecutionStartEvent ts) + { + var write = options.Log ?? (m => Console.Error.WriteLine(m)); + write($" 🔧 {ts.Data.ToolName}"); + } + }); + + await session.SendAsync(new MessageOptions { Prompt = options.Scenario.Prompt }); + return await done.Task; + } + catch + { + return skillActivated; + } + } + private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) { var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); diff --git a/eng/skill-validator/src/Services/EvalSchema.cs b/eng/skill-validator/src/Services/EvalSchema.cs index 30bc9816ef..edf6a8a614 100644 --- a/eng/skill-validator/src/Services/EvalSchema.cs +++ b/eng/skill-validator/src/Services/EvalSchema.cs @@ -21,7 +21,7 @@ public static EvalConfig ParseEvalConfig(string yamlContent) if (scenarios is not { Count: > 0 }) throw new InvalidOperationException("Eval config must have at least one scenario"); - return new EvalConfig(scenarios); + return new EvalConfig(scenarios, raw.Selectivity?.ShouldActivate, raw.Selectivity?.ShouldNotActivate); } public static (bool Success, EvalConfig? Data, IReadOnlyList? Errors) ValidateEvalConfig(string yamlContent) @@ -128,6 +128,15 @@ internal sealed class RawAgentFrontmatter internal sealed class RawEvalConfig { public List? Scenarios { get; set; } + public RawSelectivity? Selectivity { get; set; } + } + + internal sealed class RawSelectivity + { + [YamlMember(Alias = "should_activate")] + public List? ShouldActivate { get; set; } + [YamlMember(Alias = "should_not_activate")] + public List? ShouldNotActivate { get; set; } } internal sealed class RawScenario diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs index 83e542b62d..adb15d5536 100644 --- a/eng/skill-validator/src/Services/Reporter.cs +++ b/eng/skill-validator/src/Services/Reporter.cs @@ -63,22 +63,34 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver { var icon = verdict.Passed ? "\x1b[32m✓\x1b[0m" : "\x1b[31m✗\x1b[0m"; var name = $"\x1b[1m{verdict.SkillName}\x1b[0m"; - var score = FormatScore(verdict.OverallImprovementScore); - var scoreLine = $"{icon} {name} {score}"; - if (verdict.ConfidenceInterval is { } ci) + // Selectivity-only verdict: no scenarios or score to display + bool isSelectivityOnly = verdict.Scenarios.Count == 0 && verdict.SelectivityResult is not null; + + if (isSelectivityOnly) { - var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]"; - var sigStr = verdict.IsSignificant == true - ? "\x1b[32msignificant\x1b[0m" - : "\x1b[33mnot significant\x1b[0m"; - scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}"; + Console.WriteLine($"{icon} {name} \x1b[2m(selectivity only)\x1b[0m"); + Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); } - if (verdict.NormalizedGain is { } ng) - scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m"; + else + { + var score = FormatScore(verdict.OverallImprovementScore); + + var scoreLine = $"{icon} {name} {score}"; + if (verdict.ConfidenceInterval is { } ci) + { + var ciStr = $"[{FormatPct(ci.Low)}, {FormatPct(ci.High)}]"; + var sigStr = verdict.IsSignificant == true + ? "\x1b[32msignificant\x1b[0m" + : "\x1b[33mnot significant\x1b[0m"; + scoreLine += $" \x1b[2m{ciStr}\x1b[0m {sigStr}"; + } + if (verdict.NormalizedGain is { } ng) + scoreLine += $" \x1b[2m(g={FormatPct(ng)})\x1b[0m"; - Console.WriteLine(scoreLine); - Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); + Console.WriteLine(scoreLine); + Console.WriteLine($" \x1b[2m{verdict.Reason}\x1b[0m"); + } if (!verdict.Passed && verdict.ProfileWarnings is { Count: > 0 }) { @@ -133,6 +145,21 @@ private static void ReportConsole(IReadOnlyList verdicts, bool ver Console.WriteLine($" \x1b[2m•\x1b[0m [{item.Classification}] \x1b[2m{item.AssertionSummary}\x1b[0m\n \x1b[2m— {item.Reasoning}\x1b[0m"); } } + if (verdict.SelectivityResult is { } selResult) + { + Console.WriteLine(); + var selIcon = selResult.Passed ? "✅" : "🔴"; + Console.WriteLine($" 🎯 Selectivity: recall={selResult.Recall:P0}, precision={selResult.Precision:P0} {selIcon}"); + foreach (var pr in selResult.PromptResults) + { + var expected = pr.ExpectedActivation ? "should activate" : "should NOT activate"; + var correct = (pr.ExpectedActivation == pr.SkillActivated); + var prIcon = correct ? "\x1b[32m✓\x1b[0m" : "\x1b[31m✗\x1b[0m"; + var activatedStr = pr.SkillActivated ? "activated" : "not activated"; + var prompt = pr.Prompt.Length > 60 ? pr.Prompt[..59] + "…" : pr.Prompt; + Console.WriteLine($" {prIcon} \x1b[2m\"{prompt}\" — {expected} → {activatedStr}\x1b[0m"); + } + } if (verdict.Scenarios.Count > 0) { Console.WriteLine(); diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 925c30c898..c48b07bdcf 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -18,7 +18,7 @@ true - --results-dir "$([MSBuild]::NormalizePath('$(ArtifactsPath)', 'TestResults', '$(AssemblyName)'))" --parallel-skills 3 --parallel-scenarios 3 --parallel-runs 3 + --results-dir "$([MSBuild]::NormalizePath('$(ArtifactsPath)', 'TestResults', '$(AssemblyName)'))" --parallel-skills 3 --parallel-scenarios 3 --parallel-runs 3 diff --git a/eng/skill-validator/src/SkillValidatorJsonContext.cs b/eng/skill-validator/src/SkillValidatorJsonContext.cs index 90e5420e72..fb370ef526 100644 --- a/eng/skill-validator/src/SkillValidatorJsonContext.cs +++ b/eng/skill-validator/src/SkillValidatorJsonContext.cs @@ -33,6 +33,8 @@ namespace SkillValidator; [JsonSerializable(typeof(PairwiseMagnitude))] [JsonSerializable(typeof(AssertionType))] [JsonSerializable(typeof(MCPServerDef))] +[JsonSerializable(typeof(SelectivityPromptResult))] +[JsonSerializable(typeof(SelectivityResult))] [JsonSerializable(typeof(JsonElement))] [JsonSerializable(typeof(Dictionary))] [JsonSerializable(typeof(Dictionary))] diff --git a/eng/skill-validator/src/SkillValidatorYamlContext.cs b/eng/skill-validator/src/SkillValidatorYamlContext.cs index 3369930b8f..2056105ba3 100644 --- a/eng/skill-validator/src/SkillValidatorYamlContext.cs +++ b/eng/skill-validator/src/SkillValidatorYamlContext.cs @@ -11,4 +11,5 @@ namespace SkillValidator; [YamlSerializable(typeof(EvalSchema.RawSetup))] [YamlSerializable(typeof(EvalSchema.RawSetupFile))] [YamlSerializable(typeof(EvalSchema.RawAssertion))] +[YamlSerializable(typeof(EvalSchema.RawSelectivity))] public partial class SkillValidatorYamlContext : StaticContext; diff --git a/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml b/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml index be492fba00..8982b987e5 100644 --- a/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml +++ b/tests/dotnet-msbuild/build-perf-diagnostics/eval.yaml @@ -15,3 +15,17 @@ scenarios: - "Identified EnforceCodeStyleInBuild should be conditional on CI, not always true" - "Solution preserves full analyzer enforcement in CI pipelines while speeding dev builds" timeout: 160 + +selectivity: + should_activate: + - "My .NET build takes over 5 minutes, how can I speed it up?" + - "How do I analyze a binlog to find slow targets in MSBuild?" + - "Roslyn analyzers are making my compilation really slow, what can I do?" + - "I want to profile my MSBuild build to understand where time is being spent" + - "Our CI builds are fast but local dev builds are painfully slow, how do I diagnose this?" + should_not_activate: + - "How do I add a NuGet package reference to my project?" + - "My unit tests are failing with a NullReferenceException" + - "How do I configure Docker for my .NET application?" + - "What's the difference between .NET 8 and .NET 9?" + - "How do I set up Entity Framework Core migrations?"