From c42609a9c6fa88c4c5f0e9b4731ca410f9be70f8 Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Wed, 4 Mar 2026 09:17:44 +1000 Subject: [PATCH 1/6] Add --run-in-docker to run Copilot CLI in docker container --- eng/skill-validator/README.md | 16 + .../src/Commands/ValidateCommand.cs | 47 ++- eng/skill-validator/src/Docker/Dockerfile | 11 + eng/skill-validator/src/Models/Models.cs | 1 + .../src/Services/AgentRunner.cs | 48 ++- .../src/Services/DockerCopilotServer.cs | 373 ++++++++++++++++++ eng/skill-validator/src/Services/Judge.cs | 4 +- .../src/Services/PairwiseJudge.cs | 4 +- eng/skill-validator/src/SkillValidator.csproj | 4 + .../tests/DockerCopilotServerTests.cs | 245 ++++++++++++ 10 files changed, 720 insertions(+), 33 deletions(-) create mode 100644 eng/skill-validator/src/Docker/Dockerfile create mode 100644 eng/skill-validator/src/Services/DockerCopilotServer.cs create mode 100644 eng/skill-validator/tests/DockerCopilotServerTests.cs diff --git a/eng/skill-validator/README.md b/eng/skill-validator/README.md index c1b8a29d21..f4fe84e288 100644 --- a/eng/skill-validator/README.md +++ b/eng/skill-validator/README.md @@ -63,6 +63,21 @@ dotnet run --project src/SkillValidator -- --require-evals ./skills/ dotnet run --project src/SkillValidator -- --verdict-warn-only --require-evals ./skills/ ``` +## Docker mode + +Use `--run-in-docker` to run agent sessions and setup commands in a Docker container instead of the host environment. + +Requirements: +- Docker installed and running +- `GITHUB_TOKEN` set in your environment (for example: `export GITHUB_TOKEN="$(gh auth token)"`) + +```bash +# Run validation in Docker +dotnet run --project src/SkillValidator -- --run-in-docker ./skills/ +``` + +When Docker mode is enabled, skill-validator builds the runtime image from `src/Docker/Dockerfile`, mounts run workdirs at `/work`, mounts discovered skill directories read-only under `/skills/*`, and stops/removes the container during normal cleanup (and on process exit). + ## Writing eval files Each skill can include a `tests/eval.yaml`: @@ -246,6 +261,7 @@ The default of 5 runs provides sufficient precision for significance testing (va | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors and `--require-evals` still fail. | | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) | | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions | +| `--run-in-docker` | `false` | Run agent sessions and setup commands inside a Docker container (requires `GITHUB_TOKEN`). | | `--verbose` | `false` | Show tool calls and agent events during runs | | `--reporter ` | `console`, `json`, `markdown` | Output format: `console`, `json`, `junit`, `markdown`. | | `--results-dir ` | `.skill-validator-results` | Directory for file reporter output. | diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 195ea06841..31cc43b123 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -30,6 +30,7 @@ public static RootCommand Create() var reporterOpt = new Option("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; var noOverfittingCheckOpt = new Option("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; var overfittingFixOpt = new Option("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; + var runInDockerOpt = new Option("--run-in-docker") { Description = "Run agents and judges inside Docker containers" }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -53,6 +54,7 @@ public static RootCommand Create() reporterOpt, noOverfittingCheckOpt, overfittingFixOpt, + runInDockerOpt, }; command.SetAction(async (parseResult, _) => @@ -98,6 +100,7 @@ public static RootCommand Create() TestsDir = parseResult.GetValue(testsDirOpt), OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), OverfittingFix = parseResult.GetValue(overfittingFixOpt), + RunInDocker = parseResult.GetValue(runInDockerOpt), }; return await Run(config); @@ -117,6 +120,26 @@ public static RootCommand Create() public static async Task Run(ValidatorConfig config) { + // Discover skills first (needed to determine Docker volume mounts) + var allSkills = new List(); + foreach (var path in config.SkillPaths) + { + var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir); + allSkills.AddRange(skills); + } + + if (allSkills.Count == 0) + { + Console.Error.WriteLine("No skills found in the specified paths."); + return 1; + } + + Console.WriteLine($"Found {allSkills.Count} skill(s)\n"); + + // Set up DockerCopilotServer with skill directories to mount + if (config.RunInDocker) + DockerCopilotServer.Initialize(config.Verbose, allSkills); + // Validate model early try { @@ -148,22 +171,6 @@ public static async Task Run(ValidatorConfig config) if (config.Verbose) Console.WriteLine($"Results dir: {config.ResultsDir}"); - // Discover skills - var allSkills = new List(); - foreach (var path in config.SkillPaths) - { - var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir); - allSkills.AddRange(skills); - } - - if (allSkills.Count == 0) - { - Console.Error.WriteLine("No skills found in the specified paths."); - return 1; - } - - Console.WriteLine($"Found {allSkills.Count} skill(s)\n"); - if (config.Runs < 5) Console.WriteLine($"\x1b[33m⚠ Running with {config.Runs} run(s). For statistically significant results, use --runs 5 or higher.\x1b[0m"); @@ -202,6 +209,8 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, config.Model, config.JudgeModel, config.ResultsDir); await AgentRunner.StopSharedClient(); + if (DockerCopilotServer.Instance is { } dockerServer) + await dockerServer.StopAsync(); await AgentRunner.CleanupWorkDirs(); // Always fail on execution errors, even in --verdict-warn-only mode @@ -262,13 +271,13 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, log(warning); // Launch overfitting check in parallel with scenario execution - var workDir = Path.GetTempPath(); + var overfittingWorkDir = DockerCopilotServer.Instance is not null ? "/tmp" : Path.GetTempPath(); Task overfittingTask = Task.FromResult(null); if (config.OverfittingCheck && skill.EvalConfig is not null) { log("🔍 Running overfitting check (parallel)..."); overfittingTask = Services.OverfittingJudge.Analyze(skill, new OverfittingJudgeOptions( - config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir)); + config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir)); } bool singleScenario = skill.EvalConfig.Scenarios.Count == 1; @@ -301,7 +310,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, try { await Services.OverfittingJudge.GenerateFix(skill, overfittingResult, new OverfittingJudgeOptions( - config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir)); + config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir)); log("📝 Generated eval.fixed.yaml with suggested improvements"); } catch (Exception ex) diff --git a/eng/skill-validator/src/Docker/Dockerfile b/eng/skill-validator/src/Docker/Dockerfile new file mode 100644 index 0000000000..1dfb7ed0c1 --- /dev/null +++ b/eng/skill-validator/src/Docker/Dockerfile @@ -0,0 +1,11 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build + +ARG COPILOT_SDK_VERSION +RUN dotnet new console -o /tmp/dl \ + && dotnet add /tmp/dl package GitHub.Copilot.SDK --version $COPILOT_SDK_VERSION \ + && dotnet build /tmp/dl -c Release \ + && cp /tmp/dl/bin/Release/net10.0/runtimes/*/native/copilot /usr/local/bin/copilot \ + && chmod +x /usr/local/bin/copilot \ + && rm -rf /tmp/dl + +RUN copilot --version \ No newline at end of file diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 71be6ea41a..17f00d3213 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -299,6 +299,7 @@ public sealed record ValidatorConfig public string? TestsDir { get; init; } public bool OverfittingCheck { get; init; } = true; public bool OverfittingFix { get; init; } + public bool RunInDocker { get; init; } = false; } public static class DefaultWeights diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 1f711e3629..e59f74e0b3 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -29,14 +29,20 @@ public static async Task GetSharedClient(bool verbose) { if (_sharedClient is not null) return _sharedClient; - var options = new CopilotClientOptions - { - LogLevel = verbose ? "info" : "none", - }; - var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); - if (!string.IsNullOrEmpty(githubToken)) - options.GitHubToken = githubToken; + var options = new CopilotClientOptions(); + if (DockerCopilotServer.Instance is {} dockerServer) + { + _workDirs.Add(dockerServer.GetHostDir()); + options.CliUrl = await dockerServer.GetCliUrlAsync(); + } + else + { + options.LogLevel = verbose ? "info" : "none"; + var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); + if (!string.IsNullOrEmpty(githubToken)) + options.GitHubToken = githubToken; + } _sharedClient = new CopilotClient(options); await _sharedClient.StartAsync(); @@ -82,6 +88,9 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st if (string.IsNullOrEmpty(reqPath)) return true; + if (DockerCopilotServer.Instance is {} dockerServer && dockerServer.TryMapContainerPathToHost(reqPath, out var mappedPath)) + reqPath = mappedPath; + var resolved = Path.GetFullPath(reqPath); var allowedDirs = new List { Path.GetFullPath(workDir) }; if (skillPath is not null) allowedDirs.Add(Path.GetFullPath(skillPath)); @@ -95,10 +104,11 @@ internal static SessionConfig BuildSessionConfig( SkillInfo? skill, string model, string workDir, IReadOnlyDictionary? mcpServers = null) { + var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath(); var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null; // Create a unique temporary config directory for this session to not share any data - var configDir = Path.Combine(Path.GetTempPath(), $"skill-validator-cfg-{Guid.NewGuid():N}"); + var configDir = Path.Combine(parentDir, $"skill-validator-cfg-{Guid.NewGuid():N}"); Directory.CreateDirectory(configDir); _workDirs.Add(configDir); @@ -122,13 +132,19 @@ internal static SessionConfig BuildSessionConfig( } } + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(workDir) ?? workDir; + var sessionSkillPath = skillPath is not null + ? (DockerCopilotServer.Instance?.MapHostPathToContainer(skillPath) ?? skillPath) + : null; + var sessionConfigDir = DockerCopilotServer.Instance?.MapHostPathToContainer(configDir) ?? configDir; + return new SessionConfig { Model = model, Streaming = true, - WorkingDirectory = workDir, - SkillDirectories = skill is not null ? [skillPath!] : [], - ConfigDir = configDir, + WorkingDirectory = sessionWorkDir, + SkillDirectories = skill is not null ? [sessionSkillPath!] : [], + ConfigDir = sessionConfigDir, McpServers = sdkMcp, InfiniteSessions = new InfiniteSessionConfig { Enabled = false }, OnPermissionRequest = (request, _) => @@ -273,7 +289,8 @@ public static async Task RunAgent(RunOptions options) private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) { - var workDir = Path.Combine(Path.GetTempPath(), $"skill-validator-{Guid.NewGuid():N}"); + var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath(); + var workDir = Path.Combine(parentDir, $"skill-validator-{Guid.NewGuid():N}"); Directory.CreateDirectory(workDir); _workDirs.Add(workDir); @@ -319,6 +336,13 @@ private static async Task SetupWorkDir(EvalScenario scenario, string? sk { try { + if (DockerCopilotServer.Instance is {} dockerServer) + { + var containerDir = dockerServer.MapHostPathToContainer(workDir); + await dockerServer.ExecAsync(containerDir, cmd); + continue; + } + var psi = new ProcessStartInfo { FileName = OperatingSystem.IsWindows() ? "cmd.exe" : "/bin/sh", diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs new file mode 100644 index 0000000000..1fb93abc80 --- /dev/null +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -0,0 +1,373 @@ +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Text.RegularExpressions; +using GitHub.Copilot.SDK; +using SkillValidator.Models; + +namespace SkillValidator.Services; + +public partial class DockerCopilotServer +{ + record ContainerState(int HostPort); + + public static DockerCopilotServer? Instance { get; private set; } + + public static void Initialize(bool verbose, IReadOnlyList skills) + { + Instance = Create(verbose, skills); + } + + internal static DockerCopilotServer Create(bool verbose, IReadOnlyList skills) + { + return new DockerCopilotServer(verbose, BuildSkillMounts(skills)); + } + + private const int InternalPort = 4321; + private const string ImageBaseName = "skill-validator-base"; + + private readonly string _invocationId = Guid.NewGuid().ToString("N")[..8]; + private readonly bool _verbose; + private readonly Lazy> _lazyStartTask; + + /// Host skill directory → container mount point (e.g. "/skills/dotnet"). + private readonly Dictionary _skillMounts; + + private ContainerState? _containerState; + private EventHandler? _processExitHandler; + private ConsoleCancelEventHandler? _cancelKeyPressHandler; + + private DockerCopilotServer(bool verbose, Dictionary skillMounts) + { + _verbose = verbose; + _skillMounts = skillMounts; + _lazyStartTask = new Lazy>(() => StartAsync()); + } + + internal static Dictionary BuildSkillMounts(IReadOnlyList skills) + { + var mounts = new Dictionary(StringComparer.OrdinalIgnoreCase); + var usedNames = new Dictionary(StringComparer.OrdinalIgnoreCase); + + // Mount the grandparent directory of each SKILL.md (i.e. the parent of skill.Path) + foreach (var skill in skills) + { + var fullPath = Path.GetFullPath(Path.GetDirectoryName(skill.Path)!); + if (mounts.ContainsKey(fullPath)) + continue; + var name = Path.GetFileName(fullPath); + if (usedNames.TryGetValue(name, out var count)) + { + usedNames[name] = count + 1; + name = $"{name}-{count}"; + } + else + { + usedNames[name] = 1; + } + mounts[fullPath] = $"/skills/{name}"; + } + return mounts; + } + + public string GetHostDir() => Path.Combine(Path.GetTempPath(), $"skill-validator-container-{_invocationId}"); + + private string GetContainerName() => $"skill-validator-{_invocationId}"; + + public async Task GetCliUrlAsync(CancellationToken ct = default) + { + var state = await GetOrStartContainerAsync(ct).ConfigureAwait(false); + return $"localhost:{state.HostPort}"; + } + + private void RegisterProcessExitHandler() + { + if (_processExitHandler is not null || _cancelKeyPressHandler is not null) + return; + + _processExitHandler = (_, _) => + { + try + { + StopAsync().GetAwaiter().GetResult(); + } + catch (Exception ex) + { + if (_verbose) + Console.Error.WriteLine($"🐳 Failed to stop container on process exit: {ex.Message}"); + } + }; + _cancelKeyPressHandler = (_, _) => + { + try + { + StopAsync().GetAwaiter().GetResult(); + } + catch (Exception ex) + { + if (_verbose) + Console.Error.WriteLine($"🐳 Failed to stop container on Ctrl+C: {ex.Message}"); + } + }; + + AppDomain.CurrentDomain.ProcessExit += _processExitHandler; + Console.CancelKeyPress += _cancelKeyPressHandler; + } + + private void UnregisterProcessExitHandler() + { + if (_processExitHandler is null && _cancelKeyPressHandler is null) + return; + + if (_processExitHandler is not null) + AppDomain.CurrentDomain.ProcessExit -= _processExitHandler; + if (_cancelKeyPressHandler is not null) + Console.CancelKeyPress -= _cancelKeyPressHandler; + _processExitHandler = null; + _cancelKeyPressHandler = null; + } + + public async Task StopAsync(CancellationToken ct = default) + { + var containerName = GetContainerName(); + try + { + if (_containerState is null) + return; + + try + { + await RunDockerCommandAsync(["stop", containerName], ct); + } + catch { /* container may already be stopped */ } + + try + { + await RunDockerCommandAsync(["rm", containerName], ct); + } + catch { /* container may already be removed */ } + + _containerState = null; + + if (_verbose) + Console.Error.WriteLine($"🐳 Container {containerName} stopped and removed."); + } + finally + { + UnregisterProcessExitHandler(); + } + } + + public string MapHostPathToContainer(string hostPath) + { + var fullPath = Path.GetFullPath(hostPath); + + // Check work dir mount + var hostDir = GetHostDir(); + if (fullPath.StartsWith(hostDir, StringComparison.OrdinalIgnoreCase)) + { + var relativePath = Path.GetRelativePath(hostDir, fullPath); + return Path.Combine("/work", relativePath).Replace("\\", "/"); + } + + // Check skill dir mounts + foreach (var (hostSkillDir, containerMount) in _skillMounts) + { + if (fullPath.StartsWith(hostSkillDir, StringComparison.OrdinalIgnoreCase)) + { + var relativePath = Path.GetRelativePath(hostSkillDir, fullPath); + return Path.Combine(containerMount, relativePath).Replace("\\", "/"); + } + } + + throw new ArgumentException($"Host path is not mapped into the container: {hostPath}"); + } + + public bool TryMapContainerPathToHost(string containerPath, [NotNullWhen(true)] out string? hostPath) + { + if (containerPath.StartsWith("/work/") || containerPath == "/work") + { + var relativePath = containerPath == "/work" ? "." : containerPath["/work/".Length..]; + hostPath = Path.GetFullPath(Path.Combine(GetHostDir(), relativePath)); + return true; + } + + foreach (var (hostSkillDir, containerMount) in _skillMounts) + { + var prefix = containerMount + "/"; + if (containerPath.StartsWith(prefix) || containerPath == containerMount) + { + var relativePath = containerPath == containerMount ? "." : containerPath[prefix.Length..]; + hostPath = Path.GetFullPath(Path.Combine(hostSkillDir, relativePath)); + return true; + } + } + + hostPath = null; + return false; + } + + public async Task ExecAsync(string workDir, string command, CancellationToken ct = default) + { + _ = await GetOrStartContainerAsync(ct); + + await RunDockerCommandAsync(["exec", GetContainerName(), "--workdir", workDir, "/bin/sh", "-c", command], ct); + } + + private async Task GetOrStartContainerAsync(CancellationToken ct = default) + { + if (_containerState is not null) + return _containerState; + + _containerState = await _lazyStartTask.Value.WaitAsync(ct).ConfigureAwait(false); + return _containerState; + } + + private async Task StartAsync(CancellationToken ct = default) + { + var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN") + ?? throw new InvalidOperationException("GITHUB_TOKEN environment variable is required when running in Docker. You can get it with 'gh auth token'."); + + if (_verbose) + Console.Error.WriteLine("🐳 Building Docker image ..."); + + var sdkVersion = GetCopilotSdkVersion(); + var imageName = $"{ImageBaseName}:{sdkVersion}"; + var dockerFilePath = Path.Combine(AppContext.BaseDirectory, "docker", "Dockerfile"); + + await RunDockerCommandAsync( + ["build", "-t", imageName, "--build-arg", $"COPILOT_SDK_VERSION={sdkVersion}", "-f", dockerFilePath, "."], ct); + + if (_verbose) + Console.Error.WriteLine("🐳 Docker image built successfully."); + + var containerName = GetContainerName(); + + if (_verbose) + Console.Error.WriteLine($"🐳 Starting container {containerName}..."); + + var runArgs = new List + { + "run", + "--name", containerName, + "-p", $"0:{InternalPort}", // Map internal port to a random host port + "-e", $"GITHUB_TOKEN={githubToken}", + "-v", $"{GetHostDir()}:/work", // Mount host dir to /work in container + }; + + // Mount skill directories as read-only volumes + foreach (var (hostDir, containerMount) in _skillMounts) + runArgs.AddRange(["-v", $"{hostDir}:{containerMount}:ro"]); + + runArgs.AddRange([ + imageName, + // Start the Copilot server in headless mode, listening on the internal port, and using the GITHUB_TOKEN from env + "copilot", + "--headless", + "--port", InternalPort.ToString(), + "--auth-token-env", "GITHUB_TOKEN", + "--no-auto-login", + "--no-auto-update", + "--log-level", (_verbose ? "info" : "none") + ]); + + using var process = StartNonDetached(runArgs); + + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(30)); + + try + { + while (!cts.Token.IsCancellationRequested) + { + var line = await process.StandardOutput.ReadLineAsync(cts.Token); + if (line is null) + { + var stderr = await process.StandardError.ReadToEndAsync(ct); + throw new InvalidOperationException( + $"Container {containerName} exited before becoming ready. stderr: {stderr}"); + } + + var match = ListeningPattern().Match(line); + if (match.Success) + break; + } + + if (cts.Token.IsCancellationRequested) + throw new TimeoutException($"Container {containerName} did not become ready within 30s."); + } + finally + { + if (!process.HasExited) + { + process.Kill(); + await process.WaitForExitAsync(); + } + } + + var output = await RunDockerCommandAsync(["port", GetContainerName(), InternalPort.ToString()], ct); + var portMatch = PortPattern().Match(output); + if (!portMatch.Success) + throw new InvalidOperationException($"Could not parse port mapping from: {output}"); + + var port = int.Parse(portMatch.Groups[1].Value); + + if (_verbose) + Console.Error.WriteLine($"🐳 Container {containerName} ready (port {port})"); + + RegisterProcessExitHandler(); + + return new ContainerState(port); + } + + private static async Task RunDockerCommandAsync(IReadOnlyList args, CancellationToken ct = default) + { + using var proc = StartNonDetached(args); + + var stdoutTask = proc.StandardOutput.ReadToEndAsync(ct); + var stderrTask = proc.StandardError.ReadToEndAsync(ct); + await proc.WaitForExitAsync(ct); + var stdout = await stdoutTask; + var stderr = await stderrTask; + + if (proc.ExitCode != 0) + { + var output = string.IsNullOrWhiteSpace(stderr) ? stdout : stderr; + throw new InvalidOperationException( + $"docker {args[0]} failed (exit {proc.ExitCode}): {output.Trim()}"); + } + + return stdout.Trim(); + } + + private static Process StartNonDetached(IReadOnlyList args) + { + var psi = new ProcessStartInfo("docker", args) + { + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true, + }; + + return Process.Start(psi) + ?? throw new InvalidOperationException("Failed to start docker run process"); + } + + internal static string GetCopilotSdkVersion() + { + var attr = typeof(CopilotClient).Assembly + .GetCustomAttribute(); + var version = attr?.InformationalVersion + ?? throw new InvalidOperationException("Could not determine GitHub.Copilot.SDK version from assembly."); + // Strip the commit hash suffix (e.g. "0.1.26+abc123" → "0.1.26") + var plusIndex = version.IndexOf('+'); + return plusIndex >= 0 ? version[..plusIndex] : version; + } + + [GeneratedRegex(@"listening on port (\d+)", RegexOptions.IgnoreCase)] + private static partial Regex ListeningPattern(); + + [GeneratedRegex(@":(\d+)$", RegexOptions.Multiline)] + private static partial Regex PortPattern(); +} diff --git a/eng/skill-validator/src/Services/Judge.cs b/eng/skill-validator/src/Services/Judge.cs index 53596143f8..916fbbe8b2 100644 --- a/eng/skill-validator/src/Services/Judge.cs +++ b/eng/skill-validator/src/Services/Judge.cs @@ -50,11 +50,13 @@ private static async Task JudgeRunOnce( { var client = await AgentRunner.GetSharedClient(options.Verbose); + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(options.WorkDir) ?? options.WorkDir; + await using var session = await client.CreateSessionAsync(new SessionConfig { Model = options.Model, Streaming = true, - WorkingDirectory = options.WorkDir, + WorkingDirectory = sessionWorkDir, SystemMessage = new SystemMessageConfig { Mode = SystemMessageMode.Replace, diff --git a/eng/skill-validator/src/Services/PairwiseJudge.cs b/eng/skill-validator/src/Services/PairwiseJudge.cs index c41973492f..37eb1d089d 100644 --- a/eng/skill-validator/src/Services/PairwiseJudge.cs +++ b/eng/skill-validator/src/Services/PairwiseJudge.cs @@ -86,11 +86,13 @@ private static async Task JudgeCall( var client = await AgentRunner.GetSharedClient(options.Verbose); var rubric = scenario.Rubric ?? []; + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(options.WorkDir) ?? options.WorkDir; + await using var session = await client.CreateSessionAsync(new SessionConfig { Model = options.Model, Streaming = true, - WorkingDirectory = options.WorkDir, + WorkingDirectory = sessionWorkDir, SystemMessage = new SystemMessageConfig { Mode = SystemMessageMode.Replace, diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 914507c18b..f65b684d8f 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -23,4 +23,8 @@ + + + + diff --git a/eng/skill-validator/tests/DockerCopilotServerTests.cs b/eng/skill-validator/tests/DockerCopilotServerTests.cs new file mode 100644 index 0000000000..ff7464f1a5 --- /dev/null +++ b/eng/skill-validator/tests/DockerCopilotServerTests.cs @@ -0,0 +1,245 @@ +using SkillValidator.Models; +using SkillValidator.Services; + +namespace SkillValidator.Tests; + +public class BuildSkillMountsTests +{ + private static SkillInfo MakeSkill(string path) => new( + Name: Path.GetFileName(path), + Description: "test", + Path: path, + SkillMdPath: Path.Combine(path, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + [Fact] + public void MountsGrandparentOfSkillMd() + { + // skill.Path = /home/user/plugins/dotnet/dotnet-msbuild + // grandparent of SKILL.md = parent of skill.Path = /home/user/plugins/dotnet + var skills = new[] { MakeSkill("/home/user/plugins/dotnet/dotnet-msbuild") }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + var expected = Path.GetFullPath("/home/user/plugins/dotnet"); + Assert.Single(mounts); + Assert.True(mounts.ContainsKey(expected)); + Assert.Equal("/skills/dotnet", mounts[expected]); + } + + [Fact] + public void DeduplicatesSkillsInSameParentDirectory() + { + var skills = new[] + { + MakeSkill("/home/user/plugins/dotnet/skill-a"), + MakeSkill("/home/user/plugins/dotnet/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Single(mounts); + } + + [Fact] + public void HandlesNameCollisionsWithIncrementingSuffix() + { + // Two different parent dirs both named "plugins" + var skills = new[] + { + MakeSkill("/home/user/area1/plugins/skill-a"), + MakeSkill("/home/user/area2/plugins/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Equal(2, mounts.Count); + var containerPaths = mounts.Values.OrderBy(v => v).ToList(); + Assert.Equal("/skills/plugins", containerPaths[0]); + Assert.Equal("/skills/plugins-1", containerPaths[1]); + } + + [Fact] + public void MultipleDistinctParentsGetSeparateMounts() + { + var skills = new[] + { + MakeSkill("/home/user/plugins/dotnet/skill-a"), + MakeSkill("/home/user/plugins/python/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Equal(2, mounts.Count); + Assert.Contains(mounts.Values, v => v == "/skills/dotnet"); + Assert.Contains(mounts.Values, v => v == "/skills/python"); + } + + [Fact] + public void EmptySkillListProducesEmptyMounts() + { + var mounts = DockerCopilotServer.BuildSkillMounts([]); + Assert.Empty(mounts); + } +} + +public class MapHostPathToContainerTests +{ + [Fact] + public void MapsWorkDirPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + Directory.CreateDirectory(hostDir); + + try + { + var subPath = Path.Combine(hostDir, "skill-validator-abc123"); + var result = server.MapHostPathToContainer(subPath); + Assert.Equal("/work/skill-validator-abc123", result); + } + finally + { + Directory.Delete(hostDir, true); + } + } + + [Fact] + public void MapsWorkDirNestedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + Directory.CreateDirectory(hostDir); + + try + { + var subPath = Path.Combine(hostDir, "run1", "subdir", "file.txt"); + var result = server.MapHostPathToContainer(subPath); + Assert.Equal("/work/run1/subdir/file.txt", result); + } + finally + { + Directory.Delete(hostDir, true); + } + } + + [Fact] + public void MapsSkillDirPath() + { + // Create a real temp directory to use as a skill path + var tempParent = Path.Combine(Path.GetTempPath(), $"test-skills-{Guid.NewGuid():N}"); + var skillDir = Path.Combine(tempParent, "dotnet", "dotnet-msbuild"); + Directory.CreateDirectory(skillDir); + + try + { + var skill = new SkillInfo( + Name: "dotnet-msbuild", + Description: "test", + Path: skillDir, + SkillMdPath: Path.Combine(skillDir, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + var server = DockerCopilotServer.Create(verbose: false, skills: [skill]); + var fullParent = Path.GetFullPath(tempParent); + + // Map a path inside the skill's parent directory + var result = server.MapHostPathToContainer(Path.Combine(fullParent, "dotnet", "dotnet-msbuild", "SKILL.md")); + Assert.StartsWith("/skills/", result); + Assert.EndsWith("/dotnet-msbuild/SKILL.md", result); + } + finally + { + Directory.Delete(tempParent, true); + } + } + + [Fact] + public void ThrowsForUnmappedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.Throws(() => + server.MapHostPathToContainer("/some/random/path")); + } +} + +public class TryMapContainerPathToHostTests +{ + [Fact] + public void MapsWorkPathToHost() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + + Assert.True(server.TryMapContainerPathToHost("/work/run1/file.txt", out var hostPath)); + Assert.Equal(Path.Combine(hostDir, "run1", "file.txt"), hostPath); + } + + [Fact] + public void MapsWorkRootToHost() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.True(server.TryMapContainerPathToHost("/work", out var hostPath)); + Assert.NotNull(hostPath); + } + + [Fact] + public void MapsSkillPathToHost() + { + var tempParent = Path.Combine(Path.GetTempPath(), $"test-skills-{Guid.NewGuid():N}"); + var skillDir = Path.Combine(tempParent, "dotnet", "my-skill"); + Directory.CreateDirectory(skillDir); + + try + { + var skill = new SkillInfo( + Name: "my-skill", + Description: "test", + Path: skillDir, + SkillMdPath: Path.Combine(skillDir, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + var server = DockerCopilotServer.Create(verbose: false, skills: [skill]); + + // The mount is the parent of skill.Path (/skills/dotnet → tempParent/dotnet) + var mounts = DockerCopilotServer.BuildSkillMounts([skill]); + var containerMount = mounts.Values.First(); + + Assert.True(server.TryMapContainerPathToHost($"{containerMount}/my-skill/SKILL.md", out var hostPath)); + Assert.EndsWith(Path.Combine("my-skill", "SKILL.md"), hostPath); + } + finally + { + Directory.Delete(tempParent, true); + } + } + + [Fact] + public void ReturnsFalseForUnmappedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.False(server.TryMapContainerPathToHost("/tmp/something", out _)); + } + + [Fact] + public void ReturnsFalseForPartialWorkPrefix() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.False(server.TryMapContainerPathToHost("/workspace/file.txt", out _)); + } +} + +public class GetCopilotSdkVersionTests +{ + [Fact] + public void ReturnsSemverWithoutCommitHash() + { + var version = DockerCopilotServer.GetCopilotSdkVersion(); + + // Should be something like "0.1.26", not "0.1.26+abc123" + Assert.DoesNotContain("+", version); + Assert.Matches(@"^\d+\.\d+\.\d+", version); + } +} From c842f7b0bbcf91b6deaf8b7a265d3026e2876a79 Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Fri, 6 Mar 2026 15:43:44 +1000 Subject: [PATCH 2/6] Address PR comments --- .../src/Services/DockerCopilotServer.cs | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs index 1fb93abc80..942ed8fe7d 100644 --- a/eng/skill-validator/src/Services/DockerCopilotServer.cs +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -164,18 +164,22 @@ public string MapHostPathToContainer(string hostPath) // Check work dir mount var hostDir = GetHostDir(); - if (fullPath.StartsWith(hostDir, StringComparison.OrdinalIgnoreCase)) + var relativeToWork = Path.GetRelativePath(hostDir, fullPath); + if (!Path.IsPathRooted(relativeToWork) && + !relativeToWork.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && + relativeToWork != "..") { - var relativePath = Path.GetRelativePath(hostDir, fullPath); - return Path.Combine("/work", relativePath).Replace("\\", "/"); + return Path.Combine("/work", relativeToWork).Replace("\\", "/"); } // Check skill dir mounts foreach (var (hostSkillDir, containerMount) in _skillMounts) { - if (fullPath.StartsWith(hostSkillDir, StringComparison.OrdinalIgnoreCase)) + var relativePath = Path.GetRelativePath(hostSkillDir, fullPath); + if (!Path.IsPathRooted(relativePath) && + !relativePath.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && + relativePath != "..") { - var relativePath = Path.GetRelativePath(hostSkillDir, fullPath); return Path.Combine(containerMount, relativePath).Replace("\\", "/"); } } @@ -225,18 +229,18 @@ private async Task GetOrStartContainerAsync(CancellationToken ct private async Task StartAsync(CancellationToken ct = default) { - var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN") - ?? throw new InvalidOperationException("GITHUB_TOKEN environment variable is required when running in Docker. You can get it with 'gh auth token'."); + if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable("GITHUB_TOKEN"))) + throw new InvalidOperationException("GITHUB_TOKEN environment variable is required when running in Docker. You can get it with 'gh auth token'."); if (_verbose) Console.Error.WriteLine("🐳 Building Docker image ..."); var sdkVersion = GetCopilotSdkVersion(); var imageName = $"{ImageBaseName}:{sdkVersion}"; - var dockerFilePath = Path.Combine(AppContext.BaseDirectory, "docker", "Dockerfile"); + var dockerFilePath = Path.Combine(AppContext.BaseDirectory, "Docker", "Dockerfile"); await RunDockerCommandAsync( - ["build", "-t", imageName, "--build-arg", $"COPILOT_SDK_VERSION={sdkVersion}", "-f", dockerFilePath, "."], ct); + ["build", "-t", imageName, "--build-arg", $"COPILOT_SDK_VERSION={sdkVersion}", "-f", dockerFilePath, Path.GetDirectoryName(dockerFilePath)!], ct); if (_verbose) Console.Error.WriteLine("🐳 Docker image built successfully."); @@ -246,18 +250,21 @@ await RunDockerCommandAsync( if (_verbose) Console.Error.WriteLine($"🐳 Starting container {containerName}..."); + var hostDir = GetHostDir(); + Directory.CreateDirectory(hostDir); + var runArgs = new List { "run", "--name", containerName, "-p", $"0:{InternalPort}", // Map internal port to a random host port - "-e", $"GITHUB_TOKEN={githubToken}", - "-v", $"{GetHostDir()}:/work", // Mount host dir to /work in container + "-e", "GITHUB_TOKEN", + "-v", $"{hostDir}:/work", // Mount host dir to /work in container }; // Mount skill directories as read-only volumes - foreach (var (hostDir, containerMount) in _skillMounts) - runArgs.AddRange(["-v", $"{hostDir}:{containerMount}:ro"]); + foreach (var (hostSkillDir, containerMount) in _skillMounts) + runArgs.AddRange(["-v", $"{hostSkillDir}:{containerMount}:ro"]); runArgs.AddRange([ imageName, From bba74a644cabc7b9c5f7c5198cd03b539ab5ff35 Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:58:45 +1000 Subject: [PATCH 3/6] Change temp dir prefix for docker container host dir --- eng/skill-validator/src/Services/DockerCopilotServer.cs | 2 +- eng/skill-validator/tests/DockerCopilotServerTests.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs index 942ed8fe7d..685b73d29c 100644 --- a/eng/skill-validator/src/Services/DockerCopilotServer.cs +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -70,7 +70,7 @@ internal static Dictionary BuildSkillMounts(IReadOnlyList Path.Combine(Path.GetTempPath(), $"skill-validator-container-{_invocationId}"); + public string GetHostDir() => Path.Combine(Path.GetTempPath(), $"sv-container-{_invocationId}"); private string GetContainerName() => $"skill-validator-{_invocationId}"; diff --git a/eng/skill-validator/tests/DockerCopilotServerTests.cs b/eng/skill-validator/tests/DockerCopilotServerTests.cs index ff7464f1a5..381eb962dd 100644 --- a/eng/skill-validator/tests/DockerCopilotServerTests.cs +++ b/eng/skill-validator/tests/DockerCopilotServerTests.cs @@ -92,9 +92,9 @@ public void MapsWorkDirPath() try { - var subPath = Path.Combine(hostDir, "skill-validator-abc123"); + var subPath = Path.Combine(hostDir, "sv-abc123"); var result = server.MapHostPathToContainer(subPath); - Assert.Equal("/work/skill-validator-abc123", result); + Assert.Equal("/work/sv-abc123", result); } finally { From 659a68fd63ff1c099009db57fa4668c3ce3eeff7 Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Fri, 6 Mar 2026 19:35:03 +1000 Subject: [PATCH 4/6] Extract path mapping logic to helper function --- .../src/Services/DockerCopilotServer.cs | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs index 685b73d29c..d22bd78526 100644 --- a/eng/skill-validator/src/Services/DockerCopilotServer.cs +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -163,25 +163,14 @@ public string MapHostPathToContainer(string hostPath) var fullPath = Path.GetFullPath(hostPath); // Check work dir mount - var hostDir = GetHostDir(); - var relativeToWork = Path.GetRelativePath(hostDir, fullPath); - if (!Path.IsPathRooted(relativeToWork) && - !relativeToWork.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && - relativeToWork != "..") - { - return Path.Combine("/work", relativeToWork).Replace("\\", "/"); - } + if (TryMapToContainerMount(fullPath, GetHostDir(), "/work", out var workResult)) + return workResult; // Check skill dir mounts foreach (var (hostSkillDir, containerMount) in _skillMounts) { - var relativePath = Path.GetRelativePath(hostSkillDir, fullPath); - if (!Path.IsPathRooted(relativePath) && - !relativePath.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && - relativePath != "..") - { - return Path.Combine(containerMount, relativePath).Replace("\\", "/"); - } + if (TryMapToContainerMount(fullPath, hostSkillDir, containerMount, out var skillResult)) + return skillResult; } throw new ArgumentException($"Host path is not mapped into the container: {hostPath}"); @@ -361,6 +350,25 @@ private static Process StartNonDetached(IReadOnlyList args) ?? throw new InvalidOperationException("Failed to start docker run process"); } + private static bool TryMapToContainerMount( + string fullPath, + string hostDir, + string containerMount, + [NotNullWhen(true)] out string? containerPath) + { + var relativePath = Path.GetRelativePath(hostDir, fullPath); + if (!Path.IsPathRooted(relativePath) && + !relativePath.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && + relativePath != "..") + { + containerPath = Path.Combine(containerMount, relativePath).Replace("\\", "/"); + return true; + } + + containerPath = null; + return false; + } + internal static string GetCopilotSdkVersion() { var attr = typeof(CopilotClient).Assembly From f547651b20d40947d01fc178d7188e8a5a3bf066 Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Fri, 6 Mar 2026 19:46:04 +1000 Subject: [PATCH 5/6] Re-add newline at end of README.md that disappeared in merge. --- eng/skill-validator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/skill-validator/README.md b/eng/skill-validator/README.md index a0d7f3eac5..192b021838 100644 --- a/eng/skill-validator/README.md +++ b/eng/skill-validator/README.md @@ -304,4 +304,4 @@ Results include bootstrap confidence intervals computed across individual runs. - **not significant**: the CI crosses zero — could be noise - **g=**: normalized gain, controlling for ceiling effects (a skill improving a strong baseline is harder than improving a weak one) -The default of 5 runs provides sufficient precision for significance testing (validated by [SkillsBench](https://arxiv.org/abs/2602.12670)). \ No newline at end of file +The default of 5 runs provides sufficient precision for significance testing (validated by [SkillsBench](https://arxiv.org/abs/2602.12670)). From 40292f8db9dee8c5ec84c75eb4602ea23fd85d8c Mon Sep 17 00:00:00 2001 From: Cameron Aavik <99771732+caaavik-msft@users.noreply.github.com> Date: Sat, 7 Mar 2026 05:42:00 +1000 Subject: [PATCH 6/6] Fix docker exec command --- eng/skill-validator/src/Services/DockerCopilotServer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs index d22bd78526..14a51e5784 100644 --- a/eng/skill-validator/src/Services/DockerCopilotServer.cs +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -204,7 +204,7 @@ public async Task ExecAsync(string workDir, string command, CancellationToken ct { _ = await GetOrStartContainerAsync(ct); - await RunDockerCommandAsync(["exec", GetContainerName(), "--workdir", workDir, "/bin/sh", "-c", command], ct); + await RunDockerCommandAsync(["exec", "--workdir", workDir, GetContainerName(), "/bin/sh", "-c", command], ct); } private async Task GetOrStartContainerAsync(CancellationToken ct = default)