From 6e99797ecbe385da7454fe14d484f42c54eef1fd Mon Sep 17 00:00:00 2001 From: "Ashrya Agrawal (from Dev Box)" Date: Thu, 18 Jun 2026 13:45:39 -0700 Subject: [PATCH 1/2] feat(evaluate): add Azure OpenAI judge (--eval-engine azure-openai) Score each MCP tool-schema checklist item with your own Azure OpenAI deployment via the OpenAI Responses API, authenticated with Microsoft Entra ID (DefaultAzureCredential; API-key auth is not supported). - Check-by-check: each assertion is one model call at temperature 0 with the full tool schema as context, fanned out in parallel (A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY, default 100). Avoids the output truncation the whole-tool approach hit on large tools. - Single-flight Entra token cache so high concurrency does not storm `az`. - Checklist checkpointed each round; a re-run resumes only unscored checks. - Explicit-only engine (never auto-selected); fixed retry of 3 with backoff. - New --report-name overrides the host-derived output/report name (needed when multiple servers share one gateway host). - OpenAI pinned to 2.8.0 to keep System.Text.Json and System.Diagnostics.DiagnosticSource at 9.0.8 (no shared-dependency bump). - MCP discovery now tracks Mcp-Session-Id for session-required servers. - Docs (evaluate instructions + FAQ), CHANGELOG, and tests included. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 2 + .../a365-evaluate-faq.md | 4 +- .../a365-evaluate-instructions.md | 36 +- src/Directory.Packages.props | 5 + .../Commands/DevelopMcpCommand.cs | 21 +- .../Microsoft.Agents.A365.DevTools.Cli.csproj | 3 + .../Models/Evaluate/EvaluateEnums.cs | 1 + .../Program.cs | 3 + .../Services/Evaluate/AzureOpenAiLauncher.cs | 431 ++++++++++++++++++ .../Services/Evaluate/ChecklistEvaluator.cs | 279 +++++++++--- .../Evaluate/CodingAgentLauncherBase.cs | 19 + .../Services/Evaluate/EvalModelConstants.cs | 54 +++ .../Evaluate/EvaluationPipelineService.cs | 23 +- .../Services/Evaluate/ICodingAgentLauncher.cs | 40 ++ .../Evaluate/IEvaluationPipelineService.cs | 5 +- .../Evaluate/SchemaDiscoveryService.cs | 19 + .../Services/Evaluate/SemanticCheckPrompts.cs | 52 +++ .../EvaluateCommandInvocationTests.cs | 48 +- .../AzureOpenAiLauncherAvailabilityTests.cs | 130 ++++++ .../Evaluate/AzureOpenAiLauncherTests.cs | 233 ++++++++++ .../EvaluationPipelineServiceTests.cs | 105 +++++ 21 files changed, 1436 insertions(+), 77 deletions(-) create mode 100644 src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs create mode 100644 src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs create mode 100644 src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c9615c2..c6015a2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ Agents provisioned before this release need `Agent365.Observability.OtelWrite` g - Log separator written at the start of each CLI invocation now redacts values for secret-bearing options (e.g. `--idp-client-secret`) so they are not written to the log file in plain text. - Authentication context (tenant and user) is now logged at the `Information` level whenever the resolved sign-in identity changes, giving operators a clear audit trail in the log file of who the CLI is acting as, without exposing credentials. - `a365 develop-mcp evaluate` command for evaluating MCP server tool schema quality — runs deterministic and semantic checks (via GitHub Copilot or Claude Code CLIs), computes maturity scoring, and generates an interactive HTML report +- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores each checklist item with your own Azure OpenAI deployment instead of a local coding agent. Authenticates with Microsoft Entra ID only (`DefaultAzureCredential`; API-key auth is not supported); configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (optional `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY`, default 100). Each assertion is judged independently at temperature 0, in parallel, with the full tool schema as context. +- `--report-name` on `develop-mcp evaluate` — overrides the server name used for the output file names and the report header (derived from the URL host by default); needed when multiple servers share one gateway host so their reports don't collide. - `--skip-sp-provisioning` option on `setup all` — skips the interactive in-line provisioning of missing resource service principals (issue #429). Default: setup prompts per-resource and runs `az ad sp create --id ` using the operator's `az login`. With this flag, missing SPs are excluded from the consent URL and listed in the Action Required block with the `az` command and a per-SP consent URL. Implicitly enabled when stdin is redirected (CI / pipe scenarios). - Action Required block in the `setup all` summary now lists missing service principals — each entry shows the resource, pending scopes, the `az ad sp create` command, and the per-SP consent URL needed to complete provisioning. - `logs export [command] [--output ]` — exports a redacted copy of a CLI diagnostic log safe to share with Microsoft support. Redacts JWT tokens, email addresses, OS-path usernames, and tenant-specific GUIDs; replaces identical values with consistent aliases so log correlation is preserved. Preserves diagnostic IDs that aren't sensitive but are useful for debugging — `TraceId`, `CorrelationId`, Microsoft Graph `request-id` and `client-request-id` values, and well-known public Microsoft / Agent 365 resource appIds (such as the Microsoft Graph appId `00000003-0000-0000-c000-000000000000`). Omit `[command]` to export all available logs at once. diff --git a/docs/agent365-guided-setup/a365-evaluate-faq.md b/docs/agent365-guided-setup/a365-evaluate-faq.md index 1b7bbc1f..e637b664 100644 --- a/docs/agent365-guided-setup/a365-evaluate-faq.md +++ b/docs/agent365-guided-setup/a365-evaluate-faq.md @@ -37,7 +37,9 @@ Microsoft's role is supplying the `a365` CLI that orchestrates steps 1–2. Step By default the evaluation uses **Claude Haiku 4.5**, invoked through whichever coding agent CLI you have installed (GitHub Copilot CLI or Claude Code CLI). You can override the engine with `--eval-engine github-copilot` or `--eval-engine claude-code`. -To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). +You can also score with **your own Azure OpenAI deployment** via `--eval-engine azure-openai`. Set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`, and sign in to Entra ID (e.g. `az login`). This engine authenticates with **Microsoft Entra ID only** (`DefaultAzureCredential`) — **API-key authentication is not supported**. Unlike the coding-agent engines it needs no local CLI: the `a365` CLI calls your Azure OpenAI deployment directly, under your own Azure subscription, billing, and access policies. + +To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`. The CLI that runs the model is **yours**, installed from npm by you, authenticated with your credentials. Microsoft does not host or resell the model API call — it is made directly by your CLI to the AI provider under your own terms of service and billing. The `a365` CLI specifies the model flag but does not mediate the API call. diff --git a/docs/agent365-guided-setup/a365-evaluate-instructions.md b/docs/agent365-guided-setup/a365-evaluate-instructions.md index 6388be9f..4de1cb3f 100644 --- a/docs/agent365-guided-setup/a365-evaluate-instructions.md +++ b/docs/agent365-guided-setup/a365-evaluate-instructions.md @@ -32,15 +32,17 @@ Wait for the answer. Store as `needsAuth`: 1. Auto — try GitHub Copilot first, fall back to Claude Code 2. GitHub Copilot only 3. Claude Code only -4. None — generate the checklist and let me (or my own LLM) score it manually (bring-your-own-LLM) +4. Azure OpenAI — score with your own Azure OpenAI deployment via Entra ID (no local coding agent; typically the fastest option) +5. None — generate the checklist and let me (or my own LLM) score it manually (bring-your-own-LLM) Wait for the answer. Store as `evalEngine`: - If **1 (Auto)**: `evalEngine = "auto"` - If **2 (GitHub Copilot)**: `evalEngine = "github-copilot"` - If **3 (Claude Code)**: `evalEngine = "claude-code"` -- If **4 (None)**: `evalEngine = "none"` +- If **4 (Azure OpenAI)**: `evalEngine = "azure-openai"` — also confirm the user has set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and is signed in to Entra ID (see Step 2). +- If **5 (None)**: `evalEngine = "none"` -> **Note:** Auto and the named engines require the corresponding CLI to be installed on the workstation (`copilot` or `claude`). If neither is installed and `evalEngine` is `auto` or a named engine, the pipeline will stop after writing the checklist and print BYO-LLM instructions — that is expected, not an error. +> **Note:** The local engines (`auto`, `github-copilot`, `claude-code`) require the corresponding CLI on the workstation (`copilot` or `claude`); if none is installed the pipeline stops after writing the checklist and prints BYO-LLM instructions — that is expected, not an error. The `azure-openai` engine needs **no local CLI** — instead it requires the `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` environment variables and an Entra ID sign-in (e.g. `az login`); see Step 2. After all three questions are answered, create all todos for the path and mark Todo 1 in-progress: @@ -133,6 +135,27 @@ Tell the user one of the following is required, and that without one of them the - Install Claude Code (see above), or - Switch to `evalEngine = "none"` and score the checklist with their own LLM after the run. +### If `evalEngine = "azure-openai"` + +No local coding agent is used — the `a365` CLI calls your Azure OpenAI deployment directly. Verify two things: + +1. **Endpoint and deployment are configured.** The CLI reads these from the environment (they are not flags): + +```bash +echo "$A365_EVAL_AZURE_OPENAI_ENDPOINT" # e.g. https://.services.ai.azure.com/openai/v1 +echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT" # e.g. gpt-4.1 +``` + +If either is empty, ask the user for their Azure OpenAI endpoint (include the `/openai/v1` path) and deployment name, then set both variables. + +2. **The workstation is signed in to Entra ID.** The engine authenticates with `DefaultAzureCredential` and requests a token for `https://ai.azure.com/.default`. The simplest sign-in is: + +```bash +az login +``` + +> **Authentication — Microsoft Entra ID only.** The `azure-openai` engine supports **Entra ID authentication only**, via `DefaultAzureCredential` (Azure CLI sign-in, managed identity, a service principal supplied through environment variables, or Visual Studio / VS Code sign-in). **API-key authentication is NOT supported** — no key is read from the environment or any config file. The model call is made directly by the `a365` CLI to *your* Azure OpenAI deployment, under your Azure subscription, billing, and access policies. + ### Step 2 completion > Mark Todo 2 as **completed**. Mark Todo 3 as **in-progress**. Proceed to Step 3. @@ -188,8 +211,11 @@ These settings can come from environment variables instead of (or alongside) the | `A365_MCP_AUTH_TOKEN` | Bearer token for the MCP server, used when `--auth-token` is not passed. **Preferred over the flag** — it keeps the token out of process listings (`ps` / Task Manager) and shell history. If you pass `--auth-token`, the CLI prints a one-time warning recommending this variable. | | `A365_EVAL_COPILOT_MODEL` | Override the GitHub Copilot model (exact model ID, e.g. `claude-haiku-4.5`). | | `A365_EVAL_CLAUDE_MODEL` | Override the Claude Code model (alias, e.g. `haiku`). | +| `A365_EVAL_AZURE_OPENAI_ENDPOINT` | **Required for `--eval-engine azure-openai`.** Azure OpenAI endpoint, including the API path (e.g. `https://.services.ai.azure.com/openai/v1`). | +| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with (e.g. `gpt-4.1`). | +| `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY` | Parallel check-scoring calls for the `azure-openai` engine. Default `100`. | -The model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. +For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and authenticates with Entra ID only (no API key — see Step 2). ### What you will see @@ -268,6 +294,8 @@ If any step results in an error, stop and analyze the error message carefully. | `Unauthorized` from `tools/list` | Wrong or expired bearer token | Re-acquire the token (Question 2). | | `Could not parse server URL` | URL doesn't include the protocol | Add `http://` or `https://` to the URL. | | `No coding agent detected` after `[2/5]` | Neither `copilot` nor `claude` is on PATH | Install one (Step 2) or pass `--eval-engine none`. | +| `Azure OpenAI is not available` / engine not selected | `azure-openai` chosen but `A365_EVAL_AZURE_OPENAI_ENDPOINT` / `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` are unset | Set both env vars (Step 2). | +| `Azure OpenAI authentication failed` | Not signed in to Entra ID (or no usable credential) for the `azure-openai` engine | Run `az login` (or configure a managed identity / service-principal credential). | | `Failed to write report to ` | Output dir not writable | Choose a different `--output-dir` or fix permissions. | | Telemetry warning at debug level | Non-blocking — the marker call failed | Ignore. The evaluation runs regardless. | diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index f3fccf3a..6d92f5e4 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -32,6 +32,11 @@ + + + + diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs index deeffc40..d2b8a860 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs @@ -56,5 +56,6 @@ public enum EvalEngine Auto, GitHubCopilot, ClaudeCode, + AzureOpenAI, None } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs index 6661d462..07cc52c2 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs @@ -401,6 +401,9 @@ private static void ConfigureServices(IServiceCollection services, LogLevel mini // adding its ICodingAgentLauncher implementation file plus one line here. services.AddSingleton(); services.AddSingleton(); + // Azure OpenAI judge (BYO model via Entra ID). Explicit-only (AutoDetectable=false), + // so it never participates in --eval-engine auto despite being registered here. + services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs new file mode 100644 index 00000000..0942c5cb --- /dev/null +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs @@ -0,0 +1,431 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.ClientModel; +using System.ClientModel.Primitives; +using System.Text.Json; +using Azure.Core; +using Azure.Identity; +using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate; +using Microsoft.Extensions.Logging; +using OpenAI; +using OpenAI.Responses; + +namespace Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate; + +// The OpenAI Responses API surface is annotated [Experimental("OPENAI001")] in the +// 2.x SDK. Suppress for this file; the API shape used here is small and stable. +#pragma warning disable OPENAI001 + +/// +/// Scores semantic checks with a user-provided Azure OpenAI deployment via the OpenAI +/// Responses API, authenticated with Entra ID (). +/// +/// This engine scores each check INDEPENDENTLY ( = true): one +/// model call per assertion, with the full tool schema passed as context and temperature 0. +/// Per-check calls keep every response tiny (no truncation on large tools) and low-variance; +/// the evaluator fans them out concurrently. The coding-agent launchers (Copilot, Claude) +/// instead edit a whole-tool file via , which this engine does not use. +/// +/// Configuration is environment-driven (no secrets on the command line): +/// A365_EVAL_AZURE_OPENAI_ENDPOINT (required) e.g. https://foo.services.ai.azure.com/openai/v1 +/// A365_EVAL_AZURE_OPENAI_DEPLOYMENT (required) e.g. gpt-4.1 +/// A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY (optional) parallel check calls, default 100 +/// The Entra ID scope (https://ai.azure.com/.default) and per-call retry count (3) are fixed. +/// +/// Explicit-only: is false, so --eval-engine auto +/// never selects it. A plain evaluate run never sends content to a model endpoint or +/// spends tokens unless the user opts in with --eval-engine azure-openai. +/// +internal sealed class AzureOpenAiLauncher : ICodingAgentLauncher +{ + private readonly ILogger _logger; + + // DefaultAzureCredential probes several credential sources on construction; build it + // once (the launcher is a DI singleton) and reuse across all check calls. + private DefaultAzureCredential? _credential; + + // The ResponsesClient is thread-safe for concurrent calls; build it once and share it + // across every per-check scoring call (avoids thousands of client instances at high dop). + private ResponsesClient? _client; + private readonly object _clientLock = new(); + + public AzureOpenAiLauncher(ILogger logger) + { + ArgumentNullException.ThrowIfNull(logger); + _logger = logger; + } + + public EvalEngine Engine => EvalEngine.AzureOpenAI; + + public string DisplayName => "Azure OpenAI"; + + // Unused by this engine: it builds its own self-contained prompt from the checklist + // content rather than a coding-agent prompt that names read/edit tools. Present only + // to satisfy the interface. + public SemanticCheckPrompts.AgentToolset Toolset => new(ReadToolName: "read", EditToolName: "edit"); + + // No CLI binary backs this engine; it is never probed on PATH (IsAvailableAsync is + // overridden) and is excluded from the auto-detection list. + public string CliCommand => "azure-openai"; + + public bool AutoDetectable => false; + + public string AvailabilityHint => + $"the {EvalModelConstants.AzureOpenAiEndpointEnvVar} and {EvalModelConstants.AzureOpenAiDeploymentEnvVar} environment variables"; + + // Direct API: score many checks concurrently (vs. 1 for subprocess agents). + public int MaxConcurrency => EvalModelConstants.AzureOpenAiMaxConcurrency; + + // This engine evaluates each check on its own (per-check), not a whole-tool file. + public bool ScoresPerCheck => true; + + /// + /// Available when the endpoint and deployment environment variables are set to a + /// usable value. Credentials are not probed here (that would require a token call); + /// an auth failure surfaces from with actionable guidance. + /// + public Task IsAvailableAsync(CancellationToken cancellationToken = default) + { + var endpoint = EvalModelConstants.AzureOpenAiEndpoint; + var deployment = EvalModelConstants.AzureOpenAiDeployment; + + if (endpoint is null || deployment is null) + { + _logger.LogDebug("Azure OpenAI engine unavailable: {EndpointVar} and/or {DeploymentVar} not set.", + EvalModelConstants.AzureOpenAiEndpointEnvVar, EvalModelConstants.AzureOpenAiDeploymentEnvVar); + return Task.FromResult(false); + } + + if (!Uri.TryCreate(endpoint, UriKind.Absolute, out _)) + { + _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) is not a valid absolute URL.", + endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar); + return Task.FromResult(false); + } + + return Task.FromResult(true); + } + + /// + /// Not used by this engine — it scores per check via . + /// The evaluator routes per-check engines down a different path and never calls this. + public Task LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default) + => throw new NotSupportedException("Azure OpenAI scores checks individually; the evaluator calls ScoreCheckAsync instead of LaunchAsync."); + + /// + public async Task ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default) + { + ArgumentException.ThrowIfNullOrWhiteSpace(context); + ArgumentException.ThrowIfNullOrWhiteSpace(checkPrompt); + + var endpoint = EvalModelConstants.AzureOpenAiEndpoint; + var deployment = EvalModelConstants.AzureOpenAiDeployment; + if (endpoint is null || deployment is null) + { + // IsAvailableAsync gates this; guard anyway so a direct call can't NRE. + _logger.LogError("Azure OpenAI engine requires {EndpointVar} and {DeploymentVar}.", + EvalModelConstants.AzureOpenAiEndpointEnvVar, EvalModelConstants.AzureOpenAiDeploymentEnvVar); + return null; + } + + try + { + var client = GetClient(endpoint, deployment); + var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt(context, checkPrompt); + var options = new CreateResponseOptions(new[] { ResponseItem.CreateUserMessageItem(prompt) }, deployment) + { + // Deterministic scoring to minimize hallucination and variance across checks. + Temperature = 0f, + }; + + ClientResult result = await client.CreateResponseAsync(options, cancellationToken); + return ParseEvaluation(result.Value.GetOutputText()); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (AuthenticationFailedException ex) + { + _logger.LogError("Azure OpenAI authentication failed for scope {Scope}. Run 'az login' or verify your Entra credentials. {Message}", + EvalModelConstants.AzureOpenAiScope, ex.Message); + return null; + } + catch (ClientResultException ex) + { + // HTTP failure surfaced by the OpenAI SDK (e.g. 401 auth, 404 wrong deployment, 429 throttling after retries). + _logger.LogWarning("Azure OpenAI request failed (HTTP {Status}): {Message}", ex.Status, ex.Message); + return null; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Azure OpenAI check scoring failed unexpectedly."); + return null; + } + } + + /// + /// Builds (once) and reuses a single . The client is thread-safe + /// for concurrent calls, so every check scoring shares it. Retry count is configurable. + /// + private ResponsesClient GetClient(string endpoint, string deployment) + { + if (_client is not null) + { + return _client; + } + + lock (_clientLock) + { + if (_client is null) + { + _credential ??= new DefaultAzureCredential(); + var scope = EvalModelConstants.AzureOpenAiScope; + // Bridge the Azure.Core credential to System.ClientModel's auth abstraction the + // OpenAI SDK consumes, so no Azure.Identity version bump is required. + var authPolicy = new BearerTokenPolicy(new AzureCredentialTokenProvider(_credential, scope), scope); + _client = new ResponsesClient(deployment, authPolicy, new OpenAIClientOptions + { + Endpoint = new Uri(endpoint), + // Retry 429/transient with exponential backoff honoring Retry-After. Fixed at 3; + // if checks still fail after that, re-running the command resumes only the + // unscored ones. The logging subclass surfaces each retry so throttling is visible. + RetryPolicy = new LoggingRetryPolicy(maxRetries: 3, _logger), + }); + } + + return _client; + } + } + + /// + /// Parses the model's per-check response into a , tolerating + /// code fences and stray prose. Returns null when no usable {score, reason} is present. + /// + internal static CheckEvaluation? ParseEvaluation(string? output) + { + var json = ExtractJson(output); + if (json is null) + { + return null; + } + + try + { + using var doc = JsonDocument.Parse(json); + var root = doc.RootElement; + if (!root.TryGetProperty("score", out var scoreEl)) + { + return null; + } + + bool score = scoreEl.ValueKind switch + { + JsonValueKind.True => true, + JsonValueKind.False => false, + JsonValueKind.String => bool.TryParse(scoreEl.GetString(), out var b) && b, + _ => false, + }; + + var reason = root.TryGetProperty("reason", out var reasonEl) && reasonEl.ValueKind == JsonValueKind.String + ? reasonEl.GetString()! + : "No reason provided."; + + return new CheckEvaluation(score, reason); + } + catch (JsonException) + { + return null; + } + } + + /// + /// Extracts a JSON object from raw model output: strips Markdown code fences and any + /// surrounding prose, then verifies the candidate parses. Returns null when no valid + /// JSON object can be recovered (the evaluator then retries the attempt). + /// + internal static string? ExtractJson(string? output) + { + if (string.IsNullOrWhiteSpace(output)) + { + return null; + } + + var text = output.Trim(); + + // Strip a leading ```json (or ```) fence and its closing ``` if present. + if (text.StartsWith("```", StringComparison.Ordinal)) + { + var firstNewline = text.IndexOf('\n'); + if (firstNewline >= 0) + { + text = text[(firstNewline + 1)..]; + } + + var lastFence = text.LastIndexOf("```", StringComparison.Ordinal); + if (lastFence >= 0) + { + text = text[..lastFence]; + } + + text = text.Trim(); + } + + // Narrow to the outermost { ... } in case the model added stray prose. + var start = text.IndexOf('{'); + var end = text.LastIndexOf('}'); + if (start < 0 || end <= start) + { + return null; + } + + var candidate = text[start..(end + 1)]; + try + { + using var _ = JsonDocument.Parse(candidate, new JsonDocumentOptions + { + AllowTrailingCommas = true, + CommentHandling = JsonCommentHandling.Skip + }); + return candidate; + } + catch (JsonException) + { + return null; + } + } +} + +// The classes below (AzureCredentialTokenProvider, LoggingRetryPolicy) use no experimental +// OpenAI APIs, so the OPENAI001 suppression should not extend to them. +#pragma warning restore OPENAI001 + +/// +/// Adapts an Azure.Core (e.g. ) +/// to System.ClientModel's , which the OpenAI SDK's +/// consumes. This bridge lets the evaluate command authenticate +/// with Entra ID without bumping the repo's shared Azure.Identity version. +/// +internal sealed class AzureCredentialTokenProvider : AuthenticationTokenProvider +{ + private readonly TokenCredential _credential; + private readonly string[] _scopes; + + // Single-flight token cache. Under high concurrency (hundreds/thousands of parallel + // per-check calls) the underlying credential would otherwise be hit once per call — + // and AzureCliCredential spawns an `az` process per acquisition, so the burst storms the + // CLI and times out. We acquire one token, cache it until shortly before expiry, and + // serialize refreshes through a gate so only a single acquisition is ever in flight. + private readonly SemaphoreSlim _gate = new(1, 1); + private static readonly TimeSpan RefreshSkew = TimeSpan.FromMinutes(5); + private volatile CachedToken? _cache; + + private sealed record CachedToken(string Value, DateTimeOffset ExpiresOn); + + public AzureCredentialTokenProvider(TokenCredential credential, string scope) + { + ArgumentNullException.ThrowIfNull(credential); + ArgumentException.ThrowIfNullOrWhiteSpace(scope); + _credential = credential; + _scopes = new[] { scope }; + } + + public override GetTokenOptions CreateTokenOptions(IReadOnlyDictionary properties) => new(properties); + + public override AuthenticationToken GetToken(GetTokenOptions options, CancellationToken cancellationToken) + => GetTokenAsync(options, cancellationToken).AsTask().GetAwaiter().GetResult(); + + public override async ValueTask GetTokenAsync(GetTokenOptions options, CancellationToken cancellationToken) + { + var cached = _cache; + if (IsFresh(cached)) + { + return ToToken(cached!); + } + + await _gate.WaitAsync(cancellationToken).ConfigureAwait(false); + try + { + // Another caller may have refreshed while we waited on the gate. + cached = _cache; + if (IsFresh(cached)) + { + return ToToken(cached!); + } + + AccessToken token = await _credential.GetTokenAsync(new TokenRequestContext(_scopes), cancellationToken).ConfigureAwait(false); + var fresh = new CachedToken(token.Token, token.ExpiresOn); + _cache = fresh; + return ToToken(fresh); + } + finally + { + _gate.Release(); + } + } + + private static bool IsFresh(CachedToken? c) => c is not null && c.ExpiresOn - DateTimeOffset.UtcNow > RefreshSkew; + + private static AuthenticationToken ToToken(CachedToken c) => new(c.Value, "Bearer", c.ExpiresOn, refreshOn: null); +} + +/// +/// A that logs whenever it retries a request, so rate-limit +/// (HTTP 429) and transient failures are visible instead of being silently absorbed. The +/// retry decision, exponential backoff, and Retry-After handling are inherited unchanged from +/// the base policy; this subclass only adds the log line on each retry. +/// +internal sealed class LoggingRetryPolicy : ClientRetryPolicy +{ + private readonly ILogger _logger; + + public LoggingRetryPolicy(int maxRetries, ILogger logger) : base(maxRetries) + { + ArgumentNullException.ThrowIfNull(logger); + _logger = logger; + } + + protected override bool ShouldRetry(PipelineMessage message, Exception? exception) + { + var shouldRetry = base.ShouldRetry(message, exception); + if (shouldRetry) + { + LogRetry(message.Response?.Status ?? 0, exception); + } + + return shouldRetry; + } + + protected override async ValueTask ShouldRetryAsync(PipelineMessage message, Exception? exception) + { + var shouldRetry = await base.ShouldRetryAsync(message, exception).ConfigureAwait(false); + if (shouldRetry) + { + LogRetry(message.Response?.Status ?? 0, exception); + } + + return shouldRetry; + } + + private void LogRetry(int status, Exception? exception) + { + if (status == 429) + { + _logger.LogWarning(" Azure OpenAI rate-limited (HTTP 429); backing off and retrying the call."); + } + else if (status == 503) + { + _logger.LogWarning(" Azure OpenAI unavailable (HTTP 503); backing off and retrying the call."); + } + else if (status >= 400) + { + _logger.LogDebug(" Azure OpenAI transient failure (HTTP {Status}); retrying the call.", status); + } + else + { + // No HTTP response captured (network error / timeout); the exception carries the detail. + _logger.LogDebug(exception, " Azure OpenAI transient error; retrying the call."); + } + } +} diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs index d23fd5b0..ecf0e545 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Collections.Concurrent; using System.Text.Json; using System.Text.RegularExpressions; using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate; @@ -114,57 +115,14 @@ public async Task EvaluateAsync( // actually did the work (rather than the user's "auto" request). EvalEngine? engineUsed = null; - // Evaluate each tool using extract-evaluate-merge pattern. - // The full checklist is ~1MB which is too large for coding agents. - // Instead, extract each tool to a small temp file (~25KB), have the - // agent evaluate it, then merge the results back into the checklist. - for (int i = 0; i < checklist.Tools.Count; i++) - { - cancellationToken.ThrowIfCancellationRequested(); - - var tool = checklist.Tools[i]; - var unevaluated = CountUnevaluatedSemanticChecks(tool); - if (unevaluated == 0) - { - continue; - } - - // Heartbeat BEFORE the tool runs so the user sees forward motion immediately. - // Each tool can take minutes (multi-attempt Copilot/Claude invocations), so - // logging only on completion leaves long silent gaps that look like a hang. - _logger.LogInformation(" [{Current}/{Total}] {ToolName} ({CheckCount} checks) ... running", - i + 1, checklist.Tools.Count, tool.Name, unevaluated); - - var toolEngine = await EvaluateToolChecks(tool, enginesToTry, cancellationToken); - if (toolEngine is not null) - { - engineUsed ??= toolEngine; - _logger.LogInformation(" [{Current}/{Total}] {ToolName} ... ok", - i + 1, checklist.Tools.Count, tool.Name); - } - else - { - _logger.LogWarning(" [{Current}/{Total}] {ToolName} ... failed (continuing)", - i + 1, checklist.Tools.Count, tool.Name); - } - } - - // Evaluate server-level checks (extract server_checks + tool list summary) - var serverUnevaluated = checklist.ServerChecks.Count(c => c.Type == CheckType.Semantic && c.Score is null); - if (serverUnevaluated > 0) - { - _logger.LogInformation(" server-level checks ({Count} checks) ... running", serverUnevaluated); - var serverEngine = await EvaluateServerChecks(checklist, enginesToTry, cancellationToken); - if (serverEngine is not null) - { - engineUsed ??= serverEngine; - _logger.LogInformation(" server-level checks ... ok"); - } - else - { - _logger.LogWarning(" server-level checks ... failed (continuing)"); - } - } + // Pick the scoring path from the engine. A per-check judge (Azure OpenAI) evaluates + // each assertion independently with the full tool schema as context; subprocess coding + // agents edit a whole-tool file. A per-check engine is explicit-only, so when chosen it + // is the sole entry in enginesToTry. + var primaryLauncher = enginesToTry.Count == 1 ? LauncherFor(enginesToTry[0]) : null; + engineUsed = primaryLauncher?.ScoresPerCheck == true + ? await EvaluatePerCheck(checklist, primaryLauncher, checklistPath, cancellationToken) + : await EvaluatePerTool(checklist, enginesToTry, cancellationToken); // Write the updated checklist back (with all merged results) var updatedJson = JsonSerializer.Serialize(checklist, WriteOptions); @@ -196,6 +154,209 @@ public async Task EvaluateAsync( }; } + /// + /// Whole-tool scoring path for subprocess coding agents: extract each tool to a sandbox + /// file, have the agent edit it, then merge results back. Tools run concurrently up to the + /// engine's MaxConcurrency (1 for coding agents). Returns the engine that did the work. + /// + private async Task EvaluatePerTool(EvaluationChecklist checklist, List enginesToTry, CancellationToken cancellationToken) + { + if (enginesToTry.Count == 0) + { + return null; + } + + EvalEngine? engineUsed = null; + + // When several engines could be tried via fallthrough, use the most conservative dop. + // Each tool writes to its own sandbox and mutates only its own ToolChecklist, so + // concurrent iterations don't share state; only engineUsed needs guarding. + int dop = enginesToTry.Count == 0 + ? 1 + : Math.Max(1, enginesToTry.Min(e => LauncherFor(e)?.MaxConcurrency ?? 1)); + + var engineLock = new object(); + await Parallel.ForEachAsync( + Enumerable.Range(0, checklist.Tools.Count), + new ParallelOptions { MaxDegreeOfParallelism = dop, CancellationToken = cancellationToken }, + async (i, ct) => + { + var tool = checklist.Tools[i]; + var unevaluated = CountUnevaluatedSemanticChecks(tool); + if (unevaluated == 0) + { + return; + } + + // Heartbeat BEFORE the tool runs so the user sees forward motion immediately. + _logger.LogInformation(" [{Current}/{Total}] {ToolName} ({CheckCount} checks) ... running", + i + 1, checklist.Tools.Count, tool.Name, unevaluated); + + var toolEngine = await EvaluateToolChecks(tool, enginesToTry, ct); + if (toolEngine is not null) + { + lock (engineLock) + { + engineUsed ??= toolEngine; + } + _logger.LogInformation(" [{Current}/{Total}] {ToolName} ... ok", + i + 1, checklist.Tools.Count, tool.Name); + } + else + { + _logger.LogWarning(" [{Current}/{Total}] {ToolName} ... failed (continuing)", + i + 1, checklist.Tools.Count, tool.Name); + } + }); + + var serverUnevaluated = checklist.ServerChecks.Count(c => c.Type == CheckType.Semantic && c.Score is null); + if (serverUnevaluated > 0) + { + _logger.LogInformation(" server-level checks ({Count} checks) ... running", serverUnevaluated); + var serverEngine = await EvaluateServerChecks(checklist, enginesToTry, cancellationToken); + if (serverEngine is not null) + { + engineUsed ??= serverEngine; + _logger.LogInformation(" server-level checks ... ok"); + } + else + { + _logger.LogWarning(" server-level checks ... failed (continuing)"); + } + } + + return engineUsed; + } + + /// + /// Per-check scoring path for a direct-API judge (Azure OpenAI): score every unscored + /// Semantic check independently, passing the FULL tool schema as context for each single + /// assertion, fanning out concurrently up to the engine's MaxConcurrency. Failed checks are + /// retried up to rounds. Results are collected concurrently then + /// applied to the checklist items serially; the checklist is persisted after each round so an + /// interrupted run resumes from the scored items rather than re-scoring everything. + /// + private async Task EvaluatePerCheck(EvaluationChecklist checklist, ICodingAgentLauncher launcher, string checklistPath, CancellationToken cancellationToken) + { + var work = new List<(ChecklistItem Item, string Context)>(); + + foreach (var tool in checklist.Tools) + { + var toolContext = BuildToolContext(tool); + foreach (var item in tool.Checks.ToolName.Concat(tool.Checks.ToolDescription).Concat(tool.Checks.SchemaStructure)) + { + if (item.Type == CheckType.Semantic && item.Score is null) + { + work.Add((item, toolContext)); + } + } + + foreach (var (paramName, paramChecks) in tool.Checks.Parameters) + { + var paramContext = $"{toolContext}\n\nParameter under evaluation: \"{paramName}\""; + foreach (var item in paramChecks.ParamName.Concat(paramChecks.ParamDescription)) + { + if (item.Type == CheckType.Semantic && item.Score is null) + { + work.Add((item, paramContext)); + } + } + } + } + + var serverContext = BuildServerContext(checklist); + foreach (var item in checklist.ServerChecks) + { + if (item.Type == CheckType.Semantic && item.Score is null) + { + work.Add((item, serverContext)); + } + } + + if (work.Count == 0) + { + return launcher.Engine; + } + + int dop = Math.Max(1, launcher.MaxConcurrency); + _logger.LogInformation(" Scoring {Count} check(s) independently — check-by-check, concurrency {Dop}", work.Count, dop); + + int succeeded = 0; + var pending = work; + for (int attempt = 1; attempt <= MaxAttempts && pending.Count > 0; attempt++) + { + // Score concurrently but do NOT mutate the shared ChecklistItem objects inside the + // parallel body — collect (item, result) pairs, then apply them serially afterward. + var scored = new ConcurrentBag<(ChecklistItem Item, CheckEvaluation Result)>(); + var failed = new ConcurrentBag<(ChecklistItem Item, string Context)>(); + int doneInRound = 0; + int roundTotal = pending.Count; + await Parallel.ForEachAsync( + pending, + new ParallelOptions { MaxDegreeOfParallelism = dop, CancellationToken = cancellationToken }, + async (w, ct) => + { + var result = await launcher.ScoreCheckAsync(w.Context, w.Item.Prompt, ct); + if (result is not null) + { + scored.Add((w.Item, result)); + } + else + { + failed.Add(w); + } + + var n = Interlocked.Increment(ref doneInRound); + if (n % 50 == 0 || n == roundTotal) + { + _logger.LogInformation(" ... {Done}/{Total} checks scored", n, roundTotal); + } + }); + + // Apply results serially — no concurrent writes to ChecklistItem fields. + foreach (var (item, result) in scored) + { + item.Score = result.Score; + item.Reason = result.Reason; + } + + succeeded += scored.Count; + pending = failed.ToList(); + + // Checkpoint after each round so an interrupted run resumes from the persisted scores + // rather than re-scoring everything. + await WriteChecklistAsync(checklist, checklistPath, cancellationToken); + + if (pending.Count > 0 && attempt < MaxAttempts) + { + _logger.LogInformation(" {Count} check(s) failed; retrying (attempt {Next}/{Max})", pending.Count, attempt + 1, MaxAttempts); + } + } + + if (pending.Count > 0) + { + _logger.LogWarning(" {Count} check(s) could not be scored after {Max} attempt(s)", pending.Count, MaxAttempts); + } + + return succeeded > 0 ? launcher.Engine : null; + } + + /// Full tool schema string passed as per-check context for a tool's checks. + private static string BuildToolContext(ToolChecklist tool) + { + var schema = tool.InputSchema.HasValue + ? JsonSerializer.Serialize(tool.InputSchema.Value, WriteOptions) + : "{}"; + return $"Tool name: {tool.Name}\nTool description: {tool.Description}\nInput schema (JSON):\n{schema}"; + } + + /// Tool-set summary passed as per-check context for server-level checks. + private static string BuildServerContext(EvaluationChecklist checklist) + { + var summary = string.Join("\n", checklist.Tools.Select(t => $"- {t.Name}: {t.Description}")); + return $"Evaluate against the full tool set of this MCP server:\n{summary}"; + } + /// /// Extracts a single tool to a temp file, invokes the coding agent to evaluate /// its semantic checks, then merges the scored results back into the tool object. @@ -540,10 +701,17 @@ private async Task> BuildEngineList(EvalEngine requested, Cance return []; } - // Auto: detect all available engines, preserving the registered priority order + // Auto: detect all available engines, preserving the registered priority order. + // Skip explicit-only engines (e.g. a remote API judge) so auto never selects one + // the user didn't ask for — those run only via an explicit --eval-engine value. var available = new List(); foreach (var launcher in _launchers) { + if (!launcher.AutoDetectable) + { + continue; + } + if (await launcher.IsAvailableAsync(cancellationToken)) { _logger.LogDebug("Detected {Engine}", launcher.Engine); @@ -638,7 +806,8 @@ private void LogManualEvaluationInstructions(string checklistPath, int unscoredC if (requested == EvalEngine.Auto) { // Built from the registry so a newly added engine appears here automatically. - var probed = string.Join(" and ", _launchers.Select(l => $"{l.DisplayName} (`{l.CliCommand}`)")); + // Only auto-detectable engines are probed under `auto`, so list just those. + var probed = string.Join(" and ", _launchers.Where(l => l.AutoDetectable).Select(l => $"{l.DisplayName} (`{l.CliCommand}`)")); _logger.LogWarning(" No coding agent CLI detected (looked for {Probed}). Run with -v to see why each probe failed.", probed); } else @@ -646,8 +815,8 @@ private void LogManualEvaluationInstructions(string checklistPath, int unscoredC // The user asked for one specific engine; name only that one, not both. var launcher = LauncherFor(requested); var name = launcher?.DisplayName ?? FormatEngineName(requested); - var binary = launcher?.CliCommand ?? requested.ToString(); - _logger.LogWarning(" {Name} CLI not found on PATH (looked for `{Binary}`). Run with -v to see why the probe failed.", name, binary); + var hint = launcher?.AvailabilityHint ?? $"`{requested}`"; + _logger.LogWarning(" {Name} is not available — needs {Hint}. Run with -v for details.", name, hint); } } else if (agentAttempted) diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs index 598592a8..521a4643 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs @@ -66,6 +66,17 @@ protected CodingAgentLauncherBase(CommandExecutor executor, ILogger logger) /// public abstract string CliCommand { get; } + /// + /// Subprocess agents run one at a time; a direct-API engine overrides this. + public virtual int MaxConcurrency => 1; + + /// + /// CLI-backed engines are auto-detectable; subclasses that incur remote cost override this. + public virtual bool AutoDetectable => true; + + /// + public virtual string AvailabilityHint => $"the {CliCommand} CLI on PATH"; + /// public Task IsAvailableAsync(CancellationToken cancellationToken = default) => ProbeCommandAsync(CliCommand, "--version", cancellationToken); @@ -73,6 +84,14 @@ public Task IsAvailableAsync(CancellationToken cancellationToken = default /// public abstract Task LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default); + /// + /// Subprocess coding agents edit a whole-tool file; per-check scoring does not apply. + public virtual bool ScoresPerCheck => false; + + /// + public virtual Task ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default) + => throw new NotSupportedException($"{DisplayName} edits a whole-tool checklist file and does not score checks individually."); + /// /// Runs a process and waits for it to complete, capturing stdout/stderr. /// Optionally pipes content via stdin. Kills the process on timeout to diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs index 954525e8..30cc5f10 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs @@ -19,8 +19,16 @@ internal static class EvalModelConstants /// Environment variable that overrides the Claude Code model. public const string ClaudeModelEnvVar = "A365_EVAL_CLAUDE_MODEL"; + /// Environment variable holding the Azure OpenAI endpoint, including the + /// API path (e.g. https://my-resource.services.ai.azure.com/openai/v1). + public const string AzureOpenAiEndpointEnvVar = "A365_EVAL_AZURE_OPENAI_ENDPOINT"; + + /// Environment variable holding the Azure OpenAI deployment (model) name. + public const string AzureOpenAiDeploymentEnvVar = "A365_EVAL_AZURE_OPENAI_DEPLOYMENT"; + private const string DefaultCopilotModel = "claude-haiku-4.5"; private const string DefaultClaudeModel = "haiku"; + private const string DefaultAzureOpenAiScope = "https://ai.azure.com/.default"; /// Copilot model ID; overridable via . public static string CopilotModel => FromEnvOrDefault(CopilotModelEnvVar, DefaultCopilotModel); @@ -28,9 +36,55 @@ internal static class EvalModelConstants /// Claude model ID/alias; overridable via . public static string ClaudeModel => FromEnvOrDefault(ClaudeModelEnvVar, DefaultClaudeModel); + /// Azure OpenAI endpoint, or null when unset. There is no default: it is + /// resource-specific and must be supplied via . + public static string? AzureOpenAiEndpoint => NullIfBlank(Environment.GetEnvironmentVariable(AzureOpenAiEndpointEnvVar)); + + /// Azure OpenAI deployment (model) name, or null when unset. Supplied via + /// . + public static string? AzureOpenAiDeployment => NullIfBlank(Environment.GetEnvironmentVariable(AzureOpenAiDeploymentEnvVar)); + + /// Entra ID token scope for Azure OpenAI (fixed; not user-configurable). + public static string AzureOpenAiScope => DefaultAzureOpenAiScope; + + /// Environment variable overriding how many checks the Azure OpenAI judge scores concurrently. + public const string AzureOpenAiMaxConcurrencyEnvVar = "A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY"; + + private const int DefaultAzureOpenAiMaxConcurrency = 100; + + /// Max concurrent Azure OpenAI scoring calls; overridable via + /// . Clamped to [1, 4096]. Higher is faster but + /// more likely to hit endpoint rate limits (429), which are retried with backoff per call. + public static int AzureOpenAiMaxConcurrency + { + get + { + var raw = Environment.GetEnvironmentVariable(AzureOpenAiMaxConcurrencyEnvVar); + return int.TryParse(raw, out var n) && n >= 1 ? Math.Min(n, 4096) : DefaultAzureOpenAiMaxConcurrency; + } + } + private static string FromEnvOrDefault(string envVar, string fallback) { var value = Environment.GetEnvironmentVariable(envVar); return string.IsNullOrWhiteSpace(value) ? fallback : value; } + + private static string? NullIfBlank(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + { + return null; + } + + var trimmed = value.Trim(); + // Strip a single pair of wrapping quotes that copy-paste from a portal or `SET VAR="..."` can leave. + if (trimmed.Length >= 2 && + ((trimmed[0] == '"' && trimmed[^1] == '"') || (trimmed[0] == '\'' && trimmed[^1] == '\''))) + { + trimmed = trimmed[1..^1].Trim(); + } + + return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed; + } } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs index eaa0ad47..3090ba10 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs @@ -49,7 +49,7 @@ public EvaluationPipelineService( } /// - public async Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken) + public async Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null) { try { @@ -81,10 +81,15 @@ public async Task RunAsync(string serverUrl, string outputDir, string evalE var engine = ParseEvalEngine(evalEngine); - // Trust boundary: when an agent will run, the server's tool names and - // descriptions are handed to a locally running coding agent. Surface this - // once per run. (--eval-engine none keeps everything local, so it doesn't apply.) - if (engine != EvalEngine.None) + // Trust boundary: when an engine will run, the server's tool names and + // descriptions are handed to it for scoring. Surface this once per run, and be + // accurate about where the content goes — a locally running coding agent vs a + // remote Azure OpenAI endpoint. (--eval-engine none keeps everything local.) + if (engine == EvalEngine.AzureOpenAI) + { + _logger.LogWarning("The server's tool names and descriptions are sent to your configured Azure OpenAI endpoint for scoring. Only evaluate MCP servers you trust."); + } + else if (engine != EvalEngine.None) { _logger.LogWarning("The server's tool names and descriptions are sent to a locally running coding agent for scoring. Only evaluate MCP servers you trust."); } @@ -101,7 +106,10 @@ public async Task RunAsync(string serverUrl, string outputDir, string evalE // Run the derived name through the same sanitizer as the report filename so // any invalid-for-filesystem characters (?, *, <, etc.) from the fallback path // don't crash Path.Combine / File.Exists downstream. - var serverName = DeriveServerName(serverUrl); + // A friendly --report-name overrides the host-derived name for both the output + // file names and the report's displayed server name. Essential when many servers + // sit behind one gateway host (which would otherwise collide on the same name). + var serverName = !string.IsNullOrWhiteSpace(reportName) ? reportName! : DeriveServerName(serverUrl); var safeServerName = ReportGenerator.SanitizeFileName(serverName); var checklistPath = Path.Combine(outputDir, $"{safeServerName}_checklist.json"); @@ -299,13 +307,14 @@ internal static EvalEngine ParseEvalEngine(string value) "auto" => EvalEngine.Auto, "github-copilot" => EvalEngine.GitHubCopilot, "claude-code" => EvalEngine.ClaudeCode, + "azure-openai" => EvalEngine.AzureOpenAI, "none" => EvalEngine.None, _ => throw new EvaluationException( ErrorCodes.EvaluationFailed, $"Unknown eval engine: '{value}'.", mitigationSteps: new List { - "Use one of: auto, github-copilot, claude-code, none" + "Use one of: auto, github-copilot, claude-code, azure-openai, none" }) }; } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs index c24725d1..6741d8b1 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs @@ -32,6 +32,27 @@ internal interface ICodingAgentLauncher /// The CLI binary name on PATH (e.g. "copilot", "claude"). string CliCommand { get; } + /// + /// How many tool evaluations may run concurrently with this engine. Subprocess agents + /// return 1 (serial, to avoid spawning many heavy CLI processes); a direct-API judge + /// returns a higher value to score many tools in parallel. + /// + int MaxConcurrency { get; } + + /// + /// Whether --eval-engine auto may select this engine once its prerequisites + /// are present. Local coding agents are auto-detectable; engines that call a remote + /// model endpoint (and may incur cost) are explicit-only and return false. + /// + bool AutoDetectable { get; } + + /// + /// Human-readable description of what must be present for this engine to run, used in + /// the "engine not available" guidance (e.g. "the copilot CLI on PATH", or the + /// required environment variables for an API-based engine). + /// + string AvailabilityHint { get; } + /// /// Returns true when the agent's CLI is installed and responds on PATH. /// @@ -43,4 +64,23 @@ internal interface ICodingAgentLauncher /// Returns true when the subprocess exits 0. /// Task LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default); + + /// + /// Whether this engine scores each checklist item independently — one model call per + /// check, with the full tool schema passed as context — rather than editing a whole-tool + /// file. A direct-API judge returns true; subprocess coding agents return false and use + /// instead. + /// + bool ScoresPerCheck { get; } + + /// + /// Scores a single checklist item. is the full tool schema the + /// assertion is evaluated against (or the tool-set summary for server-level checks). + /// Returns the score and one-sentence reason, or null if the call failed. Only meaningful + /// when is true. + /// + Task ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default); } + +/// Result of scoring one checklist item: pass/fail plus a one-sentence reason. +internal sealed record CheckEvaluation(bool Score, string Reason); diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs index 591cfc71..72fa72a5 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs @@ -17,6 +17,9 @@ public interface IEvaluationPipelineService /// Coding agent engine name (auto, github-copilot, claude-code, none). /// Optional bearer token for MCP server authentication. /// Cancellation token. + /// Optional friendly name for the evaluated server. When provided it is + /// used (sanitized) for the output filenames and the report's server name instead of the value + /// derived from the URL host — needed when many servers share one gateway host. /// Process exit code: 0 on success or an intentional bring-your-own-LLM stop; 1 when evaluation could not be performed as requested. - Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken); + Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null); } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs index 2d36e4de..7783959d 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs @@ -26,6 +26,11 @@ internal sealed class SchemaDiscoveryService : ISchemaDiscoveryService, IDisposa private readonly ILogger _logger; private readonly HttpClient _httpClient; + // MCP Streamable HTTP session id issued by the server on initialize. Servers that require + // a session (per spec) reject session-less follow-up requests with HTTP 400; we echo this + // header on every subsequent request. Reset per discovery (the service is a DI singleton). + private string? _mcpSessionId; + public SchemaDiscoveryService(ILogger logger, HttpMessageHandler? handler = null) { ArgumentNullException.ThrowIfNull(logger); @@ -63,6 +68,7 @@ public async Task> DiscoverToolsAsync(string serverUrl, string? } _logger.LogDebug("Starting MCP schema discovery against {ServerUrl}", serverUrl); + _mcpSessionId = null; // reset any session id captured from a previous discovery try { @@ -300,8 +306,21 @@ private async Task PostJsonRpcAsync( request.Headers.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", authToken); } + // Echo the MCP session id (captured from initialize) so session-required servers accept + // follow-up requests; stateless servers simply ignore an unexpected header. + if (!string.IsNullOrEmpty(_mcpSessionId)) + { + request.Headers.TryAddWithoutValidation("Mcp-Session-Id", _mcpSessionId); + } + var response = await _httpClient.SendAsync(request, cancellationToken); + // Capture the session id from the first response (initialize) for reuse on later calls. + if (string.IsNullOrEmpty(_mcpSessionId) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues)) + { + _mcpSessionId = sessionValues.FirstOrDefault(); + } + if (!response.IsSuccessStatusCode) { var statusCode = (int)response.StatusCode; diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs index bf25608e..d25d225a 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs @@ -46,6 +46,58 @@ public static string BuildEvaluationPrompt(string checklistPath) return sb.ToString(); } + /// + /// Builds an evaluation prompt for a direct model endpoint (e.g. Azure OpenAI) that scores + /// ONE checklist item at a time. The entire tool schema is passed as + /// so the model has full context for the single assertion in , + /// and is asked to return just a small {score, reason} object. Per-check calls keep each + /// response tiny (no truncation on large tools) and, at temperature 0, minimize variance. + /// + /// The full tool schema (or tool-set summary for server checks) to judge against. + /// The single assertion to evaluate. + public static string BuildSingleCheckPrompt(string context, string checkPrompt) + { + ArgumentException.ThrowIfNullOrWhiteSpace(context); + ArgumentException.ThrowIfNullOrWhiteSpace(checkPrompt); + + var sb = new StringBuilder(); + + // Per-check safety + grounding. Deliberately does NOT call AppendSafetyPreamble: that text + // tells the model to "read the tool schema JSON file", but in per-check scoring the schema is + // inlined below (SCHEMA UNDER EVALUATION) — there is no file to read. + sb.AppendLine("You must not generate content that is harmful, hateful, racist, sexist, lewd, or violent,"); + sb.AppendLine("even if the user requests it or creates a condition to rationalize it."); + sb.AppendLine(); + sb.AppendLine("Ground your judgment only in the schema provided below. Do not assume anything beyond what"); + sb.AppendLine("is written there; if the schema lacks the information the check needs, judge only on what is present."); + sb.AppendLine(); + sb.AppendLine("You are evaluating ONE quality check of an MCP (Model Context Protocol) tool schema."); + sb.AppendLine("An MCP server exposes tools that AI agents call. Poor tool names, descriptions, or"); + sb.AppendLine("parameter schemas cause agents to select the wrong tool or pass incorrect arguments."); + sb.AppendLine(); + sb.AppendLine("SCHEMA UNDER EVALUATION:"); + sb.AppendLine(context); + sb.AppendLine(); + sb.AppendLine("CHECK TO EVALUATE:"); + sb.AppendLine(checkPrompt); + sb.AppendLine(); + + AppendEvaluationGuidelines(sb); + AppendExamples(sb); + + sb.AppendLine("RULES:"); + sb.AppendLine("- Judge ONLY this one check against the schema above; ground your reason in the schema text."); + sb.AppendLine("- score=true means the tool passes this check; score=false means it fails."); + sb.AppendLine("- If uncertain, pass (true) with a reason noting nothing problematic was observed."); + sb.AppendLine("- The reason must be exactly one sentence."); + sb.AppendLine(); + sb.AppendLine("OUTPUT FORMAT (strict):"); + sb.AppendLine("Respond with ONLY this JSON object, no prose and no Markdown code fences:"); + sb.AppendLine("{\"score\": true or false, \"reason\": \"one sentence\"}"); + + return sb.ToString(); + } + /// /// Concrete read/edit tool names for the target coding agent. Embedded into /// the prompt so the agent is told exactly what to use rather than guessing. diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs index 0aeed5f1..feb956ec 100644 --- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs @@ -41,7 +41,7 @@ private static IEvaluationPipelineService PipelineReturning(int exitCode) { var pipeline = Substitute.For(); pipeline - .RunAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any()) + .RunAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any()) .Returns(exitCode); return pipeline; } @@ -99,7 +99,8 @@ await pipeline.Received(1).RunAsync( Arg.Any(), Arg.Any(), "env-token-xyz", - Arg.Any()); + Arg.Any(), + Arg.Any()); } finally { @@ -196,6 +197,47 @@ await pipeline.Received(1).RunAsync( "out", "claude-code", Arg.Any(), - Arg.Any()); + Arg.Any(), + Arg.Any()); + } + + [Fact] + public async Task InvokeAsync_AzureOpenAiEngine_ForwardsEngineAsThirdArgument() + { + var pipeline = PipelineReturning(0); + var evaluate = GetEvaluateSubcommand(Substitute.For(), pipeline); + var parser = BuildFaithfulParser(evaluate); + + var exitCode = await parser.InvokeAsync( + new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "azure-openai" }); + + exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code"); + await pipeline.Received(1).RunAsync( + Arg.Any(), + Arg.Any(), + "azure-openai", + Arg.Any(), + Arg.Any(), + Arg.Any()); + } + + [Fact] + public async Task InvokeAsync_ReportNameFlag_ForwardsReportNameAsSixthArgument() + { + var pipeline = PipelineReturning(0); + var evaluate = GetEvaluateSubcommand(Substitute.For(), pipeline); + var parser = BuildFaithfulParser(evaluate); + + var exitCode = await parser.InvokeAsync( + new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "none", "--report-name", "salesforce" }); + + exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code"); + await pipeline.Received(1).RunAsync( + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + Arg.Any(), + "salesforce"); } } diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs new file mode 100644 index 00000000..b4824f1c --- /dev/null +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FluentAssertions; +using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; + +namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate; + +/// +/// Disables parallelization for , which +/// mutate the process-wide endpoint and deployment environment variables that +/// IsAvailableAsync reads. +/// +// This serializes tests WITHIN the collection only. No other test class in the suite reads +// A365_EVAL_AZURE_OPENAI_ENDPOINT or A365_EVAL_AZURE_OPENAI_DEPLOYMENT, so cross-collection +// races are not a concern; revisit if another suite begins reading those variables. +[CollectionDefinition("AzureOpenAiAvailability", DisableParallelization = true)] +public class AzureOpenAiAvailabilityCollection { } + +/// +/// Tests , which gates whether the engine +/// can run by inspecting the endpoint and deployment environment variables. These mutate +/// process-wide state, so the suite runs in its own non-parallel collection and each test +/// saves and restores both variables in a finally block. +/// +[Collection("AzureOpenAiAvailability")] +public class AzureOpenAiLauncherAvailabilityTests +{ + private const string EndpointVar = EvalModelConstants.AzureOpenAiEndpointEnvVar; + private const string DeploymentVar = EvalModelConstants.AzureOpenAiDeploymentEnvVar; + + private static AzureOpenAiLauncher CreateLauncher() => + new(NullLogger.Instance); + + /// + /// Sets both environment variables, runs , then restores the + /// original values regardless of outcome so no test leaks state into another. + /// + private static async Task WithEnvAsync(string? endpoint, string? deployment, Func body) + { + var originalEndpoint = Environment.GetEnvironmentVariable(EndpointVar); + var originalDeployment = Environment.GetEnvironmentVariable(DeploymentVar); + try + { + Environment.SetEnvironmentVariable(EndpointVar, endpoint); + Environment.SetEnvironmentVariable(DeploymentVar, deployment); + await body(); + } + finally + { + Environment.SetEnvironmentVariable(EndpointVar, originalEndpoint); + Environment.SetEnvironmentVariable(DeploymentVar, originalDeployment); + } + } + + [Fact] + public async Task IsAvailableAsync_BothUnset_ReturnsFalse() + { + await WithEnvAsync(endpoint: null, deployment: null, async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeFalse( + because: "with neither variable set the engine has nothing to connect to and must report unavailable"); + }); + } + + [Fact] + public async Task IsAvailableAsync_OnlyEndpointSet_ReturnsFalse() + { + await WithEnvAsync(endpoint: "https://x.openai.azure.com/openai/v1", deployment: null, async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeFalse( + because: "a deployment name is also required; an endpoint alone is not enough to run the judge"); + }); + } + + [Fact] + public async Task IsAvailableAsync_OnlyDeploymentSet_ReturnsFalse() + { + await WithEnvAsync(endpoint: null, deployment: "gpt-4.1", async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeFalse( + because: "an endpoint is also required; a deployment name alone is not enough to run the judge"); + }); + } + + [Fact] + public async Task IsAvailableAsync_EndpointNotAbsoluteUrl_ReturnsFalse() + { + await WithEnvAsync(endpoint: "not-a-url", deployment: "gpt-4.1", async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeFalse( + because: "a non-absolute endpoint cannot be used to build a client, so the engine must report unavailable up front"); + }); + } + + [Fact] + public async Task IsAvailableAsync_BothSetWithValidHttpsEndpoint_ReturnsTrue() + { + await WithEnvAsync(endpoint: "https://x.openai.azure.com/openai/v1", deployment: "gpt-4.1", async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeTrue( + because: "both variables are set and the endpoint is a valid absolute URL, so the engine is configured to run"); + }); + } + + [Fact] + public async Task IsAvailableAsync_EndpointWithWrappingQuotes_ReturnsTrue() + { + // Copy-paste from a portal or `SET VAR="..."` can leave literal wrapping quotes. + await WithEnvAsync(endpoint: "\"https://x.openai.azure.com/openai/v1\"", deployment: "gpt-4.1", async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeTrue( + because: "a single pair of wrapping quotes must be stripped so a quoted endpoint still resolves to a valid absolute URL"); + }); + } +} diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs new file mode 100644 index 00000000..c31a3e55 --- /dev/null +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs @@ -0,0 +1,233 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FluentAssertions; +using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate; +using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; + +namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate; + +/// +/// Pure-function and property tests for — its response +/// parsing (ParseEvaluation/ExtractJson), its static metadata, and the +/// per-check prompt builder. None of these touch the environment or the network, so this +/// suite is parallel-safe. Availability (which reads environment variables) is covered +/// separately in a non-parallel collection. +/// +public class AzureOpenAiLauncherTests +{ + private static AzureOpenAiLauncher CreateLauncher() => + new(NullLogger.Instance); + + // ----------------------------------------------------------------------- + // ParseEvaluation - valid inputs + // ----------------------------------------------------------------------- + + [Theory] + [InlineData("{\"score\": true, \"reason\": \"x\"}", true, "x")] + [InlineData("{\"score\": false, \"reason\": \"nope\"}", false, "nope")] + [InlineData("{\"score\": \"true\", \"reason\": \"stringified pass\"}", true, "stringified pass")] + [InlineData("{\"score\": \"false\", \"reason\": \"stringified fail\"}", false, "stringified fail")] + public void ParseEvaluation_WellFormedJson_ParsesScoreAndReason(string output, bool expectedScore, string expectedReason) + { + var result = AzureOpenAiLauncher.ParseEvaluation(output); + + result.Should().NotBeNull(because: "a well-formed {score, reason} object must parse into a CheckEvaluation"); + result!.Score.Should().Be(expectedScore, because: "the score must reflect the boolean (or stringified boolean) in the response"); + result.Reason.Should().Be(expectedReason, because: "the reason text must be carried through verbatim"); + } + + [Fact] + public void ParseEvaluation_FencedJson_StripsFenceAndParses() + { + var output = "```json\n{\"score\": true, \"reason\": \"fenced response\"}\n```"; + + var result = AzureOpenAiLauncher.ParseEvaluation(output); + + result.Should().NotBeNull(because: "a model that wraps its JSON in a Markdown code fence must still be parsed"); + result!.Score.Should().BeTrue(); + result.Reason.Should().Be("fenced response"); + } + + [Fact] + public void ParseEvaluation_ProseWrappedJson_ExtractsObjectAndParses() + { + var output = "Here is my judgment: {\"score\": false, \"reason\": \"prose around the object\"} - hope that helps."; + + var result = AzureOpenAiLauncher.ParseEvaluation(output); + + result.Should().NotBeNull(because: "stray prose around the JSON object must be tolerated by narrowing to the outermost braces"); + result!.Score.Should().BeFalse(); + result.Reason.Should().Be("prose around the object"); + } + + [Fact] + public void ParseEvaluation_ReasonContainingBraces_PreservesBraces() + { + var output = "{\"score\": true, \"reason\": \"Schema {ok} valid.\"}"; + + var result = AzureOpenAiLauncher.ParseEvaluation(output); + + result.Should().NotBeNull(because: "braces inside the reason string must not break extraction of the outermost object"); + result!.Score.Should().BeTrue(); + result.Reason.Should().Be("Schema {ok} valid.", + because: "the literal braces inside the reason value must be preserved, not truncated by brace matching"); + } + + // ----------------------------------------------------------------------- + // ParseEvaluation - invalid inputs return null + // ----------------------------------------------------------------------- + + [Theory] + [InlineData(null)] + [InlineData("")] + [InlineData("no json here at all")] + [InlineData("{\"reason\":\"missing score\"}")] + public void ParseEvaluation_InvalidOrIncomplete_ReturnsNull(string? output) + { + var result = AzureOpenAiLauncher.ParseEvaluation(output); + + result.Should().BeNull( + because: "without a usable score the evaluator must treat the response as unscored and retry, not record a bogus result"); + } + + [Fact] + public void ParseEvaluation_PresentButNullScore_ParsesAsFalse() + { + // The "score" property is present (so it is not a missing-field case) but its value is + // JSON null. ParseEvaluation only returns null when the property is absent; a present + // score of any non-true/non-"true" kind collapses to false, which is the safe (fail) + // default for a malformed score rather than discarding the response entirely. + var result = AzureOpenAiLauncher.ParseEvaluation("{\"score\": null, \"reason\": \"explicit null score\"}"); + + result.Should().NotBeNull(because: "a present score property means the object is parsed, not rejected"); + result!.Score.Should().BeFalse(because: "a null score is not a pass, so it conservatively collapses to false"); + result.Reason.Should().Be("explicit null score"); + } + + // ----------------------------------------------------------------------- + // ExtractJson + // ----------------------------------------------------------------------- + + [Fact] + public void ExtractJson_NestedObject_PreservesInnerStructure() + { + var output = "{\"score\": true, \"reason\": \"ok\", \"nested\": {\"a\": 1}}"; + + var json = AzureOpenAiLauncher.ExtractJson(output); + + json.Should().NotBeNull(because: "a well-formed JSON object must be extractable from plain input"); + json.Should().Contain("\"nested\"", because: "a nested object must be retained within the outermost braces"); + json.Should().Contain("{\"a\": 1}", because: "the inner object must not be truncated"); + } + + [Fact] + public void ExtractJson_BraceInsideString_DoesNotTruncate() + { + var output = "{\"score\": true, \"reason\": \"has a } brace in text\"}"; + + var json = AzureOpenAiLauncher.ExtractJson(output); + + json.Should().NotBeNull(because: "a single self-contained object with no trailing prose must be extractable"); + json.Should().Be("{\"score\": true, \"reason\": \"has a } brace in text\"}", + because: "LastIndexOf finds the outermost closing brace, which is correct when no prose after the object contains additional braces"); + } + + [Fact] + public void ExtractJson_FencedJson_StripsFences() + { + var output = "```json\n{\"score\": true, \"reason\": \"ok\"}\n```"; + + var json = AzureOpenAiLauncher.ExtractJson(output); + + json.Should().NotBeNull(because: "a fenced JSON block must be returned after fence stripping"); + json.Should().NotContain("```", because: "Markdown code fences must be stripped before the JSON is returned"); + json.Should().Be("{\"score\": true, \"reason\": \"ok\"}", + because: "only the inner JSON object should remain after fence and whitespace stripping"); + } + + [Theory] + [InlineData(null)] + [InlineData("")] + [InlineData(" ")] + [InlineData("no braces at all")] + public void ExtractJson_NullBlankOrNoBrace_ReturnsNull(string? output) + { + var json = AzureOpenAiLauncher.ExtractJson(output); + + json.Should().BeNull( + because: "with no recoverable JSON object the caller must get null rather than an unparseable candidate"); + } + + // ----------------------------------------------------------------------- + // Static metadata / properties + // ----------------------------------------------------------------------- + + [Fact] + public void Engine_IsAzureOpenAi() + { + CreateLauncher().Engine.Should().Be(EvalEngine.AzureOpenAI); + } + + [Fact] + public void DisplayName_IsAzureOpenAi() + { + CreateLauncher().DisplayName.Should().Be("Azure OpenAI"); + } + + [Fact] + public void ScoresPerCheck_IsTrue() + { + CreateLauncher().ScoresPerCheck.Should().BeTrue( + because: "the Azure OpenAI judge scores each assertion independently with one model call per check"); + } + + [Fact] + public void AutoDetectable_IsFalse() + { + CreateLauncher().AutoDetectable.Should().BeFalse( + because: "--eval-engine auto must never select Azure OpenAI; a plain run must not spend tokens unless the user opts in"); + } + + [Fact] + public void AvailabilityHint_NamesBothEnvironmentVariables() + { + var hint = CreateLauncher().AvailabilityHint; + + hint.Should().Contain(EvalModelConstants.AzureOpenAiEndpointEnvVar, + because: "the hint must tell the user which endpoint environment variable to set"); + hint.Should().Contain(EvalModelConstants.AzureOpenAiDeploymentEnvVar, + because: "the hint must tell the user which deployment environment variable to set"); + } + + // ----------------------------------------------------------------------- + // BuildSingleCheckPrompt + // ----------------------------------------------------------------------- + + [Fact] + public void BuildSingleCheckPrompt_IncludesContextCheckAndStrictJsonFormat() + { + const string context = "TOOL_SCHEMA_MARKER_42"; + const string checkPrompt = "CHECK_PROMPT_MARKER_99"; + + var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt(context, checkPrompt); + + prompt.Should().Contain(context, because: "the model needs the tool schema as grounding for the single check"); + prompt.Should().Contain(checkPrompt, because: "the assertion under evaluation must appear in the prompt"); + prompt.Should().Contain("{\"score\": true or false, \"reason\": \"one sentence\"}", + because: "the strict output contract pins the exact JSON shape the parser expects"); + } + + [Fact] + public void BuildSingleCheckPrompt_OmitsFileAndEditStrategyInstructions() + { + var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt("some schema", "some check"); + + prompt.Should().NotContain("JSON file", + because: "per-check scoring inlines the schema in the prompt; there is no file for the model to read"); + prompt.Should().NotContain("EDIT STRATEGY", + because: "the per-check prompt returns a small object directly and never edits a checklist file"); + } +} diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs index 2f862e82..caa477b7 100644 --- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs @@ -4,7 +4,11 @@ using FluentAssertions; using Microsoft.Agents.A365.DevTools.Cli.Exceptions; using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate; +using Microsoft.Agents.A365.DevTools.Cli.Services; using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using NSubstitute; using Xunit; namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate; @@ -25,6 +29,8 @@ public class EvaluationPipelineServiceTests [InlineData("GITHUB-COPILOT", EvalEngine.GitHubCopilot)] [InlineData("claude-code", EvalEngine.ClaudeCode)] [InlineData("Claude-Code", EvalEngine.ClaudeCode)] + [InlineData("azure-openai", EvalEngine.AzureOpenAI)] + [InlineData("Azure-OpenAI", EvalEngine.AzureOpenAI)] [InlineData("none", EvalEngine.None)] [InlineData("NONE", EvalEngine.None)] public void ParseEvalEngine_ValidValues_ReturnsCorrectEnum(string input, EvalEngine expected) @@ -104,4 +110,103 @@ public void DeriveServerName_EmptyString_ReturnsUnknownServer() result.Should().Be("unknown-server", because: "empty input must fall back to a stable placeholder so report generation still has a filename"); } + + // ----------------------------------------------------------------------- + // Trust-boundary warning (RunAsync) + // + // The warning fires immediately after ParseEvalEngine and before any server + // discovery, so a real pipeline with a substitute logger and mocked + // sub-dependencies surfaces it without contacting any MCP server or Azure + // OpenAI endpoint. The evaluator is stubbed to report CouldNotEvaluate so the + // pipeline short-circuits (exit 1) right after the warning, never reaching + // analysis or report generation. + // ----------------------------------------------------------------------- + + private static EvaluationPipelineService BuildPipelineWithEvaluatorOutcome( + ILogger logger, + EvaluationOutcome outcome) + { + var evaluator = Substitute.For(); + evaluator + .EvaluateAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(call => new ChecklistEvaluationResult + { + Checklist = call.Arg(), + Outcome = outcome, + }); + + var generator = Substitute.For(); + generator + .Generate(Arg.Any>(), Arg.Any(), Arg.Any()) + .Returns(new EvaluationChecklist()); + + // Return an empty (non-null) tool list so discovery logging does not NRE; the run still + // short-circuits at the evaluator outcome, which is all this warning test exercises. + var discovery = Substitute.For(); + discovery + .DiscoverToolsAsync(Arg.Any(), Arg.Any(), Arg.Any()) + .Returns(new List()); + + return new EvaluationPipelineService( + logger, + discovery, + generator, + evaluator, + Substitute.For(), + Substitute.For(), + Substitute.For()); + } + + private static void ReceivedWarningContaining(ILogger logger, string fragment) => + logger.Received().Log( + LogLevel.Warning, + Arg.Any(), + Arg.Is(o => o.ToString()!.Contains(fragment)), + Arg.Any(), + Arg.Any>()); + + private static void DidNotReceiveWarningContaining(ILogger logger, string fragment) => + logger.DidNotReceive().Log( + LogLevel.Warning, + Arg.Any(), + Arg.Is(o => o.ToString()!.Contains(fragment)), + Arg.Any(), + Arg.Any>()); + + // A unique, non-existent output directory so the pipeline always takes the fresh-discovery + // path (a leftover "_checklist.json" on disk would otherwise flip it to the resume + // path). Nothing is written here: the stubbed evaluator short-circuits the run before report + // generation, so no cleanup is needed. + private static string UniqueOutputDir() => + Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString("N")); + + [Fact] + public async Task RunAsync_AzureOpenAiEngine_WarnsAboutAzureOpenAiEndpointNotLocalAgent() + { + var logger = Substitute.For>(); + var pipeline = BuildPipelineWithEvaluatorOutcome(logger, EvaluationOutcome.CouldNotEvaluate); + + var exitCode = await pipeline.RunAsync( + "http://localhost/mcp", UniqueOutputDir(), "azure-openai", authToken: null, CancellationToken.None); + + exitCode.Should().Be(1, + because: "a CouldNotEvaluate outcome means scoring did not happen as requested, so the run must fail"); + ReceivedWarningContaining(logger, "Azure OpenAI endpoint"); + DidNotReceiveWarningContaining(logger, "locally running coding agent"); + } + + [Fact] + public async Task RunAsync_LocalAgentEngine_WarnsAboutLocalAgentNotAzureOpenAi() + { + var logger = Substitute.For>(); + var pipeline = BuildPipelineWithEvaluatorOutcome(logger, EvaluationOutcome.CouldNotEvaluate); + + var exitCode = await pipeline.RunAsync( + "http://localhost/mcp", UniqueOutputDir(), "claude-code", authToken: null, CancellationToken.None); + + exitCode.Should().Be(1, + because: "a CouldNotEvaluate outcome means scoring did not happen as requested, so the run must fail"); + ReceivedWarningContaining(logger, "locally running coding agent"); + DidNotReceiveWarningContaining(logger, "Azure OpenAI endpoint"); + } } From 070b26274074b1e963409ed0898a510a35b56497 Mon Sep 17 00:00:00 2001 From: "Ashrya Agrawal (from Dev Box)" Date: Mon, 22 Jun 2026 15:24:23 -0700 Subject: [PATCH 2/2] fix(evaluate): address PR #463 review and drop --report-name option SchemaDiscovery: replace the shared mutable _mcpSessionId field with a per-call McpSession holder so concurrent discoveries on the DI singleton no longer race; validate the server-supplied session id as visible ASCII (0x21-0x7E) and echo it via Headers.Add to block CR/LF header injection. AzureOpenAiLauncher: require an HTTPS endpoint in IsAvailableAsync (+ test) so tool metadata is never sent over plaintext http. Remove the --report-name option across command, interface, pipeline, tests, and CHANGELOG. Docs: recommend gpt-5.4 (the tested model) for A365_EVAL_AZURE_OPENAI_DEPLOYMENT. Trim the azure-openai CHANGELOG entry and remove the OpenAI version-pin rationale comment. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 3 +- .../a365-evaluate-faq.md | 2 +- .../a365-evaluate-instructions.md | 6 +- src/Directory.Packages.props | 4 - .../Commands/DevelopMcpCommand.cs | 9 +- .../Services/Evaluate/AzureOpenAiLauncher.cs | 12 ++- .../Evaluate/EvaluationPipelineService.cs | 7 +- .../Evaluate/IEvaluationPipelineService.cs | 5 +- .../Evaluate/SchemaDiscoveryService.cs | 82 ++++++++++++++----- .../EvaluateCommandInvocationTests.cs | 31 +------ .../AzureOpenAiLauncherAvailabilityTests.cs | 14 ++++ 11 files changed, 100 insertions(+), 75 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6015a2d..bb6b6a7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,8 +26,7 @@ Agents provisioned before this release need `Agent365.Observability.OtelWrite` g - Log separator written at the start of each CLI invocation now redacts values for secret-bearing options (e.g. `--idp-client-secret`) so they are not written to the log file in plain text. - Authentication context (tenant and user) is now logged at the `Information` level whenever the resolved sign-in identity changes, giving operators a clear audit trail in the log file of who the CLI is acting as, without exposing credentials. - `a365 develop-mcp evaluate` command for evaluating MCP server tool schema quality — runs deterministic and semantic checks (via GitHub Copilot or Claude Code CLIs), computes maturity scoring, and generates an interactive HTML report -- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores each checklist item with your own Azure OpenAI deployment instead of a local coding agent. Authenticates with Microsoft Entra ID only (`DefaultAzureCredential`; API-key auth is not supported); configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (optional `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY`, default 100). Each assertion is judged independently at temperature 0, in parallel, with the full tool schema as context. -- `--report-name` on `develop-mcp evaluate` — overrides the server name used for the output file names and the report header (derived from the URL host by default); needed when multiple servers share one gateway host so their reports don't collide. +- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores semantic checks with your own Azure OpenAI deployment instead of a local coding agent, authenticating with Microsoft Entra ID only (`DefaultAzureCredential`; no API key). Configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`. - `--skip-sp-provisioning` option on `setup all` — skips the interactive in-line provisioning of missing resource service principals (issue #429). Default: setup prompts per-resource and runs `az ad sp create --id ` using the operator's `az login`. With this flag, missing SPs are excluded from the consent URL and listed in the Action Required block with the `az` command and a per-SP consent URL. Implicitly enabled when stdin is redirected (CI / pipe scenarios). - Action Required block in the `setup all` summary now lists missing service principals — each entry shows the resource, pending scopes, the `az ad sp create` command, and the per-SP consent URL needed to complete provisioning. - `logs export [command] [--output ]` — exports a redacted copy of a CLI diagnostic log safe to share with Microsoft support. Redacts JWT tokens, email addresses, OS-path usernames, and tenant-specific GUIDs; replaces identical values with consistent aliases so log correlation is preserved. Preserves diagnostic IDs that aren't sensitive but are useful for debugging — `TraceId`, `CorrelationId`, Microsoft Graph `request-id` and `client-request-id` values, and well-known public Microsoft / Agent 365 resource appIds (such as the Microsoft Graph appId `00000003-0000-0000-c000-000000000000`). Omit `[command]` to export all available logs at once. diff --git a/docs/agent365-guided-setup/a365-evaluate-faq.md b/docs/agent365-guided-setup/a365-evaluate-faq.md index e637b664..a4997c8b 100644 --- a/docs/agent365-guided-setup/a365-evaluate-faq.md +++ b/docs/agent365-guided-setup/a365-evaluate-faq.md @@ -39,7 +39,7 @@ By default the evaluation uses **Claude Haiku 4.5**, invoked through whichever c You can also score with **your own Azure OpenAI deployment** via `--eval-engine azure-openai`. Set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`, and sign in to Entra ID (e.g. `az login`). This engine authenticates with **Microsoft Entra ID only** (`DefaultAzureCredential`) — **API-key authentication is not supported**. Unlike the coding-agent engines it needs no local CLI: the `a365` CLI calls your Azure OpenAI deployment directly, under your own Azure subscription, billing, and access policies. -To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`. +To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (`gpt-5.4` recommended — it is the model this engine was tested against). The CLI that runs the model is **yours**, installed from npm by you, authenticated with your credentials. Microsoft does not host or resell the model API call — it is made directly by your CLI to the AI provider under your own terms of service and billing. The `a365` CLI specifies the model flag but does not mediate the API call. diff --git a/docs/agent365-guided-setup/a365-evaluate-instructions.md b/docs/agent365-guided-setup/a365-evaluate-instructions.md index 4de1cb3f..137aa380 100644 --- a/docs/agent365-guided-setup/a365-evaluate-instructions.md +++ b/docs/agent365-guided-setup/a365-evaluate-instructions.md @@ -143,7 +143,7 @@ No local coding agent is used — the `a365` CLI calls your Azure OpenAI deploym ```bash echo "$A365_EVAL_AZURE_OPENAI_ENDPOINT" # e.g. https://.services.ai.azure.com/openai/v1 -echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT" # e.g. gpt-4.1 +echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT" # deployment name, e.g. gpt-5.4 (recommended) ``` If either is empty, ask the user for their Azure OpenAI endpoint (include the `/openai/v1` path) and deployment name, then set both variables. @@ -212,10 +212,10 @@ These settings can come from environment variables instead of (or alongside) the | `A365_EVAL_COPILOT_MODEL` | Override the GitHub Copilot model (exact model ID, e.g. `claude-haiku-4.5`). | | `A365_EVAL_CLAUDE_MODEL` | Override the Claude Code model (alias, e.g. `haiku`). | | `A365_EVAL_AZURE_OPENAI_ENDPOINT` | **Required for `--eval-engine azure-openai`.** Azure OpenAI endpoint, including the API path (e.g. `https://.services.ai.azure.com/openai/v1`). | -| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with (e.g. `gpt-4.1`). | +| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with. `gpt-5.4` is recommended — it is the model this engine was tested against. | | `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY` | Parallel check-scoring calls for the `azure-openai` engine. Default `100`. | -For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and authenticates with Entra ID only (no API key — see Step 2). +For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (`gpt-5.4` recommended) and authenticates with Entra ID only (no API key — see Step 2). ### What you will see diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index 6d92f5e4..e426181d 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -32,10 +32,6 @@ - diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs index e826b124..24d2a730 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs @@ -89,16 +89,10 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel "--auth-token", "Bearer token for MCP server authentication. Prefer the A365_MCP_AUTH_TOKEN environment variable; a token passed on the command line is visible to process listings (ps / Task Manager) and shell history."); - var reportNameOption = new Option( - "--report-name", - "Friendly name for the evaluated server, used for the output file names and the report's server name (e.g. \"salesforce\"). " + - "Defaults to a name derived from the server URL host. Set this when several servers share one gateway host so their reports don't collide."); - command.AddOption(serverUrlOption); command.AddOption(outputDirOption); command.AddOption(evalEngineOption); command.AddOption(authTokenOption); - command.AddOption(reportNameOption); command.SetHandler(async (System.CommandLine.Invocation.InvocationContext context) => { @@ -106,7 +100,6 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel var outputDir = context.ParseResult.GetValueForOption(outputDirOption)!; var evalEngine = context.ParseResult.GetValueForOption(evalEngineOption)!; var authToken = context.ParseResult.GetValueForOption(authTokenOption); - var reportName = context.ParseResult.GetValueForOption(reportNameOption); var ct = context.GetCancellationToken(); // Secret handling: a token on the command line is visible to other processes @@ -122,7 +115,7 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel authToken = Environment.GetEnvironmentVariable("A365_MCP_AUTH_TOKEN"); } - context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct, reportName); + context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct); }); return command; diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs index 0942c5cb..876c58c4 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs @@ -97,13 +97,23 @@ public Task IsAvailableAsync(CancellationToken cancellationToken = default return Task.FromResult(false); } - if (!Uri.TryCreate(endpoint, UriKind.Absolute, out _)) + if (!Uri.TryCreate(endpoint, UriKind.Absolute, out var endpointUri)) { _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) is not a valid absolute URL.", endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar); return Task.FromResult(false); } + // The server's tool names and descriptions are sent to this endpoint for scoring, so + // require HTTPS to avoid transmitting them in plaintext. Real Azure OpenAI endpoints are + // HTTPS, so this also rejects an obviously misconfigured (e.g. http://) value early. + if (!string.Equals(endpointUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)) + { + _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) must use HTTPS.", + endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar); + return Task.FromResult(false); + } + return Task.FromResult(true); } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs index 3090ba10..ef18256a 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs @@ -49,7 +49,7 @@ public EvaluationPipelineService( } /// - public async Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null) + public async Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken) { try { @@ -106,10 +106,7 @@ public async Task RunAsync(string serverUrl, string outputDir, string evalE // Run the derived name through the same sanitizer as the report filename so // any invalid-for-filesystem characters (?, *, <, etc.) from the fallback path // don't crash Path.Combine / File.Exists downstream. - // A friendly --report-name overrides the host-derived name for both the output - // file names and the report's displayed server name. Essential when many servers - // sit behind one gateway host (which would otherwise collide on the same name). - var serverName = !string.IsNullOrWhiteSpace(reportName) ? reportName! : DeriveServerName(serverUrl); + var serverName = DeriveServerName(serverUrl); var safeServerName = ReportGenerator.SanitizeFileName(serverName); var checklistPath = Path.Combine(outputDir, $"{safeServerName}_checklist.json"); diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs index 72fa72a5..591cfc71 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs @@ -17,9 +17,6 @@ public interface IEvaluationPipelineService /// Coding agent engine name (auto, github-copilot, claude-code, none). /// Optional bearer token for MCP server authentication. /// Cancellation token. - /// Optional friendly name for the evaluated server. When provided it is - /// used (sanitized) for the output filenames and the report's server name instead of the value - /// derived from the URL host — needed when many servers share one gateway host. /// Process exit code: 0 on success or an intentional bring-your-own-LLM stop; 1 when evaluation could not be performed as requested. - Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null); + Task RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken); } diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs index 7783959d..b15d10d0 100644 --- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs +++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs @@ -26,11 +26,6 @@ internal sealed class SchemaDiscoveryService : ISchemaDiscoveryService, IDisposa private readonly ILogger _logger; private readonly HttpClient _httpClient; - // MCP Streamable HTTP session id issued by the server on initialize. Servers that require - // a session (per spec) reject session-less follow-up requests with HTTP 400; we echo this - // header on every subsequent request. Reset per discovery (the service is a DI singleton). - private string? _mcpSessionId; - public SchemaDiscoveryService(ILogger logger, HttpMessageHandler? handler = null) { ArgumentNullException.ThrowIfNull(logger); @@ -68,18 +63,22 @@ public async Task> DiscoverToolsAsync(string serverUrl, string? } _logger.LogDebug("Starting MCP schema discovery against {ServerUrl}", serverUrl); - _mcpSessionId = null; // reset any session id captured from a previous discovery + + // The session id is scoped to this single discovery. The service is a DI singleton, so a + // shared mutable field would race if two discoveries ran concurrently — thread a per-call + // holder through the handshake instead. + var session = new McpSession(); try { // Step 1: Initialize - await SendInitializeAsync(serverUrl, authToken, cancellationToken); + await SendInitializeAsync(serverUrl, authToken, session, cancellationToken); // Step 2: Send initialized notification - await SendInitializedNotificationAsync(serverUrl, authToken, cancellationToken); + await SendInitializedNotificationAsync(serverUrl, authToken, session, cancellationToken); // Step 3: List tools - var tools = await SendToolsListAsync(serverUrl, authToken, cancellationToken); + var tools = await SendToolsListAsync(serverUrl, authToken, session, cancellationToken); if (tools.Count == 0) { @@ -145,7 +144,7 @@ public async Task> DiscoverToolsAsync(string serverUrl, string? } } - private async Task SendInitializeAsync(string serverUrl, string? authToken, CancellationToken cancellationToken) + private async Task SendInitializeAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken) { _logger.LogDebug("Sending MCP initialize request..."); @@ -166,7 +165,7 @@ private async Task SendInitializeAsync(string serverUrl, string? authToken, Canc id = 1 }); - using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken); + using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken); var responseBody = await ReadJsonResponseAsync(response, cancellationToken); // Validate JSON-RPC response @@ -191,7 +190,7 @@ private async Task SendInitializeAsync(string serverUrl, string? authToken, Canc _logger.LogDebug("MCP initialize succeeded."); } - private async Task SendInitializedNotificationAsync(string serverUrl, string? authToken, CancellationToken cancellationToken) + private async Task SendInitializedNotificationAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken) { _logger.LogDebug("Sending MCP initialized notification..."); @@ -203,12 +202,12 @@ private async Task SendInitializedNotificationAsync(string serverUrl, string? au }); // Notifications may not return a response body, but we still POST - using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken); + using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken); _logger.LogDebug("MCP initialized notification sent."); } - private async Task> SendToolsListAsync(string serverUrl, string? authToken, CancellationToken cancellationToken) + private async Task> SendToolsListAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken) { _logger.LogDebug("Sending MCP tools/list request..."); @@ -220,7 +219,7 @@ private async Task> SendToolsListAsync(string serverUrl, string id = 2 }); - using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken); + using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken); var responseBody = await ReadJsonResponseAsync(response, cancellationToken); using var doc = JsonDocument.Parse(responseBody); @@ -290,6 +289,7 @@ private async Task PostJsonRpcAsync( string serverUrl, string requestBody, string? authToken, + McpSession session, CancellationToken cancellationToken) { using var request = new HttpRequestMessage(HttpMethod.Post, serverUrl) @@ -307,18 +307,30 @@ private async Task PostJsonRpcAsync( } // Echo the MCP session id (captured from initialize) so session-required servers accept - // follow-up requests; stateless servers simply ignore an unexpected header. - if (!string.IsNullOrEmpty(_mcpSessionId)) + // follow-up requests; stateless servers simply ignore an unexpected header. The value was + // validated as visible ASCII at capture time, so Headers.Add (which validates) won't throw. + if (!string.IsNullOrEmpty(session.Id)) { - request.Headers.TryAddWithoutValidation("Mcp-Session-Id", _mcpSessionId); + request.Headers.Add("Mcp-Session-Id", session.Id); } var response = await _httpClient.SendAsync(request, cancellationToken); // Capture the session id from the first response (initialize) for reuse on later calls. - if (string.IsNullOrEmpty(_mcpSessionId) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues)) + // The id is server-controlled, so validate it before it is ever echoed back into a request + // header: reject anything outside visible ASCII (0x21-0x7E), which blocks CR/LF and other + // control characters that could otherwise enable header injection. + if (string.IsNullOrEmpty(session.Id) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues)) { - _mcpSessionId = sessionValues.FirstOrDefault(); + var candidate = sessionValues.FirstOrDefault(); + if (!string.IsNullOrEmpty(candidate) && IsValidMcpSessionId(candidate)) + { + session.Id = candidate; + } + else if (!string.IsNullOrEmpty(candidate)) + { + _logger.LogWarning("Ignoring malformed Mcp-Session-Id from server (must be visible ASCII 0x21-0x7E)."); + } } if (!response.IsSuccessStatusCode) @@ -383,4 +395,34 @@ private async Task ReadJsonResponseAsync(HttpResponseMessage response, C _logger.LogWarning("Could not extract JSON from SSE response"); return body; } + + /// + /// Validates a server-supplied MCP session id before it is echoed back in a request header. + /// Per the MCP spec the id must contain only visible ASCII characters (0x21-0x7E); enforcing + /// that here rejects CR/LF and other control characters that could enable header injection. + /// + private static bool IsValidMcpSessionId(string value) + { + foreach (var c in value) + { + if (c < '!' || c > '~') + { + return false; + } + } + + return value.Length > 0; + } + + /// + /// Per-discovery holder for the MCP Streamable HTTP session id issued by the server on + /// initialize. Scoped to a single call so concurrent + /// discoveries on this singleton service never share or overwrite each other's session id. + /// Servers that require a session (per spec) reject session-less follow-up requests with + /// HTTP 400, so the id is echoed on every subsequent request once captured. + /// + private sealed class McpSession + { + public string? Id; + } } diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs index feb956ec..3f15df47 100644 --- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs @@ -41,7 +41,7 @@ private static IEvaluationPipelineService PipelineReturning(int exitCode) { var pipeline = Substitute.For(); pipeline - .RunAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any()) + .RunAsync(Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any(), Arg.Any()) .Returns(exitCode); return pipeline; } @@ -99,8 +99,7 @@ await pipeline.Received(1).RunAsync( Arg.Any(), Arg.Any(), "env-token-xyz", - Arg.Any(), - Arg.Any()); + Arg.Any()); } finally { @@ -197,8 +196,7 @@ await pipeline.Received(1).RunAsync( "out", "claude-code", Arg.Any(), - Arg.Any(), - Arg.Any()); + Arg.Any()); } [Fact] @@ -217,27 +215,6 @@ await pipeline.Received(1).RunAsync( Arg.Any(), "azure-openai", Arg.Any(), - Arg.Any(), - Arg.Any()); - } - - [Fact] - public async Task InvokeAsync_ReportNameFlag_ForwardsReportNameAsSixthArgument() - { - var pipeline = PipelineReturning(0); - var evaluate = GetEvaluateSubcommand(Substitute.For(), pipeline); - var parser = BuildFaithfulParser(evaluate); - - var exitCode = await parser.InvokeAsync( - new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "none", "--report-name", "salesforce" }); - - exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code"); - await pipeline.Received(1).RunAsync( - Arg.Any(), - Arg.Any(), - Arg.Any(), - Arg.Any(), - Arg.Any(), - "salesforce"); + Arg.Any()); } } diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs index b4824f1c..29650dbd 100644 --- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs +++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs @@ -103,6 +103,20 @@ await WithEnvAsync(endpoint: "not-a-url", deployment: "gpt-4.1", async () => }); } + [Fact] + public async Task IsAvailableAsync_PlaintextHttpEndpoint_ReturnsFalse() + { + // A valid absolute URL, but http:// — the engine transmits tool names and descriptions to + // this endpoint, so a plaintext scheme must be rejected rather than silently used. + await WithEnvAsync(endpoint: "http://x.openai.azure.com/openai/v1", deployment: "gpt-4.1", async () => + { + var available = await CreateLauncher().IsAvailableAsync(); + + available.Should().BeFalse( + because: "tool metadata is sent to the endpoint, so a non-HTTPS endpoint must be treated as unavailable to avoid plaintext transmission"); + }); + } + [Fact] public async Task IsAvailableAsync_BothSetWithValidHttpsEndpoint_ReturnsTrue() {