From 6e99797ecbe385da7454fe14d484f42c54eef1fd Mon Sep 17 00:00:00 2001
From: "Ashrya Agrawal (from Dev Box)" <ashragrawal@microsoft.com>
Date: Thu, 18 Jun 2026 13:45:39 -0700
Subject: [PATCH 1/2] feat(evaluate): add Azure OpenAI judge (--eval-engine
 azure-openai)

Score each MCP tool-schema checklist item with your own Azure OpenAI
deployment via the OpenAI Responses API, authenticated with Microsoft
Entra ID (DefaultAzureCredential; API-key auth is not supported).

- Check-by-check: each assertion is one model call at temperature 0 with
  the full tool schema as context, fanned out in parallel
  (A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY, default 100). Avoids the output
  truncation the whole-tool approach hit on large tools.
- Single-flight Entra token cache so high concurrency does not storm `az`.
- Checklist checkpointed each round; a re-run resumes only unscored checks.
- Explicit-only engine (never auto-selected); fixed retry of 3 with backoff.
- New --report-name overrides the host-derived output/report name (needed
  when multiple servers share one gateway host).
- OpenAI pinned to 2.8.0 to keep System.Text.Json and
  System.Diagnostics.DiagnosticSource at 9.0.8 (no shared-dependency bump).
- MCP discovery now tracks Mcp-Session-Id for session-required servers.
- Docs (evaluate instructions + FAQ), CHANGELOG, and tests included.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                  |   2 +
 .../a365-evaluate-faq.md                      |   4 +-
 .../a365-evaluate-instructions.md             |  36 +-
 src/Directory.Packages.props                  |   5 +
 .../Commands/DevelopMcpCommand.cs             |  21 +-
 .../Microsoft.Agents.A365.DevTools.Cli.csproj |   3 +
 .../Models/Evaluate/EvaluateEnums.cs          |   1 +
 .../Program.cs                                |   3 +
 .../Services/Evaluate/AzureOpenAiLauncher.cs  | 431 ++++++++++++++++++
 .../Services/Evaluate/ChecklistEvaluator.cs   | 279 +++++++++---
 .../Evaluate/CodingAgentLauncherBase.cs       |  19 +
 .../Services/Evaluate/EvalModelConstants.cs   |  54 +++
 .../Evaluate/EvaluationPipelineService.cs     |  23 +-
 .../Services/Evaluate/ICodingAgentLauncher.cs |  40 ++
 .../Evaluate/IEvaluationPipelineService.cs    |   5 +-
 .../Evaluate/SchemaDiscoveryService.cs        |  19 +
 .../Services/Evaluate/SemanticCheckPrompts.cs |  52 +++
 .../EvaluateCommandInvocationTests.cs         |  48 +-
 .../AzureOpenAiLauncherAvailabilityTests.cs   | 130 ++++++
 .../Evaluate/AzureOpenAiLauncherTests.cs      | 233 ++++++++++
 .../EvaluationPipelineServiceTests.cs         | 105 +++++
 21 files changed, 1436 insertions(+), 77 deletions(-)
 create mode 100644 src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
 create mode 100644 src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
 create mode 100644 src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9c9615c2..c6015a2d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,8 @@ Agents provisioned before this release need `Agent365.Observability.OtelWrite` g
 - Log separator written at the start of each CLI invocation now redacts values for secret-bearing options (e.g. `--idp-client-secret`) so they are not written to the log file in plain text.
 - Authentication context (tenant and user) is now logged at the `Information` level whenever the resolved sign-in identity changes, giving operators a clear audit trail in the log file of who the CLI is acting as, without exposing credentials.
 - `a365 develop-mcp evaluate` command for evaluating MCP server tool schema quality — runs deterministic and semantic checks (via GitHub Copilot or Claude Code CLIs), computes maturity scoring, and generates an interactive HTML report
+- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores each checklist item with your own Azure OpenAI deployment instead of a local coding agent. Authenticates with Microsoft Entra ID only (`DefaultAzureCredential`; API-key auth is not supported); configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (optional `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY`, default 100). Each assertion is judged independently at temperature 0, in parallel, with the full tool schema as context.
+- `--report-name` on `develop-mcp evaluate` — overrides the server name used for the output file names and the report header (derived from the URL host by default); needed when multiple servers share one gateway host so their reports don't collide.
 - `--skip-sp-provisioning` option on `setup all` — skips the interactive in-line provisioning of missing resource service principals (issue #429). Default: setup prompts per-resource and runs `az ad sp create --id <appId>` using the operator's `az login`. With this flag, missing SPs are excluded from the consent URL and listed in the Action Required block with the `az` command and a per-SP consent URL. Implicitly enabled when stdin is redirected (CI / pipe scenarios).
 - Action Required block in the `setup all` summary now lists missing service principals — each entry shows the resource, pending scopes, the `az ad sp create` command, and the per-SP consent URL needed to complete provisioning.
 - `logs export [command] [--output <dir>]` — exports a redacted copy of a CLI diagnostic log safe to share with Microsoft support. Redacts JWT tokens, email addresses, OS-path usernames, and tenant-specific GUIDs; replaces identical values with consistent aliases so log correlation is preserved. Preserves diagnostic IDs that aren't sensitive but are useful for debugging — `TraceId`, `CorrelationId`, Microsoft Graph `request-id` and `client-request-id` values, and well-known public Microsoft / Agent 365 resource appIds (such as the Microsoft Graph appId `00000003-0000-0000-c000-000000000000`). Omit `[command]` to export all available logs at once.
diff --git a/docs/agent365-guided-setup/a365-evaluate-faq.md b/docs/agent365-guided-setup/a365-evaluate-faq.md
index 1b7bbc1f..e637b664 100644
--- a/docs/agent365-guided-setup/a365-evaluate-faq.md
+++ b/docs/agent365-guided-setup/a365-evaluate-faq.md
@@ -37,7 +37,9 @@ Microsoft's role is supplying the `a365` CLI that orchestrates steps 1–2. Step
 
 By default the evaluation uses **Claude Haiku 4.5**, invoked through whichever coding agent CLI you have installed (GitHub Copilot CLI or Claude Code CLI). You can override the engine with `--eval-engine github-copilot` or `--eval-engine claude-code`.
 
-To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`).
+You can also score with **your own Azure OpenAI deployment** via `--eval-engine azure-openai`. Set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`, and sign in to Entra ID (e.g. `az login`). This engine authenticates with **Microsoft Entra ID only** (`DefaultAzureCredential`) — **API-key authentication is not supported**. Unlike the coding-agent engines it needs no local CLI: the `a365` CLI calls your Azure OpenAI deployment directly, under your own Azure subscription, billing, and access policies.
+
+To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`.
 
 The CLI that runs the model is **yours**, installed from npm by you, authenticated with your credentials. Microsoft does not host or resell the model API call — it is made directly by your CLI to the AI provider under your own terms of service and billing. The `a365` CLI specifies the model flag but does not mediate the API call.
 
diff --git a/docs/agent365-guided-setup/a365-evaluate-instructions.md b/docs/agent365-guided-setup/a365-evaluate-instructions.md
index 6388be9f..4de1cb3f 100644
--- a/docs/agent365-guided-setup/a365-evaluate-instructions.md
+++ b/docs/agent365-guided-setup/a365-evaluate-instructions.md
@@ -32,15 +32,17 @@ Wait for the answer. Store as `needsAuth`:
 1. Auto — try GitHub Copilot first, fall back to Claude Code
 2. GitHub Copilot only
 3. Claude Code only
-4. None — generate the checklist and let me (or my own LLM) score it manually (bring-your-own-LLM)
+4. Azure OpenAI — score with your own Azure OpenAI deployment via Entra ID (no local coding agent; typically the fastest option)
+5. None — generate the checklist and let me (or my own LLM) score it manually (bring-your-own-LLM)
 
 Wait for the answer. Store as `evalEngine`:
 - If **1 (Auto)**: `evalEngine = "auto"`
 - If **2 (GitHub Copilot)**: `evalEngine = "github-copilot"`
 - If **3 (Claude Code)**: `evalEngine = "claude-code"`
-- If **4 (None)**: `evalEngine = "none"`
+- If **4 (Azure OpenAI)**: `evalEngine = "azure-openai"` — also confirm the user has set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and is signed in to Entra ID (see Step 2).
+- If **5 (None)**: `evalEngine = "none"`
 
-> **Note:** Auto and the named engines require the corresponding CLI to be installed on the workstation (`copilot` or `claude`). If neither is installed and `evalEngine` is `auto` or a named engine, the pipeline will stop after writing the checklist and print BYO-LLM instructions — that is expected, not an error.
+> **Note:** The local engines (`auto`, `github-copilot`, `claude-code`) require the corresponding CLI on the workstation (`copilot` or `claude`); if none is installed the pipeline stops after writing the checklist and prints BYO-LLM instructions — that is expected, not an error. The `azure-openai` engine needs **no local CLI** — instead it requires the `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` environment variables and an Entra ID sign-in (e.g. `az login`); see Step 2.
 
 After all three questions are answered, create all todos for the path and mark Todo 1 in-progress:
 
@@ -133,6 +135,27 @@ Tell the user one of the following is required, and that without one of them the
 - Install Claude Code (see above), or
 - Switch to `evalEngine = "none"` and score the checklist with their own LLM after the run.
 
+### If `evalEngine = "azure-openai"`
+
+No local coding agent is used — the `a365` CLI calls your Azure OpenAI deployment directly. Verify two things:
+
+1. **Endpoint and deployment are configured.** The CLI reads these from the environment (they are not flags):
+
+```bash
+echo "$A365_EVAL_AZURE_OPENAI_ENDPOINT"     # e.g. https://<resource>.services.ai.azure.com/openai/v1
+echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT"   # e.g. gpt-4.1
+```
+
+If either is empty, ask the user for their Azure OpenAI endpoint (include the `/openai/v1` path) and deployment name, then set both variables.
+
+2. **The workstation is signed in to Entra ID.** The engine authenticates with `DefaultAzureCredential` and requests a token for `https://ai.azure.com/.default`. The simplest sign-in is:
+
+```bash
+az login
+```
+
+> **Authentication — Microsoft Entra ID only.** The `azure-openai` engine supports **Entra ID authentication only**, via `DefaultAzureCredential` (Azure CLI sign-in, managed identity, a service principal supplied through environment variables, or Visual Studio / VS Code sign-in). **API-key authentication is NOT supported** — no key is read from the environment or any config file. The model call is made directly by the `a365` CLI to *your* Azure OpenAI deployment, under your Azure subscription, billing, and access policies.
+
 ### Step 2 completion
 
 > Mark Todo 2 as **completed**. Mark Todo 3 as **in-progress**. Proceed to Step 3.
@@ -188,8 +211,11 @@ These settings can come from environment variables instead of (or alongside) the
 | `A365_MCP_AUTH_TOKEN` | Bearer token for the MCP server, used when `--auth-token` is not passed. **Preferred over the flag** — it keeps the token out of process listings (`ps` / Task Manager) and shell history. If you pass `--auth-token`, the CLI prints a one-time warning recommending this variable. |
 | `A365_EVAL_COPILOT_MODEL` | Override the GitHub Copilot model (exact model ID, e.g. `claude-haiku-4.5`). |
 | `A365_EVAL_CLAUDE_MODEL` | Override the Claude Code model (alias, e.g. `haiku`). |
+| `A365_EVAL_AZURE_OPENAI_ENDPOINT` | **Required for `--eval-engine azure-openai`.** Azure OpenAI endpoint, including the API path (e.g. `https://<resource>.services.ai.azure.com/openai/v1`). |
+| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with (e.g. `gpt-4.1`). |
+| `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY` | Parallel check-scoring calls for the `azure-openai` engine. Default `100`. |
 
-The model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release.
+For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and authenticates with Entra ID only (no API key — see Step 2).
 
 ### What you will see
 
@@ -268,6 +294,8 @@ If any step results in an error, stop and analyze the error message carefully.
 | `Unauthorized` from `tools/list` | Wrong or expired bearer token | Re-acquire the token (Question 2). |
 | `Could not parse server URL` | URL doesn't include the protocol | Add `http://` or `https://` to the URL. |
 | `No coding agent detected` after `[2/5]` | Neither `copilot` nor `claude` is on PATH | Install one (Step 2) or pass `--eval-engine none`. |
+| `Azure OpenAI is not available` / engine not selected | `azure-openai` chosen but `A365_EVAL_AZURE_OPENAI_ENDPOINT` / `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` are unset | Set both env vars (Step 2). |
+| `Azure OpenAI authentication failed` | Not signed in to Entra ID (or no usable credential) for the `azure-openai` engine | Run `az login` (or configure a managed identity / service-principal credential). |
 | `Failed to write report to <path>` | Output dir not writable | Choose a different `--output-dir` or fix permissions. |
 | Telemetry warning at debug level | Non-blocking — the marker call failed | Ignore. The evaluation runs regardless. |
 
diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index f3fccf3a..6d92f5e4 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -32,6 +32,11 @@
     <PackageVersion Include="Azure.Identity" Version="1.13.2" />
     <PackageVersion Include="Azure.ResourceManager" Version="1.13.2" />
     <PackageVersion Include="Azure.ResourceManager.AppService" Version="1.2.0" />
+    <!-- OpenAI SDK (Responses API) — Azure OpenAI BYO-LLM judge for the evaluate command.
+         Pinned to 2.8.0: the last release whose transitive System.ClientModel (1.8.1) does NOT
+         force System.Text.Json / System.Diagnostics.DiagnosticSource up to 10.x. 2.9.0+ pull
+         System.ClientModel >= 1.9.0 which requires the 10.x runtime libraries. -->
+    <PackageVersion Include="OpenAI" Version="2.8.0" />
     <!-- Microsoft Graph -->
     <PackageVersion Include="Microsoft.Graph" Version="5.36.0" />
     <!-- Transitive pin: resolves NU1107 conflict between Microsoft.Extensions.Http 9.x (needs >= 9.0.8)
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
index 941b1d10..e826b124 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
@@ -56,10 +56,11 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
         var command = new Command(
             "evaluate",
             "Evaluate MCP server tool schema quality and generate an HTML report. " +
-            "Uses a locally installed coding agent (GitHub Copilot or Claude Code) to score semantic checks. " +
+            "Scores semantic checks with a locally installed coding agent (GitHub Copilot or Claude Code), " +
+            "or with your own Azure OpenAI deployment via --eval-engine azure-openai. " +
             "If no agent is detected, the command stops after writing the checklist so you can score it manually with your own LLM, " +
             "or pass --eval-engine none to skip agent probing entirely. " +
-            "Only run this against MCP servers you trust: the server's tool names and descriptions are sent to a locally running coding agent.");
+            "Only run this against MCP servers you trust: the server's tool names and descriptions are sent to the scoring engine.");
 
         // Use a required option (not a positional argument) for consistency with other
         // develop-mcp subcommands and Azure CLI conventions.
@@ -78,19 +79,26 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
         var evalEngineOption = new Option<string>(
             "--eval-engine",
             getDefaultValue: () => "auto",
-            "Which local coding agent scores semantic checks. " +
-            "auto: try github-copilot then claude-code. " +
-            "github-copilot or claude-code: use only that engine. " +
+            "Which engine scores semantic checks. " +
+            "auto: try github-copilot then claude-code (local coding agents). " +
+            "github-copilot or claude-code: use only that local agent. " +
+            "azure-openai: use your own Azure OpenAI deployment as the judge via Entra ID (set A365_EVAL_AZURE_OPENAI_ENDPOINT and A365_EVAL_AZURE_OPENAI_DEPLOYMENT). " +
             "none: skip automatic scoring and expect the checklist to be pre-scored (bring-your-own-LLM).");
 
         var authTokenOption = new Option<string?>(
             "--auth-token",
             "Bearer token for MCP server authentication. Prefer the A365_MCP_AUTH_TOKEN environment variable; a token passed on the command line is visible to process listings (ps / Task Manager) and shell history.");
 
+        var reportNameOption = new Option<string?>(
+            "--report-name",
+            "Friendly name for the evaluated server, used for the output file names and the report's server name (e.g. \"salesforce\"). " +
+            "Defaults to a name derived from the server URL host. Set this when several servers share one gateway host so their reports don't collide.");
+
         command.AddOption(serverUrlOption);
         command.AddOption(outputDirOption);
         command.AddOption(evalEngineOption);
         command.AddOption(authTokenOption);
+        command.AddOption(reportNameOption);
 
         command.SetHandler(async (System.CommandLine.Invocation.InvocationContext context) =>
         {
@@ -98,6 +106,7 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
             var outputDir = context.ParseResult.GetValueForOption(outputDirOption)!;
             var evalEngine = context.ParseResult.GetValueForOption(evalEngineOption)!;
             var authToken = context.ParseResult.GetValueForOption(authTokenOption);
+            var reportName = context.ParseResult.GetValueForOption(reportNameOption);
             var ct = context.GetCancellationToken();
 
             // Secret handling: a token on the command line is visible to other processes
@@ -113,7 +122,7 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
                 authToken = Environment.GetEnvironmentVariable("A365_MCP_AUTH_TOKEN");
             }
 
-            context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct);
+            context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct, reportName);
         });
 
         return command;
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Microsoft.Agents.A365.DevTools.Cli.csproj b/src/Microsoft.Agents.A365.DevTools.Cli/Microsoft.Agents.A365.DevTools.Cli.csproj
index 04bcea8c..bd9bdac3 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Microsoft.Agents.A365.DevTools.Cli.csproj
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Microsoft.Agents.A365.DevTools.Cli.csproj
@@ -46,6 +46,9 @@
     <PackageReference Include="Azure.ResourceManager" />
     <PackageReference Include="Azure.ResourceManager.AppService" />
 
+    <!-- OpenAI SDK (Azure OpenAI BYO-LLM judge for evaluate) -->
+    <PackageReference Include="OpenAI" />
+
     <!-- Microsoft Graph -->
     <PackageReference Include="Microsoft.Graph" />
   </ItemGroup>
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs
index deeffc40..d2b8a860 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Models/Evaluate/EvaluateEnums.cs
@@ -56,5 +56,6 @@ public enum EvalEngine
     Auto,
     GitHubCopilot,
     ClaudeCode,
+    AzureOpenAI,
     None
 }
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs
index 6661d462..07cc52c2 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Program.cs
@@ -401,6 +401,9 @@ private static void ConfigureServices(IServiceCollection services, LogLevel mini
         // adding its ICodingAgentLauncher implementation file plus one line here.
         services.AddSingleton<ICodingAgentLauncher, GitHubCopilotLauncher>();
         services.AddSingleton<ICodingAgentLauncher, ClaudeCodeLauncher>();
+        // Azure OpenAI judge (BYO model via Entra ID). Explicit-only (AutoDetectable=false),
+        // so it never participates in --eval-engine auto despite being registered here.
+        services.AddSingleton<ICodingAgentLauncher, AzureOpenAiLauncher>();
         services.AddSingleton<IChecklistEvaluator, ChecklistEvaluator>();
         services.AddSingleton<IEvaluationAnalyzer, EvaluationAnalyzer>();
         services.AddSingleton<IReportGenerator, ReportGenerator>();
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
new file mode 100644
index 00000000..0942c5cb
--- /dev/null
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
@@ -0,0 +1,431 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+using System.ClientModel;
+using System.ClientModel.Primitives;
+using System.Text.Json;
+using Azure.Core;
+using Azure.Identity;
+using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;
+using Microsoft.Extensions.Logging;
+using OpenAI;
+using OpenAI.Responses;
+
+namespace Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate;
+
+// The OpenAI Responses API surface is annotated [Experimental("OPENAI001")] in the
+// 2.x SDK. Suppress for this file; the API shape used here is small and stable.
+#pragma warning disable OPENAI001
+
+/// <summary>
+/// Scores semantic checks with a user-provided Azure OpenAI deployment via the OpenAI
+/// Responses API, authenticated with Entra ID (<see cref="DefaultAzureCredential"/>).
+///
+/// This engine scores each check INDEPENDENTLY (<see cref="ScoresPerCheck"/> = true): one
+/// model call per assertion, with the full tool schema passed as context and temperature 0.
+/// Per-check calls keep every response tiny (no truncation on large tools) and low-variance;
+/// the evaluator fans them out concurrently. The coding-agent launchers (Copilot, Claude)
+/// instead edit a whole-tool file via <see cref="LaunchAsync"/>, which this engine does not use.
+///
+/// Configuration is environment-driven (no secrets on the command line):
+///   <c>A365_EVAL_AZURE_OPENAI_ENDPOINT</c>        (required) e.g. https://foo.services.ai.azure.com/openai/v1
+///   <c>A365_EVAL_AZURE_OPENAI_DEPLOYMENT</c>      (required) e.g. gpt-4.1
+///   <c>A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY</c> (optional) parallel check calls, default 100
+/// The Entra ID scope (https://ai.azure.com/.default) and per-call retry count (3) are fixed.
+///
+/// Explicit-only: <see cref="AutoDetectable"/> is false, so <c>--eval-engine auto</c>
+/// never selects it. A plain evaluate run never sends content to a model endpoint or
+/// spends tokens unless the user opts in with <c>--eval-engine azure-openai</c>.
+/// </summary>
+internal sealed class AzureOpenAiLauncher : ICodingAgentLauncher
+{
+    private readonly ILogger<AzureOpenAiLauncher> _logger;
+
+    // DefaultAzureCredential probes several credential sources on construction; build it
+    // once (the launcher is a DI singleton) and reuse across all check calls.
+    private DefaultAzureCredential? _credential;
+
+    // The ResponsesClient is thread-safe for concurrent calls; build it once and share it
+    // across every per-check scoring call (avoids thousands of client instances at high dop).
+    private ResponsesClient? _client;
+    private readonly object _clientLock = new();
+
+    public AzureOpenAiLauncher(ILogger<AzureOpenAiLauncher> logger)
+    {
+        ArgumentNullException.ThrowIfNull(logger);
+        _logger = logger;
+    }
+
+    public EvalEngine Engine => EvalEngine.AzureOpenAI;
+
+    public string DisplayName => "Azure OpenAI";
+
+    // Unused by this engine: it builds its own self-contained prompt from the checklist
+    // content rather than a coding-agent prompt that names read/edit tools. Present only
+    // to satisfy the interface.
+    public SemanticCheckPrompts.AgentToolset Toolset => new(ReadToolName: "read", EditToolName: "edit");
+
+    // No CLI binary backs this engine; it is never probed on PATH (IsAvailableAsync is
+    // overridden) and is excluded from the auto-detection list.
+    public string CliCommand => "azure-openai";
+
+    public bool AutoDetectable => false;
+
+    public string AvailabilityHint =>
+        $"the {EvalModelConstants.AzureOpenAiEndpointEnvVar} and {EvalModelConstants.AzureOpenAiDeploymentEnvVar} environment variables";
+
+    // Direct API: score many checks concurrently (vs. 1 for subprocess agents).
+    public int MaxConcurrency => EvalModelConstants.AzureOpenAiMaxConcurrency;
+
+    // This engine evaluates each check on its own (per-check), not a whole-tool file.
+    public bool ScoresPerCheck => true;
+
+    /// <summary>
+    /// Available when the endpoint and deployment environment variables are set to a
+    /// usable value. Credentials are not probed here (that would require a token call);
+    /// an auth failure surfaces from <see cref="LaunchAsync"/> with actionable guidance.
+    /// </summary>
+    public Task<bool> IsAvailableAsync(CancellationToken cancellationToken = default)
+    {
+        var endpoint = EvalModelConstants.AzureOpenAiEndpoint;
+        var deployment = EvalModelConstants.AzureOpenAiDeployment;
+
+        if (endpoint is null || deployment is null)
+        {
+            _logger.LogDebug("Azure OpenAI engine unavailable: {EndpointVar} and/or {DeploymentVar} not set.",
+                EvalModelConstants.AzureOpenAiEndpointEnvVar, EvalModelConstants.AzureOpenAiDeploymentEnvVar);
+            return Task.FromResult(false);
+        }
+
+        if (!Uri.TryCreate(endpoint, UriKind.Absolute, out _))
+        {
+            _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) is not a valid absolute URL.",
+                endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar);
+            return Task.FromResult(false);
+        }
+
+        return Task.FromResult(true);
+    }
+
+    /// <inheritdoc />
+    /// <remarks>Not used by this engine — it scores per check via <see cref="ScoreCheckAsync"/>.
+    /// The evaluator routes per-check engines down a different path and never calls this.</remarks>
+    public Task<bool> LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default)
+        => throw new NotSupportedException("Azure OpenAI scores checks individually; the evaluator calls ScoreCheckAsync instead of LaunchAsync.");
+
+    /// <inheritdoc />
+    public async Task<CheckEvaluation?> ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default)
+    {
+        ArgumentException.ThrowIfNullOrWhiteSpace(context);
+        ArgumentException.ThrowIfNullOrWhiteSpace(checkPrompt);
+
+        var endpoint = EvalModelConstants.AzureOpenAiEndpoint;
+        var deployment = EvalModelConstants.AzureOpenAiDeployment;
+        if (endpoint is null || deployment is null)
+        {
+            // IsAvailableAsync gates this; guard anyway so a direct call can't NRE.
+            _logger.LogError("Azure OpenAI engine requires {EndpointVar} and {DeploymentVar}.",
+                EvalModelConstants.AzureOpenAiEndpointEnvVar, EvalModelConstants.AzureOpenAiDeploymentEnvVar);
+            return null;
+        }
+
+        try
+        {
+            var client = GetClient(endpoint, deployment);
+            var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt(context, checkPrompt);
+            var options = new CreateResponseOptions(new[] { ResponseItem.CreateUserMessageItem(prompt) }, deployment)
+            {
+                // Deterministic scoring to minimize hallucination and variance across checks.
+                Temperature = 0f,
+            };
+
+            ClientResult<ResponseResult> result = await client.CreateResponseAsync(options, cancellationToken);
+            return ParseEvaluation(result.Value.GetOutputText());
+        }
+        catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
+        {
+            throw;
+        }
+        catch (AuthenticationFailedException ex)
+        {
+            _logger.LogError("Azure OpenAI authentication failed for scope {Scope}. Run 'az login' or verify your Entra credentials. {Message}",
+                EvalModelConstants.AzureOpenAiScope, ex.Message);
+            return null;
+        }
+        catch (ClientResultException ex)
+        {
+            // HTTP failure surfaced by the OpenAI SDK (e.g. 401 auth, 404 wrong deployment, 429 throttling after retries).
+            _logger.LogWarning("Azure OpenAI request failed (HTTP {Status}): {Message}", ex.Status, ex.Message);
+            return null;
+        }
+        catch (Exception ex)
+        {
+            _logger.LogWarning(ex, "Azure OpenAI check scoring failed unexpectedly.");
+            return null;
+        }
+    }
+
+    /// <summary>
+    /// Builds (once) and reuses a single <see cref="ResponsesClient"/>. The client is thread-safe
+    /// for concurrent calls, so every check scoring shares it. Retry count is configurable.
+    /// </summary>
+    private ResponsesClient GetClient(string endpoint, string deployment)
+    {
+        if (_client is not null)
+        {
+            return _client;
+        }
+
+        lock (_clientLock)
+        {
+            if (_client is null)
+            {
+                _credential ??= new DefaultAzureCredential();
+                var scope = EvalModelConstants.AzureOpenAiScope;
+                // Bridge the Azure.Core credential to System.ClientModel's auth abstraction the
+                // OpenAI SDK consumes, so no Azure.Identity version bump is required.
+                var authPolicy = new BearerTokenPolicy(new AzureCredentialTokenProvider(_credential, scope), scope);
+                _client = new ResponsesClient(deployment, authPolicy, new OpenAIClientOptions
+                {
+                    Endpoint = new Uri(endpoint),
+                    // Retry 429/transient with exponential backoff honoring Retry-After. Fixed at 3;
+                    // if checks still fail after that, re-running the command resumes only the
+                    // unscored ones. The logging subclass surfaces each retry so throttling is visible.
+                    RetryPolicy = new LoggingRetryPolicy(maxRetries: 3, _logger),
+                });
+            }
+
+            return _client;
+        }
+    }
+
+    /// <summary>
+    /// Parses the model's per-check response into a <see cref="CheckEvaluation"/>, tolerating
+    /// code fences and stray prose. Returns null when no usable <c>{score, reason}</c> is present.
+    /// </summary>
+    internal static CheckEvaluation? ParseEvaluation(string? output)
+    {
+        var json = ExtractJson(output);
+        if (json is null)
+        {
+            return null;
+        }
+
+        try
+        {
+            using var doc = JsonDocument.Parse(json);
+            var root = doc.RootElement;
+            if (!root.TryGetProperty("score", out var scoreEl))
+            {
+                return null;
+            }
+
+            bool score = scoreEl.ValueKind switch
+            {
+                JsonValueKind.True => true,
+                JsonValueKind.False => false,
+                JsonValueKind.String => bool.TryParse(scoreEl.GetString(), out var b) && b,
+                _ => false,
+            };
+
+            var reason = root.TryGetProperty("reason", out var reasonEl) && reasonEl.ValueKind == JsonValueKind.String
+                ? reasonEl.GetString()!
+                : "No reason provided.";
+
+            return new CheckEvaluation(score, reason);
+        }
+        catch (JsonException)
+        {
+            return null;
+        }
+    }
+
+    /// <summary>
+    /// Extracts a JSON object from raw model output: strips Markdown code fences and any
+    /// surrounding prose, then verifies the candidate parses. Returns null when no valid
+    /// JSON object can be recovered (the evaluator then retries the attempt).
+    /// </summary>
+    internal static string? ExtractJson(string? output)
+    {
+        if (string.IsNullOrWhiteSpace(output))
+        {
+            return null;
+        }
+
+        var text = output.Trim();
+
+        // Strip a leading ```json (or ```) fence and its closing ``` if present.
+        if (text.StartsWith("```", StringComparison.Ordinal))
+        {
+            var firstNewline = text.IndexOf('\n');
+            if (firstNewline >= 0)
+            {
+                text = text[(firstNewline + 1)..];
+            }
+
+            var lastFence = text.LastIndexOf("```", StringComparison.Ordinal);
+            if (lastFence >= 0)
+            {
+                text = text[..lastFence];
+            }
+
+            text = text.Trim();
+        }
+
+        // Narrow to the outermost { ... } in case the model added stray prose.
+        var start = text.IndexOf('{');
+        var end = text.LastIndexOf('}');
+        if (start < 0 || end <= start)
+        {
+            return null;
+        }
+
+        var candidate = text[start..(end + 1)];
+        try
+        {
+            using var _ = JsonDocument.Parse(candidate, new JsonDocumentOptions
+            {
+                AllowTrailingCommas = true,
+                CommentHandling = JsonCommentHandling.Skip
+            });
+            return candidate;
+        }
+        catch (JsonException)
+        {
+            return null;
+        }
+    }
+}
+
+// The classes below (AzureCredentialTokenProvider, LoggingRetryPolicy) use no experimental
+// OpenAI APIs, so the OPENAI001 suppression should not extend to them.
+#pragma warning restore OPENAI001
+
+/// <summary>
+/// Adapts an Azure.Core <see cref="TokenCredential"/> (e.g. <see cref="DefaultAzureCredential"/>)
+/// to System.ClientModel's <see cref="AuthenticationTokenProvider"/>, which the OpenAI SDK's
+/// <see cref="BearerTokenPolicy"/> consumes. This bridge lets the evaluate command authenticate
+/// with Entra ID without bumping the repo's shared Azure.Identity version.
+/// </summary>
+internal sealed class AzureCredentialTokenProvider : AuthenticationTokenProvider
+{
+    private readonly TokenCredential _credential;
+    private readonly string[] _scopes;
+
+    // Single-flight token cache. Under high concurrency (hundreds/thousands of parallel
+    // per-check calls) the underlying credential would otherwise be hit once per call —
+    // and AzureCliCredential spawns an `az` process per acquisition, so the burst storms the
+    // CLI and times out. We acquire one token, cache it until shortly before expiry, and
+    // serialize refreshes through a gate so only a single acquisition is ever in flight.
+    private readonly SemaphoreSlim _gate = new(1, 1);
+    private static readonly TimeSpan RefreshSkew = TimeSpan.FromMinutes(5);
+    private volatile CachedToken? _cache;
+
+    private sealed record CachedToken(string Value, DateTimeOffset ExpiresOn);
+
+    public AzureCredentialTokenProvider(TokenCredential credential, string scope)
+    {
+        ArgumentNullException.ThrowIfNull(credential);
+        ArgumentException.ThrowIfNullOrWhiteSpace(scope);
+        _credential = credential;
+        _scopes = new[] { scope };
+    }
+
+    public override GetTokenOptions CreateTokenOptions(IReadOnlyDictionary<string, object> properties) => new(properties);
+
+    public override AuthenticationToken GetToken(GetTokenOptions options, CancellationToken cancellationToken)
+        => GetTokenAsync(options, cancellationToken).AsTask().GetAwaiter().GetResult();
+
+    public override async ValueTask<AuthenticationToken> GetTokenAsync(GetTokenOptions options, CancellationToken cancellationToken)
+    {
+        var cached = _cache;
+        if (IsFresh(cached))
+        {
+            return ToToken(cached!);
+        }
+
+        await _gate.WaitAsync(cancellationToken).ConfigureAwait(false);
+        try
+        {
+            // Another caller may have refreshed while we waited on the gate.
+            cached = _cache;
+            if (IsFresh(cached))
+            {
+                return ToToken(cached!);
+            }
+
+            AccessToken token = await _credential.GetTokenAsync(new TokenRequestContext(_scopes), cancellationToken).ConfigureAwait(false);
+            var fresh = new CachedToken(token.Token, token.ExpiresOn);
+            _cache = fresh;
+            return ToToken(fresh);
+        }
+        finally
+        {
+            _gate.Release();
+        }
+    }
+
+    private static bool IsFresh(CachedToken? c) => c is not null && c.ExpiresOn - DateTimeOffset.UtcNow > RefreshSkew;
+
+    private static AuthenticationToken ToToken(CachedToken c) => new(c.Value, "Bearer", c.ExpiresOn, refreshOn: null);
+}
+
+/// <summary>
+/// A <see cref="ClientRetryPolicy"/> that logs whenever it retries a request, so rate-limit
+/// (HTTP 429) and transient failures are visible instead of being silently absorbed. The
+/// retry decision, exponential backoff, and Retry-After handling are inherited unchanged from
+/// the base policy; this subclass only adds the log line on each retry.
+/// </summary>
+internal sealed class LoggingRetryPolicy : ClientRetryPolicy
+{
+    private readonly ILogger _logger;
+
+    public LoggingRetryPolicy(int maxRetries, ILogger logger) : base(maxRetries)
+    {
+        ArgumentNullException.ThrowIfNull(logger);
+        _logger = logger;
+    }
+
+    protected override bool ShouldRetry(PipelineMessage message, Exception? exception)
+    {
+        var shouldRetry = base.ShouldRetry(message, exception);
+        if (shouldRetry)
+        {
+            LogRetry(message.Response?.Status ?? 0, exception);
+        }
+
+        return shouldRetry;
+    }
+
+    protected override async ValueTask<bool> ShouldRetryAsync(PipelineMessage message, Exception? exception)
+    {
+        var shouldRetry = await base.ShouldRetryAsync(message, exception).ConfigureAwait(false);
+        if (shouldRetry)
+        {
+            LogRetry(message.Response?.Status ?? 0, exception);
+        }
+
+        return shouldRetry;
+    }
+
+    private void LogRetry(int status, Exception? exception)
+    {
+        if (status == 429)
+        {
+            _logger.LogWarning("        Azure OpenAI rate-limited (HTTP 429); backing off and retrying the call.");
+        }
+        else if (status == 503)
+        {
+            _logger.LogWarning("        Azure OpenAI unavailable (HTTP 503); backing off and retrying the call.");
+        }
+        else if (status >= 400)
+        {
+            _logger.LogDebug("        Azure OpenAI transient failure (HTTP {Status}); retrying the call.", status);
+        }
+        else
+        {
+            // No HTTP response captured (network error / timeout); the exception carries the detail.
+            _logger.LogDebug(exception, "        Azure OpenAI transient error; retrying the call.");
+        }
+    }
+}
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs
index d23fd5b0..ecf0e545 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ChecklistEvaluator.cs
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+using System.Collections.Concurrent;
 using System.Text.Json;
 using System.Text.RegularExpressions;
 using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;
@@ -114,57 +115,14 @@ public async Task<ChecklistEvaluationResult> EvaluateAsync(
         // actually did the work (rather than the user's "auto" request).
         EvalEngine? engineUsed = null;
 
-        // Evaluate each tool using extract-evaluate-merge pattern.
-        // The full checklist is ~1MB which is too large for coding agents.
-        // Instead, extract each tool to a small temp file (~25KB), have the
-        // agent evaluate it, then merge the results back into the checklist.
-        for (int i = 0; i < checklist.Tools.Count; i++)
-        {
-            cancellationToken.ThrowIfCancellationRequested();
-
-            var tool = checklist.Tools[i];
-            var unevaluated = CountUnevaluatedSemanticChecks(tool);
-            if (unevaluated == 0)
-            {
-                continue;
-            }
-
-            // Heartbeat BEFORE the tool runs so the user sees forward motion immediately.
-            // Each tool can take minutes (multi-attempt Copilot/Claude invocations), so
-            // logging only on completion leaves long silent gaps that look like a hang.
-            _logger.LogInformation("      [{Current}/{Total}] {ToolName} ({CheckCount} checks) ... running",
-                i + 1, checklist.Tools.Count, tool.Name, unevaluated);
-
-            var toolEngine = await EvaluateToolChecks(tool, enginesToTry, cancellationToken);
-            if (toolEngine is not null)
-            {
-                engineUsed ??= toolEngine;
-                _logger.LogInformation("      [{Current}/{Total}] {ToolName} ... ok",
-                    i + 1, checklist.Tools.Count, tool.Name);
-            }
-            else
-            {
-                _logger.LogWarning("      [{Current}/{Total}] {ToolName} ... failed (continuing)",
-                    i + 1, checklist.Tools.Count, tool.Name);
-            }
-        }
-
-        // Evaluate server-level checks (extract server_checks + tool list summary)
-        var serverUnevaluated = checklist.ServerChecks.Count(c => c.Type == CheckType.Semantic && c.Score is null);
-        if (serverUnevaluated > 0)
-        {
-            _logger.LogInformation("      server-level checks ({Count} checks) ... running", serverUnevaluated);
-            var serverEngine = await EvaluateServerChecks(checklist, enginesToTry, cancellationToken);
-            if (serverEngine is not null)
-            {
-                engineUsed ??= serverEngine;
-                _logger.LogInformation("      server-level checks ... ok");
-            }
-            else
-            {
-                _logger.LogWarning("      server-level checks ... failed (continuing)");
-            }
-        }
+        // Pick the scoring path from the engine. A per-check judge (Azure OpenAI) evaluates
+        // each assertion independently with the full tool schema as context; subprocess coding
+        // agents edit a whole-tool file. A per-check engine is explicit-only, so when chosen it
+        // is the sole entry in enginesToTry.
+        var primaryLauncher = enginesToTry.Count == 1 ? LauncherFor(enginesToTry[0]) : null;
+        engineUsed = primaryLauncher?.ScoresPerCheck == true
+            ? await EvaluatePerCheck(checklist, primaryLauncher, checklistPath, cancellationToken)
+            : await EvaluatePerTool(checklist, enginesToTry, cancellationToken);
 
         // Write the updated checklist back (with all merged results)
         var updatedJson = JsonSerializer.Serialize(checklist, WriteOptions);
@@ -196,6 +154,209 @@ public async Task<ChecklistEvaluationResult> EvaluateAsync(
         };
     }
 
+    /// <summary>
+    /// Whole-tool scoring path for subprocess coding agents: extract each tool to a sandbox
+    /// file, have the agent edit it, then merge results back. Tools run concurrently up to the
+    /// engine's MaxConcurrency (1 for coding agents). Returns the engine that did the work.
+    /// </summary>
+    private async Task<EvalEngine?> EvaluatePerTool(EvaluationChecklist checklist, List<EvalEngine> enginesToTry, CancellationToken cancellationToken)
+    {
+        if (enginesToTry.Count == 0)
+        {
+            return null;
+        }
+
+        EvalEngine? engineUsed = null;
+
+        // When several engines could be tried via fallthrough, use the most conservative dop.
+        // Each tool writes to its own sandbox and mutates only its own ToolChecklist, so
+        // concurrent iterations don't share state; only engineUsed needs guarding.
+        int dop = enginesToTry.Count == 0
+            ? 1
+            : Math.Max(1, enginesToTry.Min(e => LauncherFor(e)?.MaxConcurrency ?? 1));
+
+        var engineLock = new object();
+        await Parallel.ForEachAsync(
+            Enumerable.Range(0, checklist.Tools.Count),
+            new ParallelOptions { MaxDegreeOfParallelism = dop, CancellationToken = cancellationToken },
+            async (i, ct) =>
+            {
+                var tool = checklist.Tools[i];
+                var unevaluated = CountUnevaluatedSemanticChecks(tool);
+                if (unevaluated == 0)
+                {
+                    return;
+                }
+
+                // Heartbeat BEFORE the tool runs so the user sees forward motion immediately.
+                _logger.LogInformation("      [{Current}/{Total}] {ToolName} ({CheckCount} checks) ... running",
+                    i + 1, checklist.Tools.Count, tool.Name, unevaluated);
+
+                var toolEngine = await EvaluateToolChecks(tool, enginesToTry, ct);
+                if (toolEngine is not null)
+                {
+                    lock (engineLock)
+                    {
+                        engineUsed ??= toolEngine;
+                    }
+                    _logger.LogInformation("      [{Current}/{Total}] {ToolName} ... ok",
+                        i + 1, checklist.Tools.Count, tool.Name);
+                }
+                else
+                {
+                    _logger.LogWarning("      [{Current}/{Total}] {ToolName} ... failed (continuing)",
+                        i + 1, checklist.Tools.Count, tool.Name);
+                }
+            });
+
+        var serverUnevaluated = checklist.ServerChecks.Count(c => c.Type == CheckType.Semantic && c.Score is null);
+        if (serverUnevaluated > 0)
+        {
+            _logger.LogInformation("      server-level checks ({Count} checks) ... running", serverUnevaluated);
+            var serverEngine = await EvaluateServerChecks(checklist, enginesToTry, cancellationToken);
+            if (serverEngine is not null)
+            {
+                engineUsed ??= serverEngine;
+                _logger.LogInformation("      server-level checks ... ok");
+            }
+            else
+            {
+                _logger.LogWarning("      server-level checks ... failed (continuing)");
+            }
+        }
+
+        return engineUsed;
+    }
+
+    /// <summary>
+    /// Per-check scoring path for a direct-API judge (Azure OpenAI): score every unscored
+    /// Semantic check independently, passing the FULL tool schema as context for each single
+    /// assertion, fanning out concurrently up to the engine's MaxConcurrency. Failed checks are
+    /// retried up to <see cref="MaxAttempts"/> rounds. Results are collected concurrently then
+    /// applied to the checklist items serially; the checklist is persisted after each round so an
+    /// interrupted run resumes from the scored items rather than re-scoring everything.
+    /// </summary>
+    private async Task<EvalEngine?> EvaluatePerCheck(EvaluationChecklist checklist, ICodingAgentLauncher launcher, string checklistPath, CancellationToken cancellationToken)
+    {
+        var work = new List<(ChecklistItem Item, string Context)>();
+
+        foreach (var tool in checklist.Tools)
+        {
+            var toolContext = BuildToolContext(tool);
+            foreach (var item in tool.Checks.ToolName.Concat(tool.Checks.ToolDescription).Concat(tool.Checks.SchemaStructure))
+            {
+                if (item.Type == CheckType.Semantic && item.Score is null)
+                {
+                    work.Add((item, toolContext));
+                }
+            }
+
+            foreach (var (paramName, paramChecks) in tool.Checks.Parameters)
+            {
+                var paramContext = $"{toolContext}\n\nParameter under evaluation: \"{paramName}\"";
+                foreach (var item in paramChecks.ParamName.Concat(paramChecks.ParamDescription))
+                {
+                    if (item.Type == CheckType.Semantic && item.Score is null)
+                    {
+                        work.Add((item, paramContext));
+                    }
+                }
+            }
+        }
+
+        var serverContext = BuildServerContext(checklist);
+        foreach (var item in checklist.ServerChecks)
+        {
+            if (item.Type == CheckType.Semantic && item.Score is null)
+            {
+                work.Add((item, serverContext));
+            }
+        }
+
+        if (work.Count == 0)
+        {
+            return launcher.Engine;
+        }
+
+        int dop = Math.Max(1, launcher.MaxConcurrency);
+        _logger.LogInformation("      Scoring {Count} check(s) independently — check-by-check, concurrency {Dop}", work.Count, dop);
+
+        int succeeded = 0;
+        var pending = work;
+        for (int attempt = 1; attempt <= MaxAttempts && pending.Count > 0; attempt++)
+        {
+            // Score concurrently but do NOT mutate the shared ChecklistItem objects inside the
+            // parallel body — collect (item, result) pairs, then apply them serially afterward.
+            var scored = new ConcurrentBag<(ChecklistItem Item, CheckEvaluation Result)>();
+            var failed = new ConcurrentBag<(ChecklistItem Item, string Context)>();
+            int doneInRound = 0;
+            int roundTotal = pending.Count;
+            await Parallel.ForEachAsync(
+                pending,
+                new ParallelOptions { MaxDegreeOfParallelism = dop, CancellationToken = cancellationToken },
+                async (w, ct) =>
+                {
+                    var result = await launcher.ScoreCheckAsync(w.Context, w.Item.Prompt, ct);
+                    if (result is not null)
+                    {
+                        scored.Add((w.Item, result));
+                    }
+                    else
+                    {
+                        failed.Add(w);
+                    }
+
+                    var n = Interlocked.Increment(ref doneInRound);
+                    if (n % 50 == 0 || n == roundTotal)
+                    {
+                        _logger.LogInformation("      ... {Done}/{Total} checks scored", n, roundTotal);
+                    }
+                });
+
+            // Apply results serially — no concurrent writes to ChecklistItem fields.
+            foreach (var (item, result) in scored)
+            {
+                item.Score = result.Score;
+                item.Reason = result.Reason;
+            }
+
+            succeeded += scored.Count;
+            pending = failed.ToList();
+
+            // Checkpoint after each round so an interrupted run resumes from the persisted scores
+            // rather than re-scoring everything.
+            await WriteChecklistAsync(checklist, checklistPath, cancellationToken);
+
+            if (pending.Count > 0 && attempt < MaxAttempts)
+            {
+                _logger.LogInformation("      {Count} check(s) failed; retrying (attempt {Next}/{Max})", pending.Count, attempt + 1, MaxAttempts);
+            }
+        }
+
+        if (pending.Count > 0)
+        {
+            _logger.LogWarning("      {Count} check(s) could not be scored after {Max} attempt(s)", pending.Count, MaxAttempts);
+        }
+
+        return succeeded > 0 ? launcher.Engine : null;
+    }
+
+    /// <summary>Full tool schema string passed as per-check context for a tool's checks.</summary>
+    private static string BuildToolContext(ToolChecklist tool)
+    {
+        var schema = tool.InputSchema.HasValue
+            ? JsonSerializer.Serialize(tool.InputSchema.Value, WriteOptions)
+            : "{}";
+        return $"Tool name: {tool.Name}\nTool description: {tool.Description}\nInput schema (JSON):\n{schema}";
+    }
+
+    /// <summary>Tool-set summary passed as per-check context for server-level checks.</summary>
+    private static string BuildServerContext(EvaluationChecklist checklist)
+    {
+        var summary = string.Join("\n", checklist.Tools.Select(t => $"- {t.Name}: {t.Description}"));
+        return $"Evaluate against the full tool set of this MCP server:\n{summary}";
+    }
+
     /// <summary>
     /// Extracts a single tool to a temp file, invokes the coding agent to evaluate
     /// its semantic checks, then merges the scored results back into the tool object.
@@ -540,10 +701,17 @@ private async Task<List<EvalEngine>> BuildEngineList(EvalEngine requested, Cance
             return [];
         }
 
-        // Auto: detect all available engines, preserving the registered priority order
+        // Auto: detect all available engines, preserving the registered priority order.
+        // Skip explicit-only engines (e.g. a remote API judge) so auto never selects one
+        // the user didn't ask for — those run only via an explicit --eval-engine value.
         var available = new List<EvalEngine>();
         foreach (var launcher in _launchers)
         {
+            if (!launcher.AutoDetectable)
+            {
+                continue;
+            }
+
             if (await launcher.IsAvailableAsync(cancellationToken))
             {
                 _logger.LogDebug("Detected {Engine}", launcher.Engine);
@@ -638,7 +806,8 @@ private void LogManualEvaluationInstructions(string checklistPath, int unscoredC
             if (requested == EvalEngine.Auto)
             {
                 // Built from the registry so a newly added engine appears here automatically.
-                var probed = string.Join(" and ", _launchers.Select(l => $"{l.DisplayName} (`{l.CliCommand}`)"));
+                // Only auto-detectable engines are probed under `auto`, so list just those.
+                var probed = string.Join(" and ", _launchers.Where(l => l.AutoDetectable).Select(l => $"{l.DisplayName} (`{l.CliCommand}`)"));
                 _logger.LogWarning("      No coding agent CLI detected (looked for {Probed}). Run with -v to see why each probe failed.", probed);
             }
             else
@@ -646,8 +815,8 @@ private void LogManualEvaluationInstructions(string checklistPath, int unscoredC
                 // The user asked for one specific engine; name only that one, not both.
                 var launcher = LauncherFor(requested);
                 var name = launcher?.DisplayName ?? FormatEngineName(requested);
-                var binary = launcher?.CliCommand ?? requested.ToString();
-                _logger.LogWarning("      {Name} CLI not found on PATH (looked for `{Binary}`). Run with -v to see why the probe failed.", name, binary);
+                var hint = launcher?.AvailabilityHint ?? $"`{requested}`";
+                _logger.LogWarning("      {Name} is not available — needs {Hint}. Run with -v for details.", name, hint);
             }
         }
         else if (agentAttempted)
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs
index 598592a8..521a4643 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/CodingAgentLauncherBase.cs
@@ -66,6 +66,17 @@ protected CodingAgentLauncherBase(CommandExecutor executor, ILogger logger)
     /// <inheritdoc />
     public abstract string CliCommand { get; }
 
+    /// <inheritdoc />
+    /// <remarks>Subprocess agents run one at a time; a direct-API engine overrides this.</remarks>
+    public virtual int MaxConcurrency => 1;
+
+    /// <inheritdoc />
+    /// <remarks>CLI-backed engines are auto-detectable; subclasses that incur remote cost override this.</remarks>
+    public virtual bool AutoDetectable => true;
+
+    /// <inheritdoc />
+    public virtual string AvailabilityHint => $"the {CliCommand} CLI on PATH";
+
     /// <inheritdoc />
     public Task<bool> IsAvailableAsync(CancellationToken cancellationToken = default)
         => ProbeCommandAsync(CliCommand, "--version", cancellationToken);
@@ -73,6 +84,14 @@ public Task<bool> IsAvailableAsync(CancellationToken cancellationToken = default
     /// <inheritdoc />
     public abstract Task<bool> LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default);
 
+    /// <inheritdoc />
+    /// <remarks>Subprocess coding agents edit a whole-tool file; per-check scoring does not apply.</remarks>
+    public virtual bool ScoresPerCheck => false;
+
+    /// <inheritdoc />
+    public virtual Task<CheckEvaluation?> ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default)
+        => throw new NotSupportedException($"{DisplayName} edits a whole-tool checklist file and does not score checks individually.");
+
     /// <summary>
     /// Runs a process and waits for it to complete, capturing stdout/stderr.
     /// Optionally pipes content via stdin. Kills the process on timeout to
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs
index 954525e8..30cc5f10 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvalModelConstants.cs
@@ -19,8 +19,16 @@ internal static class EvalModelConstants
     /// <summary>Environment variable that overrides the Claude Code model.</summary>
     public const string ClaudeModelEnvVar = "A365_EVAL_CLAUDE_MODEL";
 
+    /// <summary>Environment variable holding the Azure OpenAI endpoint, including the
+    /// API path (e.g. https://my-resource.services.ai.azure.com/openai/v1).</summary>
+    public const string AzureOpenAiEndpointEnvVar = "A365_EVAL_AZURE_OPENAI_ENDPOINT";
+
+    /// <summary>Environment variable holding the Azure OpenAI deployment (model) name.</summary>
+    public const string AzureOpenAiDeploymentEnvVar = "A365_EVAL_AZURE_OPENAI_DEPLOYMENT";
+
     private const string DefaultCopilotModel = "claude-haiku-4.5";
     private const string DefaultClaudeModel = "haiku";
+    private const string DefaultAzureOpenAiScope = "https://ai.azure.com/.default";
 
     /// <summary>Copilot model ID; overridable via <see cref="CopilotModelEnvVar"/>.</summary>
     public static string CopilotModel => FromEnvOrDefault(CopilotModelEnvVar, DefaultCopilotModel);
@@ -28,9 +36,55 @@ internal static class EvalModelConstants
     /// <summary>Claude model ID/alias; overridable via <see cref="ClaudeModelEnvVar"/>.</summary>
     public static string ClaudeModel => FromEnvOrDefault(ClaudeModelEnvVar, DefaultClaudeModel);
 
+    /// <summary>Azure OpenAI endpoint, or null when unset. There is no default: it is
+    /// resource-specific and must be supplied via <see cref="AzureOpenAiEndpointEnvVar"/>.</summary>
+    public static string? AzureOpenAiEndpoint => NullIfBlank(Environment.GetEnvironmentVariable(AzureOpenAiEndpointEnvVar));
+
+    /// <summary>Azure OpenAI deployment (model) name, or null when unset. Supplied via
+    /// <see cref="AzureOpenAiDeploymentEnvVar"/>.</summary>
+    public static string? AzureOpenAiDeployment => NullIfBlank(Environment.GetEnvironmentVariable(AzureOpenAiDeploymentEnvVar));
+
+    /// <summary>Entra ID token scope for Azure OpenAI (fixed; not user-configurable).</summary>
+    public static string AzureOpenAiScope => DefaultAzureOpenAiScope;
+
+    /// <summary>Environment variable overriding how many checks the Azure OpenAI judge scores concurrently.</summary>
+    public const string AzureOpenAiMaxConcurrencyEnvVar = "A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY";
+
+    private const int DefaultAzureOpenAiMaxConcurrency = 100;
+
+    /// <summary>Max concurrent Azure OpenAI scoring calls; overridable via
+    /// <see cref="AzureOpenAiMaxConcurrencyEnvVar"/>. Clamped to [1, 4096]. Higher is faster but
+    /// more likely to hit endpoint rate limits (429), which are retried with backoff per call.</summary>
+    public static int AzureOpenAiMaxConcurrency
+    {
+        get
+        {
+            var raw = Environment.GetEnvironmentVariable(AzureOpenAiMaxConcurrencyEnvVar);
+            return int.TryParse(raw, out var n) && n >= 1 ? Math.Min(n, 4096) : DefaultAzureOpenAiMaxConcurrency;
+        }
+    }
+
     private static string FromEnvOrDefault(string envVar, string fallback)
     {
         var value = Environment.GetEnvironmentVariable(envVar);
         return string.IsNullOrWhiteSpace(value) ? fallback : value;
     }
+
+    private static string? NullIfBlank(string? value)
+    {
+        if (string.IsNullOrWhiteSpace(value))
+        {
+            return null;
+        }
+
+        var trimmed = value.Trim();
+        // Strip a single pair of wrapping quotes that copy-paste from a portal or `SET VAR="..."` can leave.
+        if (trimmed.Length >= 2 &&
+            ((trimmed[0] == '"' && trimmed[^1] == '"') || (trimmed[0] == '\'' && trimmed[^1] == '\'')))
+        {
+            trimmed = trimmed[1..^1].Trim();
+        }
+
+        return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
+    }
 }
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
index eaa0ad47..3090ba10 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
@@ -49,7 +49,7 @@ public EvaluationPipelineService(
     }
 
     /// <inheritdoc />
-    public async Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken)
+    public async Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null)
     {
         try
         {
@@ -81,10 +81,15 @@ public async Task<int> RunAsync(string serverUrl, string outputDir, string evalE
 
             var engine = ParseEvalEngine(evalEngine);
 
-            // Trust boundary: when an agent will run, the server's tool names and
-            // descriptions are handed to a locally running coding agent. Surface this
-            // once per run. (--eval-engine none keeps everything local, so it doesn't apply.)
-            if (engine != EvalEngine.None)
+            // Trust boundary: when an engine will run, the server's tool names and
+            // descriptions are handed to it for scoring. Surface this once per run, and be
+            // accurate about where the content goes — a locally running coding agent vs a
+            // remote Azure OpenAI endpoint. (--eval-engine none keeps everything local.)
+            if (engine == EvalEngine.AzureOpenAI)
+            {
+                _logger.LogWarning("The server's tool names and descriptions are sent to your configured Azure OpenAI endpoint for scoring. Only evaluate MCP servers you trust.");
+            }
+            else if (engine != EvalEngine.None)
             {
                 _logger.LogWarning("The server's tool names and descriptions are sent to a locally running coding agent for scoring. Only evaluate MCP servers you trust.");
             }
@@ -101,7 +106,10 @@ public async Task<int> RunAsync(string serverUrl, string outputDir, string evalE
             // Run the derived name through the same sanitizer as the report filename so
             // any invalid-for-filesystem characters (?, *, <, etc.) from the fallback path
             // don't crash Path.Combine / File.Exists downstream.
-            var serverName = DeriveServerName(serverUrl);
+            // A friendly --report-name overrides the host-derived name for both the output
+            // file names and the report's displayed server name. Essential when many servers
+            // sit behind one gateway host (which would otherwise collide on the same name).
+            var serverName = !string.IsNullOrWhiteSpace(reportName) ? reportName! : DeriveServerName(serverUrl);
             var safeServerName = ReportGenerator.SanitizeFileName(serverName);
             var checklistPath = Path.Combine(outputDir, $"{safeServerName}_checklist.json");
 
@@ -299,13 +307,14 @@ internal static EvalEngine ParseEvalEngine(string value)
             "auto" => EvalEngine.Auto,
             "github-copilot" => EvalEngine.GitHubCopilot,
             "claude-code" => EvalEngine.ClaudeCode,
+            "azure-openai" => EvalEngine.AzureOpenAI,
             "none" => EvalEngine.None,
             _ => throw new EvaluationException(
                 ErrorCodes.EvaluationFailed,
                 $"Unknown eval engine: '{value}'.",
                 mitigationSteps: new List<string>
                 {
-                    "Use one of: auto, github-copilot, claude-code, none"
+                    "Use one of: auto, github-copilot, claude-code, azure-openai, none"
                 })
         };
     }
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs
index c24725d1..6741d8b1 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/ICodingAgentLauncher.cs
@@ -32,6 +32,27 @@ internal interface ICodingAgentLauncher
     /// <summary>The CLI binary name on PATH (e.g. "copilot", "claude").</summary>
     string CliCommand { get; }
 
+    /// <summary>
+    /// How many tool evaluations may run concurrently with this engine. Subprocess agents
+    /// return 1 (serial, to avoid spawning many heavy CLI processes); a direct-API judge
+    /// returns a higher value to score many tools in parallel.
+    /// </summary>
+    int MaxConcurrency { get; }
+
+    /// <summary>
+    /// Whether <c>--eval-engine auto</c> may select this engine once its prerequisites
+    /// are present. Local coding agents are auto-detectable; engines that call a remote
+    /// model endpoint (and may incur cost) are explicit-only and return <c>false</c>.
+    /// </summary>
+    bool AutoDetectable { get; }
+
+    /// <summary>
+    /// Human-readable description of what must be present for this engine to run, used in
+    /// the "engine not available" guidance (e.g. "the copilot CLI on PATH", or the
+    /// required environment variables for an API-based engine).
+    /// </summary>
+    string AvailabilityHint { get; }
+
     /// <summary>
     /// Returns true when the agent's CLI is installed and responds on PATH.
     /// </summary>
@@ -43,4 +64,23 @@ internal interface ICodingAgentLauncher
     /// Returns true when the subprocess exits 0.
     /// </summary>
     Task<bool> LaunchAsync(string prompt, string workingDirectory, TimeSpan timeout, CancellationToken cancellationToken = default);
+
+    /// <summary>
+    /// Whether this engine scores each checklist item independently — one model call per
+    /// check, with the full tool schema passed as context — rather than editing a whole-tool
+    /// file. A direct-API judge returns true; subprocess coding agents return false and use
+    /// <see cref="LaunchAsync"/> instead.
+    /// </summary>
+    bool ScoresPerCheck { get; }
+
+    /// <summary>
+    /// Scores a single checklist item. <paramref name="context"/> is the full tool schema the
+    /// assertion is evaluated against (or the tool-set summary for server-level checks).
+    /// Returns the score and one-sentence reason, or null if the call failed. Only meaningful
+    /// when <see cref="ScoresPerCheck"/> is true.
+    /// </summary>
+    Task<CheckEvaluation?> ScoreCheckAsync(string context, string checkPrompt, CancellationToken cancellationToken = default);
 }
+
+/// <summary>Result of scoring one checklist item: pass/fail plus a one-sentence reason.</summary>
+internal sealed record CheckEvaluation(bool Score, string Reason);
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
index 591cfc71..72fa72a5 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
@@ -17,6 +17,9 @@ public interface IEvaluationPipelineService
     /// <param name="evalEngine">Coding agent engine name (auto, github-copilot, claude-code, none).</param>
     /// <param name="authToken">Optional bearer token for MCP server authentication.</param>
     /// <param name="cancellationToken">Cancellation token.</param>
+    /// <param name="reportName">Optional friendly name for the evaluated server. When provided it is
+    /// used (sanitized) for the output filenames and the report's server name instead of the value
+    /// derived from the URL host — needed when many servers share one gateway host.</param>
     /// <returns>Process exit code: 0 on success or an intentional bring-your-own-LLM stop; 1 when evaluation could not be performed as requested.</returns>
-    Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken);
+    Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null);
 }
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
index 2d36e4de..7783959d 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
@@ -26,6 +26,11 @@ internal sealed class SchemaDiscoveryService : ISchemaDiscoveryService, IDisposa
     private readonly ILogger<SchemaDiscoveryService> _logger;
     private readonly HttpClient _httpClient;
 
+    // MCP Streamable HTTP session id issued by the server on initialize. Servers that require
+    // a session (per spec) reject session-less follow-up requests with HTTP 400; we echo this
+    // header on every subsequent request. Reset per discovery (the service is a DI singleton).
+    private string? _mcpSessionId;
+
     public SchemaDiscoveryService(ILogger<SchemaDiscoveryService> logger, HttpMessageHandler? handler = null)
     {
         ArgumentNullException.ThrowIfNull(logger);
@@ -63,6 +68,7 @@ public async Task<List<ToolSchema>> DiscoverToolsAsync(string serverUrl, string?
         }
 
         _logger.LogDebug("Starting MCP schema discovery against {ServerUrl}", serverUrl);
+        _mcpSessionId = null; // reset any session id captured from a previous discovery
 
         try
         {
@@ -300,8 +306,21 @@ private async Task<HttpResponseMessage> PostJsonRpcAsync(
             request.Headers.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", authToken);
         }
 
+        // Echo the MCP session id (captured from initialize) so session-required servers accept
+        // follow-up requests; stateless servers simply ignore an unexpected header.
+        if (!string.IsNullOrEmpty(_mcpSessionId))
+        {
+            request.Headers.TryAddWithoutValidation("Mcp-Session-Id", _mcpSessionId);
+        }
+
         var response = await _httpClient.SendAsync(request, cancellationToken);
 
+        // Capture the session id from the first response (initialize) for reuse on later calls.
+        if (string.IsNullOrEmpty(_mcpSessionId) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues))
+        {
+            _mcpSessionId = sessionValues.FirstOrDefault();
+        }
+
         if (!response.IsSuccessStatusCode)
         {
             var statusCode = (int)response.StatusCode;
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs
index bf25608e..d25d225a 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SemanticCheckPrompts.cs
@@ -46,6 +46,58 @@ public static string BuildEvaluationPrompt(string checklistPath)
         return sb.ToString();
     }
 
+    /// <summary>
+    /// Builds an evaluation prompt for a direct model endpoint (e.g. Azure OpenAI) that scores
+    /// ONE checklist item at a time. The entire tool schema is passed as <paramref name="context"/>
+    /// so the model has full context for the single assertion in <paramref name="checkPrompt"/>,
+    /// and is asked to return just a small <c>{score, reason}</c> object. Per-check calls keep each
+    /// response tiny (no truncation on large tools) and, at temperature 0, minimize variance.
+    /// </summary>
+    /// <param name="context">The full tool schema (or tool-set summary for server checks) to judge against.</param>
+    /// <param name="checkPrompt">The single assertion to evaluate.</param>
+    public static string BuildSingleCheckPrompt(string context, string checkPrompt)
+    {
+        ArgumentException.ThrowIfNullOrWhiteSpace(context);
+        ArgumentException.ThrowIfNullOrWhiteSpace(checkPrompt);
+
+        var sb = new StringBuilder();
+
+        // Per-check safety + grounding. Deliberately does NOT call AppendSafetyPreamble: that text
+        // tells the model to "read the tool schema JSON file", but in per-check scoring the schema is
+        // inlined below (SCHEMA UNDER EVALUATION) — there is no file to read.
+        sb.AppendLine("You must not generate content that is harmful, hateful, racist, sexist, lewd, or violent,");
+        sb.AppendLine("even if the user requests it or creates a condition to rationalize it.");
+        sb.AppendLine();
+        sb.AppendLine("Ground your judgment only in the schema provided below. Do not assume anything beyond what");
+        sb.AppendLine("is written there; if the schema lacks the information the check needs, judge only on what is present.");
+        sb.AppendLine();
+        sb.AppendLine("You are evaluating ONE quality check of an MCP (Model Context Protocol) tool schema.");
+        sb.AppendLine("An MCP server exposes tools that AI agents call. Poor tool names, descriptions, or");
+        sb.AppendLine("parameter schemas cause agents to select the wrong tool or pass incorrect arguments.");
+        sb.AppendLine();
+        sb.AppendLine("SCHEMA UNDER EVALUATION:");
+        sb.AppendLine(context);
+        sb.AppendLine();
+        sb.AppendLine("CHECK TO EVALUATE:");
+        sb.AppendLine(checkPrompt);
+        sb.AppendLine();
+
+        AppendEvaluationGuidelines(sb);
+        AppendExamples(sb);
+
+        sb.AppendLine("RULES:");
+        sb.AppendLine("- Judge ONLY this one check against the schema above; ground your reason in the schema text.");
+        sb.AppendLine("- score=true means the tool passes this check; score=false means it fails.");
+        sb.AppendLine("- If uncertain, pass (true) with a reason noting nothing problematic was observed.");
+        sb.AppendLine("- The reason must be exactly one sentence.");
+        sb.AppendLine();
+        sb.AppendLine("OUTPUT FORMAT (strict):");
+        sb.AppendLine("Respond with ONLY this JSON object, no prose and no Markdown code fences:");
+        sb.AppendLine("{\"score\": true or false, \"reason\": \"one sentence\"}");
+
+        return sb.ToString();
+    }
+
     /// <summary>
     /// Concrete read/edit tool names for the target coding agent. Embedded into
     /// the prompt so the agent is told exactly what to use rather than guessing.
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
index 0aeed5f1..feb956ec 100644
--- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
@@ -41,7 +41,7 @@ private static IEvaluationPipelineService PipelineReturning(int exitCode)
     {
         var pipeline = Substitute.For<IEvaluationPipelineService>();
         pipeline
-            .RunAsync(Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string?>(), Arg.Any<CancellationToken>())
+            .RunAsync(Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string?>(), Arg.Any<CancellationToken>(), Arg.Any<string?>())
             .Returns(exitCode);
         return pipeline;
     }
@@ -99,7 +99,8 @@ await pipeline.Received(1).RunAsync(
                 Arg.Any<string>(),
                 Arg.Any<string>(),
                 "env-token-xyz",
-                Arg.Any<CancellationToken>());
+                Arg.Any<CancellationToken>(),
+                Arg.Any<string?>());
         }
         finally
         {
@@ -196,6 +197,47 @@ await pipeline.Received(1).RunAsync(
             "out",
             "claude-code",
             Arg.Any<string?>(),
-            Arg.Any<CancellationToken>());
+            Arg.Any<CancellationToken>(),
+            Arg.Any<string?>());
+    }
+
+    [Fact]
+    public async Task InvokeAsync_AzureOpenAiEngine_ForwardsEngineAsThirdArgument()
+    {
+        var pipeline = PipelineReturning(0);
+        var evaluate = GetEvaluateSubcommand(Substitute.For<ILogger>(), pipeline);
+        var parser = BuildFaithfulParser(evaluate);
+
+        var exitCode = await parser.InvokeAsync(
+            new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "azure-openai" });
+
+        exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code");
+        await pipeline.Received(1).RunAsync(
+            Arg.Any<string>(),
+            Arg.Any<string>(),
+            "azure-openai",
+            Arg.Any<string?>(),
+            Arg.Any<CancellationToken>(),
+            Arg.Any<string?>());
+    }
+
+    [Fact]
+    public async Task InvokeAsync_ReportNameFlag_ForwardsReportNameAsSixthArgument()
+    {
+        var pipeline = PipelineReturning(0);
+        var evaluate = GetEvaluateSubcommand(Substitute.For<ILogger>(), pipeline);
+        var parser = BuildFaithfulParser(evaluate);
+
+        var exitCode = await parser.InvokeAsync(
+            new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "none", "--report-name", "salesforce" });
+
+        exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code");
+        await pipeline.Received(1).RunAsync(
+            Arg.Any<string>(),
+            Arg.Any<string>(),
+            Arg.Any<string>(),
+            Arg.Any<string?>(),
+            Arg.Any<CancellationToken>(),
+            "salesforce");
     }
 }
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
new file mode 100644
index 00000000..b4824f1c
--- /dev/null
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+using FluentAssertions;
+using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate;
+using Microsoft.Extensions.Logging.Abstractions;
+using Xunit;
+
+namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate;
+
+/// <summary>
+/// Disables parallelization for <see cref="AzureOpenAiLauncherAvailabilityTests"/>, which
+/// mutate the process-wide endpoint and deployment environment variables that
+/// <c>IsAvailableAsync</c> reads.
+/// </summary>
+// This serializes tests WITHIN the collection only. No other test class in the suite reads
+// A365_EVAL_AZURE_OPENAI_ENDPOINT or A365_EVAL_AZURE_OPENAI_DEPLOYMENT, so cross-collection
+// races are not a concern; revisit if another suite begins reading those variables.
+[CollectionDefinition("AzureOpenAiAvailability", DisableParallelization = true)]
+public class AzureOpenAiAvailabilityCollection { }
+
+/// <summary>
+/// Tests <see cref="AzureOpenAiLauncher.IsAvailableAsync"/>, which gates whether the engine
+/// can run by inspecting the endpoint and deployment environment variables. These mutate
+/// process-wide state, so the suite runs in its own non-parallel collection and each test
+/// saves and restores both variables in a finally block.
+/// </summary>
+[Collection("AzureOpenAiAvailability")]
+public class AzureOpenAiLauncherAvailabilityTests
+{
+    private const string EndpointVar = EvalModelConstants.AzureOpenAiEndpointEnvVar;
+    private const string DeploymentVar = EvalModelConstants.AzureOpenAiDeploymentEnvVar;
+
+    private static AzureOpenAiLauncher CreateLauncher() =>
+        new(NullLogger<AzureOpenAiLauncher>.Instance);
+
+    /// <summary>
+    /// Sets both environment variables, runs <paramref name="body"/>, then restores the
+    /// original values regardless of outcome so no test leaks state into another.
+    /// </summary>
+    private static async Task WithEnvAsync(string? endpoint, string? deployment, Func<Task> body)
+    {
+        var originalEndpoint = Environment.GetEnvironmentVariable(EndpointVar);
+        var originalDeployment = Environment.GetEnvironmentVariable(DeploymentVar);
+        try
+        {
+            Environment.SetEnvironmentVariable(EndpointVar, endpoint);
+            Environment.SetEnvironmentVariable(DeploymentVar, deployment);
+            await body();
+        }
+        finally
+        {
+            Environment.SetEnvironmentVariable(EndpointVar, originalEndpoint);
+            Environment.SetEnvironmentVariable(DeploymentVar, originalDeployment);
+        }
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_BothUnset_ReturnsFalse()
+    {
+        await WithEnvAsync(endpoint: null, deployment: null, async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeFalse(
+                because: "with neither variable set the engine has nothing to connect to and must report unavailable");
+        });
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_OnlyEndpointSet_ReturnsFalse()
+    {
+        await WithEnvAsync(endpoint: "https://x.openai.azure.com/openai/v1", deployment: null, async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeFalse(
+                because: "a deployment name is also required; an endpoint alone is not enough to run the judge");
+        });
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_OnlyDeploymentSet_ReturnsFalse()
+    {
+        await WithEnvAsync(endpoint: null, deployment: "gpt-4.1", async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeFalse(
+                because: "an endpoint is also required; a deployment name alone is not enough to run the judge");
+        });
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_EndpointNotAbsoluteUrl_ReturnsFalse()
+    {
+        await WithEnvAsync(endpoint: "not-a-url", deployment: "gpt-4.1", async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeFalse(
+                because: "a non-absolute endpoint cannot be used to build a client, so the engine must report unavailable up front");
+        });
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_BothSetWithValidHttpsEndpoint_ReturnsTrue()
+    {
+        await WithEnvAsync(endpoint: "https://x.openai.azure.com/openai/v1", deployment: "gpt-4.1", async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeTrue(
+                because: "both variables are set and the endpoint is a valid absolute URL, so the engine is configured to run");
+        });
+    }
+
+    [Fact]
+    public async Task IsAvailableAsync_EndpointWithWrappingQuotes_ReturnsTrue()
+    {
+        // Copy-paste from a portal or `SET VAR="..."` can leave literal wrapping quotes.
+        await WithEnvAsync(endpoint: "\"https://x.openai.azure.com/openai/v1\"", deployment: "gpt-4.1", async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeTrue(
+                because: "a single pair of wrapping quotes must be stripped so a quoted endpoint still resolves to a valid absolute URL");
+        });
+    }
+}
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs
new file mode 100644
index 00000000..c31a3e55
--- /dev/null
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherTests.cs
@@ -0,0 +1,233 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+using FluentAssertions;
+using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;
+using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate;
+using Microsoft.Extensions.Logging.Abstractions;
+using Xunit;
+
+namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate;
+
+/// <summary>
+/// Pure-function and property tests for <see cref="AzureOpenAiLauncher"/> — its response
+/// parsing (<c>ParseEvaluation</c>/<c>ExtractJson</c>), its static metadata, and the
+/// per-check prompt builder. None of these touch the environment or the network, so this
+/// suite is parallel-safe. Availability (which reads environment variables) is covered
+/// separately in a non-parallel collection.
+/// </summary>
+public class AzureOpenAiLauncherTests
+{
+    private static AzureOpenAiLauncher CreateLauncher() =>
+        new(NullLogger<AzureOpenAiLauncher>.Instance);
+
+    // -----------------------------------------------------------------------
+    // ParseEvaluation - valid inputs
+    // -----------------------------------------------------------------------
+
+    [Theory]
+    [InlineData("{\"score\": true, \"reason\": \"x\"}", true, "x")]
+    [InlineData("{\"score\": false, \"reason\": \"nope\"}", false, "nope")]
+    [InlineData("{\"score\": \"true\", \"reason\": \"stringified pass\"}", true, "stringified pass")]
+    [InlineData("{\"score\": \"false\", \"reason\": \"stringified fail\"}", false, "stringified fail")]
+    public void ParseEvaluation_WellFormedJson_ParsesScoreAndReason(string output, bool expectedScore, string expectedReason)
+    {
+        var result = AzureOpenAiLauncher.ParseEvaluation(output);
+
+        result.Should().NotBeNull(because: "a well-formed {score, reason} object must parse into a CheckEvaluation");
+        result!.Score.Should().Be(expectedScore, because: "the score must reflect the boolean (or stringified boolean) in the response");
+        result.Reason.Should().Be(expectedReason, because: "the reason text must be carried through verbatim");
+    }
+
+    [Fact]
+    public void ParseEvaluation_FencedJson_StripsFenceAndParses()
+    {
+        var output = "```json\n{\"score\": true, \"reason\": \"fenced response\"}\n```";
+
+        var result = AzureOpenAiLauncher.ParseEvaluation(output);
+
+        result.Should().NotBeNull(because: "a model that wraps its JSON in a Markdown code fence must still be parsed");
+        result!.Score.Should().BeTrue();
+        result.Reason.Should().Be("fenced response");
+    }
+
+    [Fact]
+    public void ParseEvaluation_ProseWrappedJson_ExtractsObjectAndParses()
+    {
+        var output = "Here is my judgment: {\"score\": false, \"reason\": \"prose around the object\"} - hope that helps.";
+
+        var result = AzureOpenAiLauncher.ParseEvaluation(output);
+
+        result.Should().NotBeNull(because: "stray prose around the JSON object must be tolerated by narrowing to the outermost braces");
+        result!.Score.Should().BeFalse();
+        result.Reason.Should().Be("prose around the object");
+    }
+
+    [Fact]
+    public void ParseEvaluation_ReasonContainingBraces_PreservesBraces()
+    {
+        var output = "{\"score\": true, \"reason\": \"Schema {ok} valid.\"}";
+
+        var result = AzureOpenAiLauncher.ParseEvaluation(output);
+
+        result.Should().NotBeNull(because: "braces inside the reason string must not break extraction of the outermost object");
+        result!.Score.Should().BeTrue();
+        result.Reason.Should().Be("Schema {ok} valid.",
+            because: "the literal braces inside the reason value must be preserved, not truncated by brace matching");
+    }
+
+    // -----------------------------------------------------------------------
+    // ParseEvaluation - invalid inputs return null
+    // -----------------------------------------------------------------------
+
+    [Theory]
+    [InlineData(null)]
+    [InlineData("")]
+    [InlineData("no json here at all")]
+    [InlineData("{\"reason\":\"missing score\"}")]
+    public void ParseEvaluation_InvalidOrIncomplete_ReturnsNull(string? output)
+    {
+        var result = AzureOpenAiLauncher.ParseEvaluation(output);
+
+        result.Should().BeNull(
+            because: "without a usable score the evaluator must treat the response as unscored and retry, not record a bogus result");
+    }
+
+    [Fact]
+    public void ParseEvaluation_PresentButNullScore_ParsesAsFalse()
+    {
+        // The "score" property is present (so it is not a missing-field case) but its value is
+        // JSON null. ParseEvaluation only returns null when the property is absent; a present
+        // score of any non-true/non-"true" kind collapses to false, which is the safe (fail)
+        // default for a malformed score rather than discarding the response entirely.
+        var result = AzureOpenAiLauncher.ParseEvaluation("{\"score\": null, \"reason\": \"explicit null score\"}");
+
+        result.Should().NotBeNull(because: "a present score property means the object is parsed, not rejected");
+        result!.Score.Should().BeFalse(because: "a null score is not a pass, so it conservatively collapses to false");
+        result.Reason.Should().Be("explicit null score");
+    }
+
+    // -----------------------------------------------------------------------
+    // ExtractJson
+    // -----------------------------------------------------------------------
+
+    [Fact]
+    public void ExtractJson_NestedObject_PreservesInnerStructure()
+    {
+        var output = "{\"score\": true, \"reason\": \"ok\", \"nested\": {\"a\": 1}}";
+
+        var json = AzureOpenAiLauncher.ExtractJson(output);
+
+        json.Should().NotBeNull(because: "a well-formed JSON object must be extractable from plain input");
+        json.Should().Contain("\"nested\"", because: "a nested object must be retained within the outermost braces");
+        json.Should().Contain("{\"a\": 1}", because: "the inner object must not be truncated");
+    }
+
+    [Fact]
+    public void ExtractJson_BraceInsideString_DoesNotTruncate()
+    {
+        var output = "{\"score\": true, \"reason\": \"has a } brace in text\"}";
+
+        var json = AzureOpenAiLauncher.ExtractJson(output);
+
+        json.Should().NotBeNull(because: "a single self-contained object with no trailing prose must be extractable");
+        json.Should().Be("{\"score\": true, \"reason\": \"has a } brace in text\"}",
+            because: "LastIndexOf finds the outermost closing brace, which is correct when no prose after the object contains additional braces");
+    }
+
+    [Fact]
+    public void ExtractJson_FencedJson_StripsFences()
+    {
+        var output = "```json\n{\"score\": true, \"reason\": \"ok\"}\n```";
+
+        var json = AzureOpenAiLauncher.ExtractJson(output);
+
+        json.Should().NotBeNull(because: "a fenced JSON block must be returned after fence stripping");
+        json.Should().NotContain("```", because: "Markdown code fences must be stripped before the JSON is returned");
+        json.Should().Be("{\"score\": true, \"reason\": \"ok\"}",
+            because: "only the inner JSON object should remain after fence and whitespace stripping");
+    }
+
+    [Theory]
+    [InlineData(null)]
+    [InlineData("")]
+    [InlineData("   ")]
+    [InlineData("no braces at all")]
+    public void ExtractJson_NullBlankOrNoBrace_ReturnsNull(string? output)
+    {
+        var json = AzureOpenAiLauncher.ExtractJson(output);
+
+        json.Should().BeNull(
+            because: "with no recoverable JSON object the caller must get null rather than an unparseable candidate");
+    }
+
+    // -----------------------------------------------------------------------
+    // Static metadata / properties
+    // -----------------------------------------------------------------------
+
+    [Fact]
+    public void Engine_IsAzureOpenAi()
+    {
+        CreateLauncher().Engine.Should().Be(EvalEngine.AzureOpenAI);
+    }
+
+    [Fact]
+    public void DisplayName_IsAzureOpenAi()
+    {
+        CreateLauncher().DisplayName.Should().Be("Azure OpenAI");
+    }
+
+    [Fact]
+    public void ScoresPerCheck_IsTrue()
+    {
+        CreateLauncher().ScoresPerCheck.Should().BeTrue(
+            because: "the Azure OpenAI judge scores each assertion independently with one model call per check");
+    }
+
+    [Fact]
+    public void AutoDetectable_IsFalse()
+    {
+        CreateLauncher().AutoDetectable.Should().BeFalse(
+            because: "--eval-engine auto must never select Azure OpenAI; a plain run must not spend tokens unless the user opts in");
+    }
+
+    [Fact]
+    public void AvailabilityHint_NamesBothEnvironmentVariables()
+    {
+        var hint = CreateLauncher().AvailabilityHint;
+
+        hint.Should().Contain(EvalModelConstants.AzureOpenAiEndpointEnvVar,
+            because: "the hint must tell the user which endpoint environment variable to set");
+        hint.Should().Contain(EvalModelConstants.AzureOpenAiDeploymentEnvVar,
+            because: "the hint must tell the user which deployment environment variable to set");
+    }
+
+    // -----------------------------------------------------------------------
+    // BuildSingleCheckPrompt
+    // -----------------------------------------------------------------------
+
+    [Fact]
+    public void BuildSingleCheckPrompt_IncludesContextCheckAndStrictJsonFormat()
+    {
+        const string context = "TOOL_SCHEMA_MARKER_42";
+        const string checkPrompt = "CHECK_PROMPT_MARKER_99";
+
+        var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt(context, checkPrompt);
+
+        prompt.Should().Contain(context, because: "the model needs the tool schema as grounding for the single check");
+        prompt.Should().Contain(checkPrompt, because: "the assertion under evaluation must appear in the prompt");
+        prompt.Should().Contain("{\"score\": true or false, \"reason\": \"one sentence\"}",
+            because: "the strict output contract pins the exact JSON shape the parser expects");
+    }
+
+    [Fact]
+    public void BuildSingleCheckPrompt_OmitsFileAndEditStrategyInstructions()
+    {
+        var prompt = SemanticCheckPrompts.BuildSingleCheckPrompt("some schema", "some check");
+
+        prompt.Should().NotContain("JSON file",
+            because: "per-check scoring inlines the schema in the prompt; there is no file for the model to read");
+        prompt.Should().NotContain("EDIT STRATEGY",
+            because: "the per-check prompt returns a small object directly and never edits a checklist file");
+    }
+}
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs
index 2f862e82..caa477b7 100644
--- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/EvaluationPipelineServiceTests.cs
@@ -4,7 +4,11 @@
 using FluentAssertions;
 using Microsoft.Agents.A365.DevTools.Cli.Exceptions;
 using Microsoft.Agents.A365.DevTools.Cli.Models.Evaluate;
+using Microsoft.Agents.A365.DevTools.Cli.Services;
 using Microsoft.Agents.A365.DevTools.Cli.Services.Evaluate;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
+using NSubstitute;
 using Xunit;
 
 namespace Microsoft.Agents.A365.DevTools.Cli.Tests.Services.Evaluate;
@@ -25,6 +29,8 @@ public class EvaluationPipelineServiceTests
     [InlineData("GITHUB-COPILOT", EvalEngine.GitHubCopilot)]
     [InlineData("claude-code", EvalEngine.ClaudeCode)]
     [InlineData("Claude-Code", EvalEngine.ClaudeCode)]
+    [InlineData("azure-openai", EvalEngine.AzureOpenAI)]
+    [InlineData("Azure-OpenAI", EvalEngine.AzureOpenAI)]
     [InlineData("none", EvalEngine.None)]
     [InlineData("NONE", EvalEngine.None)]
     public void ParseEvalEngine_ValidValues_ReturnsCorrectEnum(string input, EvalEngine expected)
@@ -104,4 +110,103 @@ public void DeriveServerName_EmptyString_ReturnsUnknownServer()
         result.Should().Be("unknown-server",
             because: "empty input must fall back to a stable placeholder so report generation still has a filename");
     }
+
+    // -----------------------------------------------------------------------
+    // Trust-boundary warning (RunAsync)
+    //
+    // The warning fires immediately after ParseEvalEngine and before any server
+    // discovery, so a real pipeline with a substitute logger and mocked
+    // sub-dependencies surfaces it without contacting any MCP server or Azure
+    // OpenAI endpoint. The evaluator is stubbed to report CouldNotEvaluate so the
+    // pipeline short-circuits (exit 1) right after the warning, never reaching
+    // analysis or report generation.
+    // -----------------------------------------------------------------------
+
+    private static EvaluationPipelineService BuildPipelineWithEvaluatorOutcome(
+        ILogger<EvaluationPipelineService> logger,
+        EvaluationOutcome outcome)
+    {
+        var evaluator = Substitute.For<IChecklistEvaluator>();
+        evaluator
+            .EvaluateAsync(Arg.Any<EvaluationChecklist>(), Arg.Any<string>(), Arg.Any<EvalEngine>(), Arg.Any<CancellationToken>())
+            .Returns(call => new ChecklistEvaluationResult
+            {
+                Checklist = call.Arg<EvaluationChecklist>(),
+                Outcome = outcome,
+            });
+
+        var generator = Substitute.For<IChecklistGenerator>();
+        generator
+            .Generate(Arg.Any<List<ToolSchema>>(), Arg.Any<string>(), Arg.Any<string>())
+            .Returns(new EvaluationChecklist());
+
+        // Return an empty (non-null) tool list so discovery logging does not NRE; the run still
+        // short-circuits at the evaluator outcome, which is all this warning test exercises.
+        var discovery = Substitute.For<ISchemaDiscoveryService>();
+        discovery
+            .DiscoverToolsAsync(Arg.Any<string>(), Arg.Any<string?>(), Arg.Any<CancellationToken>())
+            .Returns(new List<ToolSchema>());
+
+        return new EvaluationPipelineService(
+            logger,
+            discovery,
+            generator,
+            evaluator,
+            Substitute.For<IEvaluationAnalyzer>(),
+            Substitute.For<IReportGenerator>(),
+            Substitute.For<IAgent365ToolingService>());
+    }
+
+    private static void ReceivedWarningContaining(ILogger<EvaluationPipelineService> logger, string fragment) =>
+        logger.Received().Log(
+            LogLevel.Warning,
+            Arg.Any<EventId>(),
+            Arg.Is<object>(o => o.ToString()!.Contains(fragment)),
+            Arg.Any<Exception?>(),
+            Arg.Any<Func<object, Exception?, string>>());
+
+    private static void DidNotReceiveWarningContaining(ILogger<EvaluationPipelineService> logger, string fragment) =>
+        logger.DidNotReceive().Log(
+            LogLevel.Warning,
+            Arg.Any<EventId>(),
+            Arg.Is<object>(o => o.ToString()!.Contains(fragment)),
+            Arg.Any<Exception?>(),
+            Arg.Any<Func<object, Exception?, string>>());
+
+    // A unique, non-existent output directory so the pipeline always takes the fresh-discovery
+    // path (a leftover "<host>_checklist.json" on disk would otherwise flip it to the resume
+    // path). Nothing is written here: the stubbed evaluator short-circuits the run before report
+    // generation, so no cleanup is needed.
+    private static string UniqueOutputDir() =>
+        Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString("N"));
+
+    [Fact]
+    public async Task RunAsync_AzureOpenAiEngine_WarnsAboutAzureOpenAiEndpointNotLocalAgent()
+    {
+        var logger = Substitute.For<ILogger<EvaluationPipelineService>>();
+        var pipeline = BuildPipelineWithEvaluatorOutcome(logger, EvaluationOutcome.CouldNotEvaluate);
+
+        var exitCode = await pipeline.RunAsync(
+            "http://localhost/mcp", UniqueOutputDir(), "azure-openai", authToken: null, CancellationToken.None);
+
+        exitCode.Should().Be(1,
+            because: "a CouldNotEvaluate outcome means scoring did not happen as requested, so the run must fail");
+        ReceivedWarningContaining(logger, "Azure OpenAI endpoint");
+        DidNotReceiveWarningContaining(logger, "locally running coding agent");
+    }
+
+    [Fact]
+    public async Task RunAsync_LocalAgentEngine_WarnsAboutLocalAgentNotAzureOpenAi()
+    {
+        var logger = Substitute.For<ILogger<EvaluationPipelineService>>();
+        var pipeline = BuildPipelineWithEvaluatorOutcome(logger, EvaluationOutcome.CouldNotEvaluate);
+
+        var exitCode = await pipeline.RunAsync(
+            "http://localhost/mcp", UniqueOutputDir(), "claude-code", authToken: null, CancellationToken.None);
+
+        exitCode.Should().Be(1,
+            because: "a CouldNotEvaluate outcome means scoring did not happen as requested, so the run must fail");
+        ReceivedWarningContaining(logger, "locally running coding agent");
+        DidNotReceiveWarningContaining(logger, "Azure OpenAI endpoint");
+    }
 }

From 070b26274074b1e963409ed0898a510a35b56497 Mon Sep 17 00:00:00 2001
From: "Ashrya Agrawal (from Dev Box)" <ashragrawal@microsoft.com>
Date: Mon, 22 Jun 2026 15:24:23 -0700
Subject: [PATCH 2/2] fix(evaluate): address PR #463 review and drop
 --report-name option

SchemaDiscovery: replace the shared mutable _mcpSessionId field with a per-call McpSession holder so concurrent discoveries on the DI singleton no longer race; validate the server-supplied session id as visible ASCII (0x21-0x7E) and echo it via Headers.Add to block CR/LF header injection.

AzureOpenAiLauncher: require an HTTPS endpoint in IsAvailableAsync (+ test) so tool metadata is never sent over plaintext http.

Remove the --report-name option across command, interface, pipeline, tests, and CHANGELOG.

Docs: recommend gpt-5.4 (the tested model) for A365_EVAL_AZURE_OPENAI_DEPLOYMENT. Trim the azure-openai CHANGELOG entry and remove the OpenAI version-pin rationale comment.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md                                  |  3 +-
 .../a365-evaluate-faq.md                      |  2 +-
 .../a365-evaluate-instructions.md             |  6 +-
 src/Directory.Packages.props                  |  4 -
 .../Commands/DevelopMcpCommand.cs             |  9 +-
 .../Services/Evaluate/AzureOpenAiLauncher.cs  | 12 ++-
 .../Evaluate/EvaluationPipelineService.cs     |  7 +-
 .../Evaluate/IEvaluationPipelineService.cs    |  5 +-
 .../Evaluate/SchemaDiscoveryService.cs        | 82 ++++++++++++++-----
 .../EvaluateCommandInvocationTests.cs         | 31 +------
 .../AzureOpenAiLauncherAvailabilityTests.cs   | 14 ++++
 11 files changed, 100 insertions(+), 75 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c6015a2d..bb6b6a7a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,8 +26,7 @@ Agents provisioned before this release need `Agent365.Observability.OtelWrite` g
 - Log separator written at the start of each CLI invocation now redacts values for secret-bearing options (e.g. `--idp-client-secret`) so they are not written to the log file in plain text.
 - Authentication context (tenant and user) is now logged at the `Information` level whenever the resolved sign-in identity changes, giving operators a clear audit trail in the log file of who the CLI is acting as, without exposing credentials.
 - `a365 develop-mcp evaluate` command for evaluating MCP server tool schema quality — runs deterministic and semantic checks (via GitHub Copilot or Claude Code CLIs), computes maturity scoring, and generates an interactive HTML report
-- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores each checklist item with your own Azure OpenAI deployment instead of a local coding agent. Authenticates with Microsoft Entra ID only (`DefaultAzureCredential`; API-key auth is not supported); configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (optional `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY`, default 100). Each assertion is judged independently at temperature 0, in parallel, with the full tool schema as context.
-- `--report-name` on `develop-mcp evaluate` — overrides the server name used for the output file names and the report header (derived from the URL host by default); needed when multiple servers share one gateway host so their reports don't collide.
+- `--eval-engine azure-openai` on `develop-mcp evaluate` — scores semantic checks with your own Azure OpenAI deployment instead of a local coding agent, authenticating with Microsoft Entra ID only (`DefaultAzureCredential`; no API key). Configured via `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`.
 - `--skip-sp-provisioning` option on `setup all` — skips the interactive in-line provisioning of missing resource service principals (issue #429). Default: setup prompts per-resource and runs `az ad sp create --id <appId>` using the operator's `az login`. With this flag, missing SPs are excluded from the consent URL and listed in the Action Required block with the `az` command and a per-SP consent URL. Implicitly enabled when stdin is redirected (CI / pipe scenarios).
 - Action Required block in the `setup all` summary now lists missing service principals — each entry shows the resource, pending scopes, the `az ad sp create` command, and the per-SP consent URL needed to complete provisioning.
 - `logs export [command] [--output <dir>]` — exports a redacted copy of a CLI diagnostic log safe to share with Microsoft support. Redacts JWT tokens, email addresses, OS-path usernames, and tenant-specific GUIDs; replaces identical values with consistent aliases so log correlation is preserved. Preserves diagnostic IDs that aren't sensitive but are useful for debugging — `TraceId`, `CorrelationId`, Microsoft Graph `request-id` and `client-request-id` values, and well-known public Microsoft / Agent 365 resource appIds (such as the Microsoft Graph appId `00000003-0000-0000-c000-000000000000`). Omit `[command]` to export all available logs at once.
diff --git a/docs/agent365-guided-setup/a365-evaluate-faq.md b/docs/agent365-guided-setup/a365-evaluate-faq.md
index e637b664..a4997c8b 100644
--- a/docs/agent365-guided-setup/a365-evaluate-faq.md
+++ b/docs/agent365-guided-setup/a365-evaluate-faq.md
@@ -39,7 +39,7 @@ By default the evaluation uses **Claude Haiku 4.5**, invoked through whichever c
 
 You can also score with **your own Azure OpenAI deployment** via `--eval-engine azure-openai`. Set `A365_EVAL_AZURE_OPENAI_ENDPOINT` and `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`, and sign in to Entra ID (e.g. `az login`). This engine authenticates with **Microsoft Entra ID only** (`DefaultAzureCredential`) — **API-key authentication is not supported**. Unlike the coding-agent engines it needs no local CLI: the `a365` CLI calls your Azure OpenAI deployment directly, under your own Azure subscription, billing, and access policies.
 
-To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT`.
+To run a **different model** without waiting for a CLI update, set an environment variable before the run: `A365_EVAL_COPILOT_MODEL` for GitHub Copilot (needs an exact model ID, e.g. `claude-haiku-4.5`) or `A365_EVAL_CLAUDE_MODEL` for Claude Code (accepts an alias, e.g. `haiku`). For Azure OpenAI, the model is whatever you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (`gpt-5.4` recommended — it is the model this engine was tested against).
 
 The CLI that runs the model is **yours**, installed from npm by you, authenticated with your credentials. Microsoft does not host or resell the model API call — it is made directly by your CLI to the AI provider under your own terms of service and billing. The `a365` CLI specifies the model flag but does not mediate the API call.
 
diff --git a/docs/agent365-guided-setup/a365-evaluate-instructions.md b/docs/agent365-guided-setup/a365-evaluate-instructions.md
index 4de1cb3f..137aa380 100644
--- a/docs/agent365-guided-setup/a365-evaluate-instructions.md
+++ b/docs/agent365-guided-setup/a365-evaluate-instructions.md
@@ -143,7 +143,7 @@ No local coding agent is used — the `a365` CLI calls your Azure OpenAI deploym
 
 ```bash
 echo "$A365_EVAL_AZURE_OPENAI_ENDPOINT"     # e.g. https://<resource>.services.ai.azure.com/openai/v1
-echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT"   # e.g. gpt-4.1
+echo "$A365_EVAL_AZURE_OPENAI_DEPLOYMENT"   # deployment name, e.g. gpt-5.4 (recommended)
 ```
 
 If either is empty, ask the user for their Azure OpenAI endpoint (include the `/openai/v1` path) and deployment name, then set both variables.
@@ -212,10 +212,10 @@ These settings can come from environment variables instead of (or alongside) the
 | `A365_EVAL_COPILOT_MODEL` | Override the GitHub Copilot model (exact model ID, e.g. `claude-haiku-4.5`). |
 | `A365_EVAL_CLAUDE_MODEL` | Override the Claude Code model (alias, e.g. `haiku`). |
 | `A365_EVAL_AZURE_OPENAI_ENDPOINT` | **Required for `--eval-engine azure-openai`.** Azure OpenAI endpoint, including the API path (e.g. `https://<resource>.services.ai.azure.com/openai/v1`). |
-| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with (e.g. `gpt-4.1`). |
+| `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` | **Required for `--eval-engine azure-openai`.** Deployment (model) name to score with. `gpt-5.4` is recommended — it is the model this engine was tested against. |
 | `A365_EVAL_AZURE_OPENAI_MAX_CONCURRENCY` | Parallel check-scoring calls for the `azure-openai` engine. Default `100`. |
 
-For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` and authenticates with Entra ID only (no API key — see Step 2).
+For the coding-agent engines the model defaults to Claude Haiku 4.5; override only to move to a newer model without waiting for a CLI release. The `azure-openai` engine uses whatever deployment you name in `A365_EVAL_AZURE_OPENAI_DEPLOYMENT` (`gpt-5.4` recommended) and authenticates with Entra ID only (no API key — see Step 2).
 
 ### What you will see
 
diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props
index 6d92f5e4..e426181d 100644
--- a/src/Directory.Packages.props
+++ b/src/Directory.Packages.props
@@ -32,10 +32,6 @@
     <PackageVersion Include="Azure.Identity" Version="1.13.2" />
     <PackageVersion Include="Azure.ResourceManager" Version="1.13.2" />
     <PackageVersion Include="Azure.ResourceManager.AppService" Version="1.2.0" />
-    <!-- OpenAI SDK (Responses API) — Azure OpenAI BYO-LLM judge for the evaluate command.
-         Pinned to 2.8.0: the last release whose transitive System.ClientModel (1.8.1) does NOT
-         force System.Text.Json / System.Diagnostics.DiagnosticSource up to 10.x. 2.9.0+ pull
-         System.ClientModel >= 1.9.0 which requires the 10.x runtime libraries. -->
     <PackageVersion Include="OpenAI" Version="2.8.0" />
     <!-- Microsoft Graph -->
     <PackageVersion Include="Microsoft.Graph" Version="5.36.0" />
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
index e826b124..24d2a730 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Commands/DevelopMcpCommand.cs
@@ -89,16 +89,10 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
             "--auth-token",
             "Bearer token for MCP server authentication. Prefer the A365_MCP_AUTH_TOKEN environment variable; a token passed on the command line is visible to process listings (ps / Task Manager) and shell history.");
 
-        var reportNameOption = new Option<string?>(
-            "--report-name",
-            "Friendly name for the evaluated server, used for the output file names and the report's server name (e.g. \"salesforce\"). " +
-            "Defaults to a name derived from the server URL host. Set this when several servers share one gateway host so their reports don't collide.");
-
         command.AddOption(serverUrlOption);
         command.AddOption(outputDirOption);
         command.AddOption(evalEngineOption);
         command.AddOption(authTokenOption);
-        command.AddOption(reportNameOption);
 
         command.SetHandler(async (System.CommandLine.Invocation.InvocationContext context) =>
         {
@@ -106,7 +100,6 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
             var outputDir = context.ParseResult.GetValueForOption(outputDirOption)!;
             var evalEngine = context.ParseResult.GetValueForOption(evalEngineOption)!;
             var authToken = context.ParseResult.GetValueForOption(authTokenOption);
-            var reportName = context.ParseResult.GetValueForOption(reportNameOption);
             var ct = context.GetCancellationToken();
 
             // Secret handling: a token on the command line is visible to other processes
@@ -122,7 +115,7 @@ private static Command CreateEvaluateSubcommand(ILogger logger, IEvaluationPipel
                 authToken = Environment.GetEnvironmentVariable("A365_MCP_AUTH_TOKEN");
             }
 
-            context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct, reportName);
+            context.ExitCode = await pipelineService.RunAsync(serverUrl, outputDir, evalEngine, authToken, ct);
         });
 
         return command;
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
index 0942c5cb..876c58c4 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/AzureOpenAiLauncher.cs
@@ -97,13 +97,23 @@ public Task<bool> IsAvailableAsync(CancellationToken cancellationToken = default
             return Task.FromResult(false);
         }
 
-        if (!Uri.TryCreate(endpoint, UriKind.Absolute, out _))
+        if (!Uri.TryCreate(endpoint, UriKind.Absolute, out var endpointUri))
         {
             _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) is not a valid absolute URL.",
                 endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar);
             return Task.FromResult(false);
         }
 
+        // The server's tool names and descriptions are sent to this endpoint for scoring, so
+        // require HTTPS to avoid transmitting them in plaintext. Real Azure OpenAI endpoints are
+        // HTTPS, so this also rejects an obviously misconfigured (e.g. http://) value early.
+        if (!string.Equals(endpointUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase))
+        {
+            _logger.LogWarning("Azure OpenAI endpoint '{Endpoint}' (from {EndpointVar}) must use HTTPS.",
+                endpoint, EvalModelConstants.AzureOpenAiEndpointEnvVar);
+            return Task.FromResult(false);
+        }
+
         return Task.FromResult(true);
     }
 
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
index 3090ba10..ef18256a 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/EvaluationPipelineService.cs
@@ -49,7 +49,7 @@ public EvaluationPipelineService(
     }
 
     /// <inheritdoc />
-    public async Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null)
+    public async Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken)
     {
         try
         {
@@ -106,10 +106,7 @@ public async Task<int> RunAsync(string serverUrl, string outputDir, string evalE
             // Run the derived name through the same sanitizer as the report filename so
             // any invalid-for-filesystem characters (?, *, <, etc.) from the fallback path
             // don't crash Path.Combine / File.Exists downstream.
-            // A friendly --report-name overrides the host-derived name for both the output
-            // file names and the report's displayed server name. Essential when many servers
-            // sit behind one gateway host (which would otherwise collide on the same name).
-            var serverName = !string.IsNullOrWhiteSpace(reportName) ? reportName! : DeriveServerName(serverUrl);
+            var serverName = DeriveServerName(serverUrl);
             var safeServerName = ReportGenerator.SanitizeFileName(serverName);
             var checklistPath = Path.Combine(outputDir, $"{safeServerName}_checklist.json");
 
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
index 72fa72a5..591cfc71 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/IEvaluationPipelineService.cs
@@ -17,9 +17,6 @@ public interface IEvaluationPipelineService
     /// <param name="evalEngine">Coding agent engine name (auto, github-copilot, claude-code, none).</param>
     /// <param name="authToken">Optional bearer token for MCP server authentication.</param>
     /// <param name="cancellationToken">Cancellation token.</param>
-    /// <param name="reportName">Optional friendly name for the evaluated server. When provided it is
-    /// used (sanitized) for the output filenames and the report's server name instead of the value
-    /// derived from the URL host — needed when many servers share one gateway host.</param>
     /// <returns>Process exit code: 0 on success or an intentional bring-your-own-LLM stop; 1 when evaluation could not be performed as requested.</returns>
-    Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken, string? reportName = null);
+    Task<int> RunAsync(string serverUrl, string outputDir, string evalEngine, string? authToken, CancellationToken cancellationToken);
 }
diff --git a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
index 7783959d..b15d10d0 100644
--- a/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
+++ b/src/Microsoft.Agents.A365.DevTools.Cli/Services/Evaluate/SchemaDiscoveryService.cs
@@ -26,11 +26,6 @@ internal sealed class SchemaDiscoveryService : ISchemaDiscoveryService, IDisposa
     private readonly ILogger<SchemaDiscoveryService> _logger;
     private readonly HttpClient _httpClient;
 
-    // MCP Streamable HTTP session id issued by the server on initialize. Servers that require
-    // a session (per spec) reject session-less follow-up requests with HTTP 400; we echo this
-    // header on every subsequent request. Reset per discovery (the service is a DI singleton).
-    private string? _mcpSessionId;
-
     public SchemaDiscoveryService(ILogger<SchemaDiscoveryService> logger, HttpMessageHandler? handler = null)
     {
         ArgumentNullException.ThrowIfNull(logger);
@@ -68,18 +63,22 @@ public async Task<List<ToolSchema>> DiscoverToolsAsync(string serverUrl, string?
         }
 
         _logger.LogDebug("Starting MCP schema discovery against {ServerUrl}", serverUrl);
-        _mcpSessionId = null; // reset any session id captured from a previous discovery
+
+        // The session id is scoped to this single discovery. The service is a DI singleton, so a
+        // shared mutable field would race if two discoveries ran concurrently — thread a per-call
+        // holder through the handshake instead.
+        var session = new McpSession();
 
         try
         {
             // Step 1: Initialize
-            await SendInitializeAsync(serverUrl, authToken, cancellationToken);
+            await SendInitializeAsync(serverUrl, authToken, session, cancellationToken);
 
             // Step 2: Send initialized notification
-            await SendInitializedNotificationAsync(serverUrl, authToken, cancellationToken);
+            await SendInitializedNotificationAsync(serverUrl, authToken, session, cancellationToken);
 
             // Step 3: List tools
-            var tools = await SendToolsListAsync(serverUrl, authToken, cancellationToken);
+            var tools = await SendToolsListAsync(serverUrl, authToken, session, cancellationToken);
 
             if (tools.Count == 0)
             {
@@ -145,7 +144,7 @@ public async Task<List<ToolSchema>> DiscoverToolsAsync(string serverUrl, string?
         }
     }
 
-    private async Task SendInitializeAsync(string serverUrl, string? authToken, CancellationToken cancellationToken)
+    private async Task SendInitializeAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken)
     {
         _logger.LogDebug("Sending MCP initialize request...");
 
@@ -166,7 +165,7 @@ private async Task SendInitializeAsync(string serverUrl, string? authToken, Canc
             id = 1
         });
 
-        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken);
+        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken);
         var responseBody = await ReadJsonResponseAsync(response, cancellationToken);
 
         // Validate JSON-RPC response
@@ -191,7 +190,7 @@ private async Task SendInitializeAsync(string serverUrl, string? authToken, Canc
         _logger.LogDebug("MCP initialize succeeded.");
     }
 
-    private async Task SendInitializedNotificationAsync(string serverUrl, string? authToken, CancellationToken cancellationToken)
+    private async Task SendInitializedNotificationAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken)
     {
         _logger.LogDebug("Sending MCP initialized notification...");
 
@@ -203,12 +202,12 @@ private async Task SendInitializedNotificationAsync(string serverUrl, string? au
         });
 
         // Notifications may not return a response body, but we still POST
-        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken);
+        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken);
 
         _logger.LogDebug("MCP initialized notification sent.");
     }
 
-    private async Task<List<ToolSchema>> SendToolsListAsync(string serverUrl, string? authToken, CancellationToken cancellationToken)
+    private async Task<List<ToolSchema>> SendToolsListAsync(string serverUrl, string? authToken, McpSession session, CancellationToken cancellationToken)
     {
         _logger.LogDebug("Sending MCP tools/list request...");
 
@@ -220,7 +219,7 @@ private async Task<List<ToolSchema>> SendToolsListAsync(string serverUrl, string
             id = 2
         });
 
-        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, cancellationToken);
+        using var response = await PostJsonRpcAsync(serverUrl, requestBody, authToken, session, cancellationToken);
         var responseBody = await ReadJsonResponseAsync(response, cancellationToken);
 
         using var doc = JsonDocument.Parse(responseBody);
@@ -290,6 +289,7 @@ private async Task<HttpResponseMessage> PostJsonRpcAsync(
         string serverUrl,
         string requestBody,
         string? authToken,
+        McpSession session,
         CancellationToken cancellationToken)
     {
         using var request = new HttpRequestMessage(HttpMethod.Post, serverUrl)
@@ -307,18 +307,30 @@ private async Task<HttpResponseMessage> PostJsonRpcAsync(
         }
 
         // Echo the MCP session id (captured from initialize) so session-required servers accept
-        // follow-up requests; stateless servers simply ignore an unexpected header.
-        if (!string.IsNullOrEmpty(_mcpSessionId))
+        // follow-up requests; stateless servers simply ignore an unexpected header. The value was
+        // validated as visible ASCII at capture time, so Headers.Add (which validates) won't throw.
+        if (!string.IsNullOrEmpty(session.Id))
         {
-            request.Headers.TryAddWithoutValidation("Mcp-Session-Id", _mcpSessionId);
+            request.Headers.Add("Mcp-Session-Id", session.Id);
         }
 
         var response = await _httpClient.SendAsync(request, cancellationToken);
 
         // Capture the session id from the first response (initialize) for reuse on later calls.
-        if (string.IsNullOrEmpty(_mcpSessionId) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues))
+        // The id is server-controlled, so validate it before it is ever echoed back into a request
+        // header: reject anything outside visible ASCII (0x21-0x7E), which blocks CR/LF and other
+        // control characters that could otherwise enable header injection.
+        if (string.IsNullOrEmpty(session.Id) && response.Headers.TryGetValues("Mcp-Session-Id", out var sessionValues))
         {
-            _mcpSessionId = sessionValues.FirstOrDefault();
+            var candidate = sessionValues.FirstOrDefault();
+            if (!string.IsNullOrEmpty(candidate) && IsValidMcpSessionId(candidate))
+            {
+                session.Id = candidate;
+            }
+            else if (!string.IsNullOrEmpty(candidate))
+            {
+                _logger.LogWarning("Ignoring malformed Mcp-Session-Id from server (must be visible ASCII 0x21-0x7E).");
+            }
         }
 
         if (!response.IsSuccessStatusCode)
@@ -383,4 +395,34 @@ private async Task<string> ReadJsonResponseAsync(HttpResponseMessage response, C
         _logger.LogWarning("Could not extract JSON from SSE response");
         return body;
     }
+
+    /// <summary>
+    /// Validates a server-supplied MCP session id before it is echoed back in a request header.
+    /// Per the MCP spec the id must contain only visible ASCII characters (0x21-0x7E); enforcing
+    /// that here rejects CR/LF and other control characters that could enable header injection.
+    /// </summary>
+    private static bool IsValidMcpSessionId(string value)
+    {
+        foreach (var c in value)
+        {
+            if (c < '!' || c > '~')
+            {
+                return false;
+            }
+        }
+
+        return value.Length > 0;
+    }
+
+    /// <summary>
+    /// Per-discovery holder for the MCP Streamable HTTP session id issued by the server on
+    /// initialize. Scoped to a single <see cref="DiscoverToolsAsync"/> call so concurrent
+    /// discoveries on this singleton service never share or overwrite each other's session id.
+    /// Servers that require a session (per spec) reject session-less follow-up requests with
+    /// HTTP 400, so the id is echoed on every subsequent request once captured.
+    /// </summary>
+    private sealed class McpSession
+    {
+        public string? Id;
+    }
 }
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
index feb956ec..3f15df47 100644
--- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Commands/EvaluateCommandInvocationTests.cs
@@ -41,7 +41,7 @@ private static IEvaluationPipelineService PipelineReturning(int exitCode)
     {
         var pipeline = Substitute.For<IEvaluationPipelineService>();
         pipeline
-            .RunAsync(Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string?>(), Arg.Any<CancellationToken>(), Arg.Any<string?>())
+            .RunAsync(Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string?>(), Arg.Any<CancellationToken>())
             .Returns(exitCode);
         return pipeline;
     }
@@ -99,8 +99,7 @@ await pipeline.Received(1).RunAsync(
                 Arg.Any<string>(),
                 Arg.Any<string>(),
                 "env-token-xyz",
-                Arg.Any<CancellationToken>(),
-                Arg.Any<string?>());
+                Arg.Any<CancellationToken>());
         }
         finally
         {
@@ -197,8 +196,7 @@ await pipeline.Received(1).RunAsync(
             "out",
             "claude-code",
             Arg.Any<string?>(),
-            Arg.Any<CancellationToken>(),
-            Arg.Any<string?>());
+            Arg.Any<CancellationToken>());
     }
 
     [Fact]
@@ -217,27 +215,6 @@ await pipeline.Received(1).RunAsync(
             Arg.Any<string>(),
             "azure-openai",
             Arg.Any<string?>(),
-            Arg.Any<CancellationToken>(),
-            Arg.Any<string?>());
-    }
-
-    [Fact]
-    public async Task InvokeAsync_ReportNameFlag_ForwardsReportNameAsSixthArgument()
-    {
-        var pipeline = PipelineReturning(0);
-        var evaluate = GetEvaluateSubcommand(Substitute.For<ILogger>(), pipeline);
-        var parser = BuildFaithfulParser(evaluate);
-
-        var exitCode = await parser.InvokeAsync(
-            new[] { "--server-url", "http://localhost/mcp", "--eval-engine", "none", "--report-name", "salesforce" });
-
-        exitCode.Should().Be(0, because: "RunAsync returned 0, which the handler must propagate as the process exit code");
-        await pipeline.Received(1).RunAsync(
-            Arg.Any<string>(),
-            Arg.Any<string>(),
-            Arg.Any<string>(),
-            Arg.Any<string?>(),
-            Arg.Any<CancellationToken>(),
-            "salesforce");
+            Arg.Any<CancellationToken>());
     }
 }
diff --git a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
index b4824f1c..29650dbd 100644
--- a/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
+++ b/src/Tests/Microsoft.Agents.A365.DevTools.Cli.Tests/Services/Evaluate/AzureOpenAiLauncherAvailabilityTests.cs
@@ -103,6 +103,20 @@ await WithEnvAsync(endpoint: "not-a-url", deployment: "gpt-4.1", async () =>
         });
     }
 
+    [Fact]
+    public async Task IsAvailableAsync_PlaintextHttpEndpoint_ReturnsFalse()
+    {
+        // A valid absolute URL, but http:// — the engine transmits tool names and descriptions to
+        // this endpoint, so a plaintext scheme must be rejected rather than silently used.
+        await WithEnvAsync(endpoint: "http://x.openai.azure.com/openai/v1", deployment: "gpt-4.1", async () =>
+        {
+            var available = await CreateLauncher().IsAvailableAsync();
+
+            available.Should().BeFalse(
+                because: "tool metadata is sent to the endpoint, so a non-HTTPS endpoint must be treated as unavailable to avoid plaintext transmission");
+        });
+    }
+
     [Fact]
     public async Task IsAvailableAsync_BothSetWithValidHttpsEndpoint_ReturnsTrue()
     {