microsoft
diff --git a/‎.env_example‎
Lines changed: 5 additions & 0 deletions b/‎.env_example‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎build_scripts/evaluate_scorers.py‎
Lines changed: 65 additions & 9 deletions b/‎build_scripts/evaluate_scorers.py‎
Lines changed: 65 additions & 9 deletions
diff --git a/‎doc/code/scoring/8_scorer_metrics.ipynb‎
Lines changed: 114 additions & 36 deletions b/‎doc/code/scoring/8_scorer_metrics.ipynb‎
Lines changed: 114 additions & 36 deletions
@@ -56,6 +56,11 @@ AZURE_OPENAI_GPT4_CHAT_KEY="xxxxx"
 AZURE_OPENAI_GPT4_CHAT_MODEL="deployment-name"
 AZURE_OPENAI_GPT4_CHAT_UNDERLYING_MODEL=""
 
+AZURE_OPENAI_GPT5_4_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
+AZURE_OPENAI_GPT5_4_KEY="xxxxx"
+AZURE_OPENAI_GPT5_4_MODEL="gpt-5.4"
+AZURE_OPENAI_GPT5_4_UNDERLYING_MODEL="gpt-5.4"
+
 # Endpoints that host models with fewer safety mechanisms (e.g. via adversarial fine tuning
 # or content filters turned off) can be defined below and used in adversarial attack testing scenarios.
 AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
 
@@ -9,44 +9,69 @@
 
 Usage:
     python build_scripts/evaluate_scorers.py
+    python build_scripts/evaluate_scorers.py --tags refusal
+    python build_scripts/evaluate_scorers.py --tags refusal,default
+    python build_scripts/evaluate_scorers.py --max-concurrency 3
 """
 
+import argparse
 import asyncio
 import sys
 import time
 
 from tqdm import tqdm
 
 from pyrit.common.path import SCORER_EVALS_PATH
+from pyrit.exceptions.exception_context import ComponentRole, execution_context
 from pyrit.registry import ScorerRegistry
 from pyrit.setup import IN_MEMORY, initialize_pyrit_async
 from pyrit.setup.initializers import ScorerInitializer, TargetInitializer
 
 
-async def evaluate_scorers() -> None:
+async def evaluate_scorers(tags: list[str] | None = None, max_concurrency: int = 5) -> None:
     """
     Evaluate multiple scorers against their configured datasets.
 
     This will:
     1. Initialize PyRIT with in-memory database
     2. Register all scorers from ScorerInitializer into the ScorerRegistry
-    3. Iterate through all registered scorers
+    3. Iterate through registered scorers (optionally filtered by tags)
     4. Run evaluate_async() on each scorer
     5. Save results to scorer_evals directory
+
+    Args:
+        tags: Optional list of tags to filter which scorers to evaluate.
+            When provided, only scorers matching any of the tags are evaluated.
+            When None, all scorers are evaluated.
+        max_concurrency: Maximum number of concurrent scoring requests per scorer.
+            Defaults to 5.
     """
     print("Initializing PyRIT...")
     target_init = TargetInitializer()
     target_init.params = {"tags": ["default", "scorer"]}
+    scorer_init = ScorerInitializer()
     await initialize_pyrit_async(
         memory_db_type=IN_MEMORY,
-        initializers=[target_init, ScorerInitializer()],
+        initializers=[target_init, scorer_init],
     )
 
     registry = ScorerRegistry.get_registry_singleton()
-    scorer_names = registry.get_names()
+
+    # Filter scorers by tags if specified
+    if tags:
+        scorer_names: list[str] = []
+        for tag in tags:
+            entries = registry.get_by_tag(tag=tag)
+            scorer_names.extend(entry.name for entry in entries if entry.name not in scorer_names)
+        scorer_names.sort()
+        print(f"\nFiltering by tags: {tags}")
+    else:
+        scorer_names = registry.get_names()
 
     if not scorer_names:
         print("No scorers registered. Check environment variable configuration.")
+        if tags:
+            print(f"  (filtered by tags: {tags})")
         return
 
     print(f"\nEvaluating {len(scorer_names)} scorer(s)...\n")
@@ -68,10 +93,14 @@ async def evaluate_scorers() -> None:
 
         try:
             print("  Status: Running evaluations...")
-            results = await scorer.evaluate_async(
-                num_scorer_trials=3,
-                max_concurrency=10,
-            )
+            with execution_context(
+                component_role=ComponentRole.OBJECTIVE_SCORER,
+                component_identifier=scorer.get_identifier(),
+            ):
+                results = await scorer.evaluate_async(
+                    num_scorer_trials=3,
+                    max_concurrency=max_concurrency,
+                )
 
             elapsed_time = time.time() - start_time
 
@@ -95,21 +124,48 @@ async def evaluate_scorers() -> None:
     print("=" * 60)
 
 
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Evaluate PyRIT scorers against human-labeled datasets.",
+    )
+    parser.add_argument(
+        "--tags",
+        type=str,
+        default=None,
+        help="Comma-separated list of tags to filter which scorers to evaluate (e.g., --tags refusal,default)",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=5,
+        help="Maximum number of concurrent scoring requests per scorer (default: 5)",
+    )
+    return parser.parse_args()
+
+
 if __name__ == "__main__":
+    args = parse_args()
+    tag_list = [t.strip() for t in args.tags.split(",")] if args.tags else None
+    max_concurrency = args.max_concurrency
+
     print("=" * 60)
     print("PyRIT Scorer Evaluation Script")
     print("=" * 60)
     print("This script will evaluate multiple scorers against human-labeled")
     print("datasets. This is a long-running process that may take several")
     print("minutes to hours depending on the number of scorers and datasets.")
     print()
+    if tag_list:
+        print(f"Filtering by tags: {tag_list}")
+    print(f"Max concurrency: {max_concurrency}")
     print("Results will be saved to the registry files in:")
     print(f"  {SCORER_EVALS_PATH}")
     print("=" * 60)
     print()
 
     try:
-        asyncio.run(evaluate_scorers())
+        asyncio.run(evaluate_scorers(tags=tag_list, max_concurrency=max_concurrency))
     except KeyboardInterrupt:
         print("\n\nEvaluation interrupted by user.")
         sys.exit(1)
 
@@ -278,7 +278,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found 11 scorer configurations in the registry\n",
+      "Found 10 scorer configurations in the metrics file\n",
       "\n",
       "Top 5 configurations by F1 Score:\n",
       "--------------------------------------------------------------------------------\n",
@@ -295,12 +295,12 @@
       "\u001b[36m            • model_name: gpt-4o\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 84.84%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0185\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8606\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7928\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9412\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.27s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 83.29%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0188\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8472\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.7593\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9581\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 0.80s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -314,12 +314,12 @@
       "\u001b[36m            • model_name: gpt-4o-unsafe\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 79.26%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0209\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8259\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7088\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9893\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.52s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 79.24%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0204\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8210\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.7041\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9843\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 0.99s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -334,12 +334,12 @@
       "\u001b[36m            • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 78.46%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0212\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.8204\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.7008\u001b[0m\n",
-      "\u001b[32m      • Recall: 0.9893\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 1.77s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 77.72%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0209\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.8095\u001b[0m\n",
+      "\u001b[31m      • Precision: 0.6900\u001b[0m\n",
+      "\u001b[32m      • Recall: 0.9791\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 1.36s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -350,12 +350,12 @@
       "\u001b[36m      • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 78.46%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0212\u001b[0m\n",
-      "\u001b[36m      • F1 Score: 0.7582\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.8581\u001b[0m\n",
-      "\u001b[31m      • Recall: 0.6791\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 2.39s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 81.27%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0196\u001b[0m\n",
+      "\u001b[36m      • F1 Score: 0.7836\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.8874\u001b[0m\n",
+      "\u001b[36m      • Recall: 0.7016\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 2.01s\u001b[0m\n",
       "\n",
       "\u001b[1m  📊 Scorer Information\u001b[0m\n",
       "\u001b[37m    ▸ Scorer Identifier\u001b[0m\n",
@@ -366,19 +366,19 @@
       "\u001b[36m      • temperature: 0.9\u001b[0m\n",
       "\n",
       "\u001b[37m    ▸ Performance Metrics\u001b[0m\n",
-      "\u001b[36m      • Accuracy: 73.40%\u001b[0m\n",
-      "\u001b[36m      • Accuracy Std Error: ±0.0228\u001b[0m\n",
-      "\u001b[31m      • F1 Score: 0.6732\u001b[0m\n",
-      "\u001b[36m      • Precision: 0.8655\u001b[0m\n",
-      "\u001b[31m      • Recall: 0.5508\u001b[0m\n",
-      "\u001b[36m      • Average Score Time: 2.23s\u001b[0m\n",
+      "\u001b[36m      • Accuracy: 74.18%\u001b[0m\n",
+      "\u001b[36m      • Accuracy Std Error: ±0.0220\u001b[0m\n",
+      "\u001b[31m      • F1 Score: 0.6731\u001b[0m\n",
+      "\u001b[36m      • Precision: 0.8678\u001b[0m\n",
+      "\u001b[31m      • Recall: 0.5497\u001b[0m\n",
+      "\u001b[36m      • Average Score Time: 1.83s\u001b[0m\n",
       "\n",
       "================================================================================\n",
-      "Best Accuracy:  84.84%\n",
+      "Best Accuracy:  83.29%\n",
       "Best Precision: 0.989\n",
-      "Best Recall:    0.989\n",
-      "Fastest:        0.129 seconds\n",
-      "Slowest:        3.520 seconds\n"
+      "Best Recall:    0.984\n",
+      "Fastest:        0.134 seconds\n",
+      "Slowest:        2.390 seconds\n"
      ]
     }
    ],
@@ -388,7 +388,7 @@
     "# Load all objective scorer metrics - returns ScorerMetricsWithIdentity[ObjectiveScorerMetrics]\n",
     "all_scorers = get_all_objective_metrics()\n",
     "\n",
-    "print(f\"Found {len(all_scorers)} scorer configurations in the registry\\n\")\n",
+    "print(f\"Found {len(all_scorers)} scorer configurations in the metrics file\\n\")\n",
     "\n",
     "# Sort by F1 score - type checker knows entry.metrics is ObjectiveScorerMetrics\n",
     "sorted_by_f1 = sorted(all_scorers, key=lambda x: x.metrics.f1_score, reverse=True)\n",
@@ -650,6 +650,84 @@
     "  - For harm scorers: 0.0-1.0 float values\n",
     "- `data_type`: Type of content (defaults to \"text\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18",
+   "metadata": {},
+   "source": [
+    "## Batch Evaluation with `evaluate_scorers.py`\n",
+    "\n",
+    "While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script\n",
+    "evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer\n",
+    "prompts, adding new variants, or updating human-labeled datasets.\n",
+    "\n",
+    "The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),\n",
+    "then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in\n",
+    "`pyrit/datasets/scorer_evals/`.\n",
+    "\n",
+    "### Basic Usage\n",
+    "\n",
+    "```bash\n",
+    "# Evaluate all registered scorers (long-running — can take hours)\n",
+    "python build_scripts/evaluate_scorers.py\n",
+    "\n",
+    "# Evaluate only scorers with specific tags\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal,default\n",
+    "\n",
+    "# Control parallelism (default: 5, lower if hitting rate limits)\n",
+    "python build_scripts/evaluate_scorers.py --max-concurrency 3\n",
+    "```\n",
+    "\n",
+    "### Tags\n",
+    "\n",
+    "`ScorerInitializer` applies tags to scorers during registration. These tags let you target\n",
+    "specific subsets for evaluation:\n",
+    "\n",
+    "- `refusal` — The 4 standalone refusal scorer variants\n",
+    "- `default` — All scorers registered by default\n",
+    "- `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)\n",
+    "- `best_objective_f1` — The objective scorer with the highest F1\n",
+    "\n",
+    "### Recommended Workflow: Refusal → Dependent Scorers\n",
+    "\n",
+    "When refusal scorer prompts or datasets change, the recommended workflow is:\n",
+    "\n",
+    "**Step 1: Evaluate refusal scorers first**\n",
+    "\n",
+    "```bash\n",
+    "python build_scripts/evaluate_scorers.py --tags refusal\n",
+    "```\n",
+    "\n",
+    "This evaluates only the 4 refusal variants and writes results to\n",
+    "`refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which\n",
+    "refusal variant has the best F1 and tag it as `best_refusal_f1`.\n",
+    "\n",
+    "**Step 2: Re-evaluate all scorers**\n",
+    "\n",
+    "```bash\n",
+    "python build_scripts/evaluate_scorers.py\n",
+    "```\n",
+    "\n",
+    "On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best\n",
+    "refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping\n",
+    "the best refusal scorer). This ensures objective scorers that depend on refusal detection use the\n",
+    "best-performing refusal prompt.\n",
+    "\n",
+    "Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are\n",
+    "automatically skipped, so re-running the full script is efficient.\n",
+    "\n",
+    "**Step 3: Commit updated metrics**\n",
+    "\n",
+    "```bash\n",
+    "git add pyrit/datasets/scorer_evals/\n",
+    "git commit -m \"chore: update scorer metrics\"\n",
+    "```\n",
+    "\n",
+    "The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime\n",
+    "to select the best scorers."
+   ]
   }
  ],
  "metadata": {