Skip to content

Commit e6fdded

Browse files
rlundeen2Copilot
andauthored
FEAT: Updating Scorer Metrics Update Workflow and SelfAskRefusalScorer update (#1549)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 1ccde34 commit e6fdded

38 files changed

Lines changed: 3550 additions & 1549 deletions

.env_example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ AZURE_OPENAI_GPT4_CHAT_KEY="xxxxx"
5656
AZURE_OPENAI_GPT4_CHAT_MODEL="deployment-name"
5757
AZURE_OPENAI_GPT4_CHAT_UNDERLYING_MODEL=""
5858

59+
AZURE_OPENAI_GPT5_4_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
60+
AZURE_OPENAI_GPT5_4_KEY="xxxxx"
61+
AZURE_OPENAI_GPT5_4_MODEL="gpt-5.4"
62+
AZURE_OPENAI_GPT5_4_UNDERLYING_MODEL="gpt-5.4"
63+
5964
# Endpoints that host models with fewer safety mechanisms (e.g. via adversarial fine tuning
6065
# or content filters turned off) can be defined below and used in adversarial attack testing scenarios.
6166
AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"

build_scripts/evaluate_scorers.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,44 +9,69 @@
99
1010
Usage:
1111
python build_scripts/evaluate_scorers.py
12+
python build_scripts/evaluate_scorers.py --tags refusal
13+
python build_scripts/evaluate_scorers.py --tags refusal,default
14+
python build_scripts/evaluate_scorers.py --max-concurrency 3
1215
"""
1316

17+
import argparse
1418
import asyncio
1519
import sys
1620
import time
1721

1822
from tqdm import tqdm
1923

2024
from pyrit.common.path import SCORER_EVALS_PATH
25+
from pyrit.exceptions.exception_context import ComponentRole, execution_context
2126
from pyrit.registry import ScorerRegistry
2227
from pyrit.setup import IN_MEMORY, initialize_pyrit_async
2328
from pyrit.setup.initializers import ScorerInitializer, TargetInitializer
2429

2530

26-
async def evaluate_scorers() -> None:
31+
async def evaluate_scorers(tags: list[str] | None = None, max_concurrency: int = 5) -> None:
2732
"""
2833
Evaluate multiple scorers against their configured datasets.
2934
3035
This will:
3136
1. Initialize PyRIT with in-memory database
3237
2. Register all scorers from ScorerInitializer into the ScorerRegistry
33-
3. Iterate through all registered scorers
38+
3. Iterate through registered scorers (optionally filtered by tags)
3439
4. Run evaluate_async() on each scorer
3540
5. Save results to scorer_evals directory
41+
42+
Args:
43+
tags: Optional list of tags to filter which scorers to evaluate.
44+
When provided, only scorers matching any of the tags are evaluated.
45+
When None, all scorers are evaluated.
46+
max_concurrency: Maximum number of concurrent scoring requests per scorer.
47+
Defaults to 5.
3648
"""
3749
print("Initializing PyRIT...")
3850
target_init = TargetInitializer()
3951
target_init.params = {"tags": ["default", "scorer"]}
52+
scorer_init = ScorerInitializer()
4053
await initialize_pyrit_async(
4154
memory_db_type=IN_MEMORY,
42-
initializers=[target_init, ScorerInitializer()],
55+
initializers=[target_init, scorer_init],
4356
)
4457

4558
registry = ScorerRegistry.get_registry_singleton()
46-
scorer_names = registry.get_names()
59+
60+
# Filter scorers by tags if specified
61+
if tags:
62+
scorer_names: list[str] = []
63+
for tag in tags:
64+
entries = registry.get_by_tag(tag=tag)
65+
scorer_names.extend(entry.name for entry in entries if entry.name not in scorer_names)
66+
scorer_names.sort()
67+
print(f"\nFiltering by tags: {tags}")
68+
else:
69+
scorer_names = registry.get_names()
4770

4871
if not scorer_names:
4972
print("No scorers registered. Check environment variable configuration.")
73+
if tags:
74+
print(f" (filtered by tags: {tags})")
5075
return
5176

5277
print(f"\nEvaluating {len(scorer_names)} scorer(s)...\n")
@@ -68,10 +93,14 @@ async def evaluate_scorers() -> None:
6893

6994
try:
7095
print(" Status: Running evaluations...")
71-
results = await scorer.evaluate_async(
72-
num_scorer_trials=3,
73-
max_concurrency=10,
74-
)
96+
with execution_context(
97+
component_role=ComponentRole.OBJECTIVE_SCORER,
98+
component_identifier=scorer.get_identifier(),
99+
):
100+
results = await scorer.evaluate_async(
101+
num_scorer_trials=3,
102+
max_concurrency=max_concurrency,
103+
)
75104

76105
elapsed_time = time.time() - start_time
77106

@@ -95,21 +124,48 @@ async def evaluate_scorers() -> None:
95124
print("=" * 60)
96125

97126

127+
def parse_args() -> argparse.Namespace:
128+
"""Parse command-line arguments."""
129+
parser = argparse.ArgumentParser(
130+
description="Evaluate PyRIT scorers against human-labeled datasets.",
131+
)
132+
parser.add_argument(
133+
"--tags",
134+
type=str,
135+
default=None,
136+
help="Comma-separated list of tags to filter which scorers to evaluate (e.g., --tags refusal,default)",
137+
)
138+
parser.add_argument(
139+
"--max-concurrency",
140+
type=int,
141+
default=5,
142+
help="Maximum number of concurrent scoring requests per scorer (default: 5)",
143+
)
144+
return parser.parse_args()
145+
146+
98147
if __name__ == "__main__":
148+
args = parse_args()
149+
tag_list = [t.strip() for t in args.tags.split(",")] if args.tags else None
150+
max_concurrency = args.max_concurrency
151+
99152
print("=" * 60)
100153
print("PyRIT Scorer Evaluation Script")
101154
print("=" * 60)
102155
print("This script will evaluate multiple scorers against human-labeled")
103156
print("datasets. This is a long-running process that may take several")
104157
print("minutes to hours depending on the number of scorers and datasets.")
105158
print()
159+
if tag_list:
160+
print(f"Filtering by tags: {tag_list}")
161+
print(f"Max concurrency: {max_concurrency}")
106162
print("Results will be saved to the registry files in:")
107163
print(f" {SCORER_EVALS_PATH}")
108164
print("=" * 60)
109165
print()
110166

111167
try:
112-
asyncio.run(evaluate_scorers())
168+
asyncio.run(evaluate_scorers(tags=tag_list, max_concurrency=max_concurrency))
113169
except KeyboardInterrupt:
114170
print("\n\nEvaluation interrupted by user.")
115171
sys.exit(1)

doc/code/scoring/8_scorer_metrics.ipynb

Lines changed: 114 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@
278278
"name": "stdout",
279279
"output_type": "stream",
280280
"text": [
281-
"Found 11 scorer configurations in the registry\n",
281+
"Found 10 scorer configurations in the metrics file\n",
282282
"\n",
283283
"Top 5 configurations by F1 Score:\n",
284284
"--------------------------------------------------------------------------------\n",
@@ -295,12 +295,12 @@
295295
"\u001b[36m • model_name: gpt-4o\u001b[0m\n",
296296
"\n",
297297
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
298-
"\u001b[36m • Accuracy: 84.84%\u001b[0m\n",
299-
"\u001b[36m • Accuracy Std Error: ±0.0185\u001b[0m\n",
300-
"\u001b[36m • F1 Score: 0.8606\u001b[0m\n",
301-
"\u001b[36m • Precision: 0.7928\u001b[0m\n",
302-
"\u001b[32m • Recall: 0.9412\u001b[0m\n",
303-
"\u001b[36m • Average Score Time: 1.27s\u001b[0m\n",
298+
"\u001b[36m • Accuracy: 83.29%\u001b[0m\n",
299+
"\u001b[36m • Accuracy Std Error: ±0.0188\u001b[0m\n",
300+
"\u001b[36m • F1 Score: 0.8472\u001b[0m\n",
301+
"\u001b[36m • Precision: 0.7593\u001b[0m\n",
302+
"\u001b[32m • Recall: 0.9581\u001b[0m\n",
303+
"\u001b[36m • Average Score Time: 0.80s\u001b[0m\n",
304304
"\n",
305305
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
306306
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
@@ -314,12 +314,12 @@
314314
"\u001b[36m • model_name: gpt-4o-unsafe\u001b[0m\n",
315315
"\n",
316316
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
317-
"\u001b[36m • Accuracy: 79.26%\u001b[0m\n",
318-
"\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n",
319-
"\u001b[36m • F1 Score: 0.8259\u001b[0m\n",
320-
"\u001b[36m • Precision: 0.7088\u001b[0m\n",
321-
"\u001b[32m • Recall: 0.9893\u001b[0m\n",
322-
"\u001b[36m • Average Score Time: 1.52s\u001b[0m\n",
317+
"\u001b[36m • Accuracy: 79.24%\u001b[0m\n",
318+
"\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n",
319+
"\u001b[36m • F1 Score: 0.8210\u001b[0m\n",
320+
"\u001b[36m • Precision: 0.7041\u001b[0m\n",
321+
"\u001b[32m • Recall: 0.9843\u001b[0m\n",
322+
"\u001b[36m • Average Score Time: 0.99s\u001b[0m\n",
323323
"\n",
324324
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
325325
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
@@ -334,12 +334,12 @@
334334
"\u001b[36m • temperature: 0.9\u001b[0m\n",
335335
"\n",
336336
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
337-
"\u001b[36m • Accuracy: 78.46%\u001b[0m\n",
338-
"\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n",
339-
"\u001b[36m • F1 Score: 0.8204\u001b[0m\n",
340-
"\u001b[36m • Precision: 0.7008\u001b[0m\n",
341-
"\u001b[32m • Recall: 0.9893\u001b[0m\n",
342-
"\u001b[36m • Average Score Time: 1.77s\u001b[0m\n",
337+
"\u001b[36m • Accuracy: 77.72%\u001b[0m\n",
338+
"\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n",
339+
"\u001b[36m • F1 Score: 0.8095\u001b[0m\n",
340+
"\u001b[31m • Precision: 0.6900\u001b[0m\n",
341+
"\u001b[32m • Recall: 0.9791\u001b[0m\n",
342+
"\u001b[36m • Average Score Time: 1.36s\u001b[0m\n",
343343
"\n",
344344
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
345345
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
@@ -350,12 +350,12 @@
350350
"\u001b[36m • temperature: 0.9\u001b[0m\n",
351351
"\n",
352352
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
353-
"\u001b[36m • Accuracy: 78.46%\u001b[0m\n",
354-
"\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n",
355-
"\u001b[36m • F1 Score: 0.7582\u001b[0m\n",
356-
"\u001b[36m • Precision: 0.8581\u001b[0m\n",
357-
"\u001b[31m • Recall: 0.6791\u001b[0m\n",
358-
"\u001b[36m • Average Score Time: 2.39s\u001b[0m\n",
353+
"\u001b[36m • Accuracy: 81.27%\u001b[0m\n",
354+
"\u001b[36m • Accuracy Std Error: ±0.0196\u001b[0m\n",
355+
"\u001b[36m • F1 Score: 0.7836\u001b[0m\n",
356+
"\u001b[36m • Precision: 0.8874\u001b[0m\n",
357+
"\u001b[36m • Recall: 0.7016\u001b[0m\n",
358+
"\u001b[36m • Average Score Time: 2.01s\u001b[0m\n",
359359
"\n",
360360
"\u001b[1m 📊 Scorer Information\u001b[0m\n",
361361
"\u001b[37m ▸ Scorer Identifier\u001b[0m\n",
@@ -366,19 +366,19 @@
366366
"\u001b[36m • temperature: 0.9\u001b[0m\n",
367367
"\n",
368368
"\u001b[37m ▸ Performance Metrics\u001b[0m\n",
369-
"\u001b[36m • Accuracy: 73.40%\u001b[0m\n",
370-
"\u001b[36m • Accuracy Std Error: ±0.0228\u001b[0m\n",
371-
"\u001b[31m • F1 Score: 0.6732\u001b[0m\n",
372-
"\u001b[36m • Precision: 0.8655\u001b[0m\n",
373-
"\u001b[31m • Recall: 0.5508\u001b[0m\n",
374-
"\u001b[36m • Average Score Time: 2.23s\u001b[0m\n",
369+
"\u001b[36m • Accuracy: 74.18%\u001b[0m\n",
370+
"\u001b[36m • Accuracy Std Error: ±0.0220\u001b[0m\n",
371+
"\u001b[31m • F1 Score: 0.6731\u001b[0m\n",
372+
"\u001b[36m • Precision: 0.8678\u001b[0m\n",
373+
"\u001b[31m • Recall: 0.5497\u001b[0m\n",
374+
"\u001b[36m • Average Score Time: 1.83s\u001b[0m\n",
375375
"\n",
376376
"================================================================================\n",
377-
"Best Accuracy: 84.84%\n",
377+
"Best Accuracy: 83.29%\n",
378378
"Best Precision: 0.989\n",
379-
"Best Recall: 0.989\n",
380-
"Fastest: 0.129 seconds\n",
381-
"Slowest: 3.520 seconds\n"
379+
"Best Recall: 0.984\n",
380+
"Fastest: 0.134 seconds\n",
381+
"Slowest: 2.390 seconds\n"
382382
]
383383
}
384384
],
@@ -388,7 +388,7 @@
388388
"# Load all objective scorer metrics - returns ScorerMetricsWithIdentity[ObjectiveScorerMetrics]\n",
389389
"all_scorers = get_all_objective_metrics()\n",
390390
"\n",
391-
"print(f\"Found {len(all_scorers)} scorer configurations in the registry\\n\")\n",
391+
"print(f\"Found {len(all_scorers)} scorer configurations in the metrics file\\n\")\n",
392392
"\n",
393393
"# Sort by F1 score - type checker knows entry.metrics is ObjectiveScorerMetrics\n",
394394
"sorted_by_f1 = sorted(all_scorers, key=lambda x: x.metrics.f1_score, reverse=True)\n",
@@ -650,6 +650,84 @@
650650
" - For harm scorers: 0.0-1.0 float values\n",
651651
"- `data_type`: Type of content (defaults to \"text\")"
652652
]
653+
},
654+
{
655+
"cell_type": "markdown",
656+
"id": "18",
657+
"metadata": {},
658+
"source": [
659+
"## Batch Evaluation with `evaluate_scorers.py`\n",
660+
"\n",
661+
"While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script\n",
662+
"evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer\n",
663+
"prompts, adding new variants, or updating human-labeled datasets.\n",
664+
"\n",
665+
"The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),\n",
666+
"then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in\n",
667+
"`pyrit/datasets/scorer_evals/`.\n",
668+
"\n",
669+
"### Basic Usage\n",
670+
"\n",
671+
"```bash\n",
672+
"# Evaluate all registered scorers (long-running — can take hours)\n",
673+
"python build_scripts/evaluate_scorers.py\n",
674+
"\n",
675+
"# Evaluate only scorers with specific tags\n",
676+
"python build_scripts/evaluate_scorers.py --tags refusal\n",
677+
"python build_scripts/evaluate_scorers.py --tags refusal,default\n",
678+
"\n",
679+
"# Control parallelism (default: 5, lower if hitting rate limits)\n",
680+
"python build_scripts/evaluate_scorers.py --max-concurrency 3\n",
681+
"```\n",
682+
"\n",
683+
"### Tags\n",
684+
"\n",
685+
"`ScorerInitializer` applies tags to scorers during registration. These tags let you target\n",
686+
"specific subsets for evaluation:\n",
687+
"\n",
688+
"- `refusal` — The 4 standalone refusal scorer variants\n",
689+
"- `default` — All scorers registered by default\n",
690+
"- `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)\n",
691+
"- `best_objective_f1` — The objective scorer with the highest F1\n",
692+
"\n",
693+
"### Recommended Workflow: Refusal → Dependent Scorers\n",
694+
"\n",
695+
"When refusal scorer prompts or datasets change, the recommended workflow is:\n",
696+
"\n",
697+
"**Step 1: Evaluate refusal scorers first**\n",
698+
"\n",
699+
"```bash\n",
700+
"python build_scripts/evaluate_scorers.py --tags refusal\n",
701+
"```\n",
702+
"\n",
703+
"This evaluates only the 4 refusal variants and writes results to\n",
704+
"`refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which\n",
705+
"refusal variant has the best F1 and tag it as `best_refusal_f1`.\n",
706+
"\n",
707+
"**Step 2: Re-evaluate all scorers**\n",
708+
"\n",
709+
"```bash\n",
710+
"python build_scripts/evaluate_scorers.py\n",
711+
"```\n",
712+
"\n",
713+
"On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best\n",
714+
"refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping\n",
715+
"the best refusal scorer). This ensures objective scorers that depend on refusal detection use the\n",
716+
"best-performing refusal prompt.\n",
717+
"\n",
718+
"Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are\n",
719+
"automatically skipped, so re-running the full script is efficient.\n",
720+
"\n",
721+
"**Step 3: Commit updated metrics**\n",
722+
"\n",
723+
"```bash\n",
724+
"git add pyrit/datasets/scorer_evals/\n",
725+
"git commit -m \"chore: update scorer metrics\"\n",
726+
"```\n",
727+
"\n",
728+
"The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime\n",
729+
"to select the best scorers."
730+
]
653731
}
654732
],
655733
"metadata": {

0 commit comments

Comments
 (0)