|
278 | 278 | "name": "stdout", |
279 | 279 | "output_type": "stream", |
280 | 280 | "text": [ |
281 | | - "Found 11 scorer configurations in the registry\n", |
| 281 | + "Found 10 scorer configurations in the metrics file\n", |
282 | 282 | "\n", |
283 | 283 | "Top 5 configurations by F1 Score:\n", |
284 | 284 | "--------------------------------------------------------------------------------\n", |
|
295 | 295 | "\u001b[36m • model_name: gpt-4o\u001b[0m\n", |
296 | 296 | "\n", |
297 | 297 | "\u001b[37m ▸ Performance Metrics\u001b[0m\n", |
298 | | - "\u001b[36m • Accuracy: 84.84%\u001b[0m\n", |
299 | | - "\u001b[36m • Accuracy Std Error: ±0.0185\u001b[0m\n", |
300 | | - "\u001b[36m • F1 Score: 0.8606\u001b[0m\n", |
301 | | - "\u001b[36m • Precision: 0.7928\u001b[0m\n", |
302 | | - "\u001b[32m • Recall: 0.9412\u001b[0m\n", |
303 | | - "\u001b[36m • Average Score Time: 1.27s\u001b[0m\n", |
| 298 | + "\u001b[36m • Accuracy: 83.29%\u001b[0m\n", |
| 299 | + "\u001b[36m • Accuracy Std Error: ±0.0188\u001b[0m\n", |
| 300 | + "\u001b[36m • F1 Score: 0.8472\u001b[0m\n", |
| 301 | + "\u001b[36m • Precision: 0.7593\u001b[0m\n", |
| 302 | + "\u001b[32m • Recall: 0.9581\u001b[0m\n", |
| 303 | + "\u001b[36m • Average Score Time: 0.80s\u001b[0m\n", |
304 | 304 | "\n", |
305 | 305 | "\u001b[1m 📊 Scorer Information\u001b[0m\n", |
306 | 306 | "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", |
|
314 | 314 | "\u001b[36m • model_name: gpt-4o-unsafe\u001b[0m\n", |
315 | 315 | "\n", |
316 | 316 | "\u001b[37m ▸ Performance Metrics\u001b[0m\n", |
317 | | - "\u001b[36m • Accuracy: 79.26%\u001b[0m\n", |
318 | | - "\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n", |
319 | | - "\u001b[36m • F1 Score: 0.8259\u001b[0m\n", |
320 | | - "\u001b[36m • Precision: 0.7088\u001b[0m\n", |
321 | | - "\u001b[32m • Recall: 0.9893\u001b[0m\n", |
322 | | - "\u001b[36m • Average Score Time: 1.52s\u001b[0m\n", |
| 317 | + "\u001b[36m • Accuracy: 79.24%\u001b[0m\n", |
| 318 | + "\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n", |
| 319 | + "\u001b[36m • F1 Score: 0.8210\u001b[0m\n", |
| 320 | + "\u001b[36m • Precision: 0.7041\u001b[0m\n", |
| 321 | + "\u001b[32m • Recall: 0.9843\u001b[0m\n", |
| 322 | + "\u001b[36m • Average Score Time: 0.99s\u001b[0m\n", |
323 | 323 | "\n", |
324 | 324 | "\u001b[1m 📊 Scorer Information\u001b[0m\n", |
325 | 325 | "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", |
|
334 | 334 | "\u001b[36m • temperature: 0.9\u001b[0m\n", |
335 | 335 | "\n", |
336 | 336 | "\u001b[37m ▸ Performance Metrics\u001b[0m\n", |
337 | | - "\u001b[36m • Accuracy: 78.46%\u001b[0m\n", |
338 | | - "\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n", |
339 | | - "\u001b[36m • F1 Score: 0.8204\u001b[0m\n", |
340 | | - "\u001b[36m • Precision: 0.7008\u001b[0m\n", |
341 | | - "\u001b[32m • Recall: 0.9893\u001b[0m\n", |
342 | | - "\u001b[36m • Average Score Time: 1.77s\u001b[0m\n", |
| 337 | + "\u001b[36m • Accuracy: 77.72%\u001b[0m\n", |
| 338 | + "\u001b[36m • Accuracy Std Error: ±0.0209\u001b[0m\n", |
| 339 | + "\u001b[36m • F1 Score: 0.8095\u001b[0m\n", |
| 340 | + "\u001b[31m • Precision: 0.6900\u001b[0m\n", |
| 341 | + "\u001b[32m • Recall: 0.9791\u001b[0m\n", |
| 342 | + "\u001b[36m • Average Score Time: 1.36s\u001b[0m\n", |
343 | 343 | "\n", |
344 | 344 | "\u001b[1m 📊 Scorer Information\u001b[0m\n", |
345 | 345 | "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", |
|
350 | 350 | "\u001b[36m • temperature: 0.9\u001b[0m\n", |
351 | 351 | "\n", |
352 | 352 | "\u001b[37m ▸ Performance Metrics\u001b[0m\n", |
353 | | - "\u001b[36m • Accuracy: 78.46%\u001b[0m\n", |
354 | | - "\u001b[36m • Accuracy Std Error: ±0.0212\u001b[0m\n", |
355 | | - "\u001b[36m • F1 Score: 0.7582\u001b[0m\n", |
356 | | - "\u001b[36m • Precision: 0.8581\u001b[0m\n", |
357 | | - "\u001b[31m • Recall: 0.6791\u001b[0m\n", |
358 | | - "\u001b[36m • Average Score Time: 2.39s\u001b[0m\n", |
| 353 | + "\u001b[36m • Accuracy: 81.27%\u001b[0m\n", |
| 354 | + "\u001b[36m • Accuracy Std Error: ±0.0196\u001b[0m\n", |
| 355 | + "\u001b[36m • F1 Score: 0.7836\u001b[0m\n", |
| 356 | + "\u001b[36m • Precision: 0.8874\u001b[0m\n", |
| 357 | + "\u001b[36m • Recall: 0.7016\u001b[0m\n", |
| 358 | + "\u001b[36m • Average Score Time: 2.01s\u001b[0m\n", |
359 | 359 | "\n", |
360 | 360 | "\u001b[1m 📊 Scorer Information\u001b[0m\n", |
361 | 361 | "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", |
|
366 | 366 | "\u001b[36m • temperature: 0.9\u001b[0m\n", |
367 | 367 | "\n", |
368 | 368 | "\u001b[37m ▸ Performance Metrics\u001b[0m\n", |
369 | | - "\u001b[36m • Accuracy: 73.40%\u001b[0m\n", |
370 | | - "\u001b[36m • Accuracy Std Error: ±0.0228\u001b[0m\n", |
371 | | - "\u001b[31m • F1 Score: 0.6732\u001b[0m\n", |
372 | | - "\u001b[36m • Precision: 0.8655\u001b[0m\n", |
373 | | - "\u001b[31m • Recall: 0.5508\u001b[0m\n", |
374 | | - "\u001b[36m • Average Score Time: 2.23s\u001b[0m\n", |
| 369 | + "\u001b[36m • Accuracy: 74.18%\u001b[0m\n", |
| 370 | + "\u001b[36m • Accuracy Std Error: ±0.0220\u001b[0m\n", |
| 371 | + "\u001b[31m • F1 Score: 0.6731\u001b[0m\n", |
| 372 | + "\u001b[36m • Precision: 0.8678\u001b[0m\n", |
| 373 | + "\u001b[31m • Recall: 0.5497\u001b[0m\n", |
| 374 | + "\u001b[36m • Average Score Time: 1.83s\u001b[0m\n", |
375 | 375 | "\n", |
376 | 376 | "================================================================================\n", |
377 | | - "Best Accuracy: 84.84%\n", |
| 377 | + "Best Accuracy: 83.29%\n", |
378 | 378 | "Best Precision: 0.989\n", |
379 | | - "Best Recall: 0.989\n", |
380 | | - "Fastest: 0.129 seconds\n", |
381 | | - "Slowest: 3.520 seconds\n" |
| 379 | + "Best Recall: 0.984\n", |
| 380 | + "Fastest: 0.134 seconds\n", |
| 381 | + "Slowest: 2.390 seconds\n" |
382 | 382 | ] |
383 | 383 | } |
384 | 384 | ], |
|
388 | 388 | "# Load all objective scorer metrics - returns ScorerMetricsWithIdentity[ObjectiveScorerMetrics]\n", |
389 | 389 | "all_scorers = get_all_objective_metrics()\n", |
390 | 390 | "\n", |
391 | | - "print(f\"Found {len(all_scorers)} scorer configurations in the registry\\n\")\n", |
| 391 | + "print(f\"Found {len(all_scorers)} scorer configurations in the metrics file\\n\")\n", |
392 | 392 | "\n", |
393 | 393 | "# Sort by F1 score - type checker knows entry.metrics is ObjectiveScorerMetrics\n", |
394 | 394 | "sorted_by_f1 = sorted(all_scorers, key=lambda x: x.metrics.f1_score, reverse=True)\n", |
|
650 | 650 | " - For harm scorers: 0.0-1.0 float values\n", |
651 | 651 | "- `data_type`: Type of content (defaults to \"text\")" |
652 | 652 | ] |
| 653 | + }, |
| 654 | + { |
| 655 | + "cell_type": "markdown", |
| 656 | + "id": "18", |
| 657 | + "metadata": {}, |
| 658 | + "source": [ |
| 659 | + "## Batch Evaluation with `evaluate_scorers.py`\n", |
| 660 | + "\n", |
| 661 | + "While `evaluate_async()` runs evaluations for a single scorer, the `evaluate_scorers.py` script\n", |
| 662 | + "evaluates **all registered scorers** in bulk. This is useful for benchmarking after changing scorer\n", |
| 663 | + "prompts, adding new variants, or updating human-labeled datasets.\n", |
| 664 | + "\n", |
| 665 | + "The script initializes PyRIT with `ScorerInitializer` (which registers all configured scorers),\n", |
| 666 | + "then runs `evaluate_async()` on each one. Results are saved to the JSONL registry files in\n", |
| 667 | + "`pyrit/datasets/scorer_evals/`.\n", |
| 668 | + "\n", |
| 669 | + "### Basic Usage\n", |
| 670 | + "\n", |
| 671 | + "```bash\n", |
| 672 | + "# Evaluate all registered scorers (long-running — can take hours)\n", |
| 673 | + "python build_scripts/evaluate_scorers.py\n", |
| 674 | + "\n", |
| 675 | + "# Evaluate only scorers with specific tags\n", |
| 676 | + "python build_scripts/evaluate_scorers.py --tags refusal\n", |
| 677 | + "python build_scripts/evaluate_scorers.py --tags refusal,default\n", |
| 678 | + "\n", |
| 679 | + "# Control parallelism (default: 5, lower if hitting rate limits)\n", |
| 680 | + "python build_scripts/evaluate_scorers.py --max-concurrency 3\n", |
| 681 | + "```\n", |
| 682 | + "\n", |
| 683 | + "### Tags\n", |
| 684 | + "\n", |
| 685 | + "`ScorerInitializer` applies tags to scorers during registration. These tags let you target\n", |
| 686 | + "specific subsets for evaluation:\n", |
| 687 | + "\n", |
| 688 | + "- `refusal` — The 4 standalone refusal scorer variants\n", |
| 689 | + "- `default` — All scorers registered by default\n", |
| 690 | + "- `best_refusal_f1` — The refusal variant with the highest F1 (set dynamically from metrics)\n", |
| 691 | + "- `best_objective_f1` — The objective scorer with the highest F1\n", |
| 692 | + "\n", |
| 693 | + "### Recommended Workflow: Refusal → Dependent Scorers\n", |
| 694 | + "\n", |
| 695 | + "When refusal scorer prompts or datasets change, the recommended workflow is:\n", |
| 696 | + "\n", |
| 697 | + "**Step 1: Evaluate refusal scorers first**\n", |
| 698 | + "\n", |
| 699 | + "```bash\n", |
| 700 | + "python build_scripts/evaluate_scorers.py --tags refusal\n", |
| 701 | + "```\n", |
| 702 | + "\n", |
| 703 | + "This evaluates only the 4 refusal variants and writes results to\n", |
| 704 | + "`refusal_scorer/refusal_metrics.jsonl`. After this step, `ScorerInitializer` can determine which\n", |
| 705 | + "refusal variant has the best F1 and tag it as `best_refusal_f1`.\n", |
| 706 | + "\n", |
| 707 | + "**Step 2: Re-evaluate all scorers**\n", |
| 708 | + "\n", |
| 709 | + "```bash\n", |
| 710 | + "python build_scripts/evaluate_scorers.py\n", |
| 711 | + "```\n", |
| 712 | + "\n", |
| 713 | + "On the next full run, `ScorerInitializer` reads the refusal metrics from Step 1, picks the best\n", |
| 714 | + "refusal variant, and uses it to build dependent scorers (e.g., `TrueFalseInverterScorer` wrapping\n", |
| 715 | + "the best refusal scorer). This ensures objective scorers that depend on refusal detection use the\n", |
| 716 | + "best-performing refusal prompt.\n", |
| 717 | + "\n", |
| 718 | + "Scorers whose metrics are already up-to-date (same dataset version, sufficient trials) are\n", |
| 719 | + "automatically skipped, so re-running the full script is efficient.\n", |
| 720 | + "\n", |
| 721 | + "**Step 3: Commit updated metrics**\n", |
| 722 | + "\n", |
| 723 | + "```bash\n", |
| 724 | + "git add pyrit/datasets/scorer_evals/\n", |
| 725 | + "git commit -m \"chore: update scorer metrics\"\n", |
| 726 | + "```\n", |
| 727 | + "\n", |
| 728 | + "The updated JSONL files should be checked in so that `ScorerInitializer` can read them at runtime\n", |
| 729 | + "to select the best scorers." |
| 730 | + ] |
653 | 731 | } |
654 | 732 | ], |
655 | 733 | "metadata": { |
|
0 commit comments