From 69d3ca81b67c13e2f17702630111ea1c2baa88d1 Mon Sep 17 00:00:00 2001 From: Varun Joginpalli Date: Wed, 1 Apr 2026 22:52:50 +0000 Subject: [PATCH] PR 1: Scenario Doc Pages --- doc/code/scenarios/2_content_harms.ipynb | 759 ++++++++++++++++++++ doc/code/scenarios/2_content_harms.py | 104 +++ doc/code/scenarios/3_psychosocial.ipynb | 782 +++++++++++++++++++++ doc/code/scenarios/3_psychosocial.py | 125 ++++ doc/code/scenarios/4_cyber.ipynb | 750 ++++++++++++++++++++ doc/code/scenarios/4_cyber.py | 92 +++ doc/code/scenarios/5_jailbreak.ipynb | 753 ++++++++++++++++++++ doc/code/scenarios/5_jailbreak.py | 101 +++ doc/code/scenarios/6_leakage.ipynb | 803 ++++++++++++++++++++++ doc/code/scenarios/6_leakage.py | 139 ++++ doc/code/scenarios/7_scam.ipynb | 753 ++++++++++++++++++++ doc/code/scenarios/7_scam.py | 94 +++ doc/code/scenarios/8_garak_encoding.ipynb | 761 ++++++++++++++++++++ doc/code/scenarios/8_garak_encoding.py | 111 +++ doc/myst.yml | 7 + 15 files changed, 6134 insertions(+) create mode 100644 doc/code/scenarios/2_content_harms.ipynb create mode 100644 doc/code/scenarios/2_content_harms.py create mode 100644 doc/code/scenarios/3_psychosocial.ipynb create mode 100644 doc/code/scenarios/3_psychosocial.py create mode 100644 doc/code/scenarios/4_cyber.ipynb create mode 100644 doc/code/scenarios/4_cyber.py create mode 100644 doc/code/scenarios/5_jailbreak.ipynb create mode 100644 doc/code/scenarios/5_jailbreak.py create mode 100644 doc/code/scenarios/6_leakage.ipynb create mode 100644 doc/code/scenarios/6_leakage.py create mode 100644 doc/code/scenarios/7_scam.ipynb create mode 100644 doc/code/scenarios/7_scam.py create mode 100644 doc/code/scenarios/8_garak_encoding.ipynb create mode 100644 doc/code/scenarios/8_garak_encoding.py diff --git a/doc/code/scenarios/2_content_harms.ipynb b/doc/code/scenarios/2_content_harms.ipynb new file mode 100644 index 0000000000..64b62a22b7 --- /dev/null +++ b/doc/code/scenarios/2_content_harms.ipynb @@ -0,0 +1,759 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8bd70cfc", + "metadata": {}, + "source": [ + "# 2. Content Harms Scenario\n", + "\n", + "The `ContentHarms` scenario tests whether a target model can be induced to generate harmful content across\n", + "seven harm categories: hate, fairness, violence, sexual, harassment, misinformation, and leakage. It combines\n", + "single-turn attacks (PromptSending, RolePlay) with multi-turn techniques (ManyShot, TAP) to provide broad\n", + "coverage of content safety risks.\n", + "\n", + "## Available Strategies\n", + "\n", + "Each strategy targets a specific harm category with its own dataset:\n", + "\n", + "| Strategy | CLI Value | Description |\n", + "|----------|-----------|-------------|\n", + "| ALL | `all` | Aggregate — runs all 7 harm categories |\n", + "| Hate | `hate` | Tests for hateful content generation |\n", + "| Fairness | `fairness` | Tests for unfair or biased content |\n", + "| Violence | `violence` | Tests for violent content generation |\n", + "| Sexual | `sexual` | Tests for sexual content generation |\n", + "| Harassment | `harassment` | Tests for harassing content generation |\n", + "| Misinformation | `misinformation` | Tests for misinformation generation |\n", + "| Leakage | `leakage` | Tests for data leakage in content |\n", + "\n", + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9f5f1183", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-01T21:58:32.721052Z", + "iopub.status.busy": "2026-04-01T21:58:32.720917Z", + "iopub.status.idle": "2026-04-01T21:58:46.139110Z", + "shell.execute_reply": "2026-04-01T21:58:46.137113Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found default environment files: ['./.pyrit/.env', './.pyrit/.env.local']\n", + "Loaded environment file: ./.pyrit/.env\n", + "Loaded environment file: ./.pyrit/.env.local\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + "Loading datasets - this can take a few minutes: 0%| | 0/58 [00:00