diff --git a/.github/workflows/dependabot-automerge.yml b/.github/workflows/dependabot-automerge.yml new file mode 100644 index 00000000..d7529fa3 --- /dev/null +++ b/.github/workflows/dependabot-automerge.yml @@ -0,0 +1,21 @@ +name: Dependabot Auto-merge + +on: + pull_request: + branches: ["main"] + +permissions: + contents: write + pull-requests: write + +jobs: + auto-merge: + name: Auto-merge Dependabot PR + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + steps: + - name: Enable auto-merge + run: gh pr merge --auto --squash "$PR_URL" + env: + PR_URL: ${{ github.event.pull_request.html_url }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b7a995b..1420e940 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## v0.10.1 (2026-05-22) + +### fix + +- replace hardcoded OpenAI model defaults with local Ollama defaults +- resolve top-level 'judge' dict before falling back to gpt-4-0613 default +- move examples/ inside hackagent package for correct wheel packaging + ## v0.10.0 (2026-05-22) ### ✨ Features diff --git a/hackagent/attacks/evaluator/evaluation_step.py b/hackagent/attacks/evaluator/evaluation_step.py index ef68c080..a05daada 100644 --- a/hackagent/attacks/evaluator/evaluation_step.py +++ b/hackagent/attacks/evaluator/evaluation_step.py @@ -51,6 +51,11 @@ def execute(self, input_data): from hackagent.attacks.shared.router_factory import extract_passthrough_request_config from hackagent.attacks.evaluator.sync import sync_evaluation_to_server from hackagent.attacks.techniques.advprefix.config import EvaluatorConfig +from hackagent.attacks.techniques.config import ( + DEFAULT_JUDGE_IDENTIFIER, + DEFAULT_LOCAL_AGENT_TYPE, + DEFAULT_LOCAL_MODEL_ENDPOINT, +) from hackagent.server.client import AuthenticatedClient from hackagent.router.types import AgentTypeEnum @@ -349,14 +354,17 @@ def _build_base_eval_config( def _resolve_judges_from_config( self, technique_params: Optional[Dict[str, Any]] = None, - default_judge: str = "gpt-4-0613", - default_type: str = "jailbreakbench", + default_judge: str = DEFAULT_JUDGE_IDENTIFIER, + default_type: str = "harmbench", ) -> List[Dict[str, Any]]: """ Resolve the judges list from ``_raw_config``. - If no top-level ``judges`` key is present, builds a single-judge - fallback from *technique_params* for backward compatibility. + Resolution order: + 1. Top-level ``judges`` list in raw config. + 2. Top-level ``judge`` dict in raw config (wrapped in a list). + 3. ``technique_params["judge"]`` string (legacy fallback). + 4. ``default_judge`` / ``default_type`` hardcoded defaults. Args: technique_params: Technique-specific params dict with legacy @@ -371,6 +379,11 @@ def _resolve_judges_from_config( if isinstance(judges, list) and judges: return judges + # Use the top-level "judge" dict if present (e.g. from Ollama/local configs). + raw_judge = self._raw_config.get("judge") + if isinstance(raw_judge, dict) and raw_judge: + return [raw_judge] + tp = technique_params or {} judge_model = tp.get("judge", default_judge) judge_type = tp.get("judge_type") or self.infer_judge_type( @@ -380,6 +393,11 @@ def _resolve_judges_from_config( "identifier": judge_model, "type": judge_type, } + # For the built-in local default, inject Ollama connectivity so it + # works out-of-the-box without any API key. + if judge_model == DEFAULT_JUDGE_IDENTIFIER: + fallback.setdefault("endpoint", DEFAULT_LOCAL_MODEL_ENDPOINT) + fallback.setdefault("agent_type", DEFAULT_LOCAL_AGENT_TYPE) for key in ( "endpoint", "agent_type", diff --git a/hackagent/attacks/techniques/flipattack/attack.py b/hackagent/attacks/techniques/flipattack/attack.py index 8b41db5a..1dcba302 100644 --- a/hackagent/attacks/techniques/flipattack/attack.py +++ b/hackagent/attacks/techniques/flipattack/attack.py @@ -41,6 +41,7 @@ from hackagent.router.router import AgentRouter from hackagent.attacks.techniques.base import BaseAttack from hackagent.attacks.shared.tui import with_tui_logging +from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER from . import generation, evaluation from .config import DEFAULT_FLIPATTACK_CONFIG @@ -461,7 +462,7 @@ def run(self, goals: List[str]) -> List[Dict]: "cot": flipattack_params.get("cot", False), "lang_gpt": flipattack_params.get("lang_gpt", False), "few_shot": flipattack_params.get("few_shot", False), - "judge": flipattack_params.get("judge", "gpt-4-0613"), + "judge": flipattack_params.get("judge", DEFAULT_JUDGE_IDENTIFIER), } # Initialize goal contexts upfront so goal elapsed_s covers the full diff --git a/hackagent/cli/tui/attack_specs.py b/hackagent/cli/tui/attack_specs.py index 772df0bc..9e07ea2e 100644 --- a/hackagent/cli/tui/attack_specs.py +++ b/hackagent/cli/tui/attack_specs.py @@ -26,6 +26,8 @@ from enum import Enum from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from hackagent.attacks.techniques.config import DEFAULT_ATTACKER_IDENTIFIER + # ===================================================================== # Field / Spec primitives @@ -578,7 +580,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]: key="attacker.model", label="Attacker Model", field_type=FieldType.STRING, - default="gpt-4", + default=DEFAULT_ATTACKER_IDENTIFIER, description="Model ID for the attacker LLM that generates prompts.", section="Attacker LLM", ), @@ -1305,7 +1307,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]: key="attacker.identifier", label="Attacker Model", field_type=FieldType.STRING, - default="gpt-4o-mini", + default=DEFAULT_ATTACKER_IDENTIFIER, description="Model identifier for persuasive paraphrasing.", section="Attacker LLM", ), diff --git a/pyproject.toml b/pyproject.toml index 2fef50df..47a1d729 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hackagent" -version = "0.10.0" +version = "0.10.1" description = "HackAgent is an open-source security toolkit to detect vulnerabilities of your AI Agents." authors = [ {name = "AI Security Lab", email = "ais@ai4i.it"} diff --git a/tests/integration/attacks/test_evaluation_step.py b/tests/integration/attacks/test_evaluation_step.py index aec5cfd6..52d279f4 100644 --- a/tests/integration/attacks/test_evaluation_step.py +++ b/tests/integration/attacks/test_evaluation_step.py @@ -41,6 +41,7 @@ JUDGE_TYPE_LABELS, MERGE_KEYS, ) +from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER from hackagent.router.types import AgentTypeEnum logger = logging.getLogger(__name__) @@ -297,9 +298,9 @@ def test_fallback_to_defaults_with_no_params(self): judges = step._resolve_judges_from_config() assert len(judges) == 1 - assert judges[0]["identifier"] == "gpt-4-0613" - # default_type in _resolve_judges_from_config is "jailbreakbench" - assert judges[0]["type"] == "jailbreakbench" + assert judges[0]["identifier"] == DEFAULT_JUDGE_IDENTIFIER + # default_type in _resolve_judges_from_config is "harmbench" + assert judges[0]["type"] == "harmbench" def test_multiple_judges(self): """Test with multiple judges configured.""" diff --git a/uv.lock b/uv.lock index b4c831c5..a6ce3655 100644 --- a/uv.lock +++ b/uv.lock @@ -2368,7 +2368,7 @@ wheels = [ [[package]] name = "hackagent" -version = "0.9.1" +version = "0.10.0" source = { editable = "." } dependencies = [ { name = "click" },