Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/dependabot-automerge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Dependabot Auto-merge

on:
pull_request:
branches: ["main"]

permissions:
contents: write
pull-requests: write

jobs:
auto-merge:
name: Auto-merge Dependabot PR
runs-on: ubuntu-latest
if: github.actor == 'dependabot[bot]'
steps:
- name: Enable auto-merge
run: gh pr merge --auto --squash "$PR_URL"
env:
PR_URL: ${{ github.event.pull_request.html_url }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## v0.10.1 (2026-05-22)

### fix

- replace hardcoded OpenAI model defaults with local Ollama defaults
- resolve top-level 'judge' dict before falling back to gpt-4-0613 default
- move examples/ inside hackagent package for correct wheel packaging

## v0.10.0 (2026-05-22)

### ✨ Features
Expand Down
26 changes: 22 additions & 4 deletions hackagent/attacks/evaluator/evaluation_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def execute(self, input_data):
from hackagent.attacks.shared.router_factory import extract_passthrough_request_config
from hackagent.attacks.evaluator.sync import sync_evaluation_to_server
from hackagent.attacks.techniques.advprefix.config import EvaluatorConfig
from hackagent.attacks.techniques.config import (
DEFAULT_JUDGE_IDENTIFIER,
DEFAULT_LOCAL_AGENT_TYPE,
DEFAULT_LOCAL_MODEL_ENDPOINT,
)
from hackagent.server.client import AuthenticatedClient
from hackagent.router.types import AgentTypeEnum

Expand Down Expand Up @@ -349,14 +354,17 @@ def _build_base_eval_config(
def _resolve_judges_from_config(
self,
technique_params: Optional[Dict[str, Any]] = None,
default_judge: str = "gpt-4-0613",
default_type: str = "jailbreakbench",
default_judge: str = DEFAULT_JUDGE_IDENTIFIER,
default_type: str = "harmbench",
) -> List[Dict[str, Any]]:
"""
Resolve the judges list from ``_raw_config``.

If no top-level ``judges`` key is present, builds a single-judge
fallback from *technique_params* for backward compatibility.
Resolution order:
1. Top-level ``judges`` list in raw config.
2. Top-level ``judge`` dict in raw config (wrapped in a list).
3. ``technique_params["judge"]`` string (legacy fallback).
4. ``default_judge`` / ``default_type`` hardcoded defaults.

Args:
technique_params: Technique-specific params dict with legacy
Expand All @@ -371,6 +379,11 @@ def _resolve_judges_from_config(
if isinstance(judges, list) and judges:
return judges

# Use the top-level "judge" dict if present (e.g. from Ollama/local configs).
raw_judge = self._raw_config.get("judge")
if isinstance(raw_judge, dict) and raw_judge:
return [raw_judge]

tp = technique_params or {}
judge_model = tp.get("judge", default_judge)
judge_type = tp.get("judge_type") or self.infer_judge_type(
Expand All @@ -380,6 +393,11 @@ def _resolve_judges_from_config(
"identifier": judge_model,
"type": judge_type,
}
# For the built-in local default, inject Ollama connectivity so it
# works out-of-the-box without any API key.
if judge_model == DEFAULT_JUDGE_IDENTIFIER:
fallback.setdefault("endpoint", DEFAULT_LOCAL_MODEL_ENDPOINT)
fallback.setdefault("agent_type", DEFAULT_LOCAL_AGENT_TYPE)
for key in (
"endpoint",
"agent_type",
Expand Down
3 changes: 2 additions & 1 deletion hackagent/attacks/techniques/flipattack/attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from hackagent.router.router import AgentRouter
from hackagent.attacks.techniques.base import BaseAttack
from hackagent.attacks.shared.tui import with_tui_logging
from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER

from . import generation, evaluation
from .config import DEFAULT_FLIPATTACK_CONFIG
Expand Down Expand Up @@ -461,7 +462,7 @@ def run(self, goals: List[str]) -> List[Dict]:
"cot": flipattack_params.get("cot", False),
"lang_gpt": flipattack_params.get("lang_gpt", False),
"few_shot": flipattack_params.get("few_shot", False),
"judge": flipattack_params.get("judge", "gpt-4-0613"),
"judge": flipattack_params.get("judge", DEFAULT_JUDGE_IDENTIFIER),
}

# Initialize goal contexts upfront so goal elapsed_s covers the full
Expand Down
6 changes: 4 additions & 2 deletions hackagent/cli/tui/attack_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

from hackagent.attacks.techniques.config import DEFAULT_ATTACKER_IDENTIFIER


# =====================================================================
# Field / Spec primitives
Expand Down Expand Up @@ -578,7 +580,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]:
key="attacker.model",
label="Attacker Model",
field_type=FieldType.STRING,
default="gpt-4",
default=DEFAULT_ATTACKER_IDENTIFIER,
description="Model ID for the attacker LLM that generates prompts.",
section="Attacker LLM",
),
Expand Down Expand Up @@ -1305,7 +1307,7 @@ def get_all_attack_specs() -> Dict[str, AttackConfigSpec]:
key="attacker.identifier",
label="Attacker Model",
field_type=FieldType.STRING,
default="gpt-4o-mini",
default=DEFAULT_ATTACKER_IDENTIFIER,
description="Model identifier for persuasive paraphrasing.",
section="Attacker LLM",
),
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "hackagent"
version = "0.10.0"
version = "0.10.1"
description = "HackAgent is an open-source security toolkit to detect vulnerabilities of your AI Agents."
authors = [
{name = "AI Security Lab", email = "ais@ai4i.it"}
Expand Down
7 changes: 4 additions & 3 deletions tests/integration/attacks/test_evaluation_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
JUDGE_TYPE_LABELS,
MERGE_KEYS,
)
from hackagent.attacks.techniques.config import DEFAULT_JUDGE_IDENTIFIER
from hackagent.router.types import AgentTypeEnum

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -297,9 +298,9 @@ def test_fallback_to_defaults_with_no_params(self):
judges = step._resolve_judges_from_config()

assert len(judges) == 1
assert judges[0]["identifier"] == "gpt-4-0613"
# default_type in _resolve_judges_from_config is "jailbreakbench"
assert judges[0]["type"] == "jailbreakbench"
assert judges[0]["identifier"] == DEFAULT_JUDGE_IDENTIFIER
# default_type in _resolve_judges_from_config is "harmbench"
assert judges[0]["type"] == "harmbench"

def test_multiple_judges(self):
"""Test with multiple judges configured."""
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading