AISecurityLab · Nicola Franco (franconicola) · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -135,8 +135,11 @@ jobs:
           fail_ci_if_error: true
 
   # ── Integration (offline) ─────────────────────────────────────────────────
-  # Covers tests/integration/storage/ (LocalBackend + SQLite) and
-  # tests/integration/tui/ (mock-based). No external services required.
+  # Covers tests/integration/tui/ — mock-based TUI integration tests
+  # that exercise widget composition + lifecycle without external
+  # services. (The former tests/integration/storage/ moved to
+  # tests/unit/server/storage/ since it only exercised the in-memory
+  # local backend.)
   integration-offline:
     name: Integration Tests (Offline)
     runs-on: ubuntu-latest
@@ -161,7 +164,6 @@ jobs:
       - name: Run offline integration tests with coverage
         run: >
           uv run pytest
-          tests/integration/storage/
           tests/integration/tui/
           --run-integration
           -n auto
@@ -180,17 +182,32 @@ jobs:
           retention-days: 1
 
   # ── Integration (Ollama) ──────────────────────────────────────────────────
-  # Covers tests/integration/adapters/ and tests/integration/attacks/.
+  # Covers tests/integration/router/ and tests/integration/attacks/.
   # Requires a running Ollama instance with tinyllama.
+  #
+  # Sharded across two runners (real CPUs, real parallelism):
+  #   - shard=fast → ``-m "not slow"`` (the bulk; ~few minutes)
+  #   - shard=slow → ``-m "slow"`` (advprefix multi-judge; ~14 min on CPU)
+  # Within each shard pytest-xdist spreads tests across runner cores
+  # with ``-n auto --dist=loadfile`` and Ollama is allowed to serve
+  # multiple concurrent requests via ``OLLAMA_NUM_PARALLEL=4``.
   integration-ollama:
-    name: Integration Tests (Ollama)
+    name: Integration Tests (Ollama, ${{ matrix.shard }})
     runs-on: ubuntu-latest
     timeout-minutes: 30
     if: github.event_name == 'pull_request' && github.base_ref == 'main'
+    strategy:
+      fail-fast: false
+      matrix:
+        shard:
+          - fast
+          - slow
     env:
       HACKAGENT_API_KEY: ${{ secrets.HACKAGENT_API_KEY }}
       OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
       OLLAMA_MODEL: tinyllama
+      # Let Ollama serve concurrent requests from pytest-xdist workers.
+      OLLAMA_NUM_PARALLEL: "4"
       TEST_MAX_TOKENS_FAST: "15"
       TEST_MAX_TOKENS_MEDIUM: "25"
       TEST_MAX_TOKENS_SLOW: "40"
@@ -222,28 +239,36 @@ jobs:
             ollama-models-tinyllama-
 
       - name: Pull Ollama model
+        # Integration tests reuse tinyllama for the target, attacker,
+        # judges, and category classifier (via
+        # ``_fast_classifier_config``). The orchestrator's implicit
+        # default classifier (``gemma3:4b``) is much slower on CPU
+        # runners and not pulled here on purpose.
         run: ollama pull tinyllama
 
       - name: Install dependencies
         run: uv sync --group dev
 
       - name: Run Ollama integration tests with coverage
+        # Each shard handles one ``-m`` selector so the slow advprefix
+        # test (~14 min on CPU) runs on its own runner instead of
+        # bottlenecking the rest of the suite.
         run: >
           uv run pytest
-          tests/integration/adapters/
+          tests/integration/router/
           tests/integration/attacks/
           --run-integration
-          -n 2
+          -n auto
           --dist=loadfile
-          -m "not slow"
+          -m "${{ matrix.shard == 'slow' && 'slow' || 'not slow' }}"
           -v --tb=short
           --cov --cov-fail-under=0
           --cov-report=xml:reports/coverage.xml
 
       - name: Upload Ollama-integration coverage artifact
         uses: actions/upload-artifact@v7
         with:
-          name: coverage-integration-ollama
+          name: coverage-integration-ollama-${{ matrix.shard }}
           path: reports/.coverage
           include-hidden-files: true
           retention-days: 1

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,8 +18,17 @@ repos:
   - repo: local
     hooks:
       - id: pytest
-        name: pytest
-        entry: uv run pytest --run-integration --ignore=tests/e2e/attacks
+        name: pytest (unit only)
+        # Integration + e2e tests run in GitHub Actions (see
+        # ``.github/workflows/ci.yml``). The local pre-commit only
+        # runs the unit suite so commits stay snappy. To run the full
+        # integration suite locally on demand:
+        #   uv run pytest tests/integration/ --run-integration
+        #
+        # ``-n 4`` (not ``-n auto``) so the hook works both on 4-vCPU
+        # CI runners and on shared HPC login nodes that advertise 64+
+        # logical CPUs but enforce per-user thread limits.
+        entry: uv run pytest tests/unit/ -n 4
         language: system
         pass_filenames: false
         files: ^(.*\.py|pyproject\.toml|poetry\.lock|.*requirements.*\.txt|.*package\.json|.*package-lock\.json)$
diff --git a/docs/docs/api-index.md b/docs/docs/api-index.md
@@ -20,4 +20,4 @@ For practical usage examples, see the [Python SDK Quickstart](./sdk/python-quick
 
 ---
 
-*Auto-generated from hackagent v0.6.0.*
+*Auto-generated from hackagent v0.10.1.*
diff --git a/docs/docs/cli/initialization.md b/docs/docs/cli/initialization.md
@@ -19,26 +19,24 @@ The initialization wizard will:
 1. **Display the HackAgent ASCII logo**
 2. **Set verbosity level** — Control logging detail (0=ERROR to 3=DEBUG)
 3. **Save configuration** — Stored in `~/.config/hackagent/config.json`
+HACKAGENT_BANNER = """
 
+"""
 ## Example Session
 
 ```bash
 $ hackagent init
 
-╭────────────────────────────────────────────────────────────────────────╮
-│                                                                        │
-│                                                                        │
-│                                                                        │
-│  ███████╗███████╗ ██████╗███████╗██╗   ██╗██╗  ██╗██╗     ██╗ █████╗   │
-│  ██╔════╝██╔════╝██╔════╝██╔════╝██║   ██║██║  ██║██║     ██║██╔══██╗  │
-│  ███████╗█████╗  ██║     █████╗  ██║   ██║███████║██║     ██║███████║  │
-│  ╚════██║██╔══╝  ██║     ██╔══╝  ╚██╗ ██╔╝╚════██║██║     ██║██╔══██║  │
-│  ███████║███████╗╚██████╗███████╗ ╚████╔╝      ██║███████╗██║██║  ██║  │
-│  ╚══════╝╚══════╝ ╚═════╝╚══════╝  ╚═══╝       ╚═╝╚══════╝╚═╝╚═╝  ╚═╝  │
-│                                                                        │
-│                                                                        │
-│                                                                        │
-╰────────────────────────────────────────────────────────────────────────╯
+╭──────────────────────────────────────────────────────────────────────────────────╮
+│                                                                                  │
+│   ██╗  ██╗ █████╗  ██████╗██╗  ██╗ █████╗  ██████╗ ███████╗███╗   ██╗████████╗   │
+│   ██║  ██║██╔══██╗██╔════╝██║ ██╔╝██╔══██╗██╔════╝ ██╔════╝████╗  ██║╚══██╔══╝   │
+│   ███████║███████║██║     █████╔╝ ███████║██║  ███╗█████╗  ██╔██╗ ██║   ██║      │
+│   ██╔══██║██╔══██║██║     ██╔═██╗ ██╔══██║██║   ██║██╔══╝  ██║╚██╗██║   ██║      │  
+│   ██║  ██║██║  ██║╚██████╗██║  ██╗██║  ██║╚██████╔╝███████╗██║ ╚████║   ██║      │
+│   ╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝╚═╝  ╚═╝╚═╝  ╚═╝ ╚═════╝ ╚══════╝╚═╝  ╚═══╝   ╚═╝      │
+│                                                                                  │
+╰──────────────────────────────────────────────────────────────────────────────────╯
 
 🔧 HackAgent CLI Setup Wizard
 Welcome! Let's get you set up for AI agent security testing.

diff --git a/docs/docs/hackagent/agent.md b/docs/docs/hackagent/agent.md
@@ -35,11 +35,14 @@ attack methodologies.
 def __init__(endpoint: str,
              name: Optional[str] = None,
              agent_type: Union[AgentTypeEnum, str] = AgentTypeEnum.UNKNOWN,
+             base_url: Optional[str] = None,
+             api_key: Optional[str] = None,
              raise_on_unexpected_status: bool = False,
              timeout: Optional[float] = None,
              metadata: Optional[Dict[str, Any]] = None,
              target_config: Optional[Dict[str, Any]] = None,
-             adapter_operational_config: Optional[Dict[str, Any]] = None)
+             adapter_operational_config: Optional[Dict[str, Any]] = None,
+             thinking: Optional[bool] = None)
 ```
 
 Initializes the HackAgent client and prepares it for interaction.
@@ -75,6 +78,10 @@ attack strategies.
   generation defaults such as `name`4, `name`5,
   and `name`0.
 - `name`7 - Optional configuration for the agent adapter.
+- `name`8 - Optional OLLAMA-only control for reasoning traces.
+  When set to `False`, requests sent through the target OLLAMA adapter
+  include `agent_type`0 to disable thinking output. Ignored for
+  non-OLLAMA target agent types.
 
 #### attack\_strategies
 
@@ -91,8 +98,7 @@ Lazy-loaded attack strategies dictionary.
 def hack(attack_config: Dict[str, Any],
          run_config_override: Optional[Dict[str, Any]] = None,
          fail_on_run_error: bool = True,
-         _tui_app: Optional[Any] = None,
-         _tui_log_callback: Optional[Any] = None) -> Any
+         _tui_event_bus: Optional[Any] = None) -> Any
 ```
 
 Executes a specified attack strategy against the configured victim agent.

diff --git a/docs/docs/hackagent/attacks/evaluator/evaluation_step.md b/docs/docs/hackagent/attacks/evaluator/evaluation_step.md
@@ -96,7 +96,7 @@ Prepare evaluated items for backend sync:
 - Add _run_id if missing
 - Ensure result_id exists
 - Build judge_keys
-- Call _sync_to_server
+- Call _sync_to_server (only if not already synced by the attack)
 
 #### get\_statistics
 

diff --git a/docs/docs/hackagent/attacks/evaluator/sync.md b/docs/docs/hackagent/attacks/evaluator/sync.md
@@ -28,11 +28,13 @@ Usage:
 #### update\_single\_result
 
 ```python
-def update_single_result(result_id: str,
-                         success: bool,
-                         evaluation_notes: str,
-                         backend: Any,
-                         logger: Optional[logging.Logger] = None) -> bool
+def update_single_result(
+        result_id: str,
+        success: bool,
+        evaluation_notes: str,
+        backend: Any = None,
+        logger: Optional[logging.Logger] = None,
+        metadata_updates: Optional[Dict[str, Any]] = None) -> bool
 ```
 
 Update a single Result&#x27;s evaluation status via the storage backend.
@@ -62,19 +64,19 @@ def sync_evaluation_to_server(
 
 Sync evaluation results to the server, aggregating the best per result_id.
 
-Multiple completion rows may share the same `result_id` (one per goal).
+Multiple completion rows may share the same ``result_id`` (one per goal).
 This function aggregates to find the best (success wins over failure)
-evaluation per `result_id`, then PATCHes the server once per goal.
+evaluation per ``result_id``, then PATCHes the server once per goal.
 
 **Arguments**:
 
 - `evaluated_data` - List of dicts with evaluation results. Each dict
-  should contain `result_id` and evaluation score keys.
+  should contain ``result_id`` and evaluation score keys.
 - `client` - Authenticated client for API calls.
 - `logger` - Optional logger instance.
 - `judge_keys` - Optional list of dicts mapping judge types to their
-  column names, e.g. ``[\{&quot;key&quot;: &quot;eval_jb&quot;, &quot;explanation&quot;: &quot;explanation_jb&quot;,
-- `1 - &quot;JailbreakBench&quot;}]`. If None, auto-detects from
+  column names, e.g. ``[{&quot;key&quot;: &quot;eval_jb&quot;, &quot;explanation&quot;: &quot;explanation_jb&quot;,
+- ``1 - &quot;JailbreakBench&quot;}]``. If None, auto-detects from
   known column patterns.
 
 

diff --git a/docs/docs/hackagent/attacks/objectives/base.md b/docs/docs/hackagent/attacks/objectives/base.md
@@ -33,11 +33,11 @@ Usage:
     )
 
     # Use in attack configuration
-    attack_config = \{
+    attack_config = {
         &quot;objective&quot;: &quot;prompt_injection&quot;,
         &quot;technique&quot;: &quot;advprefix&quot;,  # or &quot;template&quot;
         &quot;goals&quot;: [...]
-    \}
+    }
 
 #### \_\_init\_\_
 

diff --git a/docs/docs/hackagent/attacks/orchestrator.md b/docs/docs/hackagent/attacks/orchestrator.md
@@ -88,8 +88,7 @@ def execute(attack_config: Dict[str, Any],
             fail_on_run_error: bool,
             max_wait_time_seconds: Optional[int] = None,
             poll_interval_seconds: Optional[int] = None,
-            _tui_app: Optional[Any] = None,
-            _tui_log_callback: Optional[Any] = None) -> Any
+            _tui_event_bus: Optional[Any] = None) -> Any
 ```
 
 Execute attack with server tracking.
@@ -108,8 +107,9 @@ Standard workflow:
 - `fail_on_run_error` - Whether to raise on errors
 - `max_wait_time_seconds` - Unused for local execution
 - `poll_interval_seconds` - Unused for local execution
-- `_tui_app` - Optional TUI app for logging
-- `_tui_log_callback` - Optional TUI log callback
+- `_tui_event_bus` - Optional :class:`hackagent.cli.tui.events.TUIEventBus`
+  that receives structured events (step start/end, tool calls,
+  progress, etc.) during execution.
 
 
 **Returns**:

diff --git a/docs/docs/hackagent/attacks/shared/response_utils.md b/docs/docs/hackagent/attacks/shared/response_utils.md
@@ -34,9 +34,9 @@ def extract_response_content(
 Extract text content from an LLM response in various formats.
 
 Handles the following response formats:
-1. **OpenAI-style object** — `response.choices[0].message.content`
-2. **Dictionary** — `response[&quot;generated_text&quot;]` or
-`response[&quot;processed_response&quot;]`
+1. **OpenAI-style object** — ``response.choices[0].message.content``
+2. **Dictionary** — ``response[&quot;generated_text&quot;]`` or
+``response[&quot;processed_response&quot;]``
 3. **String** — returned as-is
 4. **None / empty** — returns None
 
@@ -58,7 +58,7 @@ Handles the following response formats:
   &gt;&gt;&gt; # OpenAI-style response
   &gt;&gt;&gt; content = extract_response_content(openai_response)
   &gt;&gt;&gt; # Dict-style response
-  &gt;&gt;&gt; content = extract_response_content(\{&quot;generated_text&quot;: &quot;Hello!&quot;\})
+  &gt;&gt;&gt; content = extract_response_content({&quot;generated_text&quot;: &quot;Hello!&quot;})
   &gt;&gt;&gt; # Plain string
   &gt;&gt;&gt; content = extract_response_content(&quot;Hello!&quot;)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,4 +20,4 @@ For practical usage examples, see the [Python SDK Quickstart](./sdk/python-quick

		---

		Auto-generated from hackagent v0.6.0.
		Auto-generated from hackagent v0.10.1.