From 9004c81a5dee9c46ef9eef95de30a38cfe23e346 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Sun, 24 May 2026 11:20:19 +0200 Subject: [PATCH 01/52] feat: defining new data structure for Discovery + Extraction of Tenders --- src/bandai/models/__init__.py | 12 ++ src/bandai/models/discovery_models.py | 161 ++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 src/bandai/models/discovery_models.py diff --git a/src/bandai/models/__init__.py b/src/bandai/models/__init__.py index fecc7b8..7b0845c 100644 --- a/src/bandai/models/__init__.py +++ b/src/bandai/models/__init__.py @@ -17,6 +17,13 @@ parse_company_profile, ) +from .discovery_models import ( + AccessMode, Status, + + TenderOverview, DiscoveryResult, + TenderInfo, ContractingAuthority +) + __all__ = [ # Pipeline models "RawContract", @@ -34,4 +41,9 @@ "clear_profile_cache", "load_company_profile", "parse_company_profile", + + ## Discovery + Extration Models + "AccessMode", "Status", + "TenderOverview", "DiscoveryResult", + "TenderInfo", "ContractingAuthority" ] diff --git a/src/bandai/models/discovery_models.py b/src/bandai/models/discovery_models.py new file mode 100644 index 0000000..a42fa46 --- /dev/null +++ b/src/bandai/models/discovery_models.py @@ -0,0 +1,161 @@ +from pydantic import BaseModel, HttpUrl, Field +from typing import Optional, Literal +import datetime as dt + +### ----------------------------------------------------------------------------------- +### +### Discovery Agents - Tender Overview Models +### +### ----------------------------------------------------------------------------------- + +AccessMode = Literal['api', 'html'] +Status = Literal['success', 'error', 'partial'] + +class TenderOverview(BaseModel): + """ + Output Template for the Discovery Agent. + It contains an Information overview of the different available Tenders on a Portal. + """ + title: Optional[str] = Field( + default = None, + description = "Tender title (if available) from Terders' lists" + ) + url: HttpUrl = Field(description = "Direct URL page/endpoint for reaching tenders Information") + + portal: str = Field(description = "Orginal Portal's Name (i.e. ANAC, TED, MePA)") + + access_mode: AccessMode = Field( + default = 'html', + description = "Extration Access Mode" + ) + + metadata: Optional[dict] = Field( + default = None, + description = "Avaible Data during the Discovery Process (i.e. CIG, deadline)" + ) + + discovery_datetime: dt.datetime = Field( + default_factory = dt.datetime.now, + description = "Timestamp of first discovery of the tender" + ) + + +class DiscoveryResult(BaseModel): + """ + It contains a list of discovered tenders on a portal by a Discovery Agent. + """ + + tenders: list[TenderOverview] + portal: str + status: Status = 'success' + error: Optional[str] = None + + @property + def tenders_count(self): + return len(self.tenders) + + + +### ----------------------------------------------------------------------------------- +### +### Extractor Agents - Tender Complete Information Models +### +### ----------------------------------------------------------------------------------- + +class ContractingAuthority(BaseModel): + """ + Contracting Authority Information of a Tender + """ + + name: str + tax_code: Optional[str] = None + pec: Optional[str] = None + rup: Optional[str] = None + + +class TenderInfo(BaseModel): + """ + Output Template for the Extractor Agent. + It contains all the Information Available of a Tender in a structured way. + """ + + ## Identifiers + + cig: Optional[str] = Field(default = None, description = "Identification Bid Code") + cup: Optional[str] = Field(default = None, description = "Unified Project Code") + title: str + url: HttpUrl + + ## Classification + cpv: Optional[list[str]] = Field( + default = None, + description = "CPV Codes list (i.e. ['72000000', '48000000'])" + ) + contract_type: Optional[str] = Field( + default = None, + description = "Services / Supplies / Construction Work" + ) + + ## Contracting Authority + contracting_authority: Optional[ContractingAuthority] = None + + ## Economical Value + base_amount: Optional[float] = Field(default = None, description = "Base auction amount in Euros") + max_amount: Optional[float] = Field(default = None, description = "Maximum value, including renewals") + # award_criterion: Optional[str] = Field( + # default = None, + # description = "OEPV (quality + price) or OPB (price only)" + # ) + + ## Date Information + publication_date: Optional[dt.date] = None + deadline: Optional[dt.datetime] = Field( + default = None, + description="Bid submission deadline" + ) + contract_duration_months: Optional[int] = None + + ## Requirements (as Text) + # financial_requirements: Optional[str] = None + # techincal_requirements: Optional[str] = None + + ## Documentation + url_docs: Optional[list[HttpUrl]] = Field( + default = None, + description = "Links to tender documents, specifications, and PDF attachments" + ) + + ## Extraction Meta Data + portal: str + extraction_date: dt.datetime = Field(default_factory = dt.datetime.now) + status: Status = Field( + default = 'success', + description = "'success', 'partial' (for missing fields), 'error'" + ) + error: Optional[str] = None + + +if __name__ == "__main__": + + tenders_list = [] + + for i in range(2): + tender_overview = TenderOverview( + title = f"Title {i}", + url=f"https://dati.anticorruzione.it/api/gara/{i + 234}", + portal = "ANAC", + access_mode = 'api' if (i % 2 == 0) else "html", + metadata = {} if (i % 2 == 0) else dict(cig='ABC1234567') + ) + + tenders_list.append(tender_overview) + +discovery_result = DiscoveryResult( + tenders = tenders_list, + portal = "ANAC", + tenders_count = len(tenders_list), + status = 'success', +) + + +print(discovery_result.model_dump_json(indent=2)) From 08ffcdb80c9c33a534f5352e560cf9c32c5dc77e Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 09:19:32 +0200 Subject: [PATCH 02/52] feat: added support for transforming HTML Content into Markdown --- src/bandai/utils.py | 136 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 2 deletions(-) diff --git a/src/bandai/utils.py b/src/bandai/utils.py index e313562..9eac39a 100644 --- a/src/bandai/utils.py +++ b/src/bandai/utils.py @@ -3,10 +3,16 @@ import json import logging import re -from pathlib import Path -from typing import Any + +from typing import Any, Literal, Optional import yaml +from pathlib import Path + +from bs4 import BeautifulSoup +from markdownify import markdownify +from typing import Literal, Optional + log = logging.getLogger(__name__) @@ -113,3 +119,129 @@ def is_implicit_no_go(text: str) -> bool: """Fast keyword check for abandonment language (pre-LLM, zero cost).""" lower = text.lower() return any(kw in lower for kw in NO_GO_KEYWORDS) + + +### ----------------------------------------------------------------------------------- +### +### Defining Parser/Tranformation for HTML to Markdown Format +### +### ----------------------------------------------------------------------------------- + + +## All the Tags/Classes that needs to be removed from the original HTML Content +_TAGS_TO_REMOVE: list[str] = [ + 'nav', 'header', 'footer', 'aside', + 'script', 'style', 'noscript', + + '.side-bar', '.cookie-banner', '.cookie-notice', + '.breadcrumb', '.pagination', '.social-share' +] + +## Main Content Containers +_MAIN_CONTENT_SELECTORS = [ + ## Extraction + "main", "article", "[role='main']", + "#content", ".content", + "#main-content", ".main-content", + + ## Discovery + 'table' +] + +PADDING_LINES: int = 5 + + +## Conversion Mode: +## - Discovery - No Images, no Styling, just raw content with links. +## Try to keep all relevant information about terders' urls for reachability. +## - Extraction - Focus on the main content and outer links to documents and attachments. +## - Full - Keep everything (structure and content), aside from styling +ConversionMode = Literal['discovery', 'extraction', 'full'] + + +_MODE_CONFIG = { + "discovery": { + "description": "Clean Text with urls for reaching Single Tenders", + "strip": ["img"] + + }, + "extraction": { + "description": "Single-page Tender Extraction of Information", + "strip": ["img"] + }, + "full": { + "description": "Complete Output with no further stripping (debug mode)", + "strip": ['a', 'img'] + }, +} + + +def html_to_markdown( + html_content: str, + mode: ConversionMode, + max_chars: int = 12000, + max_lines: Optional[int] = None +) -> str: + """ + Converter from HTML to Markdown content. + Based on the requested Mode, it tries to strip all non-relevant information. + + :param html_content: Raw HTML Content to strip + :type html_content: str + :param mode: * 'discovery' -> Try to keep lists structure with titles and url for reachability, + * 'extraction' -> Keep all relevant information and documents linkage, + * 'full' -> No further stripping + :type mode: ConversionMode + :param max_chars: Maximum length of the Output (for Context Window Reasons, Saturation etc.) + :type max_chars: int + :param max_lines: Maximum number of Lines to keep, if provided. Used for counting the Results. + :type max_lines: Optional[int] + """ + + ## Init + config = _MODE_CONFIG[mode] + soup = BeautifulSoup(html_content, 'html.parser') + + ## Removing all Basic Tags/Classes + for selector in _TAGS_TO_REMOVE: + for tag in soup.select(selector): + tag.decompose() ## Destroys Tags content Recoursively + + + ## Main Content Identification + main_content = None + for selector in _MAIN_CONTENT_SELECTORS: + main_content = soup.select_one(selector) + if main_content: + break + + ## Fallback on the Body tag + target = main_content if main_content else soup + if not target: + return "" + + + ## Markdown Conversion + markdown = markdownify( + html = str(target), + heading_style = "ATX", ## Titles with '#' symbol + strip = config['strip'], + newline_style = 'backslash' + ) + + + ## Possible Multiline Spacing + markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip() + + if max_lines: + lines = markdown.split('/n') + if len(lines) > max_lines: + truncated = lines[:max_lines] + markdown = '/n'.join(truncated) + + + ## Markdown Content being too long + if len(markdown) > max_chars: + markdown = markdown[:max_chars] + + return markdown \ No newline at end of file From 048e10842bd0ef94dac871732bb1b363a2b3c035 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 09:21:41 +0200 Subject: [PATCH 03/52] feat: updated pyproject.toml and new dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9802b2d..6cc74e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "pyyaml>=6.0", "python-dotenv>=1.0.0", "ollama>=0.6.2", + "playwright>=1.60.0", ] [project.optional-dependencies] From 025e64e126440312daeedda6fb5518dd292a0af0 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 10:33:25 +0200 Subject: [PATCH 04/52] feat: updated portals configuration to do discovery and extraction of information --- src/bandai/config/portals.py | 56 +++++++++++++++++++++++++++++++--- src/bandai/config/portals.yaml | 47 ++++++++++++++++++++++++++-- 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/src/bandai/config/portals.py b/src/bandai/config/portals.py index e32bb20..7ea44dd 100644 --- a/src/bandai/config/portals.py +++ b/src/bandai/config/portals.py @@ -2,16 +2,52 @@ import logging import os -from pathlib import Path + +from pydantic import BaseModel, Field, HttpUrl +from typing import Literal, Optional import yaml -from pydantic import BaseModel, Field +from pathlib import Path from bandai.utils import CONFIG_DIR log = logging.getLogger(__name__) + +### ----------------------------------------------------------------------------------- +### +### Defining Configuration Directives for Extraction Process +### +### ----------------------------------------------------------------------------------- + +class SelectorIdentifier(BaseModel): + type: str + text: str + +class ActionDescription(BaseModel): + selector: SelectorIdentifier + action_type: Literal['click'] + wait_after_action: Literal['networkidle'] | int + + +class DiscoveryProcess(BaseModel): + + base_search_url: HttpUrl + elements_per_page: int + + ## Main Selector + list_wrapper_selector: SelectorIdentifier + next_page_selector: SelectorIdentifier + + actions_to_perform: list[ActionDescription] + + +class ExtractionProcess(BaseModel): + base_resource_url: HttpUrl + main_content_selector: SelectorIdentifier + + # Portal Config Model @@ -19,9 +55,21 @@ class PortalConfig(BaseModel): """Immutable configuration for a single procurement portal.""" name: str - base_url: str + info: Optional[str] = None + base_url: Optional[str] = None reliability: float = Field(..., ge=0.0, le=1.0) - country_filter: str | None = None + country_filter: Optional[str] = None + + discovery: Optional[DiscoveryProcess] = None + extraction: Optional[ExtractionProcess] = None + + def is_ready_for_discovery(self) -> bool: + return self.discovery is not None + + def is_ready_for_extraction(self) -> bool: + return self.discovery is not None + + # Loading diff --git a/src/bandai/config/portals.yaml b/src/bandai/config/portals.yaml index 9592588..e30ae4c 100644 --- a/src/bandai/config/portals.yaml +++ b/src/bandai/config/portals.yaml @@ -11,14 +11,57 @@ # automatically by the Scout crew on the next run. portals: - - name: "ANAC / Simog" + - name: "ANAC" + info: "ANAC / Simog" base_url: "https://www.anticorruzione.it/-/bandi-di-gara" reliability: 1.00 - - name: "TED (EU)" + discovery: + base_search_url: "https://pubblicitalegale.anticorruzione.it/bandi" + elements_per_page: 10 + list_wrapper_selector: + type: "css" + text: "app-bandi-di-gara div:has( app-bando-di-gara-preview)" + next_page_selector: + type: "css" + text: ".pagination .page-item:last-child" + actions_to_perform: [] + + extraction: + base_resource_url: "https://pubblicitalegale.anticorruzione.it/bandi" + main_content_selector: + type: "css" + text: "app-bando-di-gara-detail" + + + - name: "TED" + info: "TED (EU)" base_url: "https://www.ted.europa.eu/en/search/result" reliability: 0.90 country_filter: "IT" + + discovery: + base_search_url: "https://ted.europa.eu/en/search/result?FT=Italy" + elements_per_page: 50 + list_wrapper_selector: + type: "css" + text: "table:has( thead)" + next_page_selector: + type: "aria-label" + text: "Go to the next page" + actions_to_perform: [] + # - selector: + # type: "css" + # text: "th:has( #deadline-column-id)" + # action_type: "click" + # wait_after_action: "networkidle" + + extraction: + base_resource_url: https://ted.europa.eu/ + main_content_selector: + type: "css" + text: "#notice-accordion" + - name: "MePA" base_url: "https://www.acquistinretepa.it/opencms/opencms/main/pa/" From 944019f151e70f74eb67bbc3aec50c8ece3c26bd Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 10:36:00 +0200 Subject: [PATCH 05/52] refactor: added discovery models to models.py --- src/bandai/models/discovery_models.py | 161 -------------------------- src/bandai/models/models.py | 139 +++++++++++++++++++++- 2 files changed, 137 insertions(+), 163 deletions(-) delete mode 100644 src/bandai/models/discovery_models.py diff --git a/src/bandai/models/discovery_models.py b/src/bandai/models/discovery_models.py deleted file mode 100644 index a42fa46..0000000 --- a/src/bandai/models/discovery_models.py +++ /dev/null @@ -1,161 +0,0 @@ -from pydantic import BaseModel, HttpUrl, Field -from typing import Optional, Literal -import datetime as dt - -### ----------------------------------------------------------------------------------- -### -### Discovery Agents - Tender Overview Models -### -### ----------------------------------------------------------------------------------- - -AccessMode = Literal['api', 'html'] -Status = Literal['success', 'error', 'partial'] - -class TenderOverview(BaseModel): - """ - Output Template for the Discovery Agent. - It contains an Information overview of the different available Tenders on a Portal. - """ - title: Optional[str] = Field( - default = None, - description = "Tender title (if available) from Terders' lists" - ) - url: HttpUrl = Field(description = "Direct URL page/endpoint for reaching tenders Information") - - portal: str = Field(description = "Orginal Portal's Name (i.e. ANAC, TED, MePA)") - - access_mode: AccessMode = Field( - default = 'html', - description = "Extration Access Mode" - ) - - metadata: Optional[dict] = Field( - default = None, - description = "Avaible Data during the Discovery Process (i.e. CIG, deadline)" - ) - - discovery_datetime: dt.datetime = Field( - default_factory = dt.datetime.now, - description = "Timestamp of first discovery of the tender" - ) - - -class DiscoveryResult(BaseModel): - """ - It contains a list of discovered tenders on a portal by a Discovery Agent. - """ - - tenders: list[TenderOverview] - portal: str - status: Status = 'success' - error: Optional[str] = None - - @property - def tenders_count(self): - return len(self.tenders) - - - -### ----------------------------------------------------------------------------------- -### -### Extractor Agents - Tender Complete Information Models -### -### ----------------------------------------------------------------------------------- - -class ContractingAuthority(BaseModel): - """ - Contracting Authority Information of a Tender - """ - - name: str - tax_code: Optional[str] = None - pec: Optional[str] = None - rup: Optional[str] = None - - -class TenderInfo(BaseModel): - """ - Output Template for the Extractor Agent. - It contains all the Information Available of a Tender in a structured way. - """ - - ## Identifiers - - cig: Optional[str] = Field(default = None, description = "Identification Bid Code") - cup: Optional[str] = Field(default = None, description = "Unified Project Code") - title: str - url: HttpUrl - - ## Classification - cpv: Optional[list[str]] = Field( - default = None, - description = "CPV Codes list (i.e. ['72000000', '48000000'])" - ) - contract_type: Optional[str] = Field( - default = None, - description = "Services / Supplies / Construction Work" - ) - - ## Contracting Authority - contracting_authority: Optional[ContractingAuthority] = None - - ## Economical Value - base_amount: Optional[float] = Field(default = None, description = "Base auction amount in Euros") - max_amount: Optional[float] = Field(default = None, description = "Maximum value, including renewals") - # award_criterion: Optional[str] = Field( - # default = None, - # description = "OEPV (quality + price) or OPB (price only)" - # ) - - ## Date Information - publication_date: Optional[dt.date] = None - deadline: Optional[dt.datetime] = Field( - default = None, - description="Bid submission deadline" - ) - contract_duration_months: Optional[int] = None - - ## Requirements (as Text) - # financial_requirements: Optional[str] = None - # techincal_requirements: Optional[str] = None - - ## Documentation - url_docs: Optional[list[HttpUrl]] = Field( - default = None, - description = "Links to tender documents, specifications, and PDF attachments" - ) - - ## Extraction Meta Data - portal: str - extraction_date: dt.datetime = Field(default_factory = dt.datetime.now) - status: Status = Field( - default = 'success', - description = "'success', 'partial' (for missing fields), 'error'" - ) - error: Optional[str] = None - - -if __name__ == "__main__": - - tenders_list = [] - - for i in range(2): - tender_overview = TenderOverview( - title = f"Title {i}", - url=f"https://dati.anticorruzione.it/api/gara/{i + 234}", - portal = "ANAC", - access_mode = 'api' if (i % 2 == 0) else "html", - metadata = {} if (i % 2 == 0) else dict(cig='ABC1234567') - ) - - tenders_list.append(tender_overview) - -discovery_result = DiscoveryResult( - tenders = tenders_list, - portal = "ANAC", - tenders_count = len(tenders_list), - status = 'success', -) - - -print(discovery_result.model_dump_json(indent=2)) diff --git a/src/bandai/models/models.py b/src/bandai/models/models.py index ca8c468..b0d7f16 100644 --- a/src/bandai/models/models.py +++ b/src/bandai/models/models.py @@ -1,6 +1,141 @@ from __future__ import annotations -from typing import Literal -from pydantic import BaseModel, Field + +from typing import Literal, Optional +from pydantic import BaseModel, Field, HttpUrl + +import datetime as dt + +### ----------------------------------------------------------------------------------- +### +### Discovery - Tender Overview Models +### +### ----------------------------------------------------------------------------------- + +AccessMode = Literal['api', 'html'] +Status = Literal['success', 'error', 'partial'] + +class TenderOverview(BaseModel): + """ + Output Template for the Discovery Agent. + It contains an Information overview of the different available Tenders on a Portal. + """ + title: Optional[str] = Field( + default = None, + description = "Tender title (if available) from Terders' lists" + ) + url: HttpUrl | str = Field(description = "Direct (complete or relative) URL page/endpoint for reaching tenders Information") + + portal: str = Field(description = "Orginal Portal's Name (i.e. ANAC, TED, MePA)") + + access_mode: AccessMode = Field( + default = 'html', + description = "Extration Access Mode" + ) + + metadata: Optional[dict] = Field( + default = None, + description = "Avaible Data during the Discovery Process (i.e. CIG, deadline)" + ) + + discovery_datetime: dt.datetime = Field( + default_factory = dt.datetime.now, + description = "Timestamp of first discovery of the tender" + ) + + +class DiscoveryResult(BaseModel): + """ + It contains a list of discovered tenders on a portal by a Discovery Agent. + """ + + tenders: list[TenderOverview] + portal: str + status: Status = 'success' + error: Optional[str] = None + + @property + def tenders_count(self): + return len(self.tenders) + + +### ----------------------------------------------------------------------------------- +### +### Extractor Agents - Tender Complete Information Models +### +### ----------------------------------------------------------------------------------- + +class ContractingAuthority(BaseModel): + """ + Contracting Authority Information of a Tender + """ + + name: str + tax_code: Optional[str] = None + pec: Optional[str] = None + rup: Optional[str] = None + + +class TenderInfo(BaseModel): + """ + Output Template for the Extractor Agent. + It contains all the Information Available of a Tender in a structured way. + """ + + ## Identifiers + cig: Optional[str] = Field(default = None, description = "Identification Bid Code") + cup: Optional[str] = Field(default = None, description = "Unified Project Code") + title: str + url: HttpUrl + + ## Classification + cpv: Optional[list[str]] = Field( + default = None, + description = "CPV Codes list (i.e. ['72000000', '48000000'])" + ) + contract_type: Optional[str] = Field( + default = None, + description = "Services / Supplies / Construction Work" + ) + + ## Contracting Authority + contracting_authority: Optional[ContractingAuthority] = None + + ## Economical Value + base_amount: Optional[float] = Field(default = None, description = "Base auction amount in Euros") + max_amount: Optional[float] = Field(default = None, description = "Maximum value, including renewals") + # award_criterion: Optional[str] = Field( + # default = None, + # description = "OEPV (quality + price) or OPB (price only)" + # ) + + ## Date Information + publication_date: Optional[dt.date] = None + deadline: Optional[dt.datetime] = Field( + default = None, + description="Bid submission deadline" + ) + contract_duration_months: Optional[int] = None + + ## Requirements (as Text) + # financial_requirements: Optional[str] = None + # techincal_requirements: Optional[str] = None + + ## Documentation + url_docs: Optional[list[HttpUrl]] = Field( + default = None, + description = "Links to tender documents, specifications, and PDF attachments" + ) + + ## Extraction Meta Data + portal: str + extraction_date: dt.datetime = Field(default_factory = dt.datetime.now) + status: Status = Field( + default = 'success', + description = "'success', 'partial' (for missing fields), 'error'" + ) + error: Optional[str] = None + + # Scouting Models From 0a860f0068805f6d953170e28f0df6c6cec532d8 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 11:26:04 +0200 Subject: [PATCH 06/52] feat: added Tenders Overview Extractor Tool --- src/bandai/tools/crawler_tools.py | 289 +++++++++++++++++++++++++++++- 1 file changed, 288 insertions(+), 1 deletion(-) diff --git a/src/bandai/tools/crawler_tools.py b/src/bandai/tools/crawler_tools.py index 415b646..f57fdbe 100644 --- a/src/bandai/tools/crawler_tools.py +++ b/src/bandai/tools/crawler_tools.py @@ -3,12 +3,24 @@ import json import logging import random +import asyncio + from pathlib import Path -from typing import Type +from typing import Type, Literal, Optional from crewai.tools import BaseTool # type: ignore from pydantic import BaseModel, Field, field_validator +from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout + +from bandai.config import BANDI_PORTALS +from bandai.config.portals import ( + PortalConfig, + SelectorIdentifier, ActionDescription, + DiscoveryProcess, ExtractionProcess +) +from bandai.utils import html_to_markdown + log = logging.getLogger(__name__) @@ -175,3 +187,278 @@ def _run(self, title: str, sections: dict[str, str], output_path: str) -> str: except IOError as e: log.error("Failed to write proposal to %s: %s", output_path, e) return f"Error writing proposal to {output_path}: {e}" + + + +### ----------------------------------------------------------------------------------- +### +### Defining Tools for Discovery and Construction of Tenders Overview +### +### ----------------------------------------------------------------------------------- + +PROCESS_CONFIG: dict[str, PortalConfig] = { + portal['name']: portal + for portal in BANDI_PORTALS if portal.is_ready_for_discovery() +} + +## Input Schema +class TendersOverviewExtractorInput(BaseModel): + """ + Input Schema for the Tenders' Overview Extraction Tool. + """ + portal_name: str = Field(description = "Available Portal Name to reach.") + tenders_count: int = Field( + default = 10, + description = "Number of Tenders to get from the Portal" + ) + + +## Main Tool Class +class TendersOverviewExtractorTool(BaseTool): + """ + Navigates a tender listing portal, given as input, across multiple pages and 4 + returns all tenders found in Markdown format. + In this way, the content is LLM-ready to be parsed into custom data structures (TenderOverview). + Pagination and portal-specific actions are handled based on the portal configuration file. + """ + + name: str = "Tenders' Overview Extractor" + description: str = ( + "This Tool can nagigate a portal searching for different tenders, presented in a list format." + "It can handle pagination automatically." + "All the content will be turned into Markdown format - parsable to extract TenderOverview objects." + ) + args_schema: type[BaseModel] = TendersOverviewExtractorInput + + ## Internal Config + max_chars: int = 12000 + timeout_ms: int = 30000 + headless_mode: bool = True #### DEBUG + + + def _run(self, portal_name: str, tenders_count: int) -> str: + """ + Synchronous entry point for CrewAI Agent. + """ + return asyncio.run(self._extract(portal_name, tenders_count)) + + async def _extract(self, portal_name: str, tenders_count: int) -> str: + """ + Loads asynchronously a Web Page containing a list-like content and returns its content in Markdown Format. + + Performed Steps: + 1. Read the Portal Configuration to perform the process. + 2. Opens Playwright in headless mode. + 3. If provided, applies portal-specific actions on the first page (Main table interacion). + 4. Extract the list wrapper HTML content + 5. Converts the content into Markdown through a utility function. + 6. Loops over pages if needed. + 7. Returns the aggregated Markdown content for the Agent to parse + """ + + ### Portal Name configuration: + if portal_name not in PROCESS_CONFIG.keys(): + return self._error( + portal_name = portal_name, + error_message = ( + "No Configuration found for the current portal. " + f"Available: { list(PROCESS_CONFIG.keys()) }" + ) + ) + + portal_cfg: DiscoveryProcess = PROCESS_CONFIG[portal_name].discovery + + search_url: str = portal_cfg.base_search_url + + elements_per_page: int = portal_cfg.elements_per_page + max_pages: int = (tenders_count // elements_per_page) + 1 + max_lines_on_last_page: int = (tenders_count % elements_per_page) + 1 + + if max_lines_on_last_page == 1: + max_pages -= 1 + max_lines_on_last_page = None + + pages_markdown: list[str] = [] + + ## Main Instruction Process - Playwright Loop + browser = None + async with async_playwright() as p: + try: + browser = await p.chromium.launch(headless = self.headless_mode) + page = await browser.new_page() + + ## Prevents unuseful resources from loading to speed up the loading process + await page.route( + "**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf}", + lambda route: route.abort() + ) + + try: + ## Main Page + await page.goto( + search_url, + wait_until = "networkidle", ## Waits for the Network traffic to become stable + timeout = self.timeout_ms + ) + + for page_num in range(max_pages): + log.info(f"[TendersOverviewExtraction] [{portal_name}] Page {page_num}/{max_pages}") + + ## Useful for Filtering or Tag Selection + if page_num == 0: + await self._do_apply_action_on_first_page( + page = page, portal_cfg = portal_cfg + ) + + ## List Wrapper - Current Content + html = await self._do_get_list_html_content( + page = page, portal_cfg = portal_cfg + ) + if not html: + log.warning(f"[TendersOverviewExtraction] Empty content on page {page_num}, stopping") + break + + ## Markdown Conversion + markdown = html_to_markdown( + html, + mode = 'discovery', + max_lines = max_lines_on_last_page if page_num == max_pages - 1 else None + ) + + if markdown.strip(): + pages_markdown.append(f"\n{markdown}") + log.info(f"[TendersOverviewExtraction] Page {page_num} converted successfully. ({len(markdown)} chars).") + + + ## Go to nex Page + has_next = await self._do_go_to_next_page( + page = page, portal_cfg = portal_cfg + ) + + if not has_next: + log.info(f"[TendersOverviewExtraction] No next page after page {page_num}, stopping") + break + + except PlaywrightTimeout: + log.warning(f"[TendersOverviewExtraction] Timeout on {portal_name}, Partial Recovery of the page.") + + ## We can try and get all the Content that was already loaded in the page. + html = await page.content() + if not html: + return self._error(portal_name, "Timeout exceeded, empty page.") + + + ## Markdown Extraction + markdown = html_to_markdown(html, mode = 'discovery', max_chars = self.max_chars) + + if not markdown.strip(): + return self._error(portal_name, "Page loaded successfully, no content found after stripping [Markdown production].") + + log.info(f"[TendersOverviewExtraction] Extracted {len(markdown)} chars from {portal_name}") + + return markdown + + except Exception as e: + log.error(f"[TendersOverviewExtraction] Unexpected Error on {portal_name}: {e}") + return self._error(portal_name, str(e)) + + finally: + if browser: + await browser.close() + + if not pages_markdown: + return self._error(portal_name, "No content collected across all pages") + + header = ( + f"# Tender listings from {portal_name}\n" + f"Pages navigated: {len(pages_markdown)}\n\n" + ) + return header + "\n\n---\n\n".join(pages_markdown) + + + ## Support Methods + def _build_locator(self, selector: SelectorIdentifier) -> str: + """ + Builds a Playwright locator string from a selector config. + """ + match selector.type: + case "css": + return selector.text + # case "aria-label": + # return f"[aria-label='{cfg['selector_text']}']" + + case _: ## In general [type='content'] + return f"[{selector.type}='{selector.text}']" + + async def _do_apply_action_on_first_page(self, page, portal_cfg: DiscoveryProcess) -> None: + """ + Applies actions/filter on the main table for the first page. + Useful for sorting based on Values using dynamic lists, etc. + """ + ## Apply actions like sorting based on Values using dynamic lists etc. + for action in (portal_cfg.actions_to_perform or []): + + locator_str: str = self._build_locator(action.selector) + log.info(f"[TendersOverviewExtraction] Applying action \"{action.action_type}\" on element: {locator_str}") + + match action.action_type: + case "click": + await page.locator(locator_str).click() + + wait = action.wait_after_action or "networkidle" + if isinstance(wait, int): + await page.wait_for_timeout(wait) + else: + await page.wait_for_load_state(wait) + + async def _do_get_list_html_content(self, page, portal_cfg: DiscoveryProcess) -> Optional[str]: + """ + Extracts HTML from the configured list wrapper element. + """ + wrapper_cfg = portal_cfg.list_wrapper_selector + + # No wrapper configured — fallback to full page + if not wrapper_cfg: + return await page.content() + + locator_str = self._build_locator(wrapper_cfg) + try: + return await page.locator(locator_str).first.inner_html(timeout=5000) + + except Exception as e: + log.warning(f"[TendersOverviewExtraction] List wrapper '{locator_str}' not found: {e}, falling back to full page") + return await page.content() + + async def _do_go_to_next_page(self, page, portal_cfg: DiscoveryProcess) -> bool: + """ + Clicks the next page button if available and not disabled. + Returns True if navigation occurred, False otherwise. + """ + next_cfg = portal_cfg.next_page_selector + if not next_cfg: + return False + + locator_str = self._build_locator(next_cfg) + next_btn = page.locator(locator_str).first + + if await next_btn.count() == 0: + return False + + is_disabled = await next_btn.get_attribute("disabled") + if is_disabled is not None: + return False + + await next_btn.click() + + ## Waiting for JS to render properly + await page.wait_for_load_state("networkidle") + + return True + + + def _error(self, portal_name: str, error_message: str) -> str: + """ + Returns an error message that an Agent can parse. + """ + return f"[ERROR on TendersOverviewExtraction] Portal: {portal_name} | Reason: {error_message}" + From 5f421b5e9b002a7301859aa1db3a0f2fdbfdb613 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 15:15:53 +0200 Subject: [PATCH 07/52] fix: small corrections --- src/bandai/tools/crawler_tools.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/bandai/tools/crawler_tools.py b/src/bandai/tools/crawler_tools.py index f57fdbe..07ade63 100644 --- a/src/bandai/tools/crawler_tools.py +++ b/src/bandai/tools/crawler_tools.py @@ -197,7 +197,7 @@ def _run(self, title: str, sections: dict[str, str], output_path: str) -> str: ### ----------------------------------------------------------------------------------- PROCESS_CONFIG: dict[str, PortalConfig] = { - portal['name']: portal + portal.name: portal for portal in BANDI_PORTALS if portal.is_ready_for_discovery() } @@ -268,7 +268,7 @@ async def _extract(self, portal_name: str, tenders_count: int) -> str: portal_cfg: DiscoveryProcess = PROCESS_CONFIG[portal_name].discovery - search_url: str = portal_cfg.base_search_url + search_url: str = str(portal_cfg.base_search_url) elements_per_page: int = portal_cfg.elements_per_page max_pages: int = (tenders_count // elements_per_page) + 1 @@ -285,7 +285,15 @@ async def _extract(self, portal_name: str, tenders_count: int) -> str: async with async_playwright() as p: try: browser = await p.chromium.launch(headless = self.headless_mode) - page = await browser.new_page() + context = await browser.new_context( + user_agent = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + viewport={ 'width': 1920, 'height': 1080 } + ) + page = await context.new_page() ## Prevents unuseful resources from loading to speed up the loading process await page.route( @@ -423,10 +431,16 @@ async def _do_get_list_html_content(self, page, portal_cfg: DiscoveryProcess) -> locator_str = self._build_locator(wrapper_cfg) try: - return await page.locator(locator_str).first.inner_html(timeout=5000) + await page.wait_for_selector( + locator_str, + state = 'visible', + timeout = 10000 + ) + + return await page.locator(locator_str).first.inner_html() except Exception as e: - log.warning(f"[TendersOverviewExtraction] List wrapper '{locator_str}' not found: {e}, falling back to full page") + log.warning(f"[TendersOverviewExtraction] Timeout waiting for '{locator_str}': {e}. Falling back to full page.") return await page.content() async def _do_go_to_next_page(self, page, portal_cfg: DiscoveryProcess) -> bool: @@ -460,5 +474,4 @@ def _error(self, portal_name: str, error_message: str) -> str: """ Returns an error message that an Agent can parse. """ - return f"[ERROR on TendersOverviewExtraction] Portal: {portal_name} | Reason: {error_message}" - + return f"[ERROR on TendersOverviewExtraction] Portal: {portal_name} | Reason: {error_message}" \ No newline at end of file From 2cf406ce859374d56b188cb3229c18d06ecee708 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 15:16:20 +0200 Subject: [PATCH 08/52] feat: added browser context --- src/bandai/tools/crawler_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bandai/tools/crawler_tools.py b/src/bandai/tools/crawler_tools.py index 07ade63..dac34df 100644 --- a/src/bandai/tools/crawler_tools.py +++ b/src/bandai/tools/crawler_tools.py @@ -11,7 +11,7 @@ from crewai.tools import BaseTool # type: ignore from pydantic import BaseModel, Field, field_validator -from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout +from playwright.async_api import async_playwright, Page, TimeoutError as PlaywrightTimeout from bandai.config import BANDI_PORTALS from bandai.config.portals import ( @@ -398,7 +398,7 @@ def _build_locator(self, selector: SelectorIdentifier) -> str: case _: ## In general [type='content'] return f"[{selector.type}='{selector.text}']" - async def _do_apply_action_on_first_page(self, page, portal_cfg: DiscoveryProcess) -> None: + async def _do_apply_action_on_first_page(self, page: Page, portal_cfg: DiscoveryProcess) -> None: """ Applies actions/filter on the main table for the first page. Useful for sorting based on Values using dynamic lists, etc. @@ -419,7 +419,7 @@ async def _do_apply_action_on_first_page(self, page, portal_cfg: DiscoveryProces else: await page.wait_for_load_state(wait) - async def _do_get_list_html_content(self, page, portal_cfg: DiscoveryProcess) -> Optional[str]: + async def _do_get_list_html_content(self, page: Page, portal_cfg: DiscoveryProcess) -> Optional[str]: """ Extracts HTML from the configured list wrapper element. """ @@ -443,7 +443,7 @@ async def _do_get_list_html_content(self, page, portal_cfg: DiscoveryProcess) -> log.warning(f"[TendersOverviewExtraction] Timeout waiting for '{locator_str}': {e}. Falling back to full page.") return await page.content() - async def _do_go_to_next_page(self, page, portal_cfg: DiscoveryProcess) -> bool: + async def _do_go_to_next_page(self, page: Page, portal_cfg: DiscoveryProcess) -> bool: """ Clicks the next page button if available and not disabled. Returns True if navigation occurred, False otherwise. From f5e3478e63e54f818b0a30f521a65637f8dd4ef5 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 15:40:14 +0200 Subject: [PATCH 09/52] fix: possible missing value --- src/bandai/config/portals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bandai/config/portals.py b/src/bandai/config/portals.py index 7ea44dd..0da139d 100644 --- a/src/bandai/config/portals.py +++ b/src/bandai/config/portals.py @@ -56,7 +56,7 @@ class PortalConfig(BaseModel): name: str info: Optional[str] = None - base_url: Optional[str] = None + base_url: Optional[HttpUrl] = None reliability: float = Field(..., ge=0.0, le=1.0) country_filter: Optional[str] = None From ba9822a3eadcf5f9eda9a57d6058076393a4eb99 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 15:47:28 +0200 Subject: [PATCH 10/52] feat: added Single Page Loader Tool --- src/bandai/tools/crawler_tools.py | 188 ++++++++++++++++++++++++++++-- 1 file changed, 177 insertions(+), 11 deletions(-) diff --git a/src/bandai/tools/crawler_tools.py b/src/bandai/tools/crawler_tools.py index dac34df..5d21eed 100644 --- a/src/bandai/tools/crawler_tools.py +++ b/src/bandai/tools/crawler_tools.py @@ -9,7 +9,7 @@ from typing import Type, Literal, Optional from crewai.tools import BaseTool # type: ignore -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, HttpUrl, Field, field_validator from playwright.async_api import async_playwright, Page, TimeoutError as PlaywrightTimeout @@ -197,8 +197,11 @@ def _run(self, title: str, sections: dict[str, str], output_path: str) -> str: ### ----------------------------------------------------------------------------------- PROCESS_CONFIG: dict[str, PortalConfig] = { - portal.name: portal - for portal in BANDI_PORTALS if portal.is_ready_for_discovery() + portal.name: portal for portal in BANDI_PORTALS +} + +DISCOVERABLE_PORTALS: dict[str, PortalConfig] = { + portal.name: portal for portal in BANDI_PORTALS if portal.is_ready_for_discovery() } ## Input Schema @@ -231,8 +234,8 @@ class TendersOverviewExtractorTool(BaseTool): args_schema: type[BaseModel] = TendersOverviewExtractorInput ## Internal Config - max_chars: int = 12000 - timeout_ms: int = 30000 + max_chars: int = 12_000 + timeout_ms: int = 30_000 headless_mode: bool = True #### DEBUG @@ -257,16 +260,16 @@ async def _extract(self, portal_name: str, tenders_count: int) -> str: """ ### Portal Name configuration: - if portal_name not in PROCESS_CONFIG.keys(): + if portal_name not in DISCOVERABLE_PORTALS.keys(): return self._error( portal_name = portal_name, error_message = ( "No Configuration found for the current portal. " - f"Available: { list(PROCESS_CONFIG.keys()) }" + f"Available: { list(DISCOVERABLE_PORTALS.keys()) }" ) ) - portal_cfg: DiscoveryProcess = PROCESS_CONFIG[portal_name].discovery + portal_cfg: DiscoveryProcess = DISCOVERABLE_PORTALS[portal_name].discovery search_url: str = str(portal_cfg.base_search_url) @@ -291,7 +294,7 @@ async def _extract(self, portal_name: str, tenders_count: int) -> str: "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), - viewport={ 'width': 1920, 'height': 1080 } + viewport = { 'width': 1920, 'height': 1080 } ) page = await context.new_page() @@ -434,7 +437,7 @@ async def _do_get_list_html_content(self, page: Page, portal_cfg: DiscoveryProce await page.wait_for_selector( locator_str, state = 'visible', - timeout = 10000 + timeout = 10_000 ) return await page.locator(locator_str).first.inner_html() @@ -474,4 +477,167 @@ def _error(self, portal_name: str, error_message: str) -> str: """ Returns an error message that an Agent can parse. """ - return f"[ERROR on TendersOverviewExtraction] Portal: {portal_name} | Reason: {error_message}" \ No newline at end of file + return f"[ERROR on TendersOverviewExtraction] Portal: {portal_name} | Reason: {error_message}" + + +### ----------------------------------------------------------------------------------- +### +### Defining Tools for Information Extraction of a Tender +### +### ----------------------------------------------------------------------------------- + +class SinglePageLoaderInput(BaseModel): + """ + Input Schema for PageLoaderTool. + """ + portal_name: str = Field(description = "Name of portal to Portal to reach") + url: HttpUrl | str = Field(description = "URL of the Tender's Detail Page to Load") + +## Main Tool Class +class SinglePageLoaderTool(BaseTool): + """ + Loads the Content of a Single Web Page and returns the content in a clean Markdown format. + It can handle both static and dynamic, JS-rendered pages. + """ + + name: str = "Single Page Loader" + description: str = ( + "It can load a Web Page containg a specific Tender's Details." + "All the content will be turned into Markdown format - parsable to extract TenderInfo objects." + ) + + args_schema: type[BaseModel] = SinglePageLoaderInput + + ## Internal Config + max_chars: int = 12_000 + timeout_ms: int = 30_000 + headless_mode: bool = True #### DEBUG + + def _run(self, portal_name: str, url: HttpUrl | str) -> str: + """ + Synchronous Entry point for CrewAI Agent. + """ + return asyncio.run(self._load_page(portal_name, str(url))) + + async def _load_page(self, portal_name: str, url: str) -> str: + """ + Loads asynchronously a Web Page and returns its content in Markdown Format. + + Performed Steps: + 1. Opens Playwright in headless mode, + 2. Reaches the page by using the provided url. + 3. Waits for the complete load of the page, if necessary. + 4. Converts the selected HTML Content into Markdown through a utility function. + """ + + ### Portal Name configuration + portal_cfg: Optional[ExtractionProcess] = None + if portal_name in PROCESS_CONFIG.keys(): + portal_cfg = PROCESS_CONFIG[portal_name].extraction + + if portal_cfg and not url.startswith('http'): + url = str(portal_cfg.base_resource_url).rstrip('/') + url + + log.info(f"[SinglePageLoader] Loading: {url} [{portal_name}]") + + ## Main Instruction Process - Playwright Loop + browser = None + async with async_playwright() as p: + try: + browser = await p.chromium.launch(headless = self.headless_mode) + context = await browser.new_context( + user_agent = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + viewport = { 'width': 1920, 'height': 1080 } + ) + page = await context.new_page() + + ## Prevents unuseful resources from loading to speed up the loading process + await page.route( + "**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf}", + lambda route: route.abort() + ) + + try: + await page.goto( + url, + wait_until = "networkidle", + timeout = self.timeout_ms + ) + + except PlaywrightTimeout: + log.warning(f"[SinglePageLoader] Timeout on {url}, attempting partial recovery") + + if portal_cfg: + ## Get Main Content on specific portal + html = await self._do_get_main_html_content( + page = page, portal_cfg = portal_cfg + ) + else: + ## Generic Fallback + html = await page.content() + + if not html: + return self._error(url, "Page loaded but returned empty content") + + + ## Markdown Conversion + markdown = html_to_markdown( + html, + mode = 'extraction', + max_chars = self.max_chars + ) + + if not markdown.strip(): + return self._error(url, "No content found after cleaning") + + log.info(f"[SinglePageLoader] Extracted {len(markdown)} chars from {url}") + + return markdown + + except Exception as e: + log.error(f"[SinglePageLoader] Error on {url}: {e}") + return self._error(url, str(e)) + + finally: + if browser: + await browser.close() + + + + ## Support Methods + def _build_locator(self, selector: SelectorIdentifier) -> str: + """ + Builds a Playwright locator string from a selector config. + """ + match selector.type: + case "css": + return selector.text + case _: ## In general [type='content'] + return f"[{selector.type}='{selector.text}']" + + async def _do_get_main_html_content(self, page: Page, portal_cfg: ExtractionProcess) -> Optional[str]: + """ + Extracts HTML from the configured list wrapper element. + """ + wrapper_cfg = portal_cfg.main_content_selector + + # No wrapper configured — fallback to full page + if not wrapper_cfg: + return await page.content() + + locator_str = self._build_locator(wrapper_cfg) + try: + return await page.locator(locator_str).first.inner_html(timeout = 10_000) + + except Exception as e: + log.warning(f"[SinglePageLoader] Element '{locator_str}' not found: {e}, falling back to full page") + return await page.content() + + + def _error(self, url: str, reason: str) -> str: + return f"[ERROR SinglePageLoader] URL: {url} | Reason: {reason}" + From 99743fc2d8ab140c355649f907ce268830757e64 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 16:24:51 +0200 Subject: [PATCH 11/52] feat: added new agent --- src/bandai/config/agents_scout.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/bandai/config/agents_scout.yaml b/src/bandai/config/agents_scout.yaml index 0161e5d..5262f66 100644 --- a/src/bandai/config/agents_scout.yaml +++ b/src/bandai/config/agents_scout.yaml @@ -40,3 +40,31 @@ preference_filter_agent: strategic priorities. You understand imprecise human language and can translate "we prefer Sardinia, mid-size, cloud-heavy" into a filtered, ranked shortlist with a rationale for each decision. + + +discovery_agent: + role: > + Public Tender Discovery Specialist + goal: > + Navigate a Public Procurement Portals to find and collect an overview of + available tenders, returning them as structured data ready for be inspected + farther if necessary. + backstory: > + You are an expert in Italian and European public procurement. + You know how to navigate complex procurement portals like TED and ANAC. + Your job is to find tenders efficiently and return clean, structured overviews. + You always verify that every TenderOverview has at minimum a valid URL. + If a tender has no URL, you skip it. + +extractor_agent: + role: > + Public Tender Data Extraction Specialist + goal: > + Given some basic tender information and its URL, load the detail + page and extract all relevant information into a TenderInfo object. + backstory: > + You are an expert in parsing Italian and European public procurement documents. + You can read complex tender pages and extract key fields like CIG, CPV codes, + deadlines, amounts, and contracting authority details. + When a field is not available on the page, you set it to null rather than guessing. + You always preserve links to attached documents. From 743cdd5f06c2808723b5a7a7c05d7847c53cc7dc Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 16:25:07 +0200 Subject: [PATCH 12/52] feat: added new tasks --- src/bandai/config/tasks_scout.yaml | 81 ++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 81d6fcd..496ce2b 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -88,3 +88,84 @@ preference_filter_task: A raw JSON array starting with '[' and ending with ']'. No text before or after the array. agent: preference_filter_agent + + + +discovery_task: + description: > + Search the {portal_name} portal for public tenders aimed at Italian + public administrations. Use the TendersOverviewExtractorTool to build + the response JSON structure. + Collect exactly {tenders_count} tender overviews. + For each tender found, extract the following fields:\n + - title: tender title if available, null otherwise\n + - url: direct URL to the tender detail page (required) + - portal: always '{portal_name}'\n + - access_mode: 'html' for web pages, 'api' for API endpoints\n + - metadata: any additional data available (deadline, CIG)\n + + Rules:\n + - Do not guess missing fields, use null\n + - Return exactly the JSON schema as specified + expected_output: > + A valid object following the DiscoveryResult schema:\n + {\n + "tenders": [\n + {\n + "title": "string or null",\n + "url": "https://...",\n + "portal": "{portal_name}",\n + "access_mode": "html",\n + "metadata": {}\n + }\n + ],\n + "portal": "{portal_name}",\n + "status": "success"\n + } + agent: discovery_agent + +extraction_task: + description: > + Load the tender detail page at: [{tender.portal.upper(}}] {tender.url}. + Known title: {tender.title or 'not available'}\n\n + Extract all available information from the page and build a TenderInfo object.\n + Fields to extract:\n + - cig: Codice Identificativo Gara\n + - cup: Codice Unico di Progetto\n + - title: full tender title\n + - url: the page URL\n + - cpv: list of CPV codes\n + - contract_type: i.e Services / Supplies / Works\n + - contracting_authority: name and tax_code\n + - base_amount: base auction amount in euros\n + - max_amount: maximum value including renewals\n + - publication_date: ISO format (YYYY-MM-DD)\n + - deadline: bid submission deadline, ISO format\n + - contract_duration_months: integer\n + - url_docs: list of links to documents and attachments\n\n + Rules:\n + - Set missing fields to null, never guess\n + - Preserve all document links found on the page\n + - Return status 'partial' if less than half the fields are populated + expected_output: > + A valid object following the DiscoveryResult schema:\n + {\n + "cig": "string or null",\n + "cup": "string or null",\n + "title": "string",\n + "url": "https://...",\n + "cpv": ["code1", "code2"] or null,\n + "contract_type": "string or null",\n + "contracting_authority": {"name": "string or null", "tax_code": "string or null"} or null,\n + "base_amount": float or null,\n + "max_amount": float or null,\n + "publication_date": "YYYY-MM-DD or null",\n + "deadline": "YYYY-MM-DDTHH:MM:SS or null",\n + "contract_duration_months": integer or null,\n + "url_docs": ["https://..."] or null,\n + "portal": "string",\n + "status": "success" or "partial" or "error",\n + "error": null\n + } + agent: extractor_agent + From 046c993a9c5235756c32dd4fc559543617d22bb8 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 20:12:40 +0200 Subject: [PATCH 13/52] feat: added guardrail for single json object --- src/bandai/guardrails.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/bandai/guardrails.py b/src/bandai/guardrails.py index b129849..5985b5f 100644 --- a/src/bandai/guardrails.py +++ b/src/bandai/guardrails.py @@ -74,6 +74,41 @@ def validate_json_array( ) +# JSON Object Validator + + +def validate_json_obj( + result: TaskOutput +): + """ + Validate that the task output is a parseable JSON object. + """ + + raw: str = result.raw + + if not raw or not raw.strip(): + return (False, "Output must not be empty") + + raw = re.sub(r"```json\s*", "", raw) + raw = re.sub(r"```\s*", "", raw) + raw = raw.strip() + + start = raw.find("{") + end = raw.rfind("}") + + if start == -1 or end == -1: + return (False, "No JSON object found in the output") + + json_str = raw[start:end+1] + + try: + parsed = json.loads(json_str) + return (True, raw) + except json.JSONDecodeError as e: + return (False, "JSON Object is not Valid") + + + # Compliance Verdict Validation From 87499e28e9f9bdd3c312d839689a05004a761725 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 20:13:15 +0200 Subject: [PATCH 14/52] feat: added new agents and tassk for discovery and extraction --- src/bandai/config/agents_scout.yaml | 34 ++++--- src/bandai/config/tasks_scout.yaml | 136 +++++++++++++++------------- 2 files changed, 92 insertions(+), 78 deletions(-) diff --git a/src/bandai/config/agents_scout.yaml b/src/bandai/config/agents_scout.yaml index 5262f66..8d75311 100644 --- a/src/bandai/config/agents_scout.yaml +++ b/src/bandai/config/agents_scout.yaml @@ -46,25 +46,29 @@ discovery_agent: role: > Public Tender Discovery Specialist goal: > - Navigate a Public Procurement Portals to find and collect an overview of - available tenders, returning them as structured data ready for be inspected - farther if necessary. + Navigate the {portal} Public Procurement Portal to find and collect an overview of + available tenders. Parse the Markdown returned by the tool and produce + ONLY a valid JSON object following the DiscoveryResult schema. + Do not add any explanation, preamble, or markdown formatting. backstory: > - You are an expert in Italian and European public procurement. - You know how to navigate complex procurement portals like TED and ANAC. - Your job is to find tenders efficiently and return clean, structured overviews. - You always verify that every TenderOverview has at minimum a valid URL. + You are an expert in Italian and European public procurement. + You know what to expect when navigating procurement portals like TED and ANAC. + Your job is to find tenders efficiently and return clean, structured overviews. + You NEVER explain your reasoning or add commentary. + Your entire response is always raw JSON and nothing else. If a tender has no URL, you skip it. -extractor_agent: +extraction_agent: role: > Public Tender Data Extraction Specialist goal: > - Given some basic tender information and its URL, load the detail - page and extract all relevant information into a TenderInfo object. + Given some basic tender information and its URL, load the detail + page and parse the Markdown returned by the tool to produce + ONLY a valid JSON object following the TenderInfo schema. + Do not add any explanation, preamble, or markdown formatting. backstory: > - You are an expert in parsing Italian and European public procurement documents. - You can read complex tender pages and extract key fields like CIG, CPV codes, - deadlines, amounts, and contracting authority details. - When a field is not available on the page, you set it to null rather than guessing. - You always preserve links to attached documents. + You are an expert in parsing Italian and European public procurement documents. + You can read complex tender pages and extract key fields like CIG, CPV codes, + deadlines, amounts, and contracting authority details. You NEVER explain your + reasoning or add commentary. Your entire response is always raw JSON and nothing else. + When a field is not available on the page, you set it to null rather than guessing. diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 496ce2b..8e19aa7 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -93,79 +93,89 @@ preference_filter_task: discovery_task: description: > - Search the {portal_name} portal for public tenders aimed at Italian - public administrations. Use the TendersOverviewExtractorTool to build - the response JSON structure. - Collect exactly {tenders_count} tender overviews. - For each tender found, extract the following fields:\n - - title: tender title if available, null otherwise\n + Search the {portal_name} portal for public tenders aimed at Italian + public administrations. The TendersOverviewExtractorTool will return + a Markdown containing a list of tenders. Parse the Markdown and extract + each tender you find. Collect exactly {tenders_count} tender overviews. + + For each tender found, parse the information of the following fields: + - title: tender title if available, null otherwise - url: direct URL to the tender detail page (required) - - portal: always '{portal_name}'\n - - access_mode: 'html' for web pages, 'api' for API endpoints\n - - metadata: any additional data available (deadline, CIG)\n + - portal: always '{portal_name}' + - access_mode: 'html' for web pages, 'api' for API endpoints + - metadata: any additional data available (deadline, CIG) - Rules:\n - - Do not guess missing fields, use null\n + Rules: + - Do not guess missing fields, use null - Return exactly the JSON schema as specified + + IMPORTANT: Your final answer must be ONLY the raw JSON object. + Do not write explanations, do not use markdown code blocks, + do not add any text before or after the JSON. expected_output: > - A valid object following the DiscoveryResult schema:\n - {\n - "tenders": [\n - {\n - "title": "string or null",\n - "url": "https://...",\n - "portal": "{portal_name}",\n - "access_mode": "html",\n - "metadata": {}\n - }\n - ],\n - "portal": "{portal_name}",\n - "status": "success"\n + A valid object following the DiscoveryResult schema: + { + "tenders": [ + { + "title": "string or null", + "url": "https://...", + "portal": "{portal_name}", + "access_mode": "html", + "metadata": {} + } + ], + "portal": "{portal_name}", + "status": "success" } agent: discovery_agent extraction_task: description: > - Load the tender detail page at: [{tender.portal.upper(}}] {tender.url}. - Known title: {tender.title or 'not available'}\n\n - Extract all available information from the page and build a TenderInfo object.\n - Fields to extract:\n - - cig: Codice Identificativo Gara\n - - cup: Codice Unico di Progetto\n - - title: full tender title\n - - url: the page URL\n - - cpv: list of CPV codes\n - - contract_type: i.e Services / Supplies / Works\n - - contracting_authority: name and tax_code\n - - base_amount: base auction amount in euros\n - - max_amount: maximum value including renewals\n - - publication_date: ISO format (YYYY-MM-DD)\n - - deadline: bid submission deadline, ISO format\n - - contract_duration_months: integer\n - - url_docs: list of links to documents and attachments\n\n - Rules:\n - - Set missing fields to null, never guess\n - - Preserve all document links found on the page\n + Load the tender detail page at: [{portal_name}] {given_url}. + Known title: {title}. + Extract all available information from the page and build a TenderInfo object. + Fields to extract: + - cig: Codice Identificativo Gara + - cup: Codice Unico di Progetto + - title: full tender title + - url: the page URL + - cpv: list of CPV codes + - contract_type: i.e Services / Supplies / Works + - contracting_authority: name and tax_code + - base_amount: base auction amount in euros + - max_amount: maximum value including renewals + - publication_date: ISO format (YYYY-MM-DD) + - deadline: bid submission deadline, ISO format + - contract_duration_months: integer + - url_docs: list of links to documents and attachments + + Rules: + - Set missing fields to null, never guess + - Preserve all document links found on the page - Return status 'partial' if less than half the fields are populated + + IMPORTANT: Your final answer must be ONLY the raw JSON object. + Do not write explanations, do not use markdown code blocks, + do not add any text before or after the JSON. expected_output: > - A valid object following the DiscoveryResult schema:\n - {\n - "cig": "string or null",\n - "cup": "string or null",\n - "title": "string",\n - "url": "https://...",\n - "cpv": ["code1", "code2"] or null,\n - "contract_type": "string or null",\n - "contracting_authority": {"name": "string or null", "tax_code": "string or null"} or null,\n - "base_amount": float or null,\n - "max_amount": float or null,\n - "publication_date": "YYYY-MM-DD or null",\n - "deadline": "YYYY-MM-DDTHH:MM:SS or null",\n - "contract_duration_months": integer or null,\n - "url_docs": ["https://..."] or null,\n - "portal": "string",\n - "status": "success" or "partial" or "error",\n - "error": null\n + A valid object following the DiscoveryResult schema: + { + "cig": "string or null", + "cup": "string or null", + "title": "string", + "url": "https://...", + "cpv": ["code1", "code2"] or null, + "contract_type": "string or null", + "contracting_authority": {"name": "string or null", "tax_code": "string or null"} or null, + "base_amount": float or null, + "max_amount": float or null, + "publication_date": "YYYY-MM-DD or null", + "deadline": "YYYY-MM-DDTHH:MM:SS or null", + "contract_duration_months": integer or null, + "url_docs": ["https://..."] or null, + "portal": "string", + "status": "success" or "partial" or "error", + "error": null } - agent: extractor_agent + agent: extraction_agent From f69dc6ba4b02bd92f83486127c1bffe8cfa74bb4 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Tue, 26 May 2026 20:13:58 +0200 Subject: [PATCH 15/52] fix: updated import on models.py --- src/bandai/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bandai/models/__init__.py b/src/bandai/models/__init__.py index 7b0845c..6edf43a 100644 --- a/src/bandai/models/__init__.py +++ b/src/bandai/models/__init__.py @@ -17,7 +17,7 @@ parse_company_profile, ) -from .discovery_models import ( +from .models import ( AccessMode, Status, TenderOverview, DiscoveryResult, From f510f445bc2cf2f04bc991558977078e57606ce6 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 11:41:53 +0200 Subject: [PATCH 16/52] test: disabling all portal except 'TED' --- src/bandai/config/portals.yaml | 52 ++++++++++++++++------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/bandai/config/portals.yaml b/src/bandai/config/portals.yaml index e30ae4c..4906be2 100644 --- a/src/bandai/config/portals.yaml +++ b/src/bandai/config/portals.yaml @@ -11,28 +11,27 @@ # automatically by the Scout crew on the next run. portals: - - name: "ANAC" - info: "ANAC / Simog" - base_url: "https://www.anticorruzione.it/-/bandi-di-gara" - reliability: 1.00 + # - name: "ANAC" + # info: "ANAC / Simog" + # base_url: "https://www.anticorruzione.it/-/bandi-di-gara" + # reliability: 1.00 - discovery: - base_search_url: "https://pubblicitalegale.anticorruzione.it/bandi" - elements_per_page: 10 - list_wrapper_selector: - type: "css" - text: "app-bandi-di-gara div:has( app-bando-di-gara-preview)" - next_page_selector: - type: "css" - text: ".pagination .page-item:last-child" - actions_to_perform: [] - - extraction: - base_resource_url: "https://pubblicitalegale.anticorruzione.it/bandi" - main_content_selector: - type: "css" - text: "app-bando-di-gara-detail" + # discovery: + # base_search_url: "https://pubblicitalegale.anticorruzione.it/bandi" + # elements_per_page: 10 + # list_wrapper_selector: + # type: "css" + # text: "app-bandi-di-gara div:has( app-bando-di-gara-preview)" + # next_page_selector: + # type: "css" + # text: ".pagination .page-item:last-child" + # actions_to_perform: [] + # extraction: + # base_resource_url: "https://pubblicitalegale.anticorruzione.it/bandi" + # main_content_selector: + # type: "css" + # text: "app-bando-di-gara-detail" - name: "TED" info: "TED (EU)" @@ -62,11 +61,10 @@ portals: type: "css" text: "#notice-accordion" + # - name: "MePA" + # base_url: "https://www.acquistinretepa.it/opencms/opencms/main/pa/" + # reliability: 0.85 - - name: "MePA" - base_url: "https://www.acquistinretepa.it/opencms/opencms/main/pa/" - reliability: 0.85 - - - name: "Sardegna CAT" - base_url: "https://www.sardegacat.it/eprocurement/createWorkspace.do" - reliability: 0.75 + # - name: "Sardegna CAT" + # base_url: "https://www.sardegacat.it/eprocurement/createWorkspace.do" + # reliability: 0.75 From 0148fb1b99731b7e9554fc065822ed642ef26035 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 11:42:31 +0200 Subject: [PATCH 17/52] feat: update config for discovery and extraction agents/tasks --- src/bandai/config/agents_scout.yaml | 11 ++-- src/bandai/config/tasks_scout.yaml | 79 +++++++++++++++-------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/src/bandai/config/agents_scout.yaml b/src/bandai/config/agents_scout.yaml index 8d75311..fff0d8b 100644 --- a/src/bandai/config/agents_scout.yaml +++ b/src/bandai/config/agents_scout.yaml @@ -48,7 +48,7 @@ discovery_agent: goal: > Navigate the {portal} Public Procurement Portal to find and collect an overview of available tenders. Parse the Markdown returned by the tool and produce - ONLY a valid JSON object following the DiscoveryResult schema. + ONLY a valid JSON Array following the TenderOverview schema. Do not add any explanation, preamble, or markdown formatting. backstory: > You are an expert in Italian and European public procurement. @@ -62,10 +62,11 @@ extraction_agent: role: > Public Tender Data Extraction Specialist goal: > - Given some basic tender information and its URL, load the detail - page and parse the Markdown returned by the tool to produce - ONLY a valid JSON object following the TenderInfo schema. - Do not add any explanation, preamble, or markdown formatting. + You will be given a list of tenders from the {portal_name} portal. + Process them ONE BY ONE in order. For each tender, call the tool with + its URL, parse the Markdown result into a TenderInfo object, then move + to the next. Only after ALL tenders are processed, return the complete + JSON array. Return ONLY a valid JSON array. No explanation, no markdown. backstory: > You are an expert in parsing Italian and European public procurement documents. You can read complex tender pages and extract key fields like CIG, CPV codes, diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 8e19aa7..303d8bf 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -109,31 +109,28 @@ discovery_task: - Do not guess missing fields, use null - Return exactly the JSON schema as specified - IMPORTANT: Your final answer must be ONLY the raw JSON object. + IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. expected_output: > - A valid object following the DiscoveryResult schema: - { - "tenders": [ - { - "title": "string or null", - "url": "https://...", - "portal": "{portal_name}", - "access_mode": "html", - "metadata": {} - } - ], - "portal": "{portal_name}", - "status": "success" - } + A valid object following the TenderOverview schema: + [ + { + "title": "string or null", + "url": "https://...", + "portal": "string", + "access_mode": "html", + "metadata": {} + } + ] agent: discovery_agent extraction_task: description: > - Load the tender detail page at: [{portal_name}] {given_url}. - Known title: {title}. - Extract all available information from the page and build a TenderInfo object. + You are given a list of {tenders_count} tenders from the {portal_name} portal. + For each of the tenders in the list you must: + 1. Load the tender detail page at its URL. + 2. Extract all available information from the page and build a TenderInfo object. Fields to extract: - cig: Codice Identificativo Gara - cup: Codice Unico di Progetto @@ -149,33 +146,39 @@ extraction_task: - contract_duration_months: integer - url_docs: list of links to documents and attachments + 3. After processing ALL {tenders_count} tenders, collect + every TenderInfo into a single JSON array and return it as + your final answer. Do not return partial results mid-loop. + Rules: - Set missing fields to null, never guess - Preserve all document links found on the page - Return status 'partial' if less than half the fields are populated - IMPORTANT: Your final answer must be ONLY the raw JSON object. + IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. expected_output: > - A valid object following the DiscoveryResult schema: - { - "cig": "string or null", - "cup": "string or null", - "title": "string", - "url": "https://...", - "cpv": ["code1", "code2"] or null, - "contract_type": "string or null", - "contracting_authority": {"name": "string or null", "tax_code": "string or null"} or null, - "base_amount": float or null, - "max_amount": float or null, - "publication_date": "YYYY-MM-DD or null", - "deadline": "YYYY-MM-DDTHH:MM:SS or null", - "contract_duration_months": integer or null, - "url_docs": ["https://..."] or null, - "portal": "string", - "status": "success" or "partial" or "error", - "error": null - } + A valid array of objects following the TenderInfo schema: + [ + { + "cig": "string or null", + "cup": "string or null", + "title": "string", + "url": "https://...", + "cpv": ["code1", "code2"] or null, + "contract_type": "string or null", + "contracting_authority": {"name": "string or null", "tax_code": "string or null"} or null, + "base_amount": float or null, + "max_amount": float or null, + "publication_date": "YYYY-MM-DD or null", + "deadline": "YYYY-MM-DDTHH:MM:SS or null", + "contract_duration_months": integer or null, + "url_docs": ["https://..."] or null, + "portal": "string", + "status": "success" or "partial" or "error", + "error": null + } + ] agent: extraction_agent From 311656af3f28087e8a766e5dcd7a3ed444a083b7 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 12:08:00 +0200 Subject: [PATCH 18/52] fear: updated ScoutCrew to include the new agents --- src/bandai/crews/scout_crew.py | 119 ++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/src/bandai/crews/scout_crew.py b/src/bandai/crews/scout_crew.py index e274e8b..b34dc71 100644 --- a/src/bandai/crews/scout_crew.py +++ b/src/bandai/crews/scout_crew.py @@ -15,8 +15,14 @@ ) from bandai.guardrails import validate_json_array from bandai.knowledge_sources import get_all_knowledge_sources -from bandai.models import RawContract, ResolvedContract, load_company_profile -from bandai.tools.crawler_tools import ContractDetailTool, TenderCrawlerTool +from bandai.models import ( + RawContract, ResolvedContract, + load_company_profile +) +from bandai.tools.crawler_tools import ( + TendersOverviewExtractorTool, SinglePageLoaderTool, + ContractDetailTool, TenderCrawlerTool + ) from bandai.utils import load_yaml_config log = logging.getLogger(__name__) @@ -41,55 +47,90 @@ class ScoutCrew: agents: list[BaseAgent] tasks: list[Task] + tenders_count_per_portal: int = 3 + def build(self, user_preferences: str) -> tuple[Crew, Task]: """Build and return (crew_instance, preference_filter_task).""" ac = load_yaml_config("agents_scout.yaml") tc = load_yaml_config("tasks_scout.yaml") # Load company profile once (cached by @lru_cache). - company = load_company_profile() - ateco_codes_str = ", ".join(company.ateco_codes) + # company = load_company_profile() + # ateco_codes_str = ", ".join(company.ateco_codes) - # Crawler agents + tasks (one per portal, all async) - crawler_agents: list[Agent] = [] - crawl_tasks: list[Task] = [] + # Crawler agents + tasks + # For each portal: Discovery + Extraction (all async) + disc_agents: list[Agent] = [] + disc_tasks: list[Task] = [] + + extr_agents: list[Agent] = [] + extr_tasks: list[Task] = [] for portal in BANDI_PORTALS: - agent_cfg = ac["crawler_agent"] - task_cfg = tc["crawl_task"] - - ag = Agent( - role=agent_cfg["role"].format(portal_name=portal.name), - goal=agent_cfg["goal"].format( - portal_name=portal.name, - portal_url=portal.base_url, + disc_agent_cfg = ac["discovery_agent"] + disc_task_cfg = tc["discovery_task"] + + disc_ag = Agent( + role = disc_agent_cfg["role"], + goal = disc_agent_cfg["goal"].format(portal = portal.name), + backstory = disc_agent_cfg["backstory"], + tools = [TendersOverviewExtractorTool()], + llm = get_llm(fast = True), + verbose = True, + max_iter = 5, + max_retry_limit = 2, + respect_context_window = True, + allow_delegation = False, + ) + + disc_t = Task( + description = disc_task_cfg["description"].format( + portal_name = portal.name, + tenders_count = self.tenders_count_per_portal, ), - backstory=agent_cfg["backstory"].format(portal_name=portal.name), - tools=[TenderCrawlerTool(), ContractDetailTool()], - llm=get_llm(fast=True), - verbose=True, - max_iter=5, - max_retry_limit=2, - respect_context_window=True, - allow_delegation=False, + expected_output = disc_task_cfg["expected_output"], + agent = disc_ag, + async_execution = True, + # output_pydantic = list[TenderOverview], + guardrail = lambda r: validate_json_array(r, strip_fences=True), + guardrail_max_retries = 3, + ) + + disc_agents.append(disc_ag) + disc_tasks.append(disc_t) + + extr_agent_cfg = ac["extraction_agent"] + extr_task_cfg = tc["extraction_task"] + + extr_ag = Agent( + role = extr_agent_cfg["role"], + goal = extr_agent_cfg["goal"].format(portal_name = portal.name), + backstory = extr_agent_cfg["backstory"], + tools = [SinglePageLoaderTool()], + llm = get_llm(fast = True), + verbose = True, + max_iter = self.tenders_count_per_portal * 2 + 2, + max_retry_limit = 2, + respect_context_window = True, + allow_delegation = False, ) - t = Task( - description=task_cfg["description"].format( - portal_name=portal.name, - portal_url=portal.base_url, - ateco_codes=ateco_codes_str, + extr_t = Task( + description = extr_task_cfg["description"].format( + portal_name = portal.name, + tenders_count = self.tenders_count_per_portal, ), - expected_output=task_cfg["expected_output"], - agent=ag, - async_execution=True, - output_pydantic=list[RawContract], - guardrail=lambda r: validate_json_array(r, strip_fences=True), - guardrail_max_retries=3, + expected_output = extr_task_cfg["expected_output"], + context = [disc_t], + agent = extr_ag, + async_execution = True, + # output_pydantic = list[TenderOverview], + guardrail = lambda r: validate_json_array(r, strip_fences=True), + guardrail_max_retries = 3, ) - crawler_agents.append(ag) - crawl_tasks.append(t) + extr_agents.append(extr_ag) + extr_tasks.append(extr_t) # Resolution Agent - deduplicates and ranks results res_cfg = ac["resolution_agent"] @@ -115,7 +156,7 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: ), expected_output=res_task_cfg["expected_output"], agent=resolution_agent, - context=crawl_tasks, + context=extr_tasks, output_pydantic=list[ResolvedContract], guardrail=validate_json_array, guardrail_max_retries=3, @@ -148,8 +189,8 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: guardrail_max_retries=3, ) - all_agents = crawler_agents + [resolution_agent, preference_filter_agent] - all_tasks = crawl_tasks + [resolution_task, preference_filter_task] + all_agents = disc_agents + extr_agents + [resolution_agent, preference_filter_agent] + all_tasks = disc_tasks + extr_tasks + [resolution_task, preference_filter_task] built_crew = Crew( agents=all_agents, From 36a8aa02fd43c01559a63930980a3d9699538853 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 12:08:52 +0200 Subject: [PATCH 19/52] refactor: removed unused imports --- src/bandai/crews/scout_crew.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bandai/crews/scout_crew.py b/src/bandai/crews/scout_crew.py index b34dc71..46744a9 100644 --- a/src/bandai/crews/scout_crew.py +++ b/src/bandai/crews/scout_crew.py @@ -16,13 +16,13 @@ from bandai.guardrails import validate_json_array from bandai.knowledge_sources import get_all_knowledge_sources from bandai.models import ( - RawContract, ResolvedContract, + ResolvedContract, load_company_profile ) from bandai.tools.crawler_tools import ( TendersOverviewExtractorTool, SinglePageLoaderTool, - ContractDetailTool, TenderCrawlerTool - ) + ContractDetailTool +) from bandai.utils import load_yaml_config log = logging.getLogger(__name__) From 37afa24b8b16620bfd96f0a3bca08520b1418a40 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 12:43:33 +0200 Subject: [PATCH 20/52] fix: add missing modules --- pyproject.toml | 2 ++ src/bandai/main.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6cc74e9..015a95c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "python-dotenv>=1.0.0", "ollama>=0.6.2", "playwright>=1.60.0", + "markdownify>=1.2.2", ] [project.optional-dependencies] @@ -23,6 +24,7 @@ dev = ["pytest>=8.0.0", "pytest-cov>=5.0.0"] [project.scripts] bandai = "bandai.main:run" run_crew = "bandai.main:run" +install_chromium = "bandai.main:install_chromium" train = "bandai.main:train" replay = "bandai.main:replay" test = "bandai.main:test" diff --git a/src/bandai/main.py b/src/bandai/main.py index 604e668..949a101 100644 --- a/src/bandai/main.py +++ b/src/bandai/main.py @@ -139,6 +139,11 @@ def run_pytest() -> None: _run_command(["pytest", "tests/", "-v", *sys.argv[1:]]) +def install_chromium() -> None: + """Install the Chromium browser required by Playwright crawler tools.""" + _run_command([sys.executable, "-m", "playwright", "install", "chromium"]) + + def run_with_trigger() -> None: """Run the BandAI pipeline from an external trigger (webhook/API).""" run() From bb1d9ecc6602ee0b951fcf48616b18e075221f49 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:37:06 +0200 Subject: [PATCH 21/52] feat: update scout task prompts --- src/bandai/config/tasks_scout.yaml | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 303d8bf..c186c2d 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -42,11 +42,24 @@ resolution_task: Required fields: canonical_contract_id, title, contracting_authority, deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. + The context records are TenderInfo objects. Map them to ResolvedContract + exactly as follows: + - canonical_contract_id: use cig; if absent use cup; if absent use url. + - title: copy title; if absent use "Unknown tender". + - contracting_authority: use contracting_authority.name; if absent use "Unknown authority". + - deadline: copy deadline; if absent use "Unknown deadline". + - value_eur: use max_amount; if absent use base_amount; if absent use null. + - cpv_codes: use cpv; if absent use an empty array []. + - sources: include portal and canonical url when available; never return []. + - consensus_score: use the winning portal reliability weight from the table. + - canonical_url: copy url; if absent use "". + 4. OUTPUT RULES (mandatory): Return ONLY the raw JSON array. No introductory text, no explanation, no markdown fences. The response must start with '[' and end with ']'. value_eur must be a number (float), never a string with currency symbols. canonical_url must be a string; use "" if unknown, never null. + Do not return null for any field except value_eur. expected_output: > A JSON array of ResolvedContract objects, deduplicated and ranked by value_eur descending. @@ -82,6 +95,7 @@ preference_filter_task: no markdown fences. The response must start with '[' and end with ']'. value_eur must be a number (float), never a string with currency symbols. canonical_url must be a string; use "" if unknown, never null. + Do not return null for any field except value_eur. expected_output: > A filtered, ranked JSON array of ResolvedContract objects with fit_reason added to each entry. @@ -89,22 +103,20 @@ preference_filter_task: No text before or after the array. agent: preference_filter_agent - - discovery_task: description: > Search the {portal_name} portal for public tenders aimed at Italian public administrations. The TendersOverviewExtractorTool will return a Markdown containing a list of tenders. Parse the Markdown and extract each tender you find. Collect exactly {tenders_count} tender overviews. - + For each tender found, parse the information of the following fields: - title: tender title if available, null otherwise - url: direct URL to the tender detail page (required) - portal: always '{portal_name}' - access_mode: 'html' for web pages, 'api' for API endpoints - metadata: any additional data available (deadline, CIG) - + Rules: - Do not guess missing fields, use null - Return exactly the JSON schema as specified @@ -112,6 +124,9 @@ discovery_task: IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. + Do NOT return tool-call JSON. + Do NOT return the Markdown you loaded. Convert the Markdown content into + TenderOverview objects and return only the JSON array. expected_output: > A valid object following the TenderOverview schema: [ @@ -158,6 +173,12 @@ extraction_task: IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. + Do NOT return tool-call JSON. + Do NOT narrate your extraction process. + Do NOT write phrases like "we need to extract" or "let's start". + Do NOT return the Markdown you loaded. Convert the Markdown content into + TenderInfo objects and return only the JSON array. + Your final answer must start with '[' and end with ']'. expected_output: > A valid array of objects following the TenderInfo schema: [ @@ -181,4 +202,3 @@ extraction_task: } ] agent: extraction_agent - From 018d05a06311cfc6c951a4c618594fea077e7fe4 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:37:34 +0200 Subject: [PATCH 22/52] fix: async execution failure --- src/bandai/crews/scout_crew.py | 97 ++++++++++++++++------------------ 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/src/bandai/crews/scout_crew.py b/src/bandai/crews/scout_crew.py index 46744a9..1e1e336 100644 --- a/src/bandai/crews/scout_crew.py +++ b/src/bandai/crews/scout_crew.py @@ -13,16 +13,14 @@ get_embedder, get_memory, ) -from bandai.guardrails import validate_json_array -from bandai.knowledge_sources import get_all_knowledge_sources -from bandai.models import ( - ResolvedContract, - load_company_profile -) -from bandai.tools.crawler_tools import ( - TendersOverviewExtractorTool, SinglePageLoaderTool, - ContractDetailTool +from bandai.guardrails import ( + validate_json_array, + validate_resolved_contract_array, + validate_tender_info_array, ) +from bandai.knowledge_sources import get_all_knowledge_sources +from bandai.models import ResolvedContract +from bandai.tools.crawler_tools import TendersOverviewExtractorTool, SinglePageLoaderTool from bandai.utils import load_yaml_config log = logging.getLogger(__name__) @@ -58,7 +56,7 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: # company = load_company_profile() # ateco_codes_str = ", ".join(company.ateco_codes) - # Crawler agents + tasks + # Crawler agents + tasks # For each portal: Discovery + Extraction (all async) disc_agents: list[Agent] = [] disc_tasks: list[Task] = [] @@ -71,29 +69,29 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: disc_task_cfg = tc["discovery_task"] disc_ag = Agent( - role = disc_agent_cfg["role"], - goal = disc_agent_cfg["goal"].format(portal = portal.name), - backstory = disc_agent_cfg["backstory"], - tools = [TendersOverviewExtractorTool()], - llm = get_llm(fast = True), - verbose = True, - max_iter = 5, - max_retry_limit = 2, - respect_context_window = True, - allow_delegation = False, + role=disc_agent_cfg["role"], + goal=disc_agent_cfg["goal"].format(portal=portal.name), + backstory=disc_agent_cfg["backstory"], + tools=[TendersOverviewExtractorTool()], + llm=get_llm(fast=True), + verbose=True, + max_iter=5, + max_retry_limit=2, + respect_context_window=True, + allow_delegation=False, ) disc_t = Task( - description = disc_task_cfg["description"].format( - portal_name = portal.name, - tenders_count = self.tenders_count_per_portal, + description=disc_task_cfg["description"].format( + portal_name=portal.name, + tenders_count=self.tenders_count_per_portal, ), - expected_output = disc_task_cfg["expected_output"], - agent = disc_ag, - async_execution = True, + expected_output=disc_task_cfg["expected_output"], + agent=disc_ag, + async_execution=True, # output_pydantic = list[TenderOverview], - guardrail = lambda r: validate_json_array(r, strip_fences=True), - guardrail_max_retries = 3, + guardrail=lambda r: validate_json_array(r, strip_fences=True), + guardrail_max_retries=3, ) disc_agents.append(disc_ag) @@ -103,30 +101,30 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: extr_task_cfg = tc["extraction_task"] extr_ag = Agent( - role = extr_agent_cfg["role"], - goal = extr_agent_cfg["goal"].format(portal_name = portal.name), - backstory = extr_agent_cfg["backstory"], - tools = [SinglePageLoaderTool()], - llm = get_llm(fast = True), - verbose = True, - max_iter = self.tenders_count_per_portal * 2 + 2, - max_retry_limit = 2, - respect_context_window = True, - allow_delegation = False, + role=extr_agent_cfg["role"], + goal=extr_agent_cfg["goal"].format(portal_name=portal.name), + backstory=extr_agent_cfg["backstory"], + tools=[SinglePageLoaderTool()], + llm=get_llm(fast=True), + verbose=True, + max_iter=self.tenders_count_per_portal * 2 + 2, + max_retry_limit=2, + respect_context_window=True, + allow_delegation=False, ) extr_t = Task( - description = extr_task_cfg["description"].format( - portal_name = portal.name, - tenders_count = self.tenders_count_per_portal, + description=extr_task_cfg["description"].format( + portal_name=portal.name, + tenders_count=self.tenders_count_per_portal, ), - expected_output = extr_task_cfg["expected_output"], - context = [disc_t], - agent = extr_ag, - async_execution = True, + expected_output=extr_task_cfg["expected_output"], + context=[disc_t], + agent=extr_ag, + async_execution=False, # output_pydantic = list[TenderOverview], - guardrail = lambda r: validate_json_array(r, strip_fences=True), - guardrail_max_retries = 3, + guardrail=lambda r: validate_tender_info_array(r, strip_fences=True), + guardrail_max_retries=3, ) extr_agents.append(extr_ag) @@ -140,7 +138,6 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: role=res_cfg["role"], goal=res_cfg["goal"], backstory=res_cfg["backstory"], - tools=[ContractDetailTool()], llm=get_llm(fast=False), verbose=True, max_iter=8, @@ -158,7 +155,7 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: agent=resolution_agent, context=extr_tasks, output_pydantic=list[ResolvedContract], - guardrail=validate_json_array, + guardrail=validate_resolved_contract_array, guardrail_max_retries=3, ) @@ -185,7 +182,7 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: agent=preference_filter_agent, context=[resolution_task], output_pydantic=list[ResolvedContract], - guardrail=lambda r: validate_json_array(r, strip_fences=True), + guardrail=lambda r: validate_resolved_contract_array(r, strip_fences=True), guardrail_max_retries=3, ) From adcac0ffda824e7ce5bd48e3f9c7cddeef0bbda3 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:38:09 +0200 Subject: [PATCH 23/52] feat: update json guardrail extractors --- src/bandai/guardrails.py | 142 +++++++++++++++++++++++++++++++++------ 1 file changed, 123 insertions(+), 19 deletions(-) diff --git a/src/bandai/guardrails.py b/src/bandai/guardrails.py index 5985b5f..4eefbb3 100644 --- a/src/bandai/guardrails.py +++ b/src/bandai/guardrails.py @@ -6,6 +6,8 @@ from crewai import TaskOutput # type: ignore +from bandai.utils import extract_json_array_text + log = logging.getLogger(__name__) @@ -41,6 +43,18 @@ def validate_json_array( parsed = json.loads(raw) if isinstance(parsed, list): return (True, raw) + if isinstance(parsed, dict) and {"name", "parameters"}.issubset(parsed): + return ( + False, + "Do not return tool-call JSON with 'name' and 'parameters'. " + "Use the Markdown you received, extract the tender fields, and " + "return ONLY a JSON array of objects.", + ) + try: + extracted = extract_json_array_text(raw) + return (True, extracted) + except ValueError: + pass return ( False, "Output must be a JSON array (list), not a single object.", @@ -48,25 +62,14 @@ def validate_json_array( except json.JSONDecodeError: pass - # Strategy 2: fenced JSON extraction (preferred when present). - fenced = re.search(r"```(?:json)?\s*(\[[\s\S]*?\])\s*```", raw) - if fenced: - extracted = fenced.group(1) - try: - parsed = json.loads(extracted) - if isinstance(parsed, list): - return (True, extracted) - except json.JSONDecodeError: - pass - - # Strategy 3: find the first valid JSON array in the output. - for candidate in re.findall(r"\[[\s\S]*?\]", raw): - try: - parsed = json.loads(candidate) - if isinstance(parsed, list): - return (True, candidate) - except json.JSONDecodeError: - continue + # Strategy 2: find the first complete JSON array in the output. + try: + extracted = extract_json_array_text(raw) + parsed = json.loads(extracted) + if isinstance(parsed, list): + return (True, extracted) + except (ValueError, json.JSONDecodeError): + pass return ( False, @@ -74,6 +77,107 @@ def validate_json_array( ) +def validate_resolved_contract_array( + result: TaskOutput, + *, + strip_fences: bool = False, +): + """Validate a ResolvedContract JSON array and reject unusable null records.""" + ok, payload = validate_json_array(result, strip_fences=strip_fences) + if not ok: + return (ok, payload) + + try: + contracts = json.loads(payload) + except json.JSONDecodeError: + return (False, "Output is not valid JSON. Return a properly formatted JSON array.") + + required_non_null = ( + "canonical_contract_id", + "title", + "contracting_authority", + "deadline", + "cpv_codes", + "sources", + "consensus_score", + "canonical_url", + ) + for index, contract in enumerate(contracts, start=1): + if not isinstance(contract, dict): + return (False, f"Item {index} must be a JSON object.") + + missing = [ + field + for field in required_non_null + if contract.get(field) is None + ] + if missing: + return ( + False, + "ResolvedContract fields cannot be null except value_eur. " + f"Item {index} has null/missing fields: {', '.join(missing)}. " + "Map TenderInfo fields as follows: canonical_contract_id=cig " + "or cup or url; contracting_authority=contracting_authority.name; " + "value_eur=max_amount or base_amount; cpv_codes=cpv or []; " + "sources=[portal and/or url]; consensus_score=portal reliability weight.", + ) + + if not isinstance(contract.get("cpv_codes"), list): + return (False, f"Item {index} field cpv_codes must be a list, never null.") + if not isinstance(contract.get("sources"), list) or not contract["sources"]: + return (False, f"Item {index} field sources must be a non-empty list.") + if not isinstance(contract.get("consensus_score"), (int, float)): + return (False, f"Item {index} field consensus_score must be a number.") + + return (True, payload) + + +def validate_tender_info_array( + result: TaskOutput, + *, + strip_fences: bool = False, +): + """Validate that the extractor returned TenderInfo objects, not prose.""" + ok, payload = validate_json_array(result, strip_fences=strip_fences) + if not ok: + return ( + False, + f"{payload} Do not describe your extraction steps. Return ONLY the final JSON array.", + ) + + try: + tenders = json.loads(payload) + except json.JSONDecodeError: + return (False, "Output is not valid JSON. Return a properly formatted JSON array.") + + required_non_null = ("title", "url", "portal", "status") + for index, tender in enumerate(tenders, start=1): + if not isinstance(tender, dict): + return (False, f"Item {index} must be a TenderInfo JSON object.") + + missing = [ + field + for field in required_non_null + if tender.get(field) is None + ] + if missing: + return ( + False, + f"Item {index} is missing required TenderInfo fields: {', '.join(missing)}. " + "Return ONLY a JSON array of TenderInfo objects; no reasoning text.", + ) + + cpv = tender.get("cpv") + if cpv is not None and not isinstance(cpv, list): + return (False, f"Item {index} field cpv must be a list or null.") + + authority = tender.get("contracting_authority") + if authority is not None and not isinstance(authority, dict): + return (False, f"Item {index} field contracting_authority must be an object or null.") + + return (True, payload) + + # JSON Object Validator From f9af5db9e182bad8ed74b29cd5d2c7c1e8394211 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:39:20 +0200 Subject: [PATCH 24/52] fix: '/n' error in utils --- src/bandai/utils.py | 135 ++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 67 deletions(-) diff --git a/src/bandai/utils.py b/src/bandai/utils.py index 9eac39a..e3d9705 100644 --- a/src/bandai/utils.py +++ b/src/bandai/utils.py @@ -10,9 +10,7 @@ from pathlib import Path from bs4 import BeautifulSoup -from markdownify import markdownify -from typing import Literal, Optional - +from markdownify import markdownify # type: ignore log = logging.getLogger(__name__) @@ -53,33 +51,63 @@ def load_yaml_config(filename: str, *, use_cache: bool = True) -> dict[str, Any] def contract_to_summary(c: dict) -> str: """Format a contract dict into a human-readable summary string.""" + value = c.get("value_eur") or 0 + cpv_codes = c.get("cpv_codes") or [] + if isinstance(cpv_codes, str): + cpv_codes = [cpv_codes] + return ( f"Titolo : {c.get('title', 'N/A')}\n" f"Canonical Contract ID: {c.get('canonical_contract_id', 'N/A')}\n" f"Stazione appaltante : {c.get('contracting_authority', 'N/A')}\n" - f"Importo a base d'asta: EUR {c.get('value_eur', 0):,.0f}\n" + f"Importo a base d'asta: EUR {value:,.0f}\n" f"Scadenza : {c.get('deadline', 'N/A')}\n" - f"CPV : {', '.join(c.get('cpv_codes', []))}\n" + f"CPV : {', '.join(str(code) for code in cpv_codes)}\n" f"URL : {c.get('canonical_url', 'N/A')}" ) # JSON Extraction +_JSON_ARRAY_WRAPPER_KEYS = ("tenders", "items", "root", "contracts", "results", "data") -def extract_json_array(raw: str) -> list: - """Extract a JSON array from a raw LLM output string.""" - start = raw.find("[") - if start == -1: - raise ValueError("No JSON array found in the input.") - end = raw.find("]", start) - if end != -1: - candidate = raw[start : end + 1] - return json.loads(candidate) +def _extract_array_from_wrapper(value: Any) -> list | None: + if isinstance(value, list): + return value + + if isinstance(value, dict): + for key in _JSON_ARRAY_WRAPPER_KEYS: + wrapped = value.get(key) + if isinstance(wrapped, list): + return wrapped + + return None + + +def extract_json_array_text(raw: str) -> str: + """Extract the first complete JSON array text from a raw LLM output.""" + decoder = json.JSONDecoder() + + for start, char in enumerate(raw): + if char not in "[{": + continue + + try: + parsed, end = decoder.raw_decode(raw[start:]) + except json.JSONDecodeError: + continue + + array = _extract_array_from_wrapper(parsed) + if array is not None: + return json.dumps(array, ensure_ascii=False) - candidate = raw[start:] - return json.loads(candidate) + raise ValueError("No JSON array found in the input.") + + +def extract_json_array(raw: str) -> list: + """Extract a JSON array from a raw LLM output string.""" + return json.loads(extract_json_array_text(raw)) # Implicit NO-GO Detection @@ -129,23 +157,20 @@ def is_implicit_no_go(text: str) -> bool: ## All the Tags/Classes that needs to be removed from the original HTML Content -_TAGS_TO_REMOVE: list[str] = [ - 'nav', 'header', 'footer', 'aside', - 'script', 'style', 'noscript', - - '.side-bar', '.cookie-banner', '.cookie-notice', - '.breadcrumb', '.pagination', '.social-share' -] +_TAGS_TO_REMOVE: list[str] = ["nav", "header", "footer", "aside", "script", "style", "noscript", ".side-bar", ".cookie-banner", ".cookie-notice", ".breadcrumb", ".pagination", ".social-share"] ## Main Content Containers _MAIN_CONTENT_SELECTORS = [ ## Extraction - "main", "article", "[role='main']", - "#content", ".content", - "#main-content", ".main-content", - - ## Discovery - 'table' + "main", + "article", + "[role='main']", + "#content", + ".content", + "#main-content", + ".main-content", + ## Discovery + "table", ] PADDING_LINES: int = 5 @@ -156,40 +181,25 @@ def is_implicit_no_go(text: str) -> bool: ## Try to keep all relevant information about terders' urls for reachability. ## - Extraction - Focus on the main content and outer links to documents and attachments. ## - Full - Keep everything (structure and content), aside from styling -ConversionMode = Literal['discovery', 'extraction', 'full'] +ConversionMode = Literal["discovery", "extraction", "full"] _MODE_CONFIG = { - "discovery": { - "description": "Clean Text with urls for reaching Single Tenders", - "strip": ["img"] - - }, - "extraction": { - "description": "Single-page Tender Extraction of Information", - "strip": ["img"] - }, - "full": { - "description": "Complete Output with no further stripping (debug mode)", - "strip": ['a', 'img'] - }, + "discovery": {"description": "Clean Text with urls for reaching Single Tenders", "strip": ["img"]}, + "extraction": {"description": "Single-page Tender Extraction of Information", "strip": ["img"]}, + "full": {"description": "Complete Output with no further stripping (debug mode)", "strip": ["a", "img"]}, } -def html_to_markdown( - html_content: str, - mode: ConversionMode, - max_chars: int = 12000, - max_lines: Optional[int] = None -) -> str: +def html_to_markdown(html_content: str, mode: ConversionMode, max_chars: int = 12000, max_lines: Optional[int] = None) -> str: """ Converter from HTML to Markdown content. Based on the requested Mode, it tries to strip all non-relevant information. - + :param html_content: Raw HTML Content to strip :type html_content: str - :param mode: * 'discovery' -> Try to keep lists structure with titles and url for reachability, - * 'extraction' -> Keep all relevant information and documents linkage, + :param mode: * 'discovery' -> Try to keep lists structure with titles and url for reachability, + * 'extraction' -> Keep all relevant information and documents linkage, * 'full' -> No further stripping :type mode: ConversionMode :param max_chars: Maximum length of the Output (for Context Window Reasons, Saturation etc.) @@ -200,14 +210,13 @@ def html_to_markdown( ## Init config = _MODE_CONFIG[mode] - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") ## Removing all Basic Tags/Classes for selector in _TAGS_TO_REMOVE: for tag in soup.select(selector): - tag.decompose() ## Destroys Tags content Recoursively + tag.decompose() ## Destroys Tags content Recoursively - ## Main Content Identification main_content = None for selector in _MAIN_CONTENT_SELECTORS: @@ -220,28 +229,20 @@ def html_to_markdown( if not target: return "" - ## Markdown Conversion - markdown = markdownify( - html = str(target), - heading_style = "ATX", ## Titles with '#' symbol - strip = config['strip'], - newline_style = 'backslash' - ) - + markdown = markdownify(html=str(target), heading_style="ATX", strip=config["strip"], newline_style="backslash") ## Titles with '#' symbol ## Possible Multiline Spacing markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip() if max_lines: - lines = markdown.split('/n') + lines = markdown.split("\n") if len(lines) > max_lines: truncated = lines[:max_lines] - markdown = '/n'.join(truncated) - + markdown = "\n".join(truncated) ## Markdown Content being too long if len(markdown) > max_chars: markdown = markdown[:max_chars] - return markdown \ No newline at end of file + return markdown From 5bea166217c06edd8605d5600a45365d488e9fd0 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:44:18 +0200 Subject: [PATCH 25/52] fix: extract array from wrapper always returns None --- src/bandai/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bandai/utils.py b/src/bandai/utils.py index e3d9705..cf0f7a2 100644 --- a/src/bandai/utils.py +++ b/src/bandai/utils.py @@ -74,7 +74,9 @@ def contract_to_summary(c: dict) -> str: def _extract_array_from_wrapper(value: Any) -> list | None: if isinstance(value, list): - return value + if not value or all(isinstance(item, dict) for item in value): + return value + return None if isinstance(value, dict): for key in _JSON_ARRAY_WRAPPER_KEYS: From 303f91a01cc8c0b09e60ffdfc1761ef65597d329 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 14:45:55 +0200 Subject: [PATCH 26/52] fix: update contract parsing in flow --- src/bandai/flow.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/bandai/flow.py b/src/bandai/flow.py index f24adff..0d373e6 100644 --- a/src/bandai/flow.py +++ b/src/bandai/flow.py @@ -4,15 +4,15 @@ from datetime import datetime from typing import Callable -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, TypeAdapter from crewai.flow.flow import Flow, listen, router, start # type: ignore from bandai.crews.compliance_crew import ComplianceCrew from bandai.crews.proposal_crew import ProposalCrew from bandai.crews.scout_crew import ScoutCrew -from bandai.models import ComplianceVerdict, FinalProposal +from bandai.models import ComplianceVerdict, FinalProposal, ResolvedContract from bandai.config import _MAX_REVIEW_ITERATIONS -from bandai.utils import contract_to_summary, is_implicit_no_go +from bandai.utils import contract_to_summary, extract_json_array, is_implicit_no_go from bandai.io import save_json log = logging.getLogger("bandai.flow") @@ -122,10 +122,12 @@ def run_scouting(self) -> None: ) built_crew.kickoff() - # Use CrewAI structured output instead of manual JSON parsing. raw = final_task.output.raw + log.info("Raw scout output:\n%s", raw) try: - contracts = [c if isinstance(c, dict) else c.model_dump() for c in (final_task.output.pydantic or [])] + parsed = extract_json_array(raw) + resolved = TypeAdapter(list[ResolvedContract]).validate_python(parsed) + contracts = [contract.model_dump() for contract in resolved] except Exception: log.error("Scout output was not parseable. Raw:\n%s", raw) contracts = [] From 3df6c20281117d32507e48b3b5894bef50ae830d Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:26:47 +0200 Subject: [PATCH 27/52] feat: updated resolution agent prompt --- src/bandai/config/agents_scout.yaml | 50 ++++++--- src/bandai/config/tasks_scout.yaml | 165 +++++++++++++++++++--------- 2 files changed, 147 insertions(+), 68 deletions(-) diff --git a/src/bandai/config/agents_scout.yaml b/src/bandai/config/agents_scout.yaml index fff0d8b..81f6827 100644 --- a/src/bandai/config/agents_scout.yaml +++ b/src/bandai/config/agents_scout.yaml @@ -11,22 +11,22 @@ crawler_agent: ICT sector and you never miss a relevant bando. You always return clean, structured JSON. You never explain tool usage or show example outputs. -resolution_agent: - role: > - Resolution Agent - goal: > - Receive all raw tender lists from every Crawler, detect duplicates by Contract ID or - by (contracting_authority + value_eur ± 5%), apply weighted consensus - polling using fixed portal reliability scores, and return a single - deduplicated list of authoritative ResolvedContract objects ranked by - value_eur descending. - backstory: > - You are an expert data reconciliation specialist who has spent years - harmonising open procurement data across Italian and European portals. You - understand that ANAC is authoritative for Contract ID assignment and that TED is the - golden source for EU-wide notices. You weight each portal's data accordingly - and you never guess - if a field is missing you mark it null rather than - inventing a value. +# resolution_agent: +# role: > +# Resolution Agent +# goal: > +# Receive all raw tender lists from every Crawler, detect duplicates by Contract ID or +# by (contracting_authority + value_eur ± 5%), apply weighted consensus +# polling using fixed portal reliability scores, and return a single +# deduplicated list of authoritative ResolvedContract objects ranked by +# value_eur descending. +# backstory: > +# You are an expert data reconciliation specialist who has spent years +# harmonising open procurement data across Italian and European portals. You +# understand that ANAC is authoritative for Contract ID assignment and that TED is the +# golden source for EU-wide notices. You weight each portal's data accordingly +# and you never guess - if a field is missing you mark it null rather than +# inventing a value. preference_filter_agent: role: > @@ -73,3 +73,21 @@ extraction_agent: deadlines, amounts, and contracting authority details. You NEVER explain your reasoning or add commentary. Your entire response is always raw JSON and nothing else. When a field is not available on the page, you set it to null rather than guessing. + + +resolution_agent: + role: > + Resolution Agent + goal: > + You will be given {portals_count} lists containing various tender + from different portals. Dectect duplicates by Contract ID or, if missing, + by (Contracting Authority + Tender Amount ± 5%). Apply weighted consensus + polling using fixed portal reliability scores, and return a SINGLE + deduplicated JSON array of authoritative ResolvedContract objects + ranked by the Tender Amount in descending order. + backstory: > + You are an expert data reconciliation specialist who has spent years + harmonising open procurement data across Italian and European portals. You + understand that ANAC is authoritative for Contract ID assignment and that TED is the + golden source for EU-wide notices. You weight each portal's data accordingly + and if a field is missing you mark it null rather than guessing. diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 303d8bf..4be0a36 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -17,42 +17,42 @@ crawl_task: set to null rather than omitted. No extra text before or after the array. agent: crawler_agent -resolution_task: - description: > - You have received the tender lists from all Crawler agents (in context). - Perform the following steps: - - 1. DEDUPLICATION - Group entries with the same Contract ID. - If Contract ID is absent, group by (contracting_authority + value_eur ± 5%). - - 2. CONSENSUS POLLING - Apply the fixed portal reliability weights below - do NOT alter them: - {portal_weight_table} - - For each duplicate group: - - Select the canonical record from the highest-weight portal. - - On a tie, prefer the entry with the most non-null fields. - - consensus_score = (winning_portal_weight) x - (portals_that_listed_this / total_portals_crawled) - - 3. OUTPUT - Return a JSON array of ResolvedContract objects sorted by - value_eur descending (biggest opportunity first). - Required fields: canonical_contract_id, title, contracting_authority, - deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. - - 4. OUTPUT RULES (mandatory): - Return ONLY the raw JSON array. No introductory text, no explanation, - no markdown fences. The response must start with '[' and end with ']'. - value_eur must be a number (float), never a string with currency symbols. - canonical_url must be a string; use "" if unknown, never null. - expected_output: > - A JSON array of ResolvedContract objects, deduplicated and ranked by - value_eur descending. - A raw JSON array starting with '[' and ending with ']'. - No text before or after the array. - agent: resolution_agent +# resolution_task: +# description: > +# You have received the tender lists from all Crawler agents (in context). +# Perform the following steps: + +# 1. DEDUPLICATION +# Group entries with the same Contract ID. +# If Contract ID is absent, group by (contracting_authority + value_eur ± 5%). + +# 2. CONSENSUS POLLING +# Apply the fixed portal reliability weights below - do NOT alter them: +# {portal_weight_table} + +# For each duplicate group: +# - Select the canonical record from the highest-weight portal. +# - On a tie, prefer the entry with the most non-null fields. +# - consensus_score = (winning_portal_weight) x +# (portals_that_listed_this / total_portals_crawled) + +# 3. OUTPUT +# Return a JSON array of ResolvedContract objects sorted by +# value_eur descending (biggest opportunity first). +# Required fields: canonical_contract_id, title, contracting_authority, +# deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. + +# 4. OUTPUT RULES (mandatory): +# Return ONLY the raw JSON array. No introductory text, no explanation, +# no markdown fences. The response must start with '[' and end with ']'. +# value_eur must be a number (float), never a string with currency symbols. +# canonical_url must be a string; use "" if unknown, never null. +# expected_output: > +# A JSON array of ResolvedContract objects, deduplicated and ranked by +# value_eur descending. +# A raw JSON array starting with '[' and ending with ']'. +# No text before or after the array. +# agent: resolution_agent preference_filter_task: description: > @@ -129,26 +129,29 @@ extraction_task: description: > You are given a list of {tenders_count} tenders from the {portal_name} portal. For each of the tenders in the list you must: + 1. Load the tender detail page at its URL. + 2. Extract all available information from the page and build a TenderInfo object. - Fields to extract: - - cig: Codice Identificativo Gara - - cup: Codice Unico di Progetto - - title: full tender title - - url: the page URL - - cpv: list of CPV codes - - contract_type: i.e Services / Supplies / Works - - contracting_authority: name and tax_code - - base_amount: base auction amount in euros - - max_amount: maximum value including renewals - - publication_date: ISO format (YYYY-MM-DD) - - deadline: bid submission deadline, ISO format - - contract_duration_months: integer - - url_docs: list of links to documents and attachments + + Fields to extract: + - cig: Codice Identificativo Gara + - cup: Codice Unico di Progetto + - title: full tender title + - url: the page URL + - cpv: list of CPV codes + - contract_type: i.e Services / Supplies / Works + - contracting_authority: name and tax_code + - base_amount: base auction amount in euros + - max_amount: maximum value including renewals + - publication_date: ISO format (YYYY-MM-DD) + - deadline: bid submission deadline, ISO format + - contract_duration_months: integer + - url_docs: list of links to documents and attachments 3. After processing ALL {tenders_count} tenders, collect - every TenderInfo into a single JSON array and return it as - your final answer. Do not return partial results mid-loop. + every TenderInfo into a single JSON array and return it as + your final answer. Do not return partial results mid-loop. Rules: - Set missing fields to null, never guess @@ -182,3 +185,61 @@ extraction_task: ] agent: extraction_agent +resolution_task: + description: > + You are given {portals_count} list of {tenders_count} tenders each. + Your job is to do the following: + + 1. DEDUPLICATION + Group entries with the same Contract ID. + If Contract ID is absent, group by (Contracting Authority + Tender Amount ± 5%). + + 2. CONSENSUS POLLING + Apply the fixed portal reliability weights below - do NOT alter them: + {portal_weight_table} + + For each group of duplicates, select a SINGLE RECORD from the highest-weight portal. + On a tie, prefer the entry with the most non-null fields. + The Consensus Score is computed (winning_portal_weight) x + (portals_that_listed_this / total_portals_crawled) + + 3. After processing ALL the tenders, collect every Tender into a single + JSON array, SORTED by the Tender Amount in DESCENDING order. + If the amount is null, treat it as 0 but leave null as value. + Then, return it as your final answer. Do not return partial results mid-loop. + + REQUIRED Fields: + - canonical_contract_id: Codice Identificativo Gara (CIG) + - title: full tender title + - contracting_authority: Name of Contracting Authority [tax code if given] + - deadline: bid submission deadline, ISO format + - value_eur: auction amount in euros (float) + - cpv_codes: list of CPV codes + - sources: list of Portal where the tender was present + - consensus_score: Consensus Score, + - canonical_url: url of the Tender Page + + Rules: + - Set missing fields to null, never guess + + IMPORTANT: Your final answer must be ONLY the raw JSON array. + Do not write explanations, do not use markdown code blocks, + do not add any text before or after the JSON. + + {current_date} + expected_output: > + A valid array of objects following the ResolvedContract schema: + [ + { + "canonical_contract_id": "string or null", + "title": "string", + "contracting_authority": "string or null", + "deadline": "YYYY-MM-DDTHH:MM:SS or null", + "value_eur": float or null, + "sources" ["TED", "ANAC", ...], + "consensus_score": float + "cpv_codes": ["code1", "code2"] or null, + "canonical_url": "https://..." + } + ] + agent: resolution_agent From ed27f269da5dfa1fe16021620da2a6bd92b5b0fd Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:35:56 +0200 Subject: [PATCH 28/52] fix: correction on comment --- src/bandai/config/tasks_scout.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 47d53d7..b67032b 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -54,12 +54,12 @@ crawl_task: # - consensus_score: use the winning portal reliability weight from the table. # - canonical_url: copy url; if absent use "". - 4. OUTPUT RULES (mandatory): - Return ONLY the raw JSON array. No introductory text, no explanation, - no markdown fences. The response must start with '[' and end with ']'. - value_eur must be a number (float), never a string with currency symbols. - canonical_url must be a string; use "" if unknown, never null. - Do not return null for any field except value_eur. + # 4. OUTPUT RULES (mandatory): + # Return ONLY the raw JSON array. No introductory text, no explanation, + # no markdown fences. The response must start with '[' and end with ']'. + # value_eur must be a number (float), never a string with currency symbols. + # canonical_url must be a string; use "" if unknown, never null. + # Do not return null for any field except value_eur. expected_output: > A JSON array of ResolvedContract objects, deduplicated and ranked by value_eur descending. From 8b6bd4993872c7e240f5a6b0177b6fb5691b9d1c Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:37:04 +0200 Subject: [PATCH 29/52] refactor: cleaning task_scout.yaml --- src/bandai/config/tasks_scout.yaml | 141 ++++++++--------------------- 1 file changed, 36 insertions(+), 105 deletions(-) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index b67032b..6475fb5 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -1,108 +1,3 @@ -crawl_task: - description: > - Search {portal_name} for public IT/cloud/cybersecurity tenders aimed at - Italian public administrations. - - Relevant CPV codes: 72000000, 72212517, 48900000, 72315000. Company ATECO - codes: {ateco_codes}. Portal entry URL: {portal_url} - - Return a JSON array of RawContract objects with keys: portal, url, title, - contract_id, contracting_authority, deadline, value_eur, cpv_codes, raw_text. - - Use the TenderCrawlerTool to perform the search, then return ONLY the raw - JSON array produced by the tool. Do NOT include tool call JSON, markdown - fences, explanations, or example placeholders like "...". - expected_output: > - A JSON array of RawContract dicts. Every field that cannot be found must be - set to null rather than omitted. No extra text before or after the array. - agent: crawler_agent - -# resolution_task: -# description: > -# You have received the tender lists from all Crawler agents (in context). -# Perform the following steps: - -# 1. DEDUPLICATION -# Group entries with the same Contract ID. -# If Contract ID is absent, group by (contracting_authority + value_eur ± 5%). - -# 2. CONSENSUS POLLING -# Apply the fixed portal reliability weights below - do NOT alter them: -# {portal_weight_table} - -# For each duplicate group: -# - Select the canonical record from the highest-weight portal. -# - On a tie, prefer the entry with the most non-null fields. -# - consensus_score = (winning_portal_weight) x -# (portals_that_listed_this / total_portals_crawled) - -# 3. OUTPUT -# Return a JSON array of ResolvedContract objects sorted by -# value_eur descending (biggest opportunity first). -# Required fields: canonical_contract_id, title, contracting_authority, -# deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. - - # The context records are TenderInfo objects. Map them to ResolvedContract - # exactly as follows: - # - canonical_contract_id: use cig; if absent use cup; if absent use url. - # - title: copy title; if absent use "Unknown tender". - # - contracting_authority: use contracting_authority.name; if absent use "Unknown authority". - # - deadline: copy deadline; if absent use "Unknown deadline". - # - value_eur: use max_amount; if absent use base_amount; if absent use null. - # - cpv_codes: use cpv; if absent use an empty array []. - # - sources: include portal and canonical url when available; never return []. - # - consensus_score: use the winning portal reliability weight from the table. - # - canonical_url: copy url; if absent use "". - - # 4. OUTPUT RULES (mandatory): - # Return ONLY the raw JSON array. No introductory text, no explanation, - # no markdown fences. The response must start with '[' and end with ']'. - # value_eur must be a number (float), never a string with currency symbols. - # canonical_url must be a string; use "" if unknown, never null. - # Do not return null for any field except value_eur. - expected_output: > - A JSON array of ResolvedContract objects, deduplicated and ranked by - value_eur descending. - A raw JSON array starting with '[' and ending with ']'. - No text before or after the array. - agent: resolution_agent - -preference_filter_task: - description: > - You have the full list of ResolvedContracts from the Resolution Agent (context). - - USER PREFERENCES (natural language): - {user_preferences} - - Instructions: - 1. Filter out contracts that clearly contradict the preferences. - 2. Re-rank the remaining ones by fit with preferences (not just value_eur). - 3. For each kept contract add a one-line fit_reason. - 4. For each removed contract add a one-line removal_reason. - 5. Return the filtered + ranked JSON array. - - NOTE: the human will review your output before it is finalised. - They may add further instructions or corrections in natural language. - - OUTPUT: - Return a JSON array of ResolvedContract objects sorted by - value_eur descending (biggest opportunity first). - Required fields: canonical_contract_id, title, contracting_authority, - deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. - - OUTPUT RULES (mandatory): - Return ONLY the raw JSON array. No introductory text, no explanation, - no markdown fences. The response must start with '[' and end with ']'. - value_eur must be a number (float), never a string with currency symbols. - canonical_url must be a string; use "" if unknown, never null. - Do not return null for any field except value_eur. - expected_output: > - A filtered, ranked JSON array of ResolvedContract objects with fit_reason - added to each entry. - A raw JSON array starting with '[' and ending with ']'. - No text before or after the array. - agent: preference_filter_agent - discovery_task: description: > Search the {portal_name} portal for public tenders aimed at Italian @@ -264,3 +159,39 @@ resolution_task: } ] agent: resolution_agent + +preference_filter_task: + description: > + You have the full list of ResolvedContracts from the Resolution Agent (context). + + USER PREFERENCES (natural language): + {user_preferences} + + Instructions: + 1. Filter out contracts that clearly contradict the preferences. + 2. Re-rank the remaining ones by fit with preferences (not just value_eur). + 3. For each kept contract add a one-line fit_reason. + 4. For each removed contract add a one-line removal_reason. + 5. Return the filtered + ranked JSON array. + + NOTE: the human will review your output before it is finalised. + They may add further instructions or corrections in natural language. + + OUTPUT: + Return a JSON array of ResolvedContract objects sorted by + value_eur descending (biggest opportunity first). + Required fields: canonical_contract_id, title, contracting_authority, + deadline, value_eur, cpv_codes, sources, consensus_score, canonical_url. + + OUTPUT RULES (mandatory): + Return ONLY the raw JSON array. No introductory text, no explanation, + no markdown fences. The response must start with '[' and end with ']'. + value_eur must be a number (float), never a string with currency symbols. + canonical_url must be a string; use "" if unknown, never null. + Do not return null for any field except value_eur. + expected_output: > + A filtered, ranked JSON array of ResolvedContract objects with fit_reason + added to each entry. + A raw JSON array starting with '[' and ending with ']'. + No text before or after the array. + agent: preference_filter_agent \ No newline at end of file From 11f78e117855a38e9213b0b3dc67339803c33045 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:41:33 +0200 Subject: [PATCH 30/52] refactor: cleaning agents_scout.yaml --- src/bandai/config/agents_scout.yaml | 57 +++++++---------------------- 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/src/bandai/config/agents_scout.yaml b/src/bandai/config/agents_scout.yaml index 81f6827..d404c61 100644 --- a/src/bandai/config/agents_scout.yaml +++ b/src/bandai/config/agents_scout.yaml @@ -1,47 +1,3 @@ -crawler_agent: - role: > - Crawler - {portal_name} - goal: > - Exhaustively search {portal_name} ({portal_url}) for public tenders matching - Italian ICT / cloud / cybersecurity CPV codes. Return every relevant tender - as a structured JSON array of RawContract objects. - backstory: > - You are a specialist web-scraping agent trained on the quirks of - {portal_name}. You know which filters to apply for Italian PA tenders in the - ICT sector and you never miss a relevant bando. You always return clean, - structured JSON. You never explain tool usage or show example outputs. - -# resolution_agent: -# role: > -# Resolution Agent -# goal: > -# Receive all raw tender lists from every Crawler, detect duplicates by Contract ID or -# by (contracting_authority + value_eur ± 5%), apply weighted consensus -# polling using fixed portal reliability scores, and return a single -# deduplicated list of authoritative ResolvedContract objects ranked by -# value_eur descending. -# backstory: > -# You are an expert data reconciliation specialist who has spent years -# harmonising open procurement data across Italian and European portals. You -# understand that ANAC is authoritative for Contract ID assignment and that TED is the -# golden source for EU-wide notices. You weight each portal's data accordingly -# and you never guess - if a field is missing you mark it null rather than -# inventing a value. - -preference_filter_agent: - role: > - Preference Filter - goal: > - Given the list of resolved contracts and the user's natural language - preferences, re-rank and filter the list. Remove contracts that clearly - don't match the preferences. Explain each kept/removed decision briefly. - backstory: > - You bridge the gap between raw procurement data and the company's - strategic priorities. You understand imprecise human language and can - translate "we prefer Sardinia, mid-size, cloud-heavy" into a filtered, - ranked shortlist with a rationale for each decision. - - discovery_agent: role: > Public Tender Discovery Specialist @@ -91,3 +47,16 @@ resolution_agent: understand that ANAC is authoritative for Contract ID assignment and that TED is the golden source for EU-wide notices. You weight each portal's data accordingly and if a field is missing you mark it null rather than guessing. + +preference_filter_agent: + role: > + Preference Filter + goal: > + Given the list of resolved contracts and the user's natural language + preferences, re-rank and filter the list. Remove contracts that clearly + don't match the preferences. Explain each kept/removed decision briefly. + backstory: > + You bridge the gap between raw procurement data and the company's + strategic priorities. You understand imprecise human language and can + translate "we prefer Sardinia, mid-size, cloud-heavy" into a filtered, + ranked shortlist with a rationale for each decision. \ No newline at end of file From 39f8f977c0320e16eda196fb716c41b8a0d1fc95 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:42:05 +0200 Subject: [PATCH 31/52] refactor: reordering of tasks definitions --- src/bandai/config/tasks_scout.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 6475fb5..61978a0 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -4,14 +4,14 @@ discovery_task: public administrations. The TendersOverviewExtractorTool will return a Markdown containing a list of tenders. Parse the Markdown and extract each tender you find. Collect exactly {tenders_count} tender overviews. - + For each tender found, parse the information of the following fields: - title: tender title if available, null otherwise - url: direct URL to the tender detail page (required) - portal: always '{portal_name}' - access_mode: 'html' for web pages, 'api' for API endpoints - metadata: any additional data available (deadline, CIG) - + Rules: - Do not guess missing fields, use null - Return exactly the JSON schema as specified @@ -19,9 +19,6 @@ discovery_task: IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. - Do NOT return tool-call JSON. - Do NOT return the Markdown you loaded. Convert the Markdown content into - TenderOverview objects and return only the JSON array. expected_output: > A valid object following the TenderOverview schema: [ @@ -71,12 +68,6 @@ extraction_task: IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. - Do NOT return tool-call JSON. - Do NOT narrate your extraction process. - Do NOT write phrases like "we need to extract" or "let's start". - Do NOT return the Markdown you loaded. Convert the Markdown content into - TenderInfo objects and return only the JSON array. - Your final answer must start with '[' and end with ']'. expected_output: > A valid array of objects following the TenderInfo schema: [ From 5baf9b773a8b8d49f4c10e72eedb7c6c9a584b42 Mon Sep 17 00:00:00 2001 From: BrunoCarusoc Date: Wed, 27 May 2026 16:49:29 +0200 Subject: [PATCH 32/52] feat: update resultion_task formatting, reordering of tasks --- src/bandai/crews/scout_crew.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/bandai/crews/scout_crew.py b/src/bandai/crews/scout_crew.py index 1e1e336..5a96acf 100644 --- a/src/bandai/crews/scout_crew.py +++ b/src/bandai/crews/scout_crew.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +import datetime as dt from crewai import Agent, Crew, Process, Task # type: ignore from crewai.agents.agent_builder.base_agent import BaseAgent # type: ignore @@ -51,6 +52,7 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: """Build and return (crew_instance, preference_filter_task).""" ac = load_yaml_config("agents_scout.yaml") tc = load_yaml_config("tasks_scout.yaml") + len_portals = len(BANDI_PORTALS) # Load company profile once (cached by @lru_cache). # company = load_company_profile() @@ -58,10 +60,9 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: # Crawler agents + tasks # For each portal: Discovery + Extraction (all async) - disc_agents: list[Agent] = [] - disc_tasks: list[Task] = [] + all_agents: list[Agent] = [] + all_tasks: list[Task] = [] - extr_agents: list[Agent] = [] extr_tasks: list[Task] = [] for portal in BANDI_PORTALS: @@ -88,14 +89,14 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: ), expected_output=disc_task_cfg["expected_output"], agent=disc_ag, - async_execution=True, + # async_execution=True, # output_pydantic = list[TenderOverview], guardrail=lambda r: validate_json_array(r, strip_fences=True), guardrail_max_retries=3, ) - disc_agents.append(disc_ag) - disc_tasks.append(disc_t) + all_agents.append(disc_ag) + all_tasks.append(disc_t) extr_agent_cfg = ac["extraction_agent"] extr_task_cfg = tc["extraction_task"] @@ -127,7 +128,8 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: guardrail_max_retries=3, ) - extr_agents.append(extr_ag) + all_agents.append(extr_ag) + all_tasks.append(extr_t) extr_tasks.append(extr_t) # Resolution Agent - deduplicates and ranks results @@ -136,20 +138,23 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: resolution_agent = Agent( role=res_cfg["role"], - goal=res_cfg["goal"], + goal=res_cfg["goal"].format(portals_count = len_portals), backstory=res_cfg["backstory"], llm=get_llm(fast=False), verbose=True, max_iter=8, max_retry_limit=2, respect_context_window=True, - inject_date=True, # temporal awareness for deadline handling + # inject_date=True, # temporal awareness for deadline handling allow_delegation=False, ) resolution_task = Task( description=res_task_cfg["description"].format( portal_weight_table=_build_weight_table(), + tenders_count = self.tenders_count_per_portal, + portal_weight_table=_build_weight_table(), + current_date = f"Current Date: { str(dt.datetime.now().date()) }" ), expected_output=res_task_cfg["expected_output"], agent=resolution_agent, @@ -186,8 +191,8 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: guardrail_max_retries=3, ) - all_agents = disc_agents + extr_agents + [resolution_agent, preference_filter_agent] - all_tasks = disc_tasks + extr_tasks + [resolution_task, preference_filter_task] + all_agents = all_agents + [resolution_agent, preference_filter_agent] + all_tasks = all_tasks + [resolution_task, preference_filter_task] built_crew = Crew( agents=all_agents, From 576d2f9a14a358cb7b1950088d6485abe39d29cb Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:34:49 +0200 Subject: [PATCH 33/52] feat: update task compliance prompt --- src/bandai/config/tasks_compliance.yaml | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/bandai/config/tasks_compliance.yaml b/src/bandai/config/tasks_compliance.yaml index c804f80..42c17f8 100644 --- a/src/bandai/config/tasks_compliance.yaml +++ b/src/bandai/config/tasks_compliance.yaml @@ -10,10 +10,13 @@ advocate_task: Turnover (3y) : {turnover} Past contracts : {past_contracts} - Instructions: 1. List every EXPLICIT requirement in the tender. 2. For each, - state: MEETS / POTENTIALLY MEETS (with caveats) / DOES NOT MEET. 3. For - "potentially" items, propose a concrete mitigation. 4. Assign - confidence_score (0-1) and overall_sentiment. 5. Return a JSON object + Instructions: 1. Use only the CONTRACT text and COMPANY PROFILE above. + Do not invent missing tender requirements. 2. List every EXPLICIT + requirement in the tender. If the contract summary is insufficient, say so + in summary and keep confidence_score low. 3. For each requirement, state: + MEETS / POTENTIALLY MEETS (with caveats) / DOES NOT MEET. 4. For + "potentially" items, propose a concrete mitigation. 5. Assign + confidence_score (0-1) and overall_sentiment. 6. Return a JSON object matching the AdvocateAnalysis schema. expected_output: > A valid JSON object: requirements_met, requirements_potentially_met, @@ -27,12 +30,13 @@ auditor_task: CONTRACT: {contract_summary} - Instructions: 1. Review each Advocate claim. 2. HARD BLOCKERS: requirements - the company cannot meet before the deadline. 3. SOFT RISKS: uncertain or - third-party-dependent requirements. 4. ADVOCATE_OVERESTIMATES: legally or - factually wrong Advocate claims. 5. Assign risk_score (0 = no risk, 1 = - certain failure). 6. Return a JSON object matching the AuditorChallenge - schema. + Instructions: 1. Use only the CONTRACT text, COMPANY PROFILE and Advocate + output in context. Do not invent missing requirements. 2. Review each + Advocate claim. 3. HARD BLOCKERS: requirements the company cannot meet + before the deadline. 4. SOFT RISKS: uncertain or third-party-dependent + requirements. 5. ADVOCATE_OVERESTIMATES: legally or factually wrong + Advocate claims. 6. Assign risk_score (0 = no risk, 1 = certain failure). + 7. Return a JSON object matching the AuditorChallenge schema. expected_output: > A valid JSON object: hard_blockers, soft_risks, advocate_overestimates, overall_sentiment, risk_score, summary. From 97857fb957050371119f02ba3153fbcc5cac22e7 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:35:02 +0200 Subject: [PATCH 34/52] fix: remove current_date in task scout --- src/bandai/config/tasks_scout.yaml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/bandai/config/tasks_scout.yaml b/src/bandai/config/tasks_scout.yaml index 61978a0..3479bdc 100644 --- a/src/bandai/config/tasks_scout.yaml +++ b/src/bandai/config/tasks_scout.yaml @@ -4,14 +4,14 @@ discovery_task: public administrations. The TendersOverviewExtractorTool will return a Markdown containing a list of tenders. Parse the Markdown and extract each tender you find. Collect exactly {tenders_count} tender overviews. - + For each tender found, parse the information of the following fields: - title: tender title if available, null otherwise - url: direct URL to the tender detail page (required) - portal: always '{portal_name}' - access_mode: 'html' for web pages, 'api' for API endpoints - metadata: any additional data available (deadline, CIG) - + Rules: - Do not guess missing fields, use null - Return exactly the JSON schema as specified @@ -38,9 +38,9 @@ extraction_task: For each of the tenders in the list you must: 1. Load the tender detail page at its URL. - + 2. Extract all available information from the page and build a TenderInfo object. - + Fields to extract: - cig: Codice Identificativo Gara - cup: Codice Unico di Progetto @@ -132,8 +132,6 @@ resolution_task: IMPORTANT: Your final answer must be ONLY the raw JSON array. Do not write explanations, do not use markdown code blocks, do not add any text before or after the JSON. - - {current_date} expected_output: > A valid array of objects following the ResolvedContract schema: [ @@ -185,4 +183,4 @@ preference_filter_task: added to each entry. A raw JSON array starting with '[' and ending with ']'. No text before or after the array. - agent: preference_filter_agent \ No newline at end of file + agent: preference_filter_agent From 0f9d445afd60a7f11d9e92bb633cf12c982b3d65 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:35:13 +0200 Subject: [PATCH 35/52] feat: update company profile --- knowledge/company_profile.json | 49 +++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/knowledge/company_profile.json b/knowledge/company_profile.json index ec86e0d..2303f03 100644 --- a/knowledge/company_profile.json +++ b/knowledge/company_profile.json @@ -3,7 +3,8 @@ "vat_number": "IT12345678901", "ateco_codes": [ "62.01.09", - "62.02.00" + "62.02.00", + "72.20.00" ], "certifications": [ "ISO 9001:2015", @@ -36,17 +37,57 @@ "title": "Data Analysis for Municipal Services", "value_eur": 80000.0, "cpv_codes": [ - "72200000" + "72200000", + "72312100", + "79311400" ], "year": 2021, "authority": "Comune di Milano", "topics": [ "data analysis", - "public services" + "public services", + "statistical reporting" + ] + }, + { + "title": "Applied Socio-Economic Data Analysis for Public Policy", + "value_eur": 220000.0, + "cpv_codes": [ + "73110000", + "79311400", + "79315000", + "72312100" + ], + "year": 2024, + "authority": "Universita degli Studi di Cagliari", + "topics": [ + "research services", + "inequality indicators", + "survey data analysis", + "public policy evaluation" ] } ], "departments": { + "Data Research & Policy Analytics": { + "capabilities": [ + "Applied research design for public-sector socio-economic studies", + "Statistical analysis of administrative, survey and open data sources", + "Indicator design, reproducible reporting and policy evaluation dashboards" + ], + "certifications": [ + "ISO 9001:2015", + "ISO 27001:2022" + ], + "case_studies": [ + "Osservatorio indicatori territoriali Universita di Cagliari (2024) - EUR 220k, inequality and public-service access analysis" + ], + "kpis": { + "datasets_integrated": 18, + "avg_report_delivery_weeks": 6, + "reproducible_analysis_percent": 100 + } + }, "Cloud Infrastructure": { "capabilities": [ "Design and management of multi-cloud environments (AWS, Azure, GCP)", @@ -134,4 +175,4 @@ } } } -} \ No newline at end of file +} From 6b6b30fe4350ff1be65f3d120f2141cf819ee4cb Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:35:31 +0200 Subject: [PATCH 36/52] fix: remove mock compliance tool --- src/bandai/crews/compliance_crew.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/bandai/crews/compliance_crew.py b/src/bandai/crews/compliance_crew.py index cdb38dd..51292ef 100644 --- a/src/bandai/crews/compliance_crew.py +++ b/src/bandai/crews/compliance_crew.py @@ -14,7 +14,6 @@ ComplianceVerdict, load_company_profile, ) -from bandai.tools.crawler_tools import ComplianceCheckerTool from bandai.utils import load_yaml_config log = logging.getLogger(__name__) @@ -49,7 +48,6 @@ def build(self, contract_summary: str) -> tuple[Crew, Task]: role=ac["advocate"]["role"], goal=ac["advocate"]["goal"], backstory=ac["advocate"]["backstory"], - tools=[ComplianceCheckerTool()], llm=get_llm(), verbose=True, max_retry_limit=2, @@ -67,7 +65,6 @@ def build(self, contract_summary: str) -> tuple[Crew, Task]: role=ac["auditor"]["role"], goal=ac["auditor"]["goal"], backstory=ac["auditor"]["backstory"], - tools=[ComplianceCheckerTool()], llm=get_llm(), verbose=True, max_retry_limit=2, From d1821caa821d0f4f73c73ace520ac01f5c1ce77b Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:35:53 +0200 Subject: [PATCH 37/52] feat: clean up scout crew --- src/bandai/crews/scout_crew.py | 117 +++++++++++++++------------------ 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/src/bandai/crews/scout_crew.py b/src/bandai/crews/scout_crew.py index 5a96acf..3a74c4d 100644 --- a/src/bandai/crews/scout_crew.py +++ b/src/bandai/crews/scout_crew.py @@ -1,10 +1,6 @@ from __future__ import annotations -import logging -import datetime as dt - from crewai import Agent, Crew, Process, Task # type: ignore -from crewai.agents.agent_builder.base_agent import BaseAgent # type: ignore from bandai.config import ( BANDI_PORTALS, @@ -20,12 +16,9 @@ validate_tender_info_array, ) from bandai.knowledge_sources import get_all_knowledge_sources -from bandai.models import ResolvedContract from bandai.tools.crawler_tools import TendersOverviewExtractorTool, SinglePageLoaderTool from bandai.utils import load_yaml_config -log = logging.getLogger(__name__) - def _build_weight_table() -> str: """Build a human-readable weight table for the resolution prompt.""" @@ -34,45 +27,46 @@ def _build_weight_table() -> str: return "\n".join(rows) +def _validate_discovery_output(result): + return validate_json_array(result, strip_fences=True) + + +def _validate_extraction_output(result): + return validate_tender_info_array(result, strip_fences=True) + + +def _validate_resolved_output(result): + return validate_resolved_contract_array(result, strip_fences=True) + + class ScoutCrew: """ Scout Crew - discovers and deduplicates Italian public tenders. - Because the number of crawler agents is dynamic (one per portal), - we build agents and tasks programmatically. Crawler tasks are marked - async to allow concurrent discovery when supported by the runtime. + Because the number of portals is dynamic, discovery and extraction agents + are built programmatically from the portal configuration. """ - agents: list[BaseAgent] - tasks: list[Task] - tenders_count_per_portal: int = 3 def build(self, user_preferences: str) -> tuple[Crew, Task]: """Build and return (crew_instance, preference_filter_task).""" ac = load_yaml_config("agents_scout.yaml") tc = load_yaml_config("tasks_scout.yaml") - len_portals = len(BANDI_PORTALS) + portals_count = len(BANDI_PORTALS) - # Load company profile once (cached by @lru_cache). - # company = load_company_profile() - # ateco_codes_str = ", ".join(company.ateco_codes) - - # Crawler agents + tasks - # For each portal: Discovery + Extraction (all async) all_agents: list[Agent] = [] all_tasks: list[Task] = [] - - extr_tasks: list[Task] = [] + extraction_tasks: list[Task] = [] for portal in BANDI_PORTALS: - disc_agent_cfg = ac["discovery_agent"] - disc_task_cfg = tc["discovery_task"] + discovery_agent_cfg = ac["discovery_agent"] + discovery_task_cfg = tc["discovery_task"] - disc_ag = Agent( - role=disc_agent_cfg["role"], - goal=disc_agent_cfg["goal"].format(portal=portal.name), - backstory=disc_agent_cfg["backstory"], + discovery_agent = Agent( + role=discovery_agent_cfg["role"], + goal=discovery_agent_cfg["goal"].format(portal=portal.name), + backstory=discovery_agent_cfg["backstory"], tools=[TendersOverviewExtractorTool()], llm=get_llm(fast=True), verbose=True, @@ -82,29 +76,27 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: allow_delegation=False, ) - disc_t = Task( - description=disc_task_cfg["description"].format( + discovery_task = Task( + description=discovery_task_cfg["description"].format( portal_name=portal.name, tenders_count=self.tenders_count_per_portal, ), - expected_output=disc_task_cfg["expected_output"], - agent=disc_ag, - # async_execution=True, - # output_pydantic = list[TenderOverview], - guardrail=lambda r: validate_json_array(r, strip_fences=True), + expected_output=discovery_task_cfg["expected_output"], + agent=discovery_agent, + guardrail=_validate_discovery_output, guardrail_max_retries=3, ) - all_agents.append(disc_ag) - all_tasks.append(disc_t) + all_agents.append(discovery_agent) + all_tasks.append(discovery_task) - extr_agent_cfg = ac["extraction_agent"] - extr_task_cfg = tc["extraction_task"] + extraction_agent_cfg = ac["extraction_agent"] + extraction_task_cfg = tc["extraction_task"] - extr_ag = Agent( - role=extr_agent_cfg["role"], - goal=extr_agent_cfg["goal"].format(portal_name=portal.name), - backstory=extr_agent_cfg["backstory"], + extraction_agent = Agent( + role=extraction_agent_cfg["role"], + goal=extraction_agent_cfg["goal"].format(portal_name=portal.name), + backstory=extraction_agent_cfg["backstory"], tools=[SinglePageLoaderTool()], llm=get_llm(fast=True), verbose=True, @@ -114,23 +106,21 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: allow_delegation=False, ) - extr_t = Task( - description=extr_task_cfg["description"].format( + extraction_task = Task( + description=extraction_task_cfg["description"].format( portal_name=portal.name, tenders_count=self.tenders_count_per_portal, ), - expected_output=extr_task_cfg["expected_output"], - context=[disc_t], - agent=extr_ag, - async_execution=False, - # output_pydantic = list[TenderOverview], - guardrail=lambda r: validate_tender_info_array(r, strip_fences=True), + expected_output=extraction_task_cfg["expected_output"], + context=[discovery_task], + agent=extraction_agent, + guardrail=_validate_extraction_output, guardrail_max_retries=3, ) - all_agents.append(extr_ag) - all_tasks.append(extr_t) - extr_tasks.append(extr_t) + all_agents.append(extraction_agent) + all_tasks.append(extraction_task) + extraction_tasks.append(extraction_task) # Resolution Agent - deduplicates and ranks results res_cfg = ac["resolution_agent"] @@ -138,28 +128,26 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: resolution_agent = Agent( role=res_cfg["role"], - goal=res_cfg["goal"].format(portals_count = len_portals), + goal=res_cfg["goal"].format(portals_count=portals_count), backstory=res_cfg["backstory"], llm=get_llm(fast=False), verbose=True, max_iter=8, max_retry_limit=2, respect_context_window=True, - # inject_date=True, # temporal awareness for deadline handling + inject_date=True, allow_delegation=False, ) resolution_task = Task( description=res_task_cfg["description"].format( + tenders_count=self.tenders_count_per_portal, + portals_count=portals_count, portal_weight_table=_build_weight_table(), - tenders_count = self.tenders_count_per_portal, - portal_weight_table=_build_weight_table(), - current_date = f"Current Date: { str(dt.datetime.now().date()) }" ), expected_output=res_task_cfg["expected_output"], agent=resolution_agent, - context=extr_tasks, - output_pydantic=list[ResolvedContract], + context=extraction_tasks, guardrail=validate_resolved_contract_array, guardrail_max_retries=3, ) @@ -186,13 +174,12 @@ def build(self, user_preferences: str) -> tuple[Crew, Task]: expected_output=preference_task_cfg["expected_output"], agent=preference_filter_agent, context=[resolution_task], - output_pydantic=list[ResolvedContract], - guardrail=lambda r: validate_resolved_contract_array(r, strip_fences=True), + guardrail=_validate_resolved_output, guardrail_max_retries=3, ) - all_agents = all_agents + [resolution_agent, preference_filter_agent] - all_tasks = all_tasks + [resolution_task, preference_filter_task] + all_agents.extend([resolution_agent, preference_filter_agent]) + all_tasks.extend([resolution_task, preference_filter_task]) built_crew = Crew( agents=all_agents, From 1158ac079f244ed8f4f87b0030c08e228018fb3e Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:36:13 +0200 Subject: [PATCH 38/52] feat: clean up crawler tools --- src/bandai/tools/__init__.py | 4 +- src/bandai/tools/crawler_tools.py | 134 +----------------------------- 2 files changed, 6 insertions(+), 132 deletions(-) diff --git a/src/bandai/tools/__init__.py b/src/bandai/tools/__init__.py index 06d89df..0437e50 100644 --- a/src/bandai/tools/__init__.py +++ b/src/bandai/tools/__init__.py @@ -1,3 +1,3 @@ -from .crawler_tools import TenderCrawlerTool, ContractDetailTool, ComplianceCheckerTool, ProposalWriterTool +from .crawler_tools import ProposalWriterTool, SinglePageLoaderTool, TendersOverviewExtractorTool -__all__ = ["TenderCrawlerTool", "ContractDetailTool", "ComplianceCheckerTool", "ProposalWriterTool"] +__all__ = ["ProposalWriterTool", "SinglePageLoaderTool", "TendersOverviewExtractorTool"] diff --git a/src/bandai/tools/crawler_tools.py b/src/bandai/tools/crawler_tools.py index 5d21eed..b57b2fb 100644 --- a/src/bandai/tools/crawler_tools.py +++ b/src/bandai/tools/crawler_tools.py @@ -1,22 +1,20 @@ from __future__ import annotations -import json import logging -import random import asyncio from pathlib import Path -from typing import Type, Literal, Optional +from typing import Optional from crewai.tools import BaseTool # type: ignore -from pydantic import BaseModel, HttpUrl, Field, field_validator +from pydantic import BaseModel, HttpUrl, Field from playwright.async_api import async_playwright, Page, TimeoutError as PlaywrightTimeout from bandai.config import BANDI_PORTALS from bandai.config.portals import ( PortalConfig, - SelectorIdentifier, ActionDescription, + SelectorIdentifier, DiscoveryProcess, ExtractionProcess ) from bandai.utils import html_to_markdown @@ -24,139 +22,15 @@ log = logging.getLogger(__name__) -# Input Schemas - - -class CrawlerInput(BaseModel): - portal_name: str = Field(..., description="Human-readable name of the portal to crawl.") - base_url: str = Field(..., description="Entry URL of the portal to crawl.") - keywords: list[str] = Field(..., description="List of keywords to search for in the portal.") - max_results: int = Field(10, description="Maximum number of contracts to return.") - - @field_validator("keywords", mode="before") - @classmethod - def parse_keywords(cls, v): - if isinstance(v, str): - try: - return json.loads(v) - except json.JSONDecodeError: - return [v] - return v - - -class ContractLookupInput(BaseModel): - contract_id: str = Field(..., description="Unique identifier of the contract to look up.") - portal_url: str = Field(..., description="Portal URL where the contract was found.") - - class DocumentGeneratorInput(BaseModel): title: str = Field(..., description="Title of the document to generate.") sections: dict[str, str] = Field(..., description="section_title to markdown_content mapping.") output_path: str = Field("output/proposal_output.md", description="File path for the generated document.") -class ComplianceInput(BaseModel): - tender_raw_text: str = Field(..., description="Raw text of the tender document to analyze.") - company_profile_json: str = Field(..., description="JSON string containing the company profile.") - - # Tools -class TenderCrawlerTool(BaseTool): - """Crawls a procurement portal and returns matching tender notices as JSON.""" - - name: str = "TenderCrawlerTool" - description: str = ( - "Crawls a given procurement portal and returns all matching tender notices " - "as a JSON array. Input: portal_name, base_url, keywords, " - "optional max_results (default 10). Output: JSON array of tender notices " - "with metadata and raw text." - ) - args_schema: Type[BaseModel] = CrawlerInput - - def _run(self, portal_name: str, base_url: str, keywords: list[str], max_results: int = 10) -> str: - log.info("TenderCrawlerTool: crawling %s (max_results=%d)", portal_name, max_results) - - # TODO: Implement actual crawling logic with already CrewAI tools like SeleniumTool or HTTPTool, and parse results into structured JSON. - # This stub returns mock data so the pipeline can be tested end-to-end - # before the real crawler is implemented. - mock_response = [ - { - "portal": portal_name, - "url": f"{base_url}/tender/{i}", - "title": f"Mock Tender {i} with keywords {', '.join(keywords)}", - "contract_id": f"tender-{random.randint(1000, 9999)}{i}", - "contracting_authority": f"Authority {i}", - "deadline": f"2024-0{(i % 9) + 1}-31", - "value_eur": float(random.randint(10000, 1000000)), - "cpv_codes": [], - "raw_text": f"This is the raw text of mock tender {i}, containing keywords {', '.join(keywords)}.", - } - for i in range(1, max_results + 1) - ] - return json.dumps(mock_response, ensure_ascii=False, indent=2) - - -class ContractDetailTool(BaseTool): - """Fetches full metadata of a single tender by Contract ID from the ANAC open API.""" - - name: str = "ContractDetailTool" - description: str = ( - "Given a contract ID and a portal URL, fetches the complete tender notice " - "including annexes and administrative documents. " - "Input: contract_id, portal_url. Output: JSON with all available metadata " - "and a completeness score." - ) - args_schema: Type[BaseModel] = ContractLookupInput - - def _run(self, contract_id: str, portal_url: str) -> str: - log.info("ContractDetailTool: looking up contract %s from %s", contract_id, portal_url) - - # TODO: Implement actual contract detail lookup - # (GET https://dati.anticorruzione.it/opendata/dataset/.../contract-id/{contract_id}). - mock_response = { - "contract_id": contract_id, - "source_url": portal_url, - "completeness_score": round(random.uniform(0.5, 1.0), 2), - "has_technical_spec": random.choice([True, False]), - "has_admin_clauses": random.choice([True, False]), - "has_award_criteria": random.choice([True, False]), - } - return json.dumps(mock_response, ensure_ascii=False, indent=2) - - -class ComplianceCheckerTool(BaseTool): - """Cross-references company certifications and financials against tender requirements.""" - - name: str = "ComplianceCheckerTool" - description: str = ( - "Given a tender's raw text and the company profile JSON, returns a " - "structured gap analysis: requirements met, missing, and uncertain. " - "Input: tender_raw_text, company_profile_json. Output: JSON with lists " - "of met, missing, and uncertain requirements." - ) - args_schema: Type[BaseModel] = ComplianceInput - - def _run(self, tender_raw_text: str, company_profile_json: str) -> str: - log.info("ComplianceCheckerTool: analyzing compliance for tender (%d chars)", len(tender_raw_text)) - - # TODO: Implement actual compliance checking logic (NLP extraction). - analysis = { - "met": [ - "Requirement A: Met", - "Requirement B: Met", - "Experience in past PA: 2 contracts in the last 3 years", - ], - "missing": ["Requirement C: Missing", "Certification X: Not held"], - "uncertain": [ - "Requirement D: Uncertain", - "Financial Stability: Needs Review", - ], - } - return json.dumps(analysis, ensure_ascii=False, indent=2) - - class ProposalWriterTool(BaseTool): """Writes a Markdown proposal document from structured sections to disk.""" @@ -167,7 +41,7 @@ class ProposalWriterTool(BaseTool): "specified path. Input: title, sections (dict), output_path. " "Output: Confirmation message with the file path." ) - args_schema: Type[BaseModel] = DocumentGeneratorInput + args_schema: type[BaseModel] = DocumentGeneratorInput def _run(self, title: str, sections: dict[str, str], output_path: str) -> str: log.info("ProposalWriterTool: writing proposal to %s", output_path) From 3c9ac6d3bbf2d0f8fd2f1616523d4b6e2058bf07 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:36:42 +0200 Subject: [PATCH 39/52] feat: better contracts parsing in flow --- src/bandai/flow.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/bandai/flow.py b/src/bandai/flow.py index 0d373e6..b8df047 100644 --- a/src/bandai/flow.py +++ b/src/bandai/flow.py @@ -17,6 +17,14 @@ log = logging.getLogger("bandai.flow") + +def _normalize_resolved_contract_dict(contract: dict) -> dict: + normalized = dict(contract) + authority = normalized.get("contracting_authority") + if isinstance(authority, dict): + normalized["contracting_authority"] = authority.get("name") or "Unknown authority" + return normalized + # Human Input Callback HumanInputFn = Callable[ @@ -125,7 +133,10 @@ def run_scouting(self) -> None: raw = final_task.output.raw log.info("Raw scout output:\n%s", raw) try: - parsed = extract_json_array(raw) + parsed = [ + _normalize_resolved_contract_dict(contract) + for contract in extract_json_array(raw) + ] resolved = TypeAdapter(list[ResolvedContract]).validate_python(parsed) contracts = [contract.model_dump() for contract in resolved] except Exception: From 90a2cbf2b16371821cf3ab496108812161dfb554 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:37:06 +0200 Subject: [PATCH 40/52] feat: better json array validation in guardrails --- src/bandai/guardrails.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/bandai/guardrails.py b/src/bandai/guardrails.py index 4eefbb3..a34586f 100644 --- a/src/bandai/guardrails.py +++ b/src/bandai/guardrails.py @@ -10,6 +10,18 @@ log = logging.getLogger(__name__) +_PLACEHOLDER_VALUES = {"string", "code1", "code2", "code3", "code4", "code5", "code6"} + + +def _contains_placeholder(value) -> bool: + if isinstance(value, str): + return value.strip().lower() in _PLACEHOLDER_VALUES + if isinstance(value, list): + return any(_contains_placeholder(item) for item in value) + if isinstance(value, dict): + return any(_contains_placeholder(item) for item in value.values()) + return False + # JSON Array Validation @@ -124,10 +136,24 @@ def validate_resolved_contract_array( if not isinstance(contract.get("cpv_codes"), list): return (False, f"Item {index} field cpv_codes must be a list, never null.") + if isinstance(contract.get("contracting_authority"), dict): + return ( + False, + f"Item {index} field contracting_authority must be a string, not an object. " + "Use contracting_authority.name from TenderInfo.", + ) + if not isinstance(contract.get("contracting_authority"), str): + return (False, f"Item {index} field contracting_authority must be a string.") if not isinstance(contract.get("sources"), list) or not contract["sources"]: return (False, f"Item {index} field sources must be a non-empty list.") if not isinstance(contract.get("consensus_score"), (int, float)): return (False, f"Item {index} field consensus_score must be a number.") + if _contains_placeholder(contract): + return ( + False, + f"Item {index} contains placeholder/example values such as 'string' or 'code1'. " + "Use only real values extracted from TenderInfo context; if unavailable use the configured fallback values.", + ) return (True, payload) From 8c3e5a895542f67b9e0c0f335345adf1db21823e Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:37:26 +0200 Subject: [PATCH 41/52] feat: add contract loading from output --- src/bandai/io.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/bandai/io.py b/src/bandai/io.py index 5d2041b..1d26c65 100644 --- a/src/bandai/io.py +++ b/src/bandai/io.py @@ -3,11 +3,11 @@ import json import logging from pathlib import Path - -from bandai.utils import PROJECT_ROOT +from typing import Any log = logging.getLogger(__name__) +PROJECT_ROOT: Path = Path(__file__).resolve().parents[2] OUTPUT_DIR: Path = PROJECT_ROOT / "output" # Ensure the output directory exists at import time. @@ -23,3 +23,30 @@ def save_json(data: dict, filename: str) -> Path: ) log.info("Saved: %s", path) return path + + +def load_contract_from_outputs(contract_id: str, output_dir: Path | None = None) -> dict[str, Any] | None: + """Return a previously discovered contract from output artifacts.""" + search_dir = output_dir or OUTPUT_DIR + scout_path = search_dir / "01_scout_results.json" + + if not scout_path.exists(): + return None + + try: + data = json.loads(scout_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + log.warning("Could not parse scout output file: %s", scout_path) + return None + + contracts = data.get("contracts", []) + if not isinstance(contracts, list): + return None + + for contract in contracts: + if not isinstance(contract, dict): + continue + if contract.get("canonical_contract_id") == contract_id or contract.get("contract_id") == contract_id: + return contract + + return None From 092ce64b27bff5ef03ff2856e8dc01e06b86bd7c Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 17:38:01 +0200 Subject: [PATCH 42/52] feat: update main with contract loading --- src/bandai/main.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/bandai/main.py b/src/bandai/main.py index 949a101..f786941 100644 --- a/src/bandai/main.py +++ b/src/bandai/main.py @@ -10,6 +10,7 @@ from bandai.config import validate_config, get_active_provider, validate_portals from bandai.flow import BandAIFlow, BandAIState +from bandai.io import OUTPUT_DIR, load_contract_from_outputs logging.basicConfig( level=logging.INFO, @@ -70,21 +71,6 @@ def _startup_validation() -> None: log.info("Provider: %s (%s)", provider.name, provider.description) -def _build_stub_contract(contract_id: str) -> dict: - """Build a stub contract dict matching the ResolvedContract schema.""" - return { - "canonical_contract_id": contract_id, - "title": f"Contratto {contract_id} (manuale)", - "contracting_authority": "Da capitolato", - "deadline": "Da capitolato", - "value_eur": 0, - "cpv_codes": [], - "canonical_url": f"https://www.anticorruzione.it/contract/{contract_id}", - "sources": ["manual"], - "consensus_score": 1.0, - } - - def run() -> None: """Run the BandAI procurement pipeline via CrewAI Flow.""" args = _parse_args() @@ -107,7 +93,15 @@ def run() -> None: state = BandAIState(mode=args.mode) if args.mode == "propose": - state.contracts = [_build_stub_contract(args.contract)] + contract = load_contract_from_outputs(args.contract) + if contract is None: + log.error( + "Contract %s not found in %s. Run `bandai --mode scout` or `bandai --mode full` first.", + args.contract, + OUTPUT_DIR, + ) + sys.exit(1) + state.contracts = [contract] flow = BandAIFlow() flow.kickoff(inputs=state.model_dump()) From 2772b265c7432f0661786663f5482eb45eb425b8 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 18:08:03 +0200 Subject: [PATCH 43/52] fix: update implicit no go detection --- src/bandai/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bandai/utils.py b/src/bandai/utils.py index cf0f7a2..57bec13 100644 --- a/src/bandai/utils.py +++ b/src/bandai/utils.py @@ -148,7 +148,7 @@ def extract_json_array(raw: str) -> list: def is_implicit_no_go(text: str) -> bool: """Fast keyword check for abandonment language (pre-LLM, zero cost).""" lower = text.lower() - return any(kw in lower for kw in NO_GO_KEYWORDS) + return any(re.search(rf"(? Date: Wed, 27 May 2026 18:08:26 +0200 Subject: [PATCH 44/52] docs: update documentation --- docs/README.md | 2 +- docs/architecture/crews.md | 20 ++++---- docs/architecture/main_pipeline.md | 2 +- docs/architecture/tools.md | 82 ++++++++---------------------- docs/guides/customization.md | 2 +- docs/guides/getting_started.md | 2 +- docs/reference/cli_reference.md | 4 +- 7 files changed, 37 insertions(+), 77 deletions(-) diff --git a/docs/README.md b/docs/README.md index b2000a1..0cc07c8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,7 +20,7 @@ How the system is built. Start here for the structural overview. | [main_pipeline.md](architecture/main_pipeline.md) | Flow state machine, execution modes (`full`/`scout`/`propose`), entry points, human interaction points, output files, logging | | [crews.md](architecture/crews.md) | ScoutCrew, ComplianceCrew, ProposalCrew - agents, task chains, build signatures, shared config | | [models.md](architecture/models.md) | All 11 Pydantic models: knowledge models, scouting models, compliance models, proposal models | -| [tools.md](architecture/tools.md) | 4 custom CrewAI tools: TenderCrawlerTool, ContractDetailTool, ComplianceCheckerTool, ProposalWriterTool | +| [tools.md](architecture/tools.md) | Runtime CrewAI tools: Playwright discovery/detail loaders and ProposalWriterTool | | [configuration.md](architecture/configuration.md) | Environment variables, LLM providers, portal YAML, startup validation, NO-GO keywords | ## Guides diff --git a/docs/architecture/crews.md b/docs/architecture/crews.md index edf27e5..2bad826 100644 --- a/docs/architecture/crews.md +++ b/docs/architecture/crews.md @@ -7,25 +7,27 @@ BandAI has three crews, they all use `memory=get_memory()` for cross-session lea **File:** `crews/scout_crew.py` **Config:** `config/agents_scout.yaml`, `config/tasks_scout.yaml` -Discovers and deduplicates Italian public tenders across multiple procurement portals. The number of crawler agents is dynamic - one per portal defined in `config/portals.yaml`. +Discovers and deduplicates Italian public tenders across configured procurement portals. For each portal, Scout builds a discovery agent and an extraction agent. ### Agents | Agent | LLM | Tools | Notes | | ------------------------ | ------- | -------------------------------- | ------------------------------ | -| Crawler Agent (×N) | fast | TenderCrawlerTool, ContractDetailTool | One per portal, async | -| Resolution Agent | main | ContractDetailTool | `inject_date=True` | +| Discovery Agent (×N) | fast | TendersOverviewExtractorTool | One per portal, async | +| Extraction Agent (×N) | fast | SinglePageLoaderTool | One per portal, reads discovery context | +| Resolution Agent | main | - | `inject_date=True` | | Preference Filter | main | - | `human_input=True` | ### Task Chain ```text -crawl_task (×N, async) -> resolution_task -> preference_filter_task +discovery_task (×N, async) -> extraction_task (×N) -> resolution_task -> preference_filter_task ``` -1. **crawl_task** - Each crawler searches one portal for tenders matching Italian ICT/cloud CPV codes. Returns `RawContract` JSON arrays. -2. **resolution_task** - Deduplicates across portals using Contract ID matching and +/-5% value tolerance. Applies weighted consensus polling using portal reliability scores. Returns `ResolvedContract` JSON array ranked by `value_eur` descending. Guardrail: output must be a raw JSON array starting with `[`. -3. **preference_filter_task** - Filters and re-ranks based on user preferences. Adds `fit_reason` to each entry. Guardrail: strips markdown fences, validates JSON array format. +1. **discovery_task** - Uses Playwright to collect listing-page Markdown and returns `TenderOverview` JSON arrays. +2. **extraction_task** - Loads tender detail pages and returns `TenderInfo` JSON arrays. +3. **resolution_task** - Maps `TenderInfo` records to `ResolvedContract`, deduplicates, and applies portal reliability weights. Guardrail rejects null required fields and placeholder values. +4. **preference_filter_task** - Filters and re-ranks based on user preferences. Adds `fit_reason` to each entry. ### Build Signature @@ -50,8 +52,8 @@ Runs a structured advocate/auditor debate to produce a bid verdict for a single | Agent | LLM | Tools | Notes | |--------------------|-------|-----------------------|--------------------------------| -| Advocate | main | ComplianceCheckerTool | Optimistic bid manager | -| Auditor | main | ComplianceCheckerTool | Former ANAC inspector | +| Advocate | main | - | Optimistic bid manager | +| Auditor | main | - | Former ANAC inspector | | Compliance Officer | main | - | `reasoning=True` for synthesis | ### Task Chain (Initial) diff --git a/docs/architecture/main_pipeline.md b/docs/architecture/main_pipeline.md index 656ef48..a5771b8 100644 --- a/docs/architecture/main_pipeline.md +++ b/docs/architecture/main_pipeline.md @@ -16,7 +16,7 @@ Mode is set via `--mode` flag or by calling `BandAIState(mode=...)`. `main.py` is a thin wrapper. It validates configuration (provider, API key, knowledge file, portals), parses CLI args, prepares the initial `BandAIState`, then delegates to `BandAIFlow.kickoff(inputs=state.model_dump())`. -For `--mode propose`, the wrapper injects a stub contract into the state before kickoff so the flow can start directly from compliance without running scouting. +For `--mode propose`, the wrapper loads the selected contract from `output/01_scout_results.json` before kickoff so the flow can start directly from compliance without running scouting. ```text main.py -> _startup_validation() -> BandAIState(...) -> BandAIFlow().kickoff(inputs=state.model_dump()) diff --git a/docs/architecture/tools.md b/docs/architecture/tools.md index 88ec2e8..5c34aab 100644 --- a/docs/architecture/tools.md +++ b/docs/architecture/tools.md @@ -1,63 +1,34 @@ # Tools -Four custom CrewAI tools, all in `tools/crawler_tools.py`. All extend `BaseTool` and use Pydantic input schemas. +Runtime CrewAI tools live in `tools/crawler_tools.py`. All extend `BaseTool` and use Pydantic input schemas. -## TenderCrawlerTool +## TendersOverviewExtractorTool -Crawls a procurement portal for matching tenders. +Navigates a configured tender listing portal with Playwright and returns list-page content as Markdown for the Discovery Agent to parse into `TenderOverview` objects. -**Input:** `CrawlerInput` +**Input:** `TendersOverviewExtractorInput` ```text -portal_name: str - Human-readable portal name -base_url: str - Portal entry URL -keywords: list[str] - Search terms (accepts string or JSON array) -max_results: int - Limit, default 10 +portal_name: str +tenders_count: int ``` -**Output:** JSON array of tender notices with keys: `portal`, `url`, `title`, `contract_id`, `contracting_authority`, `deadline`, `value_euros`, `raw_text`. - -**Status:** Stub. Returns mock data with randomized values. The real implementation needs `httpx` + `BeautifulSoup` for portal-specific scraping. The mock is sufficient for end-to-end pipeline testing. - -**Used by:** Crawler Agent (Scout crew, one instance per portal). +**Used by:** Discovery Agent in `ScoutCrew`. --- -## ContractDetailTool +## SinglePageLoaderTool -Fetches full metadata for a single tender by Contract ID. +Loads a single tender detail page with Playwright and returns cleaned Markdown for the Extraction Agent to parse into `TenderInfo` objects. -**Input:** `ContractLookupInput` +**Input:** `SinglePageLoaderInput` ```text -contract_id: str - Unique tender identifier -portal_url: str - Source portal URL +portal_name: str +url: str ``` -**Output:** JSON with `contract_id`, `source_url`, `completeness_score`, and boolean flags for `has_technical_spec`, `has_admin_clauses`, `has_award_criteria`. - -**Status:** Stub. The real implementation should hit the ANAC open data API at `https://dati.anticorruzione.it/opendata/`. - -**Used by:** Crawler Agent and Resolution Agent (Scout crew). - ---- - -## ComplianceCheckerTool - -Cross-references company profile against tender requirements. - -**Input:** `ComplianceInput` - -```text -tender_raw_text: str - Full tender text -company_profile_json: str - Company profile as JSON string -``` - -**Output:** JSON with three lists: `met`, `missing`, `uncertain`. - -**Status:** Stub. Returns hardcoded example analysis. The real implementation needs NLP extraction to map tender requirements to company certifications, turnover thresholds, and past contract history. - -**Used by:** Advocate and Auditor agents (Compliance crew). +**Used by:** Extraction Agent in `ScoutCrew`. --- @@ -68,33 +39,20 @@ Writes a Markdown proposal document to disk. **Input:** `DocumentGeneratorInput` ```text -title: str - Document title -sections: dict[str, str] - Section name -> Markdown content -output_path: str - File path, default "output/proposal_output.md" +title: str +sections: dict[str, str] +output_path: str ``` **Output:** Confirmation string with the file path, or an error message. -**Behavior:** Creates parent directories automatically (`mkdir(parents=True)`). Overwrites existing files. Writes UTF-8 encoded Markdown. - -**Status:** Functional. This is the only tool with a real implementation. - -**Used by:** Proposal Architect (Proposal crew). - ---- - -## Input Schema Details - -All schemas use Pydantic `Field` descriptions for LLM-readable documentation: +**Behavior:** Creates parent directories automatically and writes UTF-8 Markdown. -- `CrawlerInput` - `keywords` field has a `@field_validator` that accepts either a JSON string or a raw string, converting single values to `["value"]`. -- `ContractLookupInput` - Straightforward key-value lookup. -- `DocumentGeneratorInput` - Default output path is `output/proposal_output.md`. -- `ComplianceInput` - Expects the full company profile as a serialized JSON string, which the agent constructs from knowledge. +**Used by:** Proposal Architect in `ProposalCrew`. ## Adding a New Tool 1. Define a Pydantic input schema in `tools/crawler_tools.py`. 2. Create a class extending `BaseTool` with `name`, `description`, `args_schema`, and `_run()`. -3. Import and assign to the relevant agent via the `tools=[]` parameter. -4. Add tests in `tests/test_config.py` or a new `tests/test_tools.py`. +3. Assign it to the relevant agent only when it provides real runtime behavior. +4. Add focused tests for the schema and behavior. diff --git a/docs/guides/customization.md b/docs/guides/customization.md index 22b50c3..f2b61d7 100644 --- a/docs/guides/customization.md +++ b/docs/guides/customization.md @@ -142,7 +142,7 @@ class RAGSearchTool(BaseTool): ```python agent = Agent( ..., - tools=[RAGSearchTool(), ContractDetailTool()], + tools=[RAGSearchTool()], ) ``` diff --git a/docs/guides/getting_started.md b/docs/guides/getting_started.md index 1f7255b..17d06ac 100644 --- a/docs/guides/getting_started.md +++ b/docs/guides/getting_started.md @@ -82,7 +82,7 @@ Discovers tenders and prints results. No compliance or proposal generation. bandai --mode propose --contract GD-2026-00123 ``` -Skips scouting. Creates a stub contract with the given ID and runs compliance + proposal phases. +Skips scouting. Loads the contract with the given ID from `output/01_scout_results.json` and runs compliance + proposal phases. ### Dry run diff --git a/docs/reference/cli_reference.md b/docs/reference/cli_reference.md index bb0652a..9e0d5c1 100644 --- a/docs/reference/cli_reference.md +++ b/docs/reference/cli_reference.md @@ -24,13 +24,13 @@ bandai --mode scout ### `bandai --mode propose --contract ` -Propose mode for a known contract. Skips scouting, creates a stub contract entry, and runs Compliance → Proposal. +Propose mode for a known contract. Skips scouting, loads the contract from `output/01_scout_results.json`, and runs Compliance → Proposal. ```bash bandai --mode propose --contract GD-2026-00123 ``` -`--mode propose` requires `--contract`. Without it, exits with an error. +`--mode propose` requires `--contract`. Without it, or if the contract ID is not present in the previous scout output, exits with an error. ### `bandai --dry-run` From 6120060da4b432a586b305ff53a835b60caf42b2 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 18:08:49 +0200 Subject: [PATCH 45/52] test: add new tests --- tests/test_config.py | 32 ++++++++++++++ tests/test_guardrails.py | 96 +++++++++++++++++++++++++++++++++++++++- tests/test_utils.py | 28 ++++++++++++ 3 files changed, 155 insertions(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index 5a0c04f..4614af7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import json import sys import tempfile import types @@ -602,6 +603,37 @@ def test_save_json_nested(self) -> None: assert path.exists() path.unlink() + def test_load_contract_from_outputs_reads_scout_results(self, tmp_path: Path) -> None: + from bandai.io import load_contract_from_outputs + + contract = { + "canonical_contract_id": "363340-2026", + "title": "Italy - Research services", + "contracting_authority": "European Commission", + "deadline": "2026-07-03T12:00:59", + "value_eur": 1100000.0, + "cpv_codes": ["73110000"], + "sources": ["TED"], + "consensus_score": 0.9, + "canonical_url": "https://ted.europa.eu/en/notice/-/detail/363340-2026", + } + (tmp_path / "01_scout_results.json").write_text( + json.dumps({"contracts": [contract]}), + encoding="utf-8", + ) + + assert load_contract_from_outputs("363340-2026", output_dir=tmp_path) == contract + + def test_load_contract_from_outputs_returns_none_when_missing(self, tmp_path: Path) -> None: + from bandai.io import load_contract_from_outputs + + (tmp_path / "01_scout_results.json").write_text( + json.dumps({"contracts": []}), + encoding="utf-8", + ) + + assert load_contract_from_outputs("missing", output_dir=tmp_path) is None + # Reload portals tests diff --git a/tests/test_guardrails.py b/tests/test_guardrails.py index 0e555c0..116f320 100644 --- a/tests/test_guardrails.py +++ b/tests/test_guardrails.py @@ -12,7 +12,12 @@ if not hasattr(sys.modules["crewai"], "TaskOutput"): sys.modules["crewai"].TaskOutput = MagicMock(name="TaskOutput") -from bandai.guardrails import validate_json_array, validate_compliance_verdict # noqa: E402 +from bandai.guardrails import ( # noqa: E402 + validate_compliance_verdict, + validate_json_array, + validate_resolved_contract_array, + validate_tender_info_array, +) def _make_task_output(raw: str) -> MagicMock: @@ -83,6 +88,30 @@ def test_strip_fences_returns_clean_json(self) -> None: assert ok is True assert "```" not in msg + def test_nested_array_is_not_truncated(self) -> None: + result = _make_task_output( + 'Final answer:\n[{"canonical_contract_id": "001", "cpv_codes": ["72000000"]}]' + ) + ok, msg = validate_json_array(result) + assert ok is True + assert msg == '[{"canonical_contract_id": "001", "cpv_codes": ["72000000"]}]' + + def test_known_wrapper_object_is_normalized_to_array(self) -> None: + result = _make_task_output( + '{"tenders": [{"title": "Tender", "cpv_codes": ["72000000"]}], "portal": "TED"}' + ) + ok, msg = validate_json_array(result) + assert ok is True + assert msg == '[{"title": "Tender", "cpv_codes": ["72000000"]}]' + + def test_tool_call_envelope_is_rejected_with_specific_message(self) -> None: + result = _make_task_output( + '{"name": "parse_markdown", "parameters": {"markdown": "## CIG 123"}}' + ) + ok, msg = validate_json_array(result) + assert ok is False + assert "Do not return tool-call JSON" in msg + class TestValidateComplianceVerdict: """Tests for validate_compliance_verdict().""" @@ -146,3 +175,68 @@ def test_unparseable_output(self) -> None: ok, msg = validate_compliance_verdict(out) assert ok is False assert "Could not parse" in msg + + +class TestValidateResolvedContractArray: + """Tests for validate_resolved_contract_array().""" + + def test_rejects_null_required_fields(self) -> None: + result = _make_task_output( + '[{"canonical_contract_id":"66516400","title":null,' + '"contracting_authority":null,"deadline":null,"value_eur":null,' + '"cpv_codes":null,"sources":[],"consensus_score":null,"canonical_url":""}]' + ) + ok, msg = validate_resolved_contract_array(result) + assert ok is False + assert "ResolvedContract fields cannot be null" in msg + + def test_accepts_valid_resolved_contract(self) -> None: + result = _make_task_output( + '[{"canonical_contract_id":"66516400","title":"Tender",' + '"contracting_authority":"BandAI","deadline":"2026-06-24T23:59:00",' + '"value_eur":1500000.0,"cpv_codes":["72000000"],' + '"sources":["TED","https://ted.europa.eu/notice"],' + '"consensus_score":0.9,"canonical_url":"https://ted.europa.eu/notice"}]' + ) + ok, msg = validate_resolved_contract_array(result) + assert ok is True + + def test_rejects_placeholder_values(self) -> None: + result = _make_task_output( + '[{"canonical_contract_id":"TED:123","title":"string",' + '"contracting_authority":{"name":"string","tax_code":"string"},' + '"deadline":"2023-02-01T23:59:59Z","value_eur":null,' + '"cpv_codes":["code1","code2"],"sources":["TED"],' + '"consensus_score":0.9,"canonical_url":"https://ted.europa.eu/notice"}]' + ) + ok, msg = validate_resolved_contract_array(result) + assert ok is False + assert "placeholder" in msg + + +class TestValidateTenderInfoArray: + """Tests for validate_tender_info_array().""" + + def test_rejects_prose_output(self) -> None: + result = _make_task_output("We have loaded all three tenders. Now we need to extract fields.") + ok, msg = validate_tender_info_array(result) + assert ok is False + assert "Do not describe your extraction steps" in msg + + def test_does_not_treat_nested_cpv_as_top_level_payload(self) -> None: + result = _make_task_output('Partial prose before malformed output {"cpv": ["66516400"]}') + ok, msg = validate_tender_info_array(result) + assert ok is False + assert "properly formatted JSON array" in msg + + def test_accepts_valid_tender_info(self) -> None: + result = _make_task_output( + '[{"cig":null,"cup":null,"title":"Tender","url":"https://ted.europa.eu/notice",' + '"cpv":["66516400"],"contract_type":"Services",' + '"contracting_authority":{"name":"Cogeser S.p.A.","tax_code":"08317570151"},' + '"base_amount":520000.0,"max_amount":null,"publication_date":"2026-05-27",' + '"deadline":null,"contract_duration_months":null,"url_docs":[],' + '"portal":"TED","status":"partial","error":null}]' + ) + ok, msg = validate_tender_info_array(result) + assert ok is True diff --git a/tests/test_utils.py b/tests/test_utils.py index fed6a92..ca28045 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -43,6 +43,21 @@ def test_empty_array(self) -> None: result = extract_json_array("[]") assert result == [] + def test_array_with_nested_cpv_array(self) -> None: + raw = 'Final answer:\n[{"id": 1, "cpv_codes": ["72000000", "72212517"]}]' + result = extract_json_array(raw) + assert result == [{"id": 1, "cpv_codes": ["72000000", "72212517"]}] + + def test_known_wrapper_object_extracts_array(self) -> None: + raw = '{"tenders": [{"id": 1}], "portal": "TED", "status": "success"}' + result = extract_json_array(raw) + assert result == [{"id": 1}] + + def test_ignores_nested_scalar_arrays(self) -> None: + raw = 'Reasoning with a nested CPV only: {"cpv": ["66516400"]}' + with pytest.raises(ValueError, match="No JSON array"): + extract_json_array(raw) + class TestIsImplicitNoGo: """Tests for is_implicit_no_go().""" @@ -59,6 +74,8 @@ def test_english_no_go(self) -> None: def test_positive_intent(self) -> None: assert not is_implicit_no_go("We have all certifications") assert not is_implicit_no_go("Posiamo procedere con la documentazione") + assert not is_implicit_no_go("Confermiamo di possedere tutti i requisiti vincolanti") + assert not is_implicit_no_go("Abbiamo formalizzato e sottoscritto un contratto di avvalimento") def test_empty_string(self) -> None: assert not is_implicit_no_go("") @@ -94,6 +111,17 @@ def test_missing_fields(self) -> None: assert "Minimal" in summary assert "N/A" in summary + def test_none_value_and_cpv_codes(self) -> None: + c = { + "title": "Nullable Tender", + "value_eur": None, + "cpv_codes": None, + } + summary = contract_to_summary(c) + assert "Nullable Tender" in summary + assert "EUR 0" in summary + assert "CPV" in summary + class TestLoadYamlConfig: """Tests for load_yaml_config().""" From 4c54632f7fdd0b348c3f1f8265e98b4c23938579 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Wed, 27 May 2026 18:28:52 +0200 Subject: [PATCH 46/52] update flow logging --- src/bandai/flow.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/bandai/flow.py b/src/bandai/flow.py index b8df047..fe63d62 100644 --- a/src/bandai/flow.py +++ b/src/bandai/flow.py @@ -25,6 +25,7 @@ def _normalize_resolved_contract_dict(contract: dict) -> dict: normalized["contracting_authority"] = authority.get("name") or "Unknown authority" return normalized + # Human Input Callback HumanInputFn = Callable[ @@ -131,12 +132,9 @@ def run_scouting(self) -> None: built_crew.kickoff() raw = final_task.output.raw - log.info("Raw scout output:\n%s", raw) + log.debug("Raw scout output:\n%s", raw) try: - parsed = [ - _normalize_resolved_contract_dict(contract) - for contract in extract_json_array(raw) - ] + parsed = [_normalize_resolved_contract_dict(contract) for contract in extract_json_array(raw)] resolved = TypeAdapter(list[ResolvedContract]).validate_python(parsed) contracts = [contract.model_dump() for contract in resolved] except Exception: From aa5a893ccd774c2cf2beb80ce8b43cd16872b0bd Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:13:01 +0200 Subject: [PATCH 47/52] feat: update pyptoject.toml with new scripts --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 015a95c..5a371cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,13 +23,13 @@ dev = ["pytest>=8.0.0", "pytest-cov>=5.0.0"] [project.scripts] bandai = "bandai.main:run" -run_crew = "bandai.main:run" +bandai_report = "bandai.main:report" +kickoff = "bandai.main:kickoff" +plot = "bandai.main:plot" install_chromium = "bandai.main:install_chromium" train = "bandai.main:train" replay = "bandai.main:replay" test = "bandai.main:test" -run_with_trigger = "bandai.main:run_with_trigger" -kickoff = "bandai.main:run" pytest_unit = "bandai.main:run_pytest" [build-system] From c8b28052ef0047f72a37c791ace4cfd41b70ea7b Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:13:32 +0200 Subject: [PATCH 48/52] feat: add html report generation --- src/bandai/main.py | 29 +++- src/bandai/report.py | 320 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+), 4 deletions(-) create mode 100644 src/bandai/report.py diff --git a/src/bandai/main.py b/src/bandai/main.py index f786941..b822df4 100644 --- a/src/bandai/main.py +++ b/src/bandai/main.py @@ -11,6 +11,7 @@ from bandai.config import validate_config, get_active_provider, validate_portals from bandai.flow import BandAIFlow, BandAIState from bandai.io import OUTPUT_DIR, load_contract_from_outputs +from bandai.report import write_report logging.basicConfig( level=logging.INFO, @@ -24,7 +25,7 @@ def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="BandAI - Italian SME Procurement Agent") p.add_argument( "--mode", - choices=["full", "scout", "propose"], + choices=["full", "scout", "propose", "report"], default="full", help="Pipeline mode (default: full)", ) @@ -75,6 +76,12 @@ def run() -> None: """Run the BandAI procurement pipeline via CrewAI Flow.""" args = _parse_args() + if args.mode == "report": + path = write_report() + log.info("Report generated: %s", path) + print(f"Report generated: {path}") + return + # Always validate config, even in dry-run _startup_validation() @@ -113,6 +120,18 @@ def run() -> None: raise +def kickoff() -> None: + """CrewAI Flow-compatible kickoff entry point.""" + run() + + +def plot() -> None: + """Generate the CrewAI flow visualization.""" + flow = BandAIFlow() + flow.plot("bandai_flow") + print("Flow plot generated: bandai_flow.html") + + def train() -> None: """Run CrewAI training through the CLI.""" _run_command(["crewai", "train", *sys.argv[1:]]) @@ -138,9 +157,11 @@ def install_chromium() -> None: _run_command([sys.executable, "-m", "playwright", "install", "chromium"]) -def run_with_trigger() -> None: - """Run the BandAI pipeline from an external trigger (webhook/API).""" - run() +def report() -> None: + """Generate the stakeholder HTML report from output JSON files.""" + path = write_report() + log.info("Report generated: %s", path) + print(f"Report generated: {path}") if __name__ == "__main__": diff --git a/src/bandai/report.py b/src/bandai/report.py new file mode 100644 index 0000000..b6f9f0a --- /dev/null +++ b/src/bandai/report.py @@ -0,0 +1,320 @@ +from __future__ import annotations + +import html +import json +from datetime import datetime +from pathlib import Path +from typing import Any + +from bandai.io import OUTPUT_DIR + + +def _read_json(path: Path) -> dict[str, Any]: + try: + data = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {} + return data if isinstance(data, dict) else {} + + +def _as_list(value: Any) -> list: + return value if isinstance(value, list) else [] + + +def _contract_id(contract: dict[str, Any]) -> str: + return str(contract.get("canonical_contract_id") or contract.get("contract_id") or "N/A") + + +def _decision_counts(compliance_items: list[dict[str, Any]], no_go_items: list[dict[str, Any]]) -> dict[str, int]: + counts = {"go": 0, "conditional": 0, "no_go": len(no_go_items)} + for item in compliance_items: + decision = str(item.get("verdict", {}).get("bid_decision", "")).upper() + if decision == "GO": + counts["go"] += 1 + elif decision == "CONDITIONAL-GO": + counts["conditional"] += 1 + elif decision == "NO-GO": + counts["no_go"] += 1 + return counts + + +def _approved_contract_ids(compliance_items: list[dict[str, Any]]) -> set[str]: + ids: set[str] = set() + for item in compliance_items: + decision = str(item.get("verdict", {}).get("bid_decision", "")).upper() + if decision not in {"GO", "CONDITIONAL-GO"}: + continue + contract = item.get("contract", {}) + if isinstance(contract, dict): + ids.add(_contract_id(contract)) + return ids + + +def build_report_data(output_dir: Path = OUTPUT_DIR) -> dict[str, Any]: + """Collect pipeline output JSON files into a report-friendly structure.""" + scout_data = _read_json(output_dir / "01_scout_results.json") + contracts = _as_list(scout_data.get("contracts")) + + compliance: list[dict[str, Any]] = [] + for path in sorted(output_dir.glob("02_compliance_*.json")): + data = _read_json(path) + if data: + data["source_file"] = path.name + compliance.append(data) + + no_go_data = _read_json(output_dir / "02_no_go_review_required.json") + no_go = _as_list(no_go_data.get("no_go_contracts")) + approved_ids = _approved_contract_ids(compliance) + no_go = [item for item in no_go if _contract_id(item.get("contract", {}) if isinstance(item, dict) else {}) not in approved_ids] + + proposals: list[dict[str, Any]] = [] + for path in sorted(output_dir.glob("03_proposal_*.json")): + data = _read_json(path) + if data: + data["source_file"] = path.name + proposals.append(data) + + counts = _decision_counts(compliance, no_go) + return { + "summary": { + "contracts": len(contracts), + "go": counts["go"], + "conditional": counts["conditional"], + "no_go": counts["no_go"], + "proposals": len(proposals), + }, + "contracts": contracts, + "compliance": compliance, + "no_go": no_go, + "proposals": proposals, + "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M"), + } + + +def _e(value: Any) -> str: + return html.escape("" if value is None else str(value)) + + +def _money(value: Any) -> str: + try: + return f"EUR {float(value):,.0f}" + except (TypeError, ValueError): + return "N/A" + + +def _score(value: Any) -> str: + try: + return f"{float(value) * 100:.0f}%" + except (TypeError, ValueError): + return "N/A" + + +def _badge(decision: Any) -> str: + text = str(decision or "N/A").upper() + css = {"GO": "go", "CONDITIONAL-GO": "conditional", "NO-GO": "no-go"}.get(text, "neutral") + return f'{_e(text)}' + + +def _list(items: Any) -> str: + values = _as_list(items) + if not values: + return '

Nessun elemento.

' + return "
    " + "".join(f"
  • {_e(item)}
  • " for item in values) + "
" + + +def _render_summary(summary: dict[str, Any]) -> str: + cards = [ + ("Contratti", summary.get("contracts", 0)), + ("GO", summary.get("go", 0)), + ("Conditional", summary.get("conditional", 0)), + ("NO-GO", summary.get("no_go", 0)), + ("Proposte", summary.get("proposals", 0)), + ] + return "".join(f'
{_e(label)}{_e(value)}
' for label, value in cards) + + +def _render_contracts(contracts: list[dict[str, Any]]) -> str: + if not contracts: + return '

Nessun contratto trovato in output/01_scout_results.json.

' + rows = [] + for contract in contracts: + cpv = ", ".join(str(code) for code in _as_list(contract.get("cpv_codes"))) or "N/A" + url = contract.get("canonical_url") + link = f'Apri' if url else "N/A" + rows.append( + "" + f"{_e(_contract_id(contract))}" + f"{_e(contract.get('title', 'N/A'))}" + f"{_e(contract.get('contracting_authority', 'N/A'))}" + f"{_money(contract.get('value_eur'))}" + f"{_e(contract.get('deadline', 'N/A'))}" + f"{_e(cpv)}" + f"{link}" + "" + ) + return ( + '
' + "" + "".join(rows) + "
IDTitoloEnteValoreScadenzaCPVURL
" + ) + + +def _render_compliance(items: list[dict[str, Any]]) -> str: + if not items: + return '

Nessun esito compliance trovato.

' + blocks = [] + for item in items: + contract = item.get("contract", {}) + verdict = item.get("verdict", {}) + blocks.append( + '
' + '
' + f"

{_e(contract.get('title', 'Contratto'))}

{_e(_contract_id(contract))}

" + f"
{_badge(verdict.get('bid_decision'))}{_score(verdict.get('compliance_score'))}
" + "
" + f"

{_e(verdict.get('verdict_rationale', ''))}

" + '
' + f"

Punti di forza

{_list(verdict.get('key_strengths'))}
" + f"

Rischi

{_list(verdict.get('key_risks'))}
" + "
" + "
" + ) + return "".join(blocks) + + +def _render_no_go(items: list[dict[str, Any]]) -> str: + if not items: + return '

Nessun NO-GO in revisione.

' + blocks = [] + for item in items: + contract = item.get("contract", {}) + verdict = item.get("verdict", {}) + blocks.append( + '
' + '
' + f"

{_e(contract.get('title', 'Contratto'))}

{_e(contract.get('contract_id', 'N/A'))}

" + f"
{_badge(verdict.get('bid_decision'))}{_score(verdict.get('compliance_score'))}
" + "
" + f"

{_e(verdict.get('verdict_rationale', ''))}

" + f"

Rischi principali

{_list(verdict.get('key_risks'))}" + "
" + ) + return "".join(blocks) + + +def _render_proposals(items: list[dict[str, Any]]) -> str: + if not items: + return '

Nessuna proposta generata.

' + blocks = [] + for item in items: + sections = item.get("sections", {}) + section_names = ", ".join(sections.keys()) if isinstance(sections, dict) else "N/A" + blocks.append( + '
' + '
' + f"

{_e(item.get('tender_ref', 'Proposta'))}

{_e(section_names)}

" + f"
Quality {_score(item.get('quality_score'))}{_e(item.get('word_count', 0))} parole
" + "
" + f"

{_e(item.get('executive_summary', ''))}

" + "
" + ) + return "".join(blocks) + + +def generate_report_html(data: dict[str, Any]) -> str: + """Render a standalone stakeholder-ready HTML report.""" + summary = data.get("summary", {}) + return f""" + + + + + BandAI Report + + + + +
+
+

BandAI Report

+

Generato il {_e(data.get('generated_at', ''))} dagli output della pipeline.

+
+ +
+
+
{_render_summary(summary)}
+

Contratti individuati

{_render_contracts(_as_list(data.get('contracts')))}
+

Esiti compliance

{_render_compliance(_as_list(data.get('compliance')))}
+

NO-GO da rivedere

{_render_no_go(_as_list(data.get('no_go')))}
+

Proposte generate

{_render_proposals(_as_list(data.get('proposals')))}
+
+ + + +""" + + +def write_report(output_dir: Path = OUTPUT_DIR, filename: str = "report.html") -> Path: + """Generate and write the HTML report into the output directory.""" + output_dir.mkdir(parents=True, exist_ok=True) + data = build_report_data(output_dir) + path = output_dir / filename + path.write_text(generate_report_html(data), encoding="utf-8") + return path From 795ff5511cb264c027484dc29b3671d0eed67a1b Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:14:03 +0200 Subject: [PATCH 49/52] docs: update documentation with new report generation command --- README.md | 7 ++++--- docs/architecture/main_pipeline.md | 4 ++++ docs/guides/customization.md | 6 +++--- docs/guides/getting_started.md | 10 ++++++++++ docs/reference/cli_reference.md | 20 ++++++++++++++++---- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f4b2be1..d771bae 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Quick links: | [Characters](docs/guides/characters.md) | Agent personas, roles, behavioral traits | | [Flow State Machine](docs/reference/flow_state_machine.md) | Every transition mapped with routing logic | | [Knowledge System](docs/reference/knowledge_system.md) | StringKnowledgeSource, chunking, RAG pipeline | -| [CLI Reference](docs/reference/cli_reference.md) | `bandai`, `bandai --dry-run`, `run_with_trigger`, CrewAI utilities | +| [CLI Reference](docs/reference/cli_reference.md) | `bandai`, `bandai --mode report`, `bandai --dry-run`, CrewAI utilities | | [Testing](docs/testing.md) | Test strategy, layers, commands, mocking, fixtures | --- @@ -163,12 +163,13 @@ bandai --mode scout # Propose for a known contract (skip scouting) bandai --mode propose --contract GD-2026-00123 +# Generate stakeholder HTML report from output/ +bandai --mode report + # Validate config without LLM calls bandai --dry-run ``` -The same runtime is also exposed as `run_with_trigger` for webhook or API-driven execution. - ### Tests ```bash diff --git a/docs/architecture/main_pipeline.md b/docs/architecture/main_pipeline.md index a5771b8..b8fcefb 100644 --- a/docs/architecture/main_pipeline.md +++ b/docs/architecture/main_pipeline.md @@ -9,6 +9,7 @@ BandAI runs a three-phase procurement pipeline orchestrated by a CrewAI Flow. Th | `full` | Scout -> Compliance -> Proposal | Preferences + conditional review | Complete end-to-end run | | `scout` | Scout only | Preferences | Quick opportunity discovery | | `propose` | Compliance -> Proposal | Conditional review | Single known contract, skip scouting | +| `report` | Report generation only | None | Stakeholder review of existing outputs | Mode is set via `--mode` flag or by calling `BandAIState(mode=...)`. @@ -18,6 +19,8 @@ Mode is set via `--mode` flag or by calling `BandAIState(mode=...)`. For `--mode propose`, the wrapper loads the selected contract from `output/01_scout_results.json` before kickoff so the flow can start directly from compliance without running scouting. +For `--mode report`, the wrapper skips CrewAI entirely and generates `output/report.html` from the JSON artifacts already present in `output/`. + ```text main.py -> _startup_validation() -> BandAIState(...) -> BandAIFlow().kickoff(inputs=state.model_dump()) ``` @@ -101,6 +104,7 @@ All artifacts land in `output/`: | `02_compliance_{nn}_{id}.json` | Individual compliance verdicts | | `02_no_go_review_required.json` | Consolidated NO-GO contracts | | `03_proposal_{nn}_{id}.json` | Final proposal per contract | +| `report.html` | Stakeholder-ready HTML summary | ## Logging diff --git a/docs/guides/customization.md b/docs/guides/customization.md index f2b61d7..c9a7aa1 100644 --- a/docs/guides/customization.md +++ b/docs/guides/customization.md @@ -16,9 +16,9 @@ The Scout crew automatically creates a Crawler Agent for each portal. No code ch **Reliability guidelines:** -- 0.90–1.00 - Official national/EU sources (ANAC, TED) -- 0.75–0.89 - Regional procurement platforms (MePA, Sardegna CAT) -- 0.50–0.74 - Municipal or niche portals +- 0.90-1.00 - Official national/EU sources (ANAC, TED) +- 0.75-0.89 - Regional procurement platforms (MePA, Sardegna CAT) +- 0.50-0.74 - Municipal or niche portals The Resolution Agent uses reliability as a weighting factor in consensus polling. A portal with 0.70 contributes less to the canonical record than one with 1.00. diff --git a/docs/guides/getting_started.md b/docs/guides/getting_started.md index 17d06ac..b480d2d 100644 --- a/docs/guides/getting_started.md +++ b/docs/guides/getting_started.md @@ -84,6 +84,14 @@ bandai --mode propose --contract GD-2026-00123 Skips scouting. Loads the contract with the given ID from `output/01_scout_results.json` and runs compliance + proposal phases. +### Report mode + +```bash +bandai --mode report +``` + +Generates `output/report.html` from the current output JSON files. Use it after `full`, `scout`, or `propose` runs to review results in a browser. + ### Dry run ```bash @@ -102,6 +110,7 @@ output/ ├── 02_compliance_01_GD-2026-00123.json ├── 02_no_go_review_required.json ├── 03_proposal_01_GD-2026-00123.json +├── report.html └── proposal_output.md ``` @@ -122,6 +131,7 @@ uv run pytest_unit | `bandai` | Run full pipeline | | `bandai --mode scout` | Scout-only mode | | `bandai --mode propose --contract ID` | Propose mode for known contract | +| `bandai --mode report` | Generate stakeholder HTML report | | `bandai --dry-run` | Validate config without LLM calls | | `crewai test` | Run CrewAI evaluation (2 iterations) | | `crewai train -n 5` | Train crew with 5 iterations | diff --git a/docs/reference/cli_reference.md b/docs/reference/cli_reference.md index 9e0d5c1..bda8800 100644 --- a/docs/reference/cli_reference.md +++ b/docs/reference/cli_reference.md @@ -32,6 +32,18 @@ bandai --mode propose --contract GD-2026-00123 `--mode propose` requires `--contract`. Without it, or if the contract ID is not present in the previous scout output, exits with an error. +### `bandai --mode report` + +Generates a stakeholder-ready HTML report from the JSON files already present in `output/`. + +```bash +bandai --mode report +# or +bandai_report +``` + +Writes `output/report.html`. This mode does not call LLMs and does not require provider/API validation. + ### `bandai --dry-run` Validates configuration without making LLM calls. Checks: @@ -51,13 +63,13 @@ Exits 0 on success, 1 on any validation failure. The following commands are declared in `pyproject.toml` and point to the same runtime entry points used by the CLI: -### `run_crew` +### `kickoff` -Alias of `bandai`. +CrewAI Flow-compatible alias for `bandai`. -### `run_with_trigger` +### `plot` -Calls the same runtime as `bandai`. Use this when the pipeline is launched by an external trigger such as a webhook. +CrewAI Flow-compatible script used by `crewai flow plot`. ### `pytest_unit` From 34a7d24f1ff73160a836c9a466dc0f2befd31710 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:14:21 +0200 Subject: [PATCH 50/52] test: add report generation unit tests --- tests/test_report.py | 111 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 tests/test_report.py diff --git a/tests/test_report.py b/tests/test_report.py new file mode 100644 index 0000000..4d6ba42 --- /dev/null +++ b/tests/test_report.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from bandai.report import build_report_data, generate_report_html, write_report + + +def test_build_report_data_collects_pipeline_outputs(tmp_path: Path) -> None: + contract = { + "canonical_contract_id": "363340-2026", + "title": "Italy - Research services", + "contracting_authority": "European Commission", + "deadline": "2026-07-03T12:00:59", + "value_eur": 1100000.0, + "cpv_codes": ["73110000"], + "sources": ["TED"], + "consensus_score": 0.9, + "canonical_url": "https://ted.europa.eu/en/notice/-/detail/363340-2026", + } + verdict = { + "bid_decision": "GO", + "conditions": [], + "key_risks": [], + "key_strengths": ["Relevant research experience"], + "compliance_score": 0.95, + "legal_flags": ["D.Lgs. 36/2023"], + "verdict_rationale": "Ready to bid.", + } + proposal = { + "tender_ref": "363340-2026", + "executive_summary": "Executive summary", + "sections": {"Approach": "Research approach"}, + "appendices": [], + "compliance_declarations": ["Declaration"], + "word_count": 1200, + "quality_score": 0.91, + } + + (tmp_path / "01_scout_results.json").write_text(json.dumps({"contracts": [contract]}), encoding="utf-8") + (tmp_path / "02_compliance_01_363340-2026.json").write_text( + json.dumps({"contract": contract, "verdict": verdict}), + encoding="utf-8", + ) + (tmp_path / "03_proposal_01_363340-2026.json").write_text(json.dumps(proposal), encoding="utf-8") + + data = build_report_data(tmp_path) + + assert data["summary"]["contracts"] == 1 + assert data["summary"]["go"] == 1 + assert data["summary"]["proposals"] == 1 + assert data["contracts"][0]["canonical_contract_id"] == "363340-2026" + + +def test_build_report_data_hides_stale_no_go_when_contract_is_approved(tmp_path: Path) -> None: + contract = {"canonical_contract_id": "363340-2026", "title": "Research services"} + verdict = {"bid_decision": "GO", "compliance_score": 0.95, "key_strengths": [], "key_risks": []} + stale_no_go = { + "contract": {"contract_id": "363340-2026", "title": "Research services"}, + "verdict": {"bid_decision": "NO-GO", "compliance_score": 0.55}, + } + + (tmp_path / "01_scout_results.json").write_text(json.dumps({"contracts": [contract]}), encoding="utf-8") + (tmp_path / "02_compliance_01_363340-2026.json").write_text( + json.dumps({"contract": contract, "verdict": verdict}), + encoding="utf-8", + ) + (tmp_path / "02_no_go_review_required.json").write_text( + json.dumps({"no_go_contracts": [stale_no_go], "total": 1}), + encoding="utf-8", + ) + + data = build_report_data(tmp_path) + + assert data["summary"]["go"] == 1 + assert data["summary"]["no_go"] == 0 + assert data["no_go"] == [] + + +def test_generate_report_html_contains_stakeholder_sections(tmp_path: Path) -> None: + data = { + "summary": {"contracts": 1, "go": 1, "conditional": 0, "no_go": 0, "proposals": 1}, + "contracts": [{"canonical_contract_id": "363340-2026", "title": "Research services"}], + "compliance": [ + { + "contract": {"canonical_contract_id": "363340-2026", "title": "Research services"}, + "verdict": {"bid_decision": "GO", "compliance_score": 0.95, "key_strengths": [], "key_risks": []}, + "source_file": "02_compliance_01_363340-2026.json", + } + ], + "no_go": [], + "proposals": [{"tender_ref": "363340-2026", "quality_score": 0.91, "word_count": 1200, "sections": {}}], + "generated_at": "2026-05-28 10:00", + } + + html = generate_report_html(data) + + assert "BandAI Report" in html + assert "Research services" in html + assert "theme-toggle" in html + assert "GO" in html + + +def test_write_report_creates_html_file(tmp_path: Path) -> None: + (tmp_path / "01_scout_results.json").write_text(json.dumps({"contracts": []}), encoding="utf-8") + + path = write_report(output_dir=tmp_path) + + assert path == tmp_path / "report.html" + assert path.exists() + assert "BandAI Report" in path.read_text(encoding="utf-8") From 6a9b66a3bbce92124fbea6ee96bdc1582483fb16 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:15:00 +0200 Subject: [PATCH 51/52] fix: update guardrails and utils --- src/bandai/guardrails.py | 12 ++++++------ src/bandai/utils.py | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/bandai/guardrails.py b/src/bandai/guardrails.py index a34586f..beb21d3 100644 --- a/src/bandai/guardrails.py +++ b/src/bandai/guardrails.py @@ -134,6 +134,12 @@ def validate_resolved_contract_array( "sources=[portal and/or url]; consensus_score=portal reliability weight.", ) + if _contains_placeholder(contract): + return ( + False, + f"Item {index} contains placeholder/example values such as 'string' or 'code1'. " + "Use only real values extracted from TenderInfo context; if unavailable use the configured fallback values.", + ) if not isinstance(contract.get("cpv_codes"), list): return (False, f"Item {index} field cpv_codes must be a list, never null.") if isinstance(contract.get("contracting_authority"), dict): @@ -148,12 +154,6 @@ def validate_resolved_contract_array( return (False, f"Item {index} field sources must be a non-empty list.") if not isinstance(contract.get("consensus_score"), (int, float)): return (False, f"Item {index} field consensus_score must be a number.") - if _contains_placeholder(contract): - return ( - False, - f"Item {index} contains placeholder/example values such as 'string' or 'code1'. " - "Use only real values extracted from TenderInfo context; if unavailable use the configured fallback values.", - ) return (True, payload) diff --git a/src/bandai/utils.py b/src/bandai/utils.py index 57bec13..f3b3c56 100644 --- a/src/bandai/utils.py +++ b/src/bandai/utils.py @@ -90,6 +90,7 @@ def _extract_array_from_wrapper(value: Any) -> list | None: def extract_json_array_text(raw: str) -> str: """Extract the first complete JSON array text from a raw LLM output.""" decoder = json.JSONDecoder() + stripped = raw.strip() for start, char in enumerate(raw): if char not in "[{": @@ -104,6 +105,9 @@ def extract_json_array_text(raw: str) -> str: if array is not None: return json.dumps(array, ensure_ascii=False) + if stripped.startswith(("[", "{")): + json.loads(stripped) + raise ValueError("No JSON array found in the input.") From b47540b180ccb571998becf05f61a38eec96cce8 Mon Sep 17 00:00:00 2001 From: Alessandro Sidero <75628365+Alessandro624@users.noreply.github.com> Date: Thu, 28 May 2026 11:15:14 +0200 Subject: [PATCH 52/52] feat: update flow with specific typing --- src/bandai/flow.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/bandai/flow.py b/src/bandai/flow.py index fe63d62..7e800eb 100644 --- a/src/bandai/flow.py +++ b/src/bandai/flow.py @@ -2,7 +2,7 @@ import logging from datetime import datetime -from typing import Callable +from typing import Callable, Literal from pydantic import BaseModel, Field, TypeAdapter from crewai.flow.flow import Flow, listen, router, start # type: ignore @@ -116,7 +116,7 @@ def begin(self) -> None: self.state.user_preferences = "No specific preferences - use standard CPV filters." @router(begin) - def route_from_begin(self) -> str: + def route_from_begin(self) -> Literal["scout", "skip_scout"]: """Decide next step after scouting.""" return "scout" if self.state.mode in ("full", "scout") else "skip_scout" @@ -138,8 +138,14 @@ def run_scouting(self) -> None: resolved = TypeAdapter(list[ResolvedContract]).validate_python(parsed) contracts = [contract.model_dump() for contract in resolved] except Exception: - log.error("Scout output was not parseable. Raw:\n%s", raw) - contracts = [] + pydantic_output = getattr(final_task.output, "pydantic", None) + try: + parsed = [_normalize_resolved_contract_dict(contract if isinstance(contract, dict) else contract.model_dump()) for contract in (pydantic_output or [])] + resolved = TypeAdapter(list[ResolvedContract]).validate_python(parsed) + contracts = [contract.model_dump() for contract in resolved] + except Exception: + log.error("Scout output was not parseable. Raw:\n%s", raw) + contracts = [] self.state.contracts = contracts self.state.total_contracts = len(contracts) @@ -150,7 +156,7 @@ def run_scouting(self) -> None: self.state.contracts = [] @router(run_scouting) - def route_after_scout(self) -> str: + def route_after_scout(self) -> Literal["end_scout", "start_compliance"]: """Decide next step after scouting.""" return "end_scout" if self.state.mode == "scout" else "start_compliance" @@ -169,7 +175,7 @@ def skip_to_compliance(self) -> None: pass @router(skip_to_compliance) - def route_skip(self) -> str: + def route_skip(self) -> Literal["start_compliance"]: return "start_compliance" # Phase 2: Compliance @@ -184,7 +190,7 @@ def init_compliance(self) -> None: self.state.current_contract_index = 0 @router(init_compliance) - def route_after_init(self) -> str: + def route_after_init(self) -> Literal["process_next_contract"]: return "process_next_contract" @listen("process_next_contract") @@ -221,7 +227,7 @@ def process_next_contract_func(self) -> None: log.info(" [%d/%d] %s", idx + 1, len(contracts), contract.get("title", "?")) @router(process_next_contract_func) - def route_process_contract(self) -> str: + def route_process_contract(self) -> Literal["compliance_done", "run_compliance_crew"]: """All done, or run compliance on the next contract?""" if self.state.current_contract_index >= len(self.state.contracts): return "compliance_done" @@ -251,7 +257,7 @@ def run_compliance_crew_func(self) -> None: self.state.current_verdict = None @router(run_compliance_crew_func) - def route_verdict(self) -> str: + def route_verdict(self) -> Literal["handle_conditional_go", "process_next_contract"]: """Route based on the compliance verdict.""" if self.state.current_verdict is None: self.state.current_contract_index += 1 @@ -361,7 +367,7 @@ def handle_conditional_go_func(self) -> None: self.state.current_contract_index += 1 @router(handle_conditional_go_func) - def route_after_conditional(self) -> str: + def route_after_conditional(self) -> Literal["handle_conditional_go", "process_next_contract"]: """Loop back for another review iteration, or move on.""" if self.state.current_verdict is not None: verdict = ComplianceVerdict(**self.state.current_verdict) @@ -381,7 +387,7 @@ def after_compliance(self) -> None: ) @router(after_compliance) - def route_after_compliance(self) -> str: + def route_after_compliance(self) -> Literal["start_proposals"]: return "start_proposals" # Phase 3: Proposals