diff --git a/data/all.txt b/data/all.txt new file mode 100644 index 0000000..29d012a --- /dev/null +++ b/data/all.txt @@ -0,0 +1,50 @@ +Aktualisierte Sanktionsmeldung: Taliban | Thu, 12 Mar 2026 14:28:55 +0100 | https://www.finma.ch/en/news/2026/03/20260312-sr-946-231-07/ +Aktualisierte Sanktionsmeldung: Islamische Republik Iran | Tue, 10 Mar 2026 16:28:45 +0100 | https://www.finma.ch/en/news/2026/03/20260310-sr-946-231-143-6/ +Aktualisierte Sanktionsmeldung: ISIL (Da'esh) / Al-Kaida | Tue, 03 Mar 2026 16:17:52 +0100 | https://www.finma.ch/en/news/2026/03/20260303-sr-946-231-08/ +FINMA proceedings: MBaer Merchant Bank AG in liquidation | Fri, 27 Feb 2026 15:00:00 +0100 | https://www.finma.ch/en/news/2026/02/20260227-mm-mbaer-liquidation/ +Measures at MBaer Merchant Bank AG | Thu, 26 Feb 2026 15:30:00 +0100 | https://www.finma.ch/en/news/2026/02/20260226-mm-mbaer/ +Sanktionen: Russland und Belarus | Thu, 26 Feb 2026 14:47:26 +0100 | https://www.finma.ch/en/news/2026/02/20260226-sanktionsmeldung-russland-belarus/ +Aktualisierte Sanktionsmeldung: Sudan | Thu, 26 Feb 2026 14:41:17 +0100 | https://www.finma.ch/en/news/2026/02/20260226-sr-946-231-18/ +Aktualisierte Sanktionsmeldung: Russland | Wed, 25 Feb 2026 14:24:33 +0100 | https://www.finma.ch/en/news/2026/02/20260225_sr-946-231-176-72/ +Aktualisierte Sanktionsmeldung: Sudan | Thu, 19 Feb 2026 08:22:44 +0100 | https://www.finma.ch/en/news/2026/02/20260219_sr-946-231-18/ +Aktualisierte Sanktionsmeldung | Thu, 19 Feb 2026 08:17:08 +0100 | https://www.finma.ch/en/news/2026/02/20260219_sr-946-231-09/ +Alain Girard to assume leadership of FINMA’s Banks division | Wed, 18 Feb 2026 09:59:00 +0100 | https://www.finma.ch/en/news/2026/02/20260218-mm-leitung-gb-b/ +FINMA to relocate its Zurich office from the city centre to Oerlikon | Fri, 06 Feb 2026 11:00:00 +0100 | https://www.finma.ch/en/news/2026/02/20260206-meldung-standort-zh/ +Aktualisierte Sanktionsmeldung: Russland | Fri, 30 Jan 2026 15:58:54 +0100 | https://www.finma.ch/en/news/2026/01/20260130-sr-946-231-176-72/ +Aktualisierte Sanktionsmeldung: Haiti | Thu, 29 Jan 2026 08:28:55 +0100 | https://www.finma.ch/en/news/2026/01/20260129-sr-946-231-139-4/ +Aktualisierte Sanktionsmeldung: Guatemala | Tue, 27 Jan 2026 13:21:43 +0100 | https://www.finma.ch/en/news/2026/01/20260127-sr-946-231-137-6/ +Aktualisierte Sanktionsmeldung: Demokratische Republik Kongo | Tue, 13 Jan 2026 12:13:38 +0100 | https://www.finma.ch/en/news/2026/01/20260113-sr-946-231-12/ +Aktualisierte Sanktionsmeldung: Venezuela | Tue, 13 Jan 2026 12:09:30 +0100 | https://www.finma.ch/en/news/2026/01/20260113-sr-946-231-178-5/ +Aktualisierte Sanktionsmeldung: Russland | Tue, 13 Jan 2026 12:03:42 +0100 | https://www.finma.ch/en/news/2026/01/20260113-sr-946-231-176-72/ +FINMA publishes guidance on risks associated with the custody of cryptobased assets | Mon, 12 Jan 2026 09:59:00 +0100 | https://www.finma.ch/en/news/2026/01/20260112-mm-am-01-26/ +New ordinance on the freezing of assets in the context of Venezuela of 5 January 2026 | Mon, 05 Jan 2026 13:45:00 +0100 | https://www.finma.ch/en/news/2026/01/20260105-sr-196-127-85/ +FINMA launches consultation on partially revised Circular 2016/7 “Video and online identification” | Tue, 16 Dec 2025 09:59:00 +0100 | https://www.finma.ch/en/news/2025/12/20251216-mm-video-online-id/ +Totalrevision der Verordnung über Massnahmen gegenüber der Islamischen Republik Iran | Mon, 15 Dec 2025 10:32:39 +0100 | https://www.finma.ch/en/news/2025/12/20251215-sr-946-231-143-6/ +Sanktionen: Die Schweiz weitet ihre Sanktionslisten betreffend Russland und Belarus aus | Fri, 12 Dec 2025 16:39:27 +0100 | https://www.finma.ch/en/news/2025/12/20251212-sanktionsmeldung-russland-belarus/ +Aktualisierte Sanktionsmeldung: Republik Irak | Thu, 11 Dec 2025 09:04:35 +0100 | https://www.finma.ch/en/news/2025/12/20251211-sr-946-206/ +Aktualisierte Sanktionsmeldung: Sudan | Tue, 09 Dec 2025 13:24:29 +0100 | https://www.finma.ch/en/news/2025/12/20251209-sr-946-231-18/ +Hedwig Ulmer Busenhart joins FINMA’s Executive Board | Tue, 09 Dec 2025 09:00:00 +0100 | https://www.finma.ch/en/news/2025/12/20251209-mm-leitung-gb-v/ +Swiss Takeover Board: Beat Fellmann’s term of office extended by one year | Fri, 28 Nov 2025 09:58:00 +0100 | https://www.finma.ch/en/news/2025/11/20251128-mm-uek-verlaengerung/ +Ex-post evaluation of Circular 2019/2 “Interest rate risks – banks” | Wed, 26 Nov 2025 09:58:00 +0100 | https://www.finma.ch/en/news/2025/11/20251126-mm-expost-rs-19-2-zinsrisiken/ +Federal Council appoints new member to FINMA’s Board of Directors | Wed, 19 Nov 2025 06:58:10 +0100 | https://www.finma.ch/en/news/2025/11/20251119-mm-change-vr/ +FINMA Risk Monitor 2025: accentuated geopolitical risks – cyber and real estate risks continue to grow | Mon, 17 Nov 2025 09:00:00 +0100 | https://www.finma.ch/en/news/2025/11/20251117-mm-risikomonitor/ +Insurance intermediary certificates unlawfully obtained due to manipulation | Tue, 11 Nov 2025 08:59:00 +0100 | https://www.finma.ch/en/news/2025/11/20251111-mm-vbv/ +Aktualisierte Sanktionsmeldung: ISIL (Da'esh) / Al-Kaida | Fri, 07 Nov 2025 16:05:45 +0100 | https://www.finma.ch/en/news/2025/11/20251107-sr-946-231-08/ +Aktualisierte Sanktionsmeldung: Belarus | Fri, 31 Oct 2025 08:42:14 +0100 | https://www.finma.ch/en/news/2025/10/20251031-sr-946-231-116-9/ +Aktualisierte Sanktionsmeldung: Russland | Thu, 30 Oct 2025 14:03:08 +0100 | https://www.finma.ch/en/news/2025/10/20251030-sr-946-231-176-72/ +Aktualisierte Sanktionsmeldung: ISIL (Da'esh) / Al-Kaida | Wed, 22 Oct 2025 15:44:04 +0200 | https://www.finma.ch/en/news/2025/10/20251022-sr-946-231-08/ +Aktualisierte Sanktionsmeldung: Haiti | Tue, 21 Oct 2025 17:28:13 +0200 | https://www.finma.ch/en/news/2025/10/20251021-sr-946-231-139-4/ +Aktualisierte Sanktionsmeldung: Nicaragua | Tue, 21 Oct 2025 15:09:42 +0200 | https://www.finma.ch/en/news/2025/10/20251021-sr-946-231-158-5/ +Aktualisierte Sanktionsmeldung: Islamischen Republik Iran | Tue, 21 Oct 2025 15:02:15 +0200 | https://www.finma.ch/en/news/2025/10/20251021-sr-946-231-143-6/ +Aktualisierte Sanktionsmeldung: ISIL (Da'esh) / Al-Kaida | Fri, 17 Oct 2025 14:34:19 +0200 | https://www.finma.ch/en/news/2025/10/20251710-sr-946-231-08/ +FINMA to appeal partial decision of the Federal Administrative Court concerning AT1 | Wed, 15 Oct 2025 09:59:00 +0200 | https://www.finma.ch/en/news/2025/10/20251015-meldung-bvger-at1/ +Aktualisierte Sanktionsmeldung: ISIL (Da'esh) / Al-Kaida | Fri, 10 Oct 2025 12:08:58 +0200 | https://www.finma.ch/en/news/2025/10/20251010-sr-946-231-08/ +FINMA publishes guidance on the extension of the transitional period for exchange of collateral in certain OTC derivatives transactions | Thu, 09 Oct 2025 09:55:00 +0200 | https://www.finma.ch/en/news/2025/10/20251009-meldung-am-04-25/ +Aktualisierte Sanktionsmeldung: Burundi | Tue, 07 Oct 2025 14:16:00 +0200 | https://www.finma.ch/en/news/2025/10/20251007-sr-946-231-121-8/ +Sanktionsmeldung: Guinea-Bissau | Tue, 07 Oct 2025 14:10:54 +0200 | https://www.finma.ch/en/news/2025/10/20251007-sr-946-231-138-3/ +Aktualisierte Sanktionsmeldung: Syrien | Tue, 07 Oct 2025 14:04:54 +0200 | https://www.finma.ch/en/news/2025/10/20251007-rs-946-231-172-7/ +Aktualisierte Sanktionsmeldung: Russland | Wed, 01 Oct 2025 06:11:24 +0200 | https://www.finma.ch/en/news/2025/10/20251001-sr-946-231-176-72/ +UBS resolution report | Thu, 25 Sep 2025 08:00:00 +0200 | https://www.finma.ch/en/news/2025/09/20250925-mm-resolution-bericht-ubs/ +FINMA and the UK supervisory authorities strengthen cooperation in financial services | Mon, 22 Sep 2025 12:14:00 +0200 | https://www.finma.ch/en/news/2025/09/20250922-meldung-mou-london/ +FINMA publishes guidance on the disclosure of cryptobased assets in the annual financial statements of banks and securities firms | Fri, 05 Sep 2025 09:59:00 +0200 | https://www.finma.ch/en/news/2025/09/20250905-meldung-am-kryptovermoegenswerte/ +FINMA publishes consolidated ordinance covering the insolvency proceedings at financial market institutions | Thu, 04 Sep 2025 09:58:00 +0200 | https://www.finma.ch/en/news/2025/09/20250904-mm-insolvenzverordnung/ diff --git a/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/data_level0.bin b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/data_level0.bin new file mode 100644 index 0000000..5efb1b9 Binary files /dev/null and b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/data_level0.bin differ diff --git a/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/header.bin b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/header.bin new file mode 100644 index 0000000..bb54792 Binary files /dev/null and b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/header.bin differ diff --git a/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/length.bin b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/length.bin new file mode 100644 index 0000000..821ccd9 Binary files /dev/null and b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/length.bin differ diff --git a/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/link_lists.bin b/data/chroma/4e62d272-4649-4357-b7ab-0c5c9ee7a654/link_lists.bin new file mode 100644 index 0000000..e69de29 diff --git a/data/chroma/chroma.sqlite3 b/data/chroma/chroma.sqlite3 new file mode 100644 index 0000000..36d7bb4 Binary files /dev/null and b/data/chroma/chroma.sqlite3 differ diff --git a/data/input/BUA302_WEEK4_HW_IRIS.pdf b/data/input/BUA302_WEEK4_HW_IRIS.pdf deleted file mode 100644 index f118ffb..0000000 Binary files a/data/input/BUA302_WEEK4_HW_IRIS.pdf and /dev/null differ diff --git a/src/thomas/finma.py b/src/thomas/finma.py index 5493407..5b33a81 100644 --- a/src/thomas/finma.py +++ b/src/thomas/finma.py @@ -22,13 +22,15 @@ class RssItem: description: Optional[str] -def fetch_rss(url: str = RSS_URL, timeout: float = 20.0) -> bytes: +def fetch_rss(url: str = RSS_URL, timeout: float = 30.0) -> bytes: # Create SSL context that doesn't verify certificates (for macOS certificate issues) ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE - with urllib.request.urlopen(url, timeout=timeout, context=ssl_context) as resp: + with urllib.request.urlopen( + url, timeout=timeout, context=ssl_context + ) as resp: return resp.read() @@ -46,7 +48,8 @@ def parse_rss(xml_bytes: bytes) -> List[RssItem]: RssItem( title=_text(item.find("title")), link=_text(item.find("link")), - pubDate=_text(item.find("pubDate")) or _text(item.find("dc:date")), + pubDate=_text(item.find("pubDate")) + or _text(item.find("dc:date")), description=_text(item.find("description")), ) ) @@ -99,7 +102,9 @@ def _item_key(item: RssItem) -> str: ) if concat: return f"raw::{concat}" - return "json::" + json.dumps(asdict(item), sort_keys=True, ensure_ascii=False) + return "json::" + json.dumps( + asdict(item), sort_keys=True, ensure_ascii=False + ) def _load_existing_keys(path: Path) -> set[str]: @@ -171,7 +176,9 @@ def main(argv: Optional[List[str]] = None) -> int: help="Output file path (default: data/finma.txt)", ) parser.add_argument( - "--url", default=RSS_URL, help="RSS URL to fetch (default: FINMA news RSS)" + "--url", + default=RSS_URL, + help="RSS URL to fetch (default: FINMA news RSS)", ) args = parser.parse_args(argv) diff --git a/src/thomas/full_rag.py b/src/thomas/full_rag.py index 62e2562..7c6a39c 100644 --- a/src/thomas/full_rag.py +++ b/src/thomas/full_rag.py @@ -6,14 +6,16 @@ import http.client from urllib.parse import urlparse import http.client -from urllib.parse import urlparse # ---------------- ChromaDB configuration (persistent by default) ---------------- # Defaults align to project root data/chroma, with optional env overrides. _PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent -CHROMA_PERSIST_DIR = os.getenv("CHROMA_PATH", str((_PROJECT_ROOT / "data" / "chroma").resolve())) +CHROMA_PERSIST_DIR = os.getenv( + "CHROMA_PATH", str((_PROJECT_ROOT / "data" / "chroma").resolve()) +) CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION", "documents") + def _get_chroma_client_and_collection(): """ Initialize a persistent Chroma client and return (client, collection). @@ -30,28 +32,35 @@ def _get_chroma_client_and_collection(): Path(CHROMA_PERSIST_DIR).mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient( - path=CHROMA_PERSIST_DIR, - settings=Settings(anonymized_telemetry=False) + path=CHROMA_PERSIST_DIR, settings=Settings(anonymized_telemetry=False) ) # Prefer get_or_create_collection when available for robustness try: get_or_create = getattr(client, "get_or_create_collection") - collection = get_or_create(name=CHROMA_COLLECTION_NAME, metadata={"hnsw:space": "cosine"}) + collection = get_or_create( + name=CHROMA_COLLECTION_NAME, metadata={"hnsw:space": "cosine"} + ) except Exception: try: collection = client.get_collection(CHROMA_COLLECTION_NAME) except Exception: - collection = client.create_collection(name=CHROMA_COLLECTION_NAME, metadata={"hnsw:space": "cosine"}) + collection = client.create_collection( + name=CHROMA_COLLECTION_NAME, metadata={"hnsw:space": "cosine"} + ) return client, collection + # ---------------- Ollama configuration (no env vars) ---------------- OLLAMA_BASE_URL = "http://127.0.0.1:11434" # Use a valid published model tag. Common options: "llama3.1", "llama3.1:8b", "llama3.1:latest", # "mistral:7b", "qwen2.5:7b", "phi3:3.8b". -OLLAMA_MODEL = "llama3.1" # changed from "llama3.1:8b-instruct" which does not exist +OLLAMA_MODEL = ( + "llama3.1" # changed from "llama3.1:8b-instruct" which does not exist +) OLLAMA_ENABLED = False OLLAMA_TIMEOUT_SECONDS = 600 + def discover_source_files(root_path: str) -> List[str]: allowed_exts = {".doc", ".docx", ".pdf", ".txt", ".rtf"} @@ -129,9 +138,15 @@ def _default_input_dir() -> Path: project_root = module_dir.parent.parent return (project_root / "data" / "input").resolve() + def _clean_extracted_text(text: str, keep_punctuation: bool = True) -> str: t = text.replace("\r\n", "\n").replace("\r", "\n") - t = t.replace("\u00AD", "").replace("\u200B", "").replace("\u200C", "").replace("\u200D", "") + t = ( + t.replace("\u00ad", "") + .replace("\u200b", "") + .replace("\u200c", "") + .replace("\u200d", "") + ) t = re.sub(r"-\s*\n\s*", "", t) t = t.replace("\n\n", "

") t = re.sub(r"[ \t]*\n[ \t]*", " ", t) @@ -139,10 +154,24 @@ def _clean_extracted_text(text: str, keep_punctuation: bool = True) -> str: def _collapse_spaces(m: re.Match) -> str: return re.sub(r"\s+", "", m.group(0)) - t = re.sub(r"(h\s*t\s*t\s*p\s*s?\s*:\s*/\s*/[^ \t<>)]+)", _collapse_spaces, t, flags=re.IGNORECASE) - t = re.sub(r"(d\s*o\s*i\s*[:.]?\s*10\.[^ \t<>)]+)", _collapse_spaces, t, flags=re.IGNORECASE) + t = re.sub( + r"(h\s*t\s*t\s*p\s*s?\s*:\s*/\s*/[^ \t<>)]+)", + _collapse_spaces, + t, + flags=re.IGNORECASE, + ) + t = re.sub( + r"(d\s*o\s*i\s*[:.]?\s*10\.[^ \t<>)]+)", + _collapse_spaces, + t, + flags=re.IGNORECASE, + ) t = re.sub(r"(10\.\d{4,}/[^ \t<>)]+)", _collapse_spaces, t) - t = re.sub(r"([A-Za-z0-9-]+\s*\.\s*[A-Za-z0-9.-]+\s*/[^ \t<>)]*)", _collapse_spaces, t) + t = re.sub( + r"([A-Za-z0-9-]+\s*\.\s*[A-Za-z0-9.-]+\s*/[^ \t<>)]*)", + _collapse_spaces, + t, + ) t = re.sub(r"[ \t]+", " ", t).strip() # Replace preserved paragraph markers with a single space to remove all newlines @@ -156,12 +185,21 @@ def _collapse_spaces(m: re.Match) -> str: return t - -def _ollama_request(method: str, path: str, body: Optional[dict] = None) -> Tuple[int, str]: +def _ollama_request( + method: str, path: str, body: Optional[dict] = None +) -> Tuple[int, str]: """Minimal HTTP client for Ollama without external deps.""" url = urlparse(OLLAMA_BASE_URL) - conn_cls = http.client.HTTPSConnection if url.scheme == "https" else http.client.HTTPConnection - conn = conn_cls(url.hostname, url.port or (443 if url.scheme == "https" else 80), timeout=OLLAMA_TIMEOUT_SECONDS) + conn_cls = ( + http.client.HTTPSConnection + if url.scheme == "https" + else http.client.HTTPConnection + ) + conn = conn_cls( + url.hostname, + url.port or (443 if url.scheme == "https" else 80), + timeout=OLLAMA_TIMEOUT_SECONDS, + ) try: payload = None headers = {"Content-Type": "application/json"} @@ -174,11 +212,20 @@ def _ollama_request(method: str, path: str, body: Optional[dict] = None) -> Tupl finally: conn.close() + # Streamed pull to avoid timeouts on large downloads def _ollama_pull_stream(model: str) -> bool: url = urlparse(OLLAMA_BASE_URL) - conn_cls = http.client.HTTPSConnection if url.scheme == "https" else http.client.HTTPConnection - conn = conn_cls(url.hostname, url.port or (443 if url.scheme == "https" else 80), timeout=OLLAMA_TIMEOUT_SECONDS) + conn_cls = ( + http.client.HTTPSConnection + if url.scheme == "https" + else http.client.HTTPConnection + ) + conn = conn_cls( + url.hostname, + url.port or (443 if url.scheme == "https" else 80), + timeout=OLLAMA_TIMEOUT_SECONDS, + ) try: payload = json.dumps({"name": model, "stream": True}).encode("utf-8") headers = {"Content-Type": "application/json"} @@ -186,7 +233,9 @@ def _ollama_pull_stream(model: str) -> bool: resp = conn.getresponse() if resp.status not in (200, 206): data = resp.read().decode("utf-8", errors="replace") - print(f"[prepare_data] Ollama pull failed ({resp.status}): {data[:200]}") + print( + f"[prepare_data] Ollama pull failed ({resp.status}): {data[:200]}" + ) return False # Read streaming JSONL chunks until 'status'=='success' or 'completed' buf = b"" @@ -223,10 +272,13 @@ def _ollama_pull_stream(model: str) -> bool: finally: conn.close() + def _ollama_model_available(model: str) -> bool: status, data = _ollama_request("GET", "/api/tags") if status != 200: - print(f"[prepare_data] Ollama /api/tags returned {status}; response: {data[:200]}") + print( + f"[prepare_data] Ollama /api/tags returned {status}; response: {data[:200]}" + ) return False try: obj = json.loads(data) @@ -237,6 +289,7 @@ def _ollama_model_available(model: str) -> bool: print(f"[prepare_data] Failed to parse /api/tags: {ex}") return False + def _ollama_generate(prompt: str, model: str) -> str: """Call Ollama /api/generate with stream disabled and parse the response. @@ -272,16 +325,20 @@ def _ollama_generate(prompt: str, model: str) -> str: # As a last resort, return raw data return data + # Try pull with stream first, then non-stream as fallback def _ollama_try_pull_model(model: str) -> bool: if _ollama_pull_stream(model): return True - status, data = _ollama_request("POST", "/api/pull", {"name": model, "stream": False}) + status, data = _ollama_request( + "POST", "/api/pull", {"name": model, "stream": False} + ) if status == 200: return True print(f"[prepare_data] Ollama pull failed ({status}): {data[:200]}") return False + def _ollama_clean_text(raw_text: str) -> str: if not OLLAMA_ENABLED: return raw_text @@ -309,7 +366,10 @@ def _ollama_clean_text(raw_text: str) -> str: return raw_text -def load_source_data(data_files: Optional[List[str]] = None) -> List[Tuple[str, str]]: + +def load_source_data( + data_files: Optional[List[str]] = None, +) -> List[Tuple[str, str]]: """Load and parse source data files. - If data_files is None, discover files under default input dir. @@ -321,7 +381,9 @@ def load_source_data(data_files: Optional[List[str]] = None) -> List[Tuple[str, if data_files is None: input_dir = _default_input_dir() discovered = discover_source_files(str(input_dir)) - data_files = [p for p in discovered if Path(p).suffix.lower() in _supported] + data_files = [ + p for p in discovered if Path(p).suffix.lower() in _supported + ] results: List[Tuple[str, str]] = [] for file_path in data_files: @@ -343,7 +405,6 @@ def load_source_data(data_files: Optional[List[str]] = None) -> List[Tuple[str, return results - def _estimate_tokens(text: str) -> int: # Rough heuristic: ~4 chars per token for English return max(1, (len(text) + 3) // 4) @@ -352,10 +413,27 @@ def _estimate_tokens(text: str) -> int: def _split_into_sentences(text: str) -> List[str]: # Hint section headers to become boundaries section_words = [ - "ABSTRACT", "Abstract", "INTRODUCTION", "Introduction", "BACKGROUND", "Background", - "METHODS", "Methods", "MATERIALS", "Materials", "RESULTS", "Results", - "DISCUSSION", "Discussion", "CONCLUSION", "Conclusions", "Conclusion", - "ACKNOWLEDGEMENTS", "Acknowledgements", "REFERENCES", "References" + "ABSTRACT", + "Abstract", + "INTRODUCTION", + "Introduction", + "BACKGROUND", + "Background", + "METHODS", + "Methods", + "MATERIALS", + "Materials", + "RESULTS", + "Results", + "DISCUSSION", + "Discussion", + "CONCLUSION", + "Conclusions", + "Conclusion", + "ACKNOWLEDGEMENTS", + "Acknowledgements", + "REFERENCES", + "References", ] for w in section_words: text = re.sub(rf"\b{re.escape(w)}\b\s*:?\s*", f"{w}. ", text) @@ -368,8 +446,17 @@ def _split_into_sentences(text: str) -> List[str]: essential_section_titles = { - "abstract", "introduction", "background", "methods", "materials", "results", - "discussion", "conclusion", "conclusions", "acknowledgements", "references" + "abstract", + "introduction", + "background", + "methods", + "materials", + "results", + "discussion", + "conclusion", + "conclusions", + "acknowledgements", + "references", } @@ -408,13 +495,15 @@ def flush_current(): if s_tok >= max_tokens: # Split this long sentence by words words = s.split() - approx_words_per_chunk = max_tokens * 4 // 5 # ~80% of max in words heuristically + approx_words_per_chunk = ( + max_tokens * 4 // 5 + ) # ~80% of max in words heuristically if approx_words_per_chunk <= 0: approx_words_per_chunk = len(words) step = max(1, approx_words_per_chunk - max(5, overlap_tokens)) i = 0 while i < len(words): - piece_words = words[i:i + approx_words_per_chunk] + piece_words = words[i : i + approx_words_per_chunk] piece = " ".join(piece_words) if current_tok + _estimate_tokens(piece) > max_tokens: flush_current() @@ -436,8 +525,12 @@ def flush_current(): if _estimate_tokens(s) > max_tokens: # already handled by branch above, but keep guard words = s.split() - for i in range(0, len(words), max(1, max_tokens - overlap_tokens)): - piece = " ".join(words[i:i + max(1, max_tokens - overlap_tokens)]) + for i in range( + 0, len(words), max(1, max_tokens - overlap_tokens) + ): + piece = " ".join( + words[i : i + max(1, max_tokens - overlap_tokens)] + ) current.append(piece) current_tok += _estimate_tokens(piece) + 1 if current_tok >= target_tokens: @@ -491,7 +584,9 @@ def chunk_data() -> bool: except ValueError as e: print(f"[prepare_data] {e}") return False - source_files = [p for p in files if Path(p).suffix.lower() in {".pdf", ".txt"}] + source_files = [ + p for p in files if Path(p).suffix.lower() in {".pdf", ".txt"} + ] items = load_source_data(source_files) # Semantic chunking parameters (token heuristics) @@ -531,13 +626,15 @@ def chunk_data() -> bool: next_id_int += 1 ids.append(doc_id) documents.append(chunk_text) - metadatas.append({ - "source": path_str, - "chunk_index": chunk_idx, - "doc_index": doc_index, - "tokens": _estimate_tokens(chunk_text), - "filename": str(Path(path_str).name), - }) + metadatas.append( + { + "source": path_str, + "chunk_index": chunk_idx, + "doc_index": doc_index, + "tokens": _estimate_tokens(chunk_text), + "filename": str(Path(path_str).name), + } + ) # Write chunks to JSONL for downstream steps (e.g., generate_metadatas) try: @@ -567,10 +664,19 @@ def chunk_data() -> bool: client, collection = _get_chroma_client_and_collection() sentinel_id = "__init__" sentinel_doc = "chroma-initialization-sentinel" - sentinel_meta = {"type": "init", "note": "created from chunk_data when no documents"} + sentinel_meta = { + "type": "init", + "note": "created from chunk_data when no documents", + } try: - collection.add(ids=[sentinel_id], documents=[sentinel_doc], metadatas=[sentinel_meta]) - print(f"[prepare_data] Wrote sentinel to Chroma to force DB creation at {CHROMA_PERSIST_DIR}.") + collection.add( + ids=[sentinel_id], + documents=[sentinel_doc], + metadatas=[sentinel_meta], + ) + print( + f"[prepare_data] Wrote sentinel to Chroma to force DB creation at {CHROMA_PERSIST_DIR}." + ) except Exception as ex2: if "embedding" in str(ex2).lower(): collection.add( @@ -579,14 +685,20 @@ def chunk_data() -> bool: metadatas=[sentinel_meta], embeddings=[_cheap_embedding(sentinel_doc)], ) - print("[prepare_data] Wrote sentinel with fallback embedding to initialize DB.") + print( + "[prepare_data] Wrote sentinel with fallback embedding to initialize DB." + ) else: - print(f"[prepare_data] Failed to write sentinel to Chroma: {ex2}") + print( + f"[prepare_data] Failed to write sentinel to Chroma: {ex2}" + ) try: persist_fn = getattr(client, "persist", None) if callable(persist_fn): persist_fn() - print(f"[prepare_data] Chroma client.persist() called for path: {CHROMA_PERSIST_DIR}") + print( + f"[prepare_data] Chroma client.persist() called for path: {CHROMA_PERSIST_DIR}" + ) except Exception: pass except Exception: @@ -604,30 +716,38 @@ def chunk_data() -> bool: for i in range(0, len(documents), BATCH): try: collection.upsert( - ids=ids[i:i+BATCH], - documents=documents[i:i+BATCH], - metadatas=metadatas[i:i+BATCH], + ids=ids[i : i + BATCH], + documents=documents[i : i + BATCH], + metadatas=metadatas[i : i + BATCH], ) except Exception as batch_ex: # Provide fallback embeddings if Chroma requires them if "embedding" in str(batch_ex).lower(): - print("[prepare_data] Chroma requires embeddings for upsert; using fallback vectors.") - embeddings = [_cheap_embedding(t) for t in documents[i:i+BATCH]] + print( + "[prepare_data] Chroma requires embeddings for upsert; using fallback vectors." + ) + embeddings = [ + _cheap_embedding(t) for t in documents[i : i + BATCH] + ] collection.upsert( - ids=ids[i:i+BATCH], - documents=documents[i:i+BATCH], - metadatas=metadatas[i:i+BATCH], + ids=ids[i : i + BATCH], + documents=documents[i : i + BATCH], + metadatas=metadatas[i : i + BATCH], embeddings=embeddings, ) else: raise - print(f"[prepare_data] Persisted {len(documents)} semantic chunks to Chroma collection '{CHROMA_COLLECTION_NAME}' at {CHROMA_PERSIST_DIR}") + print( + f"[prepare_data] Persisted {len(documents)} semantic chunks to Chroma collection '{CHROMA_COLLECTION_NAME}' at {CHROMA_PERSIST_DIR}" + ) # Ensure persistence is flushed to disk if the client supports it try: persist_fn = getattr(client, "persist", None) if callable(persist_fn): persist_fn() - print(f"[prepare_data] Chroma client.persist() called for path: {CHROMA_PERSIST_DIR}") + print( + f"[prepare_data] Chroma client.persist() called for path: {CHROMA_PERSIST_DIR}" + ) except Exception: pass return True @@ -635,6 +755,7 @@ def chunk_data() -> bool: print(f"[prepare_data] Failed to upsert chunks to Chroma: {ex}") return False + def _ensure_chunks_file() -> Path: """Ensure chunks.jsonl exists by running chunk_data() if needed. @@ -651,12 +772,16 @@ def _ensure_chunks_file() -> Path: try: chunk_data() except Exception as ex: - print(f"[prepare_data] chunk_data failed while ensuring chunks file: {ex}") + print( + f"[prepare_data] chunk_data failed while ensuring chunks file: {ex}" + ) # Even if chunk_data failed, return the path; caller will handle absence return out_path -def _load_chunks_from_jsonl(chunks_path: Path) -> Tuple[List[str], List[str], List[dict]]: +def _load_chunks_from_jsonl( + chunks_path: Path, +) -> Tuple[List[str], List[str], List[dict]]: """Load chunk rows from JSONL and produce ids, documents, metadatas. Each JSONL line is expected to have: id, source, doc_index, chunk_index, text, tokens. @@ -681,18 +806,24 @@ def _load_chunks_from_jsonl(chunks_path: Path) -> Tuple[List[str], List[str], Li di = int(obj.get("doc_index", 0)) ci = int(obj.get("chunk_index", 0)) tok = int(obj.get("tokens", _estimate_tokens(text))) - cid = str(obj.get("id")) if obj.get("id") is not None else f"doc{di}_chunk{ci}" + cid = ( + str(obj.get("id")) + if obj.get("id") is not None + else f"doc{di}_chunk{ci}" + ) ids.append(cid) documents.append(text) # Minimal, Chroma-friendly metadata - metadatas.append({ - "source": src, - "doc_index": di, - "chunk_index": ci, - "tokens": tok, - # Shallow filename for convenience - "filename": str(Path(src).name) if src else "", - }) + metadatas.append( + { + "source": src, + "doc_index": di, + "chunk_index": ci, + "tokens": tok, + # Shallow filename for convenience + "filename": str(Path(src).name) if src else "", + } + ) return ids, documents, metadatas @@ -727,9 +858,13 @@ def generate_metadatas() -> bool: chunks_path = _ensure_chunks_file() ids, documents, metadatas = _load_chunks_from_jsonl(chunks_path) if not ids: - print("[prepare_data] No chunks found in JSONL. Run chunk_data() to generate.") + print( + "[prepare_data] No chunks found in JSONL. Run chunk_data() to generate." + ) else: - print(f"[prepare_data] Loaded {len(ids)} chunks from {chunks_path} (metadata only, no DB writes).") + print( + f"[prepare_data] Loaded {len(ids)} chunks from {chunks_path} (metadata only, no DB writes)." + ) return True except Exception as ex: print(f"[prepare_data] generate_metadatas failed: {ex}") @@ -743,6 +878,7 @@ def apply_embedding() -> bool: """ return True + def apply_retrieval(query: str) -> bool: """ Basic retrieval from persisted Chroma collection. @@ -791,12 +927,20 @@ def interactive_qa(top_k: int = 5, max_context_chars: int = 8000) -> None: # Query Chroma first; try text-based, then fallback to cheap embedding. try: - res = collection.query(query_texts=[question], n_results=top_k, include=["documents", "metadatas", "distances"]) # type: ignore + res = collection.query( + query_texts=[question], + n_results=top_k, + include=["documents", "metadatas", "distances"], + ) # type: ignore except Exception as ex: if "embedding" in str(ex).lower(): try: q_emb = [_cheap_embedding(question)] - res = collection.query(query_embeddings=q_emb, n_results=top_k, include=["documents", "metadatas", "distances"]) # type: ignore + res = collection.query( + query_embeddings=q_emb, + n_results=top_k, + include=["documents", "metadatas", "distances"], + ) # type: ignore except Exception as ex2: print(f"[qa] Retrieval failed: {ex2}") continue @@ -819,14 +963,18 @@ def interactive_qa(top_k: int = 5, max_context_chars: int = 8000) -> None: total = 0 for i in range(len(ids)): meta = metas[i] if i < len(metas) else {} - filename = (meta or {}).get("filename") or (meta or {}).get("source") or "" + filename = ( + (meta or {}).get("filename") + or (meta or {}).get("source") + or "" + ) chunk_index = (meta or {}).get("chunk_index") - doc_text = (docs[i] or "") + doc_text = docs[i] or "" snippet = doc_text.replace("\n", " ") # Truncate overly long snippets to keep prompt size reasonable if len(snippet) > 1200: snippet = snippet[:1200] + "..." - item = f"[#{i+1}] source={filename} chunk={chunk_index}\n{snippet}\n" + item = f"[#{i + 1}] source={filename} chunk={chunk_index}\n{snippet}\n" if total + len(item) <= max_context_chars: context_parts.append(item) total += len(item) @@ -868,9 +1016,11 @@ def interactive_qa(top_k: int = 5, max_context_chars: int = 8000) -> None: print("[qa] Bye.") + def send_to_llm() -> bool: return True + def cleanup_with_llm() -> bool: """Placeholder for future LLM-based cleanup. Currently a no-op for tests.""" return True @@ -880,6 +1030,7 @@ def apply_sentence_transformers() -> bool: """Placeholder for Sentence-Transformers embedding step. Currently a no-op for tests.""" return True + if __name__ == "__main__": input_dir = _default_input_dir() try: @@ -904,6 +1055,8 @@ def apply_sentence_transformers() -> bool: overlap_tokens=50, ) for idx, ch in enumerate(chunks): - print(f"[chunk {idx}] tokens={_estimate_tokens(ch)}\n{ch}\n---") + print( + f"[chunk {idx}] tokens={_estimate_tokens(ch)}\n{ch}\n---" + ) except Exception as ex: print(f"[prepare_data] Failed to generate/print semantic chunks: {ex}") diff --git a/src/thomas/vdb.py b/src/thomas/vdb.py index fdd6b09..c5ac731 100644 --- a/src/thomas/vdb.py +++ b/src/thomas/vdb.py @@ -1,21 +1,43 @@ import chromadb from chromadb.config import Settings +import argparse import json import os import http.client +import importlib +import sys +import hashlib from urllib.parse import urlparse from pathlib import Path -def add_sentences_to_chromadb(sentences, collection_name="sentences_collection"): - client = chromadb.Client( - Settings( +def _make_ids(sentences): + counts = {} + ids = [] + for sentence in sentences: + base = hashlib.sha256( + sentence.encode("utf-8", errors="ignore") + ).hexdigest() + n = counts.get(base, 0) + counts[base] = n + 1 + ids.append(base if n == 0 else f"{base}-{n}") + return ids + + +def add_sentences_to_chromadb( + sentences, + persist_path, + collection_name="sentences_collection", +): + client = chromadb.PersistentClient( + path=str(persist_path), + settings=Settings( anonymized_telemetry=False, - ) + ), ) collection = client.get_or_create_collection(name=collection_name) - ids = [f"sentence-{i}" for i in range(len(sentences))] - collection.add( + ids = _make_ids(sentences) + collection.upsert( documents=sentences, ids=ids, ) @@ -26,6 +48,13 @@ def add_sentences_to_chromadb(sentences, collection_name="sentences_collection") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1") +# Add or remove RSS feeds here. +# Each URL in this list is fetched during refresh and appended into data/all.txt. +RSS_FEED_URLS = [ + "https://www.finma.ch/en/rss/news/", + # "https://example.org/feed.xml", +] + def _ollama_generate(prompt: str, model: str = OLLAMA_MODEL) -> str: url = urlparse(OLLAMA_BASE_URL) @@ -60,7 +89,9 @@ def _ollama_generate(prompt: str, model: str = OLLAMA_MODEL) -> str: resp = conn.getresponse() data = resp.read().decode("utf-8", errors="replace") if resp.status != 200: - raise RuntimeError(f"Ollama generate failed ({resp.status}): {data[:200]}") + raise RuntimeError( + f"Ollama generate failed ({resp.status}): {data[:200]}" + ) # Try parse single JSON first try: obj = json.loads(data) @@ -87,14 +118,61 @@ def _ollama_generate(prompt: str, model: str = OLLAMA_MODEL) -> str: if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run vdb demo with optional RSS refresh." + ) + parser.add_argument( + "--no-refresh", + action="store_true", + help="Skip downloading RSS feeds before loading data/all.txt", + ) + args = parser.parse_args() + # Use project-root relative path so running from any cwd works project_root = Path(__file__).resolve().parent.parent.parent + src_root = project_root / "src" + if str(src_root) not in sys.path: + sys.path.insert(0, str(src_root)) + + try: + finma_module = importlib.import_module("thomas.finma") + fetch_rss = finma_module.fetch_rss + parse_rss = finma_module.parse_rss + write_ndjson = finma_module.write_ndjson + except Exception as ex: + raise RuntimeError(f"Failed to import thomas.finma: {ex}") from ex + finma_path = project_root / "data" / "finma.txt" - if not finma_path.exists(): + all_path = finma_path.with_name("all.txt") + chroma_path = project_root / "data" / "chroma" + chroma_path.mkdir(parents=True, exist_ok=True) + + if args.no_refresh: + print("Skipping RSS refresh (--no-refresh)") + else: + total_appended = 0 + failures = 0 + for url in RSS_FEED_URLS: + try: + xml_bytes = fetch_rss(url=url) + rss_items = parse_rss(xml_bytes) + appended = write_ndjson(rss_items, all_path) + total_appended += appended + print(f"Feed refreshed: {url} (appended {appended} item(s))") + except Exception as ex: + failures += 1 + print(f"Feed refresh failed: {url} ({ex})") + print( + "RSS refresh complete: " + f"{len(RSS_FEED_URLS) - failures}/{len(RSS_FEED_URLS)} feeds succeeded, " + f"{total_appended} total new item(s) appended to {all_path}" + ) + + if not all_path.exists(): raise FileNotFoundError( - f"{finma_path} not found. Expected file at project root: {finma_path}" + f"{all_path} not found. Expected file at project root: {all_path}" ) - with finma_path.open("r", encoding="utf-8") as f: + with all_path.open("r", encoding="utf-8") as f: contents = f.read() contents = contents.split("\n") @@ -118,7 +196,8 @@ def _ollama_generate(prompt: str, model: str = OLLAMA_MODEL) -> str: sentence = " ".join(filter(None, (title, link, pubDate, description))) sentences.append(sentence) - collection = add_sentences_to_chromadb(sentences) + collection = add_sentences_to_chromadb(sentences, persist_path=chroma_path) + print(f"Chroma DB path: {chroma_path}") question = input("Ask a question: ") results = collection.query( query_texts=[question], @@ -128,7 +207,11 @@ def _ollama_generate(prompt: str, model: str = OLLAMA_MODEL) -> str: # Pick the highest-score item (first result for the single query) top_doc = None - if results and isinstance(results.get("documents"), list) and results["documents"]: + if ( + results + and isinstance(results.get("documents"), list) + and results["documents"] + ): # documents is List[List[str]] per query docs_for_query = results["documents"][0] if docs_for_query: diff --git a/uv.lock b/uv.lock index e2e5f2c..66585ed 100644 --- a/uv.lock +++ b/uv.lock @@ -1850,7 +1850,11 @@ dependencies = [ { name = "typing-extensions" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/1b/af5fccb50c341bd69dc016769503cb0857c1423fbe9343410dfeb65240f2/torch-2.10.0-1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7350f6652dfd761f11f9ecb590bfe95b573e2961f7a242eccb3c8e78348d26fe", size = 79498248, upload-time = "2026-02-06T17:37:31.982Z" }, + { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c6/4dfe238342ffdcec5aef1c96c457548762d33c40b45a1ab7033bb26d2ff2/torch-2.10.0-3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:80b1b5bfe38eb0e9f5ff09f206dcac0a87aadd084230d4a36eea5ec5232c115b", size = 915627275, upload-time = "2026-03-11T14:16:11.325Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f0/72bf18847f58f877a6a8acf60614b14935e2f156d942483af1ffc081aea0/torch-2.10.0-3-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:46b3574d93a2a8134b3f5475cfb98e2eb46771794c57015f6ad1fb795ec25e49", size = 915523474, upload-time = "2026-03-11T14:17:44.422Z" }, + { url = "https://files.pythonhosted.org/packages/f4/39/590742415c3030551944edc2ddc273ea1fdfe8ffb2780992e824f1ebee98/torch-2.10.0-3-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b1d5e2aba4eb7f8e87fbe04f86442887f9167a35f092afe4c237dfcaaef6e328", size = 915632474, upload-time = "2026-03-11T14:15:13.666Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8e/34949484f764dde5b222b7fe3fede43e4a6f0da9d7f8c370bb617d629ee2/torch-2.10.0-3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0228d20b06701c05a8f978357f657817a4a63984b0c90745def81c18aedfa591", size = 915523882, upload-time = "2026-03-11T14:14:46.311Z" }, { url = "https://files.pythonhosted.org/packages/c9/6f/f2e91e34e3fcba2e3fc8d8f74e7d6c22e74e480bbd1db7bc8900fdf3e95c/torch-2.10.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5c4d217b14741e40776dd7074d9006fd28b8a97ef5654db959d8635b2fe5f29b", size = 146004247, upload-time = "2026-01-21T16:24:29.335Z" }, { url = "https://files.pythonhosted.org/packages/98/fb/5160261aeb5e1ee12ee95fe599d0541f7c976c3701d607d8fc29e623229f/torch-2.10.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6b71486353fce0f9714ca0c9ef1c850a2ae766b409808acd58e9678a3edb7738", size = 915716445, upload-time = "2026-01-21T16:22:45.353Z" }, { url = "https://files.pythonhosted.org/packages/6a/16/502fb1b41e6d868e8deb5b0e3ae926bbb36dab8ceb0d1b769b266ad7b0c3/torch-2.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:c2ee399c644dc92ef7bc0d4f7e74b5360c37cdbe7c5ba11318dda49ffac2bc57", size = 113757050, upload-time = "2026-01-21T16:24:19.204Z" }, @@ -1943,11 +1947,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.6.3" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]]