diff --git a/src/neurostack/ask.py b/src/neurostack/ask.py index 4bec66c..2463000 100644 --- a/src/neurostack/ask.py +++ b/src/neurostack/ask.py @@ -8,7 +8,7 @@ import httpx -from .config import get_config +from .config import _auth_headers, get_config from .search import hybrid_search ASK_PROMPT = """You are a knowledge assistant answering questions \ @@ -68,23 +68,22 @@ def ask_vault( sources_text = "\n\n---\n\n".join(source_blocks) prompt = ASK_PROMPT.format(sources=sources_text, question=question) - # Call Ollama LLM + # Call LLM (OpenAI-compatible endpoint) resp = httpx.post( - f"{llm_url}/api/generate", + f"{llm_url}/v1/chat/completions", + headers=_auth_headers(cfg.llm_api_key), json={ "model": llm_model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": { - "temperature": 0.3, - "num_predict": 500, - }, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 500, }, timeout=180.0, ) resp.raise_for_status() - answer = resp.json().get("response", "").strip() + answer = resp.json()["choices"][0]["message"]["content"].strip() # Strip think tags if model includes them answer = re.sub(r".*?", "", answer, flags=re.DOTALL).strip() diff --git a/src/neurostack/cli.py b/src/neurostack/cli.py index af023d0..f7cf4cb 100644 --- a/src/neurostack/cli.py +++ b/src/neurostack/cli.py @@ -763,6 +763,12 @@ def _do_init(vault_root, cfg, profession_name=None, run_index=False): f'embed_url = "{cfg.embed_url}"\n' f'llm_url = "{cfg.llm_url}"\n' f'llm_model = "{cfg.llm_model}"\n' + ) + if cfg.llm_api_key: + config_text += f'llm_api_key = "{cfg.llm_api_key}"\n' + if cfg.embed_api_key: + config_text += f'embed_api_key = "{cfg.embed_api_key}"\n' + config_text += ( f'\n[writeback]\n' f'enabled = {wb_enabled}\n' f'path = "{cfg.writeback_path}"\n' @@ -836,8 +842,9 @@ def cmd_init(args): profession = _prompt("Profession pack", default="none", choices=prof_choices) # 3. LLM configuration - print("\n \033[1mOllama Configuration\033[0m") - print(" NeuroStack uses Ollama for embeddings and summaries.\n") + print("\n \033[1mLLM Configuration\033[0m") + print(" NeuroStack works with any OpenAI-compatible endpoint") + print(" (Ollama, vLLM, Together AI, Groq, OpenRouter, etc.)\n") embed_url = _prompt("Embedding endpoint", default=cfg.embed_url) llm_url = _prompt("LLM endpoint", default=cfg.llm_url) @@ -850,6 +857,22 @@ def cmd_init(args): ] llm_model = _prompt("LLM model for summaries", default=cfg.llm_model, choices=model_choices) + # 3b. API keys (optional — only needed for cloud providers) + llm_api_key = "" + embed_api_key = "" + is_local = any(h in llm_url for h in ("localhost", "127.0.0.1", "0.0.0.0")) + if not is_local: + print("\n \033[1mAPI Authentication\033[0m") + print(" Cloud providers require an API key.\n") + llm_api_key = _prompt("LLM API key", default="") + if embed_url != llm_url: + embed_api_key = _prompt("Embedding API key", default="") + else: + embed_api_key = llm_api_key + elif _confirm("\n Configure API keys? (only needed for cloud providers)", default=False): + llm_api_key = _prompt("LLM API key", default="") + embed_api_key = _prompt("Embedding API key", default=llm_api_key) + # 4. Write-back print( "\n Enable memory write-back? Memories will be" @@ -862,12 +885,14 @@ def cmd_init(args): # Show summary wb_label = "yes" if writeback else "no" + auth_label = "yes" if (llm_api_key or embed_api_key) else "no" print("\n \033[1m━━━ Summary ━━━\033[0m\n") print(f" Vault: {vault_root}") print(f" Profession: {profession}") print(f" Embed URL: {embed_url}") print(f" LLM URL: {llm_url}") print(f" LLM model: {llm_model}") + print(f" API auth: {auth_label}") print(f" Write-back: {wb_label}") print(f" Index now: {'yes' if run_index else 'no'}") @@ -880,6 +905,8 @@ def cmd_init(args): cfg.embed_url = embed_url cfg.llm_url = llm_url cfg.llm_model = llm_model + cfg.llm_api_key = llm_api_key + cfg.embed_api_key = embed_api_key cfg.writeback_enabled = writeback _do_init( diff --git a/src/neurostack/community.py b/src/neurostack/community.py index 8a2edfd..3383959 100644 --- a/src/neurostack/community.py +++ b/src/neurostack/community.py @@ -29,12 +29,13 @@ log = logging.getLogger("neurostack") -from .config import get_config +from .config import _auth_headers, get_config _cfg = get_config() SUMMARIZE_URL = _cfg.llm_url EMBED_URL = _cfg.embed_url SUMMARIZE_MODEL = _cfg.llm_model +_LLM_HEADERS = _auth_headers(_cfg.llm_api_key) COMMUNITY_PROMPT = ( "You are summarizing a cluster of thematically" @@ -134,29 +135,26 @@ def _generate_community_summary( note_summaries=notes_str or "(none)", ) - schema = { - "type": "object", - "properties": { - "title": {"type": "string"}, - "summary": {"type": "string"}, - }, - "required": ["title", "summary"], - } - resp = httpx.post( - f"{base_url}/api/generate", + f"{base_url}/v1/chat/completions", + headers=_LLM_HEADERS, json={ "model": model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "format": schema, - "options": {"temperature": 0.3, "num_predict": 512}, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 512, }, timeout=120.0, ) resp.raise_for_status() - raw = resp.json().get("response", "").strip() + raw = resp.json()["choices"][0]["message"]["content"].strip() + # Strip markdown fences if present + if raw.startswith("```"): + import re + raw = re.sub(r"^```\w*\n?", "", raw) + raw = re.sub(r"\n?```$", "", raw).strip() try: parsed = json.loads(raw) diff --git a/src/neurostack/community_search.py b/src/neurostack/community_search.py index 6ab811f..fe5fce0 100644 --- a/src/neurostack/community_search.py +++ b/src/neurostack/community_search.py @@ -24,12 +24,13 @@ log = logging.getLogger("neurostack") -from .config import get_config +from .config import _auth_headers, get_config _cfg = get_config() SUMMARIZE_URL = _cfg.llm_url EMBED_URL = _cfg.embed_url SUMMARIZE_MODEL = _cfg.llm_model +_LLM_HEADERS = _auth_headers(_cfg.llm_api_key) _MAP_PROMPT = """You are analyzing a knowledge community summary to answer a question. @@ -197,18 +198,20 @@ def global_query( ) try: resp = httpx.post( - f"{summarize_url}/api/generate", + f"{summarize_url}/v1/chat/completions", + headers=_LLM_HEADERS, json={ "model": model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": {"temperature": 0.1, "num_predict": 256}, - "think": False, + "reasoning_effort": "none", + "temperature": 0.1, + "max_tokens": 256, }, timeout=60.0, ) resp.raise_for_status() - finding = resp.json().get("response", "").strip() + finding = resp.json()["choices"][0]["message"]["content"].strip() if finding: findings.append(f"[{hit['title']}]\n{finding}") except Exception as e: @@ -228,18 +231,19 @@ def global_query( ) try: resp = httpx.post( - f"{summarize_url}/api/generate", + f"{summarize_url}/v1/chat/completions", json={ "model": model, - "prompt": reduce_prompt, + "messages": [{"role": "user", "content": reduce_prompt}], "stream": False, - "options": {"temperature": 0.3, "num_predict": 1024}, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 1024, }, timeout=120.0, ) resp.raise_for_status() - answer = resp.json().get("response", "").strip() + answer = resp.json()["choices"][0]["message"]["content"].strip() except Exception as e: log.warning(f"Reduce step failed: {e}") answer = "\n\n".join(findings) diff --git a/src/neurostack/config.py b/src/neurostack/config.py index fdc7a94..a7755ee 100644 --- a/src/neurostack/config.py +++ b/src/neurostack/config.py @@ -28,6 +28,8 @@ class Config: # NOTE: Verify the license of any model you configure here. # phi3.5 is MIT licensed. llm_model: str = "phi3.5" + llm_api_key: str = "" + embed_api_key: str = "" session_dir: Path = field(default_factory=lambda: Path.home() / ".claude" / "projects") api_host: str = "127.0.0.1" api_port: int = 8000 @@ -57,7 +59,7 @@ def load_config() -> Config: if key in data: setattr(cfg, key, Path(os.path.expanduser(data[key]))) for key in ("embed_url", "embed_model", "llm_url", "llm_model", - "api_host", "api_key"): + "llm_api_key", "embed_api_key", "api_host", "api_key"): if key in data: setattr(cfg, key, data[key]) if "embed_dim" in data: @@ -81,6 +83,8 @@ def load_config() -> Config: "NEUROSTACK_EMBED_DIM": ("embed_dim", int), "NEUROSTACK_LLM_URL": ("llm_url", str), "NEUROSTACK_LLM_MODEL": ("llm_model", str), + "NEUROSTACK_LLM_API_KEY": ("llm_api_key", str), + "NEUROSTACK_EMBED_API_KEY": ("embed_api_key", str), "NEUROSTACK_SESSION_DIR": ("session_dir", Path), "NEUROSTACK_API_HOST": ("api_host", str), "NEUROSTACK_API_PORT": ("api_port", int), @@ -102,6 +106,13 @@ def load_config() -> Config: return cfg +def _auth_headers(api_key: str) -> dict[str, str]: + """Build Authorization header dict if an API key is set.""" + if api_key: + return {"Authorization": f"Bearer {api_key}"} + return {} + + # Module-level singleton _config: Config | None = None diff --git a/src/neurostack/embedder.py b/src/neurostack/embedder.py index 299cf21..a188ff8 100644 --- a/src/neurostack/embedder.py +++ b/src/neurostack/embedder.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2024-2026 Raphael Southall -"""Ollama embedding client.""" +"""Embedding client (OpenAI-compatible /v1/ endpoints).""" import json from typing import Optional @@ -13,12 +13,13 @@ except ImportError: HAS_NUMPY = False -from .config import get_config +from .config import _auth_headers, get_config _cfg = get_config() DEFAULT_EMBED_URL = _cfg.embed_url EMBED_MODEL = _cfg.embed_model EMBED_DIM = _cfg.embed_dim +_EMBED_HEADERS = _auth_headers(_cfg.embed_api_key) def get_embedding( @@ -33,13 +34,14 @@ def get_embedding( "Install with: pip install neurostack[full]" ) resp = httpx.post( - f"{base_url}/api/embed", + f"{base_url}/v1/embeddings", + headers=_EMBED_HEADERS, json={"model": model, "input": text}, timeout=30.0, ) resp.raise_for_status() data = resp.json() - return np.array(data["embeddings"][0], dtype=np.float32) + return np.array(data["data"][0]["embedding"], dtype=np.float32) def get_embeddings_batch( @@ -58,14 +60,15 @@ def get_embeddings_batch( for i in range(0, len(texts), batch_size): batch = texts[i : i + batch_size] resp = httpx.post( - f"{base_url}/api/embed", + f"{base_url}/v1/embeddings", + headers=_EMBED_HEADERS, json={"model": model, "input": batch}, timeout=60.0, ) resp.raise_for_status() data = resp.json() - for emb in data["embeddings"]: - all_embeddings.append(np.array(emb, dtype=np.float32)) + for item in data["data"]: + all_embeddings.append(np.array(item["embedding"], dtype=np.float32)) return all_embeddings diff --git a/src/neurostack/harvest.py b/src/neurostack/harvest.py index 9a623d4..c862dbf 100644 --- a/src/neurostack/harvest.py +++ b/src/neurostack/harvest.py @@ -215,19 +215,22 @@ def _llm_classify( ) try: + from .config import _auth_headers, get_config resp = httpx.post( - f"{llm_url}/api/generate", + f"{llm_url}/v1/chat/completions", + headers=_auth_headers(get_config().llm_api_key), json={ "model": llm_model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": {"temperature": 0.1, "num_predict": 500}, - "think": False, + "reasoning_effort": "none", + "temperature": 0.1, + "max_tokens": 500, }, timeout=60.0, ) resp.raise_for_status() - response = resp.json().get("response", "") + response = resp.json()["choices"][0]["message"]["content"] # Strip think tags if present response = re.sub(r".*?", "", response, flags=re.DOTALL) except Exception as exc: diff --git a/src/neurostack/memories.py b/src/neurostack/memories.py index 9b7be97..97ee79c 100644 --- a/src/neurostack/memories.py +++ b/src/neurostack/memories.py @@ -1001,7 +1001,7 @@ def summarize_session( import httpx - from .config import get_config + from .config import _auth_headers, get_config cfg = get_config() llm_url = llm_url or cfg.llm_url @@ -1031,21 +1031,20 @@ def summarize_session( ) resp = httpx.post( - f"{llm_url}/api/generate", + f"{llm_url}/v1/chat/completions", + headers=_auth_headers(cfg.llm_api_key), json={ "model": llm_model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": { - "temperature": 0.3, - "num_predict": 200, - }, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 200, }, timeout=120.0, ) resp.raise_for_status() - summary = resp.json().get("response", "").strip() + summary = resp.json()["choices"][0]["message"]["content"].strip() # Strip think tags if model includes them summary = re.sub( diff --git a/src/neurostack/summarizer.py b/src/neurostack/summarizer.py index 9a85536..79a41c4 100644 --- a/src/neurostack/summarizer.py +++ b/src/neurostack/summarizer.py @@ -1,14 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2024-2026 Raphael Southall -"""Ollama summary client.""" +"""Summary client (OpenAI-compatible /v1/ endpoints).""" import httpx -from .config import get_config +from .config import _auth_headers, get_config _cfg = get_config() DEFAULT_SUMMARIZE_URL = _cfg.llm_url SUMMARIZE_MODEL = _cfg.llm_model +_LLM_HEADERS = _auth_headers(_cfg.llm_api_key) SUMMARY_PROMPT = """Summarize this note in 2-3 concise sentences. \ Focus on the key purpose, decisions, and actionable information. \ @@ -28,7 +29,7 @@ def summarize_note( base_url: str = DEFAULT_SUMMARIZE_URL, model: str = SUMMARIZE_MODEL, ) -> str: - """Generate a 2-3 sentence summary of a note using Ollama.""" + """Generate a 2-3 sentence summary of a note.""" # Truncate content to ~3000 chars to keep prompt reasonable if len(content) > 3000: content = content[:3000] + "\n[... truncated]" @@ -36,24 +37,23 @@ def summarize_note( prompt = SUMMARY_PROMPT.format(title=title, content=content) resp = httpx.post( - f"{base_url}/api/generate", + f"{base_url}/v1/chat/completions", + headers=_LLM_HEADERS, json={ "model": model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": { - "temperature": 0.3, - "num_predict": 200, - }, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 200, }, timeout=120.0, ) resp.raise_for_status() data = resp.json() - summary = data.get("response", "").strip() + summary = data["choices"][0]["message"]["content"].strip() - # Strip /think tags if the model includes them + # Strip think tags if model includes them despite reasoning_effort=none import re summary = re.sub(r".*?", "", summary, flags=re.DOTALL).strip() @@ -109,22 +109,21 @@ def summarize_folder( prompt = FOLDER_SUMMARY_PROMPT.format(folder_path=folder_path, child_summaries=child_text) resp = httpx.post( - f"{base_url}/api/generate", + f"{base_url}/v1/chat/completions", + headers=_LLM_HEADERS, json={ "model": model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "options": { - "temperature": 0.3, - "num_predict": 200, - }, - "think": False, + "reasoning_effort": "none", + "temperature": 0.3, + "max_tokens": 200, }, timeout=120.0, ) resp.raise_for_status() data = resp.json() - summary = data.get("response", "").strip() + summary = data["choices"][0]["message"]["content"].strip() import re summary = re.sub(r".*?", "", summary, flags=re.DOTALL).strip() diff --git a/src/neurostack/triples.py b/src/neurostack/triples.py index 5e18fb5..c38f0c3 100644 --- a/src/neurostack/triples.py +++ b/src/neurostack/triples.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2024-2026 Raphael Southall -"""Knowledge graph triple extraction using Ollama. +"""Knowledge graph triple extraction (OpenAI-compatible /v1/ endpoints). Extracts Subject-Predicate-Object triples from vault notes for token-efficient structured retrieval (~10-20 tokens per fact vs @@ -9,16 +9,18 @@ import json import logging +import re import httpx -from .config import get_config +from .config import _auth_headers, get_config log = logging.getLogger("neurostack") _cfg = get_config() DEFAULT_SUMMARIZE_URL = _cfg.llm_url TRIPLE_MODEL = _cfg.llm_model +_LLM_HEADERS = _auth_headers(_cfg.llm_api_key) TRIPLE_PROMPT = """Extract knowledge graph triples from this note. \ Each triple is a (subject, predicate, object) fact. @@ -60,37 +62,26 @@ def extract_triples( prompt = TRIPLE_PROMPT.format(title=title, content=content) - triple_schema = { - "type": "array", - "items": { - "type": "object", - "properties": { - "s": {"type": "string"}, - "p": {"type": "string"}, - "o": {"type": "string"}, - }, - "required": ["s", "p", "o"], - }, - } - resp = httpx.post( - f"{base_url}/api/generate", + f"{base_url}/v1/chat/completions", + headers=_LLM_HEADERS, json={ "model": model, - "prompt": prompt, + "messages": [{"role": "user", "content": prompt}], "stream": False, - "format": triple_schema, - "options": { - "temperature": 0.2, - "num_predict": 2048, - }, - "think": False, + "reasoning_effort": "none", + "temperature": 0.2, + "max_tokens": 2048, }, timeout=180.0, ) resp.raise_for_status() data = resp.json() - raw = data.get("response", "").strip() + raw = data["choices"][0]["message"]["content"].strip() + # Strip markdown fences if present + if raw.startswith("```"): + raw = re.sub(r"^```\w*\n?", "", raw) + raw = re.sub(r"\n?```$", "", raw).strip() try: triples = json.loads(raw)