diff --git a/src/neurostack/ask.py b/src/neurostack/ask.py
index 4bec66c..2463000 100644
--- a/src/neurostack/ask.py
+++ b/src/neurostack/ask.py
@@ -8,7 +8,7 @@
import httpx
-from .config import get_config
+from .config import _auth_headers, get_config
from .search import hybrid_search
ASK_PROMPT = """You are a knowledge assistant answering questions \
@@ -68,23 +68,22 @@ def ask_vault(
sources_text = "\n\n---\n\n".join(source_blocks)
prompt = ASK_PROMPT.format(sources=sources_text, question=question)
- # Call Ollama LLM
+ # Call LLM (OpenAI-compatible endpoint)
resp = httpx.post(
- f"{llm_url}/api/generate",
+ f"{llm_url}/v1/chat/completions",
+ headers=_auth_headers(cfg.llm_api_key),
json={
"model": llm_model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {
- "temperature": 0.3,
- "num_predict": 500,
- },
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 500,
},
timeout=180.0,
)
resp.raise_for_status()
- answer = resp.json().get("response", "").strip()
+ answer = resp.json()["choices"][0]["message"]["content"].strip()
# Strip think tags if model includes them
answer = re.sub(r".*?", "", answer, flags=re.DOTALL).strip()
diff --git a/src/neurostack/cli.py b/src/neurostack/cli.py
index af023d0..f7cf4cb 100644
--- a/src/neurostack/cli.py
+++ b/src/neurostack/cli.py
@@ -763,6 +763,12 @@ def _do_init(vault_root, cfg, profession_name=None, run_index=False):
f'embed_url = "{cfg.embed_url}"\n'
f'llm_url = "{cfg.llm_url}"\n'
f'llm_model = "{cfg.llm_model}"\n'
+ )
+ if cfg.llm_api_key:
+ config_text += f'llm_api_key = "{cfg.llm_api_key}"\n'
+ if cfg.embed_api_key:
+ config_text += f'embed_api_key = "{cfg.embed_api_key}"\n'
+ config_text += (
f'\n[writeback]\n'
f'enabled = {wb_enabled}\n'
f'path = "{cfg.writeback_path}"\n'
@@ -836,8 +842,9 @@ def cmd_init(args):
profession = _prompt("Profession pack", default="none", choices=prof_choices)
# 3. LLM configuration
- print("\n \033[1mOllama Configuration\033[0m")
- print(" NeuroStack uses Ollama for embeddings and summaries.\n")
+ print("\n \033[1mLLM Configuration\033[0m")
+ print(" NeuroStack works with any OpenAI-compatible endpoint")
+ print(" (Ollama, vLLM, Together AI, Groq, OpenRouter, etc.)\n")
embed_url = _prompt("Embedding endpoint", default=cfg.embed_url)
llm_url = _prompt("LLM endpoint", default=cfg.llm_url)
@@ -850,6 +857,22 @@ def cmd_init(args):
]
llm_model = _prompt("LLM model for summaries", default=cfg.llm_model, choices=model_choices)
+ # 3b. API keys (optional — only needed for cloud providers)
+ llm_api_key = ""
+ embed_api_key = ""
+ is_local = any(h in llm_url for h in ("localhost", "127.0.0.1", "0.0.0.0"))
+ if not is_local:
+ print("\n \033[1mAPI Authentication\033[0m")
+ print(" Cloud providers require an API key.\n")
+ llm_api_key = _prompt("LLM API key", default="")
+ if embed_url != llm_url:
+ embed_api_key = _prompt("Embedding API key", default="")
+ else:
+ embed_api_key = llm_api_key
+ elif _confirm("\n Configure API keys? (only needed for cloud providers)", default=False):
+ llm_api_key = _prompt("LLM API key", default="")
+ embed_api_key = _prompt("Embedding API key", default=llm_api_key)
+
# 4. Write-back
print(
"\n Enable memory write-back? Memories will be"
@@ -862,12 +885,14 @@ def cmd_init(args):
# Show summary
wb_label = "yes" if writeback else "no"
+ auth_label = "yes" if (llm_api_key or embed_api_key) else "no"
print("\n \033[1m━━━ Summary ━━━\033[0m\n")
print(f" Vault: {vault_root}")
print(f" Profession: {profession}")
print(f" Embed URL: {embed_url}")
print(f" LLM URL: {llm_url}")
print(f" LLM model: {llm_model}")
+ print(f" API auth: {auth_label}")
print(f" Write-back: {wb_label}")
print(f" Index now: {'yes' if run_index else 'no'}")
@@ -880,6 +905,8 @@ def cmd_init(args):
cfg.embed_url = embed_url
cfg.llm_url = llm_url
cfg.llm_model = llm_model
+ cfg.llm_api_key = llm_api_key
+ cfg.embed_api_key = embed_api_key
cfg.writeback_enabled = writeback
_do_init(
diff --git a/src/neurostack/community.py b/src/neurostack/community.py
index 8a2edfd..3383959 100644
--- a/src/neurostack/community.py
+++ b/src/neurostack/community.py
@@ -29,12 +29,13 @@
log = logging.getLogger("neurostack")
-from .config import get_config
+from .config import _auth_headers, get_config
_cfg = get_config()
SUMMARIZE_URL = _cfg.llm_url
EMBED_URL = _cfg.embed_url
SUMMARIZE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
COMMUNITY_PROMPT = (
"You are summarizing a cluster of thematically"
@@ -134,29 +135,26 @@ def _generate_community_summary(
note_summaries=notes_str or "(none)",
)
- schema = {
- "type": "object",
- "properties": {
- "title": {"type": "string"},
- "summary": {"type": "string"},
- },
- "required": ["title", "summary"],
- }
-
resp = httpx.post(
- f"{base_url}/api/generate",
+ f"{base_url}/v1/chat/completions",
+ headers=_LLM_HEADERS,
json={
"model": model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "format": schema,
- "options": {"temperature": 0.3, "num_predict": 512},
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 512,
},
timeout=120.0,
)
resp.raise_for_status()
- raw = resp.json().get("response", "").strip()
+ raw = resp.json()["choices"][0]["message"]["content"].strip()
+ # Strip markdown fences if present
+ if raw.startswith("```"):
+ import re
+ raw = re.sub(r"^```\w*\n?", "", raw)
+ raw = re.sub(r"\n?```$", "", raw).strip()
try:
parsed = json.loads(raw)
diff --git a/src/neurostack/community_search.py b/src/neurostack/community_search.py
index 6ab811f..fe5fce0 100644
--- a/src/neurostack/community_search.py
+++ b/src/neurostack/community_search.py
@@ -24,12 +24,13 @@
log = logging.getLogger("neurostack")
-from .config import get_config
+from .config import _auth_headers, get_config
_cfg = get_config()
SUMMARIZE_URL = _cfg.llm_url
EMBED_URL = _cfg.embed_url
SUMMARIZE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
_MAP_PROMPT = """You are analyzing a knowledge community summary to answer a question.
@@ -197,18 +198,20 @@ def global_query(
)
try:
resp = httpx.post(
- f"{summarize_url}/api/generate",
+ f"{summarize_url}/v1/chat/completions",
+ headers=_LLM_HEADERS,
json={
"model": model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {"temperature": 0.1, "num_predict": 256},
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.1,
+ "max_tokens": 256,
},
timeout=60.0,
)
resp.raise_for_status()
- finding = resp.json().get("response", "").strip()
+ finding = resp.json()["choices"][0]["message"]["content"].strip()
if finding:
findings.append(f"[{hit['title']}]\n{finding}")
except Exception as e:
@@ -228,18 +231,19 @@ def global_query(
)
try:
resp = httpx.post(
- f"{summarize_url}/api/generate",
+ f"{summarize_url}/v1/chat/completions",
json={
"model": model,
- "prompt": reduce_prompt,
+ "messages": [{"role": "user", "content": reduce_prompt}],
"stream": False,
- "options": {"temperature": 0.3, "num_predict": 1024},
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 1024,
},
timeout=120.0,
)
resp.raise_for_status()
- answer = resp.json().get("response", "").strip()
+ answer = resp.json()["choices"][0]["message"]["content"].strip()
except Exception as e:
log.warning(f"Reduce step failed: {e}")
answer = "\n\n".join(findings)
diff --git a/src/neurostack/config.py b/src/neurostack/config.py
index fdc7a94..a7755ee 100644
--- a/src/neurostack/config.py
+++ b/src/neurostack/config.py
@@ -28,6 +28,8 @@ class Config:
# NOTE: Verify the license of any model you configure here.
# phi3.5 is MIT licensed.
llm_model: str = "phi3.5"
+ llm_api_key: str = ""
+ embed_api_key: str = ""
session_dir: Path = field(default_factory=lambda: Path.home() / ".claude" / "projects")
api_host: str = "127.0.0.1"
api_port: int = 8000
@@ -57,7 +59,7 @@ def load_config() -> Config:
if key in data:
setattr(cfg, key, Path(os.path.expanduser(data[key])))
for key in ("embed_url", "embed_model", "llm_url", "llm_model",
- "api_host", "api_key"):
+ "llm_api_key", "embed_api_key", "api_host", "api_key"):
if key in data:
setattr(cfg, key, data[key])
if "embed_dim" in data:
@@ -81,6 +83,8 @@ def load_config() -> Config:
"NEUROSTACK_EMBED_DIM": ("embed_dim", int),
"NEUROSTACK_LLM_URL": ("llm_url", str),
"NEUROSTACK_LLM_MODEL": ("llm_model", str),
+ "NEUROSTACK_LLM_API_KEY": ("llm_api_key", str),
+ "NEUROSTACK_EMBED_API_KEY": ("embed_api_key", str),
"NEUROSTACK_SESSION_DIR": ("session_dir", Path),
"NEUROSTACK_API_HOST": ("api_host", str),
"NEUROSTACK_API_PORT": ("api_port", int),
@@ -102,6 +106,13 @@ def load_config() -> Config:
return cfg
+def _auth_headers(api_key: str) -> dict[str, str]:
+ """Build Authorization header dict if an API key is set."""
+ if api_key:
+ return {"Authorization": f"Bearer {api_key}"}
+ return {}
+
+
# Module-level singleton
_config: Config | None = None
diff --git a/src/neurostack/embedder.py b/src/neurostack/embedder.py
index 299cf21..a188ff8 100644
--- a/src/neurostack/embedder.py
+++ b/src/neurostack/embedder.py
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024-2026 Raphael Southall
-"""Ollama embedding client."""
+"""Embedding client (OpenAI-compatible /v1/ endpoints)."""
import json
from typing import Optional
@@ -13,12 +13,13 @@
except ImportError:
HAS_NUMPY = False
-from .config import get_config
+from .config import _auth_headers, get_config
_cfg = get_config()
DEFAULT_EMBED_URL = _cfg.embed_url
EMBED_MODEL = _cfg.embed_model
EMBED_DIM = _cfg.embed_dim
+_EMBED_HEADERS = _auth_headers(_cfg.embed_api_key)
def get_embedding(
@@ -33,13 +34,14 @@ def get_embedding(
"Install with: pip install neurostack[full]"
)
resp = httpx.post(
- f"{base_url}/api/embed",
+ f"{base_url}/v1/embeddings",
+ headers=_EMBED_HEADERS,
json={"model": model, "input": text},
timeout=30.0,
)
resp.raise_for_status()
data = resp.json()
- return np.array(data["embeddings"][0], dtype=np.float32)
+ return np.array(data["data"][0]["embedding"], dtype=np.float32)
def get_embeddings_batch(
@@ -58,14 +60,15 @@ def get_embeddings_batch(
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
resp = httpx.post(
- f"{base_url}/api/embed",
+ f"{base_url}/v1/embeddings",
+ headers=_EMBED_HEADERS,
json={"model": model, "input": batch},
timeout=60.0,
)
resp.raise_for_status()
data = resp.json()
- for emb in data["embeddings"]:
- all_embeddings.append(np.array(emb, dtype=np.float32))
+ for item in data["data"]:
+ all_embeddings.append(np.array(item["embedding"], dtype=np.float32))
return all_embeddings
diff --git a/src/neurostack/harvest.py b/src/neurostack/harvest.py
index 9a623d4..c862dbf 100644
--- a/src/neurostack/harvest.py
+++ b/src/neurostack/harvest.py
@@ -215,19 +215,22 @@ def _llm_classify(
)
try:
+ from .config import _auth_headers, get_config
resp = httpx.post(
- f"{llm_url}/api/generate",
+ f"{llm_url}/v1/chat/completions",
+ headers=_auth_headers(get_config().llm_api_key),
json={
"model": llm_model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {"temperature": 0.1, "num_predict": 500},
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.1,
+ "max_tokens": 500,
},
timeout=60.0,
)
resp.raise_for_status()
- response = resp.json().get("response", "")
+ response = resp.json()["choices"][0]["message"]["content"]
# Strip think tags if present
response = re.sub(r".*?", "", response, flags=re.DOTALL)
except Exception as exc:
diff --git a/src/neurostack/memories.py b/src/neurostack/memories.py
index 9b7be97..97ee79c 100644
--- a/src/neurostack/memories.py
+++ b/src/neurostack/memories.py
@@ -1001,7 +1001,7 @@ def summarize_session(
import httpx
- from .config import get_config
+ from .config import _auth_headers, get_config
cfg = get_config()
llm_url = llm_url or cfg.llm_url
@@ -1031,21 +1031,20 @@ def summarize_session(
)
resp = httpx.post(
- f"{llm_url}/api/generate",
+ f"{llm_url}/v1/chat/completions",
+ headers=_auth_headers(cfg.llm_api_key),
json={
"model": llm_model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {
- "temperature": 0.3,
- "num_predict": 200,
- },
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 200,
},
timeout=120.0,
)
resp.raise_for_status()
- summary = resp.json().get("response", "").strip()
+ summary = resp.json()["choices"][0]["message"]["content"].strip()
# Strip think tags if model includes them
summary = re.sub(
diff --git a/src/neurostack/summarizer.py b/src/neurostack/summarizer.py
index 9a85536..79a41c4 100644
--- a/src/neurostack/summarizer.py
+++ b/src/neurostack/summarizer.py
@@ -1,14 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024-2026 Raphael Southall
-"""Ollama summary client."""
+"""Summary client (OpenAI-compatible /v1/ endpoints)."""
import httpx
-from .config import get_config
+from .config import _auth_headers, get_config
_cfg = get_config()
DEFAULT_SUMMARIZE_URL = _cfg.llm_url
SUMMARIZE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
SUMMARY_PROMPT = """Summarize this note in 2-3 concise sentences. \
Focus on the key purpose, decisions, and actionable information. \
@@ -28,7 +29,7 @@ def summarize_note(
base_url: str = DEFAULT_SUMMARIZE_URL,
model: str = SUMMARIZE_MODEL,
) -> str:
- """Generate a 2-3 sentence summary of a note using Ollama."""
+ """Generate a 2-3 sentence summary of a note."""
# Truncate content to ~3000 chars to keep prompt reasonable
if len(content) > 3000:
content = content[:3000] + "\n[... truncated]"
@@ -36,24 +37,23 @@ def summarize_note(
prompt = SUMMARY_PROMPT.format(title=title, content=content)
resp = httpx.post(
- f"{base_url}/api/generate",
+ f"{base_url}/v1/chat/completions",
+ headers=_LLM_HEADERS,
json={
"model": model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {
- "temperature": 0.3,
- "num_predict": 200,
- },
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 200,
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
- summary = data.get("response", "").strip()
+ summary = data["choices"][0]["message"]["content"].strip()
- # Strip /think tags if the model includes them
+ # Strip think tags if model includes them despite reasoning_effort=none
import re
summary = re.sub(r".*?", "", summary, flags=re.DOTALL).strip()
@@ -109,22 +109,21 @@ def summarize_folder(
prompt = FOLDER_SUMMARY_PROMPT.format(folder_path=folder_path, child_summaries=child_text)
resp = httpx.post(
- f"{base_url}/api/generate",
+ f"{base_url}/v1/chat/completions",
+ headers=_LLM_HEADERS,
json={
"model": model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "options": {
- "temperature": 0.3,
- "num_predict": 200,
- },
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.3,
+ "max_tokens": 200,
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
- summary = data.get("response", "").strip()
+ summary = data["choices"][0]["message"]["content"].strip()
import re
summary = re.sub(r".*?", "", summary, flags=re.DOTALL).strip()
diff --git a/src/neurostack/triples.py b/src/neurostack/triples.py
index 5e18fb5..c38f0c3 100644
--- a/src/neurostack/triples.py
+++ b/src/neurostack/triples.py
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2024-2026 Raphael Southall
-"""Knowledge graph triple extraction using Ollama.
+"""Knowledge graph triple extraction (OpenAI-compatible /v1/ endpoints).
Extracts Subject-Predicate-Object triples from vault notes for
token-efficient structured retrieval (~10-20 tokens per fact vs
@@ -9,16 +9,18 @@
import json
import logging
+import re
import httpx
-from .config import get_config
+from .config import _auth_headers, get_config
log = logging.getLogger("neurostack")
_cfg = get_config()
DEFAULT_SUMMARIZE_URL = _cfg.llm_url
TRIPLE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
TRIPLE_PROMPT = """Extract knowledge graph triples from this note. \
Each triple is a (subject, predicate, object) fact.
@@ -60,37 +62,26 @@ def extract_triples(
prompt = TRIPLE_PROMPT.format(title=title, content=content)
- triple_schema = {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "s": {"type": "string"},
- "p": {"type": "string"},
- "o": {"type": "string"},
- },
- "required": ["s", "p", "o"],
- },
- }
-
resp = httpx.post(
- f"{base_url}/api/generate",
+ f"{base_url}/v1/chat/completions",
+ headers=_LLM_HEADERS,
json={
"model": model,
- "prompt": prompt,
+ "messages": [{"role": "user", "content": prompt}],
"stream": False,
- "format": triple_schema,
- "options": {
- "temperature": 0.2,
- "num_predict": 2048,
- },
- "think": False,
+ "reasoning_effort": "none",
+ "temperature": 0.2,
+ "max_tokens": 2048,
},
timeout=180.0,
)
resp.raise_for_status()
data = resp.json()
- raw = data.get("response", "").strip()
+ raw = data["choices"][0]["message"]["content"].strip()
+ # Strip markdown fences if present
+ if raw.startswith("```"):
+ raw = re.sub(r"^```\w*\n?", "", raw)
+ raw = re.sub(r"\n?```$", "", raw).strip()
try:
triples = json.loads(raw)