raphasouthall · raphasouthall · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/src/neurostack/ask.py b/src/neurostack/ask.py
@@ -8,7 +8,7 @@
 
 import httpx
 
-from .config import get_config
+from .config import _auth_headers, get_config
 from .search import hybrid_search
 
 ASK_PROMPT = """You are a knowledge assistant answering questions \
@@ -68,23 +68,22 @@ def ask_vault(
     sources_text = "\n\n---\n\n".join(source_blocks)
     prompt = ASK_PROMPT.format(sources=sources_text, question=question)
 
-    # Call Ollama LLM
+    # Call LLM (OpenAI-compatible endpoint)
     resp = httpx.post(
-        f"{llm_url}/api/generate",
+        f"{llm_url}/v1/chat/completions",
+        headers=_auth_headers(cfg.llm_api_key),
         json={
             "model": llm_model,
-            "prompt": prompt,
+            "messages": [{"role": "user", "content": prompt}],
             "stream": False,
-            "options": {
-                "temperature": 0.3,
-                "num_predict": 500,
-            },
-            "think": False,
+            "reasoning_effort": "none",
+            "temperature": 0.3,
+            "max_tokens": 500,
         },
         timeout=180.0,
     )
     resp.raise_for_status()
-    answer = resp.json().get("response", "").strip()
+    answer = resp.json()["choices"][0]["message"]["content"].strip()
 
     # Strip think tags if model includes them
     answer = re.sub(r"<think>.*?</think>", "", answer, flags=re.DOTALL).strip()

diff --git a/src/neurostack/cli.py b/src/neurostack/cli.py
@@ -763,6 +763,12 @@ def _do_init(vault_root, cfg, profession_name=None, run_index=False):
         f'embed_url = "{cfg.embed_url}"\n'
         f'llm_url = "{cfg.llm_url}"\n'
         f'llm_model = "{cfg.llm_model}"\n'
+    )
+    if cfg.llm_api_key:
+        config_text += f'llm_api_key = "{cfg.llm_api_key}"\n'
+    if cfg.embed_api_key:
+        config_text += f'embed_api_key = "{cfg.embed_api_key}"\n'
+    config_text += (
         f'\n[writeback]\n'
         f'enabled = {wb_enabled}\n'
         f'path = "{cfg.writeback_path}"\n'
@@ -836,8 +842,9 @@ def cmd_init(args):
     profession = _prompt("Profession pack", default="none", choices=prof_choices)
 
     # 3. LLM configuration
-    print("\n  \033[1mOllama Configuration\033[0m")
-    print("  NeuroStack uses Ollama for embeddings and summaries.\n")
+    print("\n  \033[1mLLM Configuration\033[0m")
+    print("  NeuroStack works with any OpenAI-compatible endpoint")
+    print("  (Ollama, vLLM, Together AI, Groq, OpenRouter, etc.)\n")
 
     embed_url = _prompt("Embedding endpoint", default=cfg.embed_url)
     llm_url = _prompt("LLM endpoint", default=cfg.llm_url)
@@ -850,6 +857,22 @@ def cmd_init(args):
     ]
     llm_model = _prompt("LLM model for summaries", default=cfg.llm_model, choices=model_choices)
 
+    # 3b. API keys (optional — only needed for cloud providers)
+    llm_api_key = ""
+    embed_api_key = ""
+    is_local = any(h in llm_url for h in ("localhost", "127.0.0.1", "0.0.0.0"))
+    if not is_local:
+        print("\n  \033[1mAPI Authentication\033[0m")
+        print("  Cloud providers require an API key.\n")
+        llm_api_key = _prompt("LLM API key", default="")
+        if embed_url != llm_url:
+            embed_api_key = _prompt("Embedding API key", default="")
+        else:
+            embed_api_key = llm_api_key
+    elif _confirm("\n  Configure API keys? (only needed for cloud providers)", default=False):
+        llm_api_key = _prompt("LLM API key", default="")
+        embed_api_key = _prompt("Embedding API key", default=llm_api_key)
+
     # 4. Write-back
     print(
         "\n  Enable memory write-back? Memories will be"
@@ -862,12 +885,14 @@ def cmd_init(args):
 
     # Show summary
     wb_label = "yes" if writeback else "no"
+    auth_label = "yes" if (llm_api_key or embed_api_key) else "no"
     print("\n  \033[1m━━━ Summary ━━━\033[0m\n")
     print(f"  Vault:      {vault_root}")
     print(f"  Profession: {profession}")
     print(f"  Embed URL:  {embed_url}")
     print(f"  LLM URL:    {llm_url}")
     print(f"  LLM model:  {llm_model}")
+    print(f"  API auth:   {auth_label}")
     print(f"  Write-back: {wb_label}")
     print(f"  Index now:  {'yes' if run_index else 'no'}")
 
@@ -880,6 +905,8 @@ def cmd_init(args):
     cfg.embed_url = embed_url
     cfg.llm_url = llm_url
     cfg.llm_model = llm_model
+    cfg.llm_api_key = llm_api_key
+    cfg.embed_api_key = embed_api_key
     cfg.writeback_enabled = writeback
 
     _do_init(

diff --git a/src/neurostack/community.py b/src/neurostack/community.py
@@ -29,12 +29,13 @@
 
 log = logging.getLogger("neurostack")
 
-from .config import get_config
+from .config import _auth_headers, get_config
 
 _cfg = get_config()
 SUMMARIZE_URL = _cfg.llm_url
 EMBED_URL = _cfg.embed_url
 SUMMARIZE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
 
 COMMUNITY_PROMPT = (
     "You are summarizing a cluster of thematically"
@@ -134,29 +135,26 @@ def _generate_community_summary(
         note_summaries=notes_str or "(none)",
     )
 
-    schema = {
-        "type": "object",
-        "properties": {
-            "title": {"type": "string"},
-            "summary": {"type": "string"},
-        },
-        "required": ["title", "summary"],
-    }
-
     resp = httpx.post(
-        f"{base_url}/api/generate",
+        f"{base_url}/v1/chat/completions",
+        headers=_LLM_HEADERS,
         json={
             "model": model,
-            "prompt": prompt,
+            "messages": [{"role": "user", "content": prompt}],
             "stream": False,
-            "format": schema,
-            "options": {"temperature": 0.3, "num_predict": 512},
-            "think": False,
+            "reasoning_effort": "none",
+            "temperature": 0.3,
+            "max_tokens": 512,
         },
         timeout=120.0,
     )
     resp.raise_for_status()
-    raw = resp.json().get("response", "").strip()
+    raw = resp.json()["choices"][0]["message"]["content"].strip()
+    # Strip markdown fences if present
+    if raw.startswith("```"):
+        import re
+        raw = re.sub(r"^```\w*\n?", "", raw)
+        raw = re.sub(r"\n?```$", "", raw).strip()
 
     try:
         parsed = json.loads(raw)

diff --git a/src/neurostack/community_search.py b/src/neurostack/community_search.py
@@ -24,12 +24,13 @@
 
 log = logging.getLogger("neurostack")
 
-from .config import get_config
+from .config import _auth_headers, get_config
 
 _cfg = get_config()
 SUMMARIZE_URL = _cfg.llm_url
 EMBED_URL = _cfg.embed_url
 SUMMARIZE_MODEL = _cfg.llm_model
+_LLM_HEADERS = _auth_headers(_cfg.llm_api_key)
 
 _MAP_PROMPT = """You are analyzing a knowledge community summary to answer a question.
 
@@ -197,18 +198,20 @@ def global_query(
         )
         try:
             resp = httpx.post(
-                f"{summarize_url}/api/generate",
+                f"{summarize_url}/v1/chat/completions",
+                headers=_LLM_HEADERS,
                 json={
                     "model": model,
-                    "prompt": prompt,
+                    "messages": [{"role": "user", "content": prompt}],
                     "stream": False,
-                    "options": {"temperature": 0.1, "num_predict": 256},
-                    "think": False,
+                    "reasoning_effort": "none",
+                    "temperature": 0.1,
+                    "max_tokens": 256,
                 },
                 timeout=60.0,
             )
             resp.raise_for_status()
-            finding = resp.json().get("response", "").strip()
+            finding = resp.json()["choices"][0]["message"]["content"].strip()
             if finding:
                 findings.append(f"[{hit['title']}]\n{finding}")
         except Exception as e:
@@ -228,18 +231,19 @@ def global_query(
     )
     try:
         resp = httpx.post(
-            f"{summarize_url}/api/generate",
+            f"{summarize_url}/v1/chat/completions",
             json={
                 "model": model,
-                "prompt": reduce_prompt,
+                "messages": [{"role": "user", "content": reduce_prompt}],
                 "stream": False,
-                "options": {"temperature": 0.3, "num_predict": 1024},
-                "think": False,
+                "reasoning_effort": "none",
+                "temperature": 0.3,
+                "max_tokens": 1024,
             },
             timeout=120.0,
         )
         resp.raise_for_status()
-        answer = resp.json().get("response", "").strip()
+        answer = resp.json()["choices"][0]["message"]["content"].strip()
     except Exception as e:
         log.warning(f"Reduce step failed: {e}")
         answer = "\n\n".join(findings)

diff --git a/src/neurostack/config.py b/src/neurostack/config.py
@@ -28,6 +28,8 @@ class Config:
     # NOTE: Verify the license of any model you configure here.
     # phi3.5 is MIT licensed.
     llm_model: str = "phi3.5"
+    llm_api_key: str = ""
+    embed_api_key: str = ""
     session_dir: Path = field(default_factory=lambda: Path.home() / ".claude" / "projects")
     api_host: str = "127.0.0.1"
     api_port: int = 8000
@@ -57,7 +59,7 @@ def load_config() -> Config:
             if key in data:
                 setattr(cfg, key, Path(os.path.expanduser(data[key])))
         for key in ("embed_url", "embed_model", "llm_url", "llm_model",
-                    "api_host", "api_key"):
+                    "llm_api_key", "embed_api_key", "api_host", "api_key"):
             if key in data:
                 setattr(cfg, key, data[key])
         if "embed_dim" in data:
@@ -81,6 +83,8 @@ def load_config() -> Config:
         "NEUROSTACK_EMBED_DIM": ("embed_dim", int),
         "NEUROSTACK_LLM_URL": ("llm_url", str),
         "NEUROSTACK_LLM_MODEL": ("llm_model", str),
+        "NEUROSTACK_LLM_API_KEY": ("llm_api_key", str),
+        "NEUROSTACK_EMBED_API_KEY": ("embed_api_key", str),
         "NEUROSTACK_SESSION_DIR": ("session_dir", Path),
         "NEUROSTACK_API_HOST": ("api_host", str),
         "NEUROSTACK_API_PORT": ("api_port", int),
@@ -102,6 +106,13 @@ def load_config() -> Config:
     return cfg
 
 
+def _auth_headers(api_key: str) -> dict[str, str]:
+    """Build Authorization header dict if an API key is set."""
+    if api_key:
+        return {"Authorization": f"Bearer {api_key}"}
+    return {}
+
+
 # Module-level singleton
 _config: Config | None = None
 

diff --git a/src/neurostack/embedder.py b/src/neurostack/embedder.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) 2024-2026 Raphael Southall
-"""Ollama embedding client."""
+"""Embedding client (OpenAI-compatible /v1/ endpoints)."""
 
 import json
 from typing import Optional
@@ -13,12 +13,13 @@
 except ImportError:
     HAS_NUMPY = False
 
-from .config import get_config
+from .config import _auth_headers, get_config
 
 _cfg = get_config()
 DEFAULT_EMBED_URL = _cfg.embed_url
 EMBED_MODEL = _cfg.embed_model
 EMBED_DIM = _cfg.embed_dim
+_EMBED_HEADERS = _auth_headers(_cfg.embed_api_key)
 
 
 def get_embedding(
@@ -33,13 +34,14 @@ def get_embedding(
             "Install with: pip install neurostack[full]"
         )
     resp = httpx.post(
-        f"{base_url}/api/embed",
+        f"{base_url}/v1/embeddings",
+        headers=_EMBED_HEADERS,
         json={"model": model, "input": text},
         timeout=30.0,
     )
     resp.raise_for_status()
     data = resp.json()
-    return np.array(data["embeddings"][0], dtype=np.float32)
+    return np.array(data["data"][0]["embedding"], dtype=np.float32)
 
 
 def get_embeddings_batch(
@@ -58,14 +60,15 @@ def get_embeddings_batch(
     for i in range(0, len(texts), batch_size):
         batch = texts[i : i + batch_size]
         resp = httpx.post(
-            f"{base_url}/api/embed",
+            f"{base_url}/v1/embeddings",
+            headers=_EMBED_HEADERS,
             json={"model": model, "input": batch},
             timeout=60.0,
         )
         resp.raise_for_status()
         data = resp.json()
-        for emb in data["embeddings"]:
-            all_embeddings.append(np.array(emb, dtype=np.float32))
+        for item in data["data"]:
+            all_embeddings.append(np.array(item["embedding"], dtype=np.float32))
     return all_embeddings
 
 

diff --git a/src/neurostack/harvest.py b/src/neurostack/harvest.py
@@ -215,19 +215,22 @@ def _llm_classify(
         )
 
         try:
+            from .config import _auth_headers, get_config
             resp = httpx.post(
-                f"{llm_url}/api/generate",
+                f"{llm_url}/v1/chat/completions",
+                headers=_auth_headers(get_config().llm_api_key),
                 json={
                     "model": llm_model,
-                    "prompt": prompt,
+                    "messages": [{"role": "user", "content": prompt}],
                     "stream": False,
-                    "options": {"temperature": 0.1, "num_predict": 500},
-                    "think": False,
+                    "reasoning_effort": "none",
+                    "temperature": 0.1,
+                    "max_tokens": 500,
                 },
                 timeout=60.0,
             )
             resp.raise_for_status()
-            response = resp.json().get("response", "")
+            response = resp.json()["choices"][0]["message"]["content"]
             # Strip think tags if present
             response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
         except Exception as exc: