From 052ae9265560204e03c83720c7f9c6c265ab2c6b Mon Sep 17 00:00:00 2001 From: DK09876 Date: Mon, 8 Jun 2026 16:13:14 -0700 Subject: [PATCH] docs(integrations): add Hindsight long-term memory integration Adds a docs page for hindsight-litellm, which wraps litellm.completion to inject relevant long-term memories before each call and store conversations back to Hindsight afterward. Works with any LiteLLM provider, against Hindsight Cloud or a self-hosted server. - New page docs/integrations/hindsight.md (modeled on the Letta page) - Sidebar entry under Agent SDKs, next to Letta - NavigationCard on the integrations index --- docs/integrations/hindsight.md | 209 +++++++++++++++++++++++++++++++++ docs/integrations/index.md | 6 + sidebars.js | 1 + 3 files changed, 216 insertions(+) create mode 100644 docs/integrations/hindsight.md diff --git a/docs/integrations/hindsight.md b/docs/integrations/hindsight.md new file mode 100644 index 000000000..c21ae9e89 --- /dev/null +++ b/docs/integrations/hindsight.md @@ -0,0 +1,209 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Hindsight Integration + +[Hindsight](https://github.com/vectorize-io/hindsight) is an open-source long-term memory engine for LLM applications. The `hindsight-litellm` package adds persistent, cross-session memory to any of LiteLLM's 100+ providers: relevant memories are injected into your prompts before each call, and conversations are stored back to Hindsight afterward — automatically. + +## What is Hindsight? + +Hindsight gives your LLM app durable memory across sessions. With the LiteLLM integration you get: + +- **Automatic memory injection** — relevant context is retrieved and added to the prompt before each `completion()` call. +- **Automatic conversation storage** — exchanges are stored back to Hindsight (async by default) so they can be recalled later. +- **Two retrieval modes** — `recall` (raw memories) or `reflect` (a synthesized, query-focused summary). +- **Works with every LiteLLM provider** — OpenAI, Anthropic, Azure, Bedrock, Vertex AI, Groq, and more. + +It works against [Hindsight Cloud](https://hindsight.vectorize.io) out of the box, or against a self-hosted Hindsight server. + +## Prerequisites + +```bash +pip install hindsight-litellm +``` + +Get a free API key from [Hindsight Cloud](https://hindsight.vectorize.io), or run Hindsight [locally](https://github.com/vectorize-io/hindsight) and point at `http://localhost:8888`. + +## Quick Start + +`hindsight-litellm` wraps `litellm.completion`, so once it's enabled you keep calling LiteLLM exactly as before — memory injection and storage happen transparently. + + + + +```python +import os +import litellm +import hindsight_litellm + +os.environ["OPENAI_API_KEY"] = "your-openai-key" + +# 1. Configure Hindsight (defaults to Hindsight Cloud) +hindsight_litellm.configure( + api_key=os.environ["HINDSIGHT_API_KEY"], # from https://hindsight.vectorize.io +) + +# 2. Set the memory bank to read/write +hindsight_litellm.set_defaults( + bank_id="my-agent", + use_reflect=True, # synthesize a focused context summary +) + +# 3. Enable the integration (patches litellm.completion / acompletion) +hindsight_litellm.enable() + +# 4. Use LiteLLM as normal — memory is injected and stored automatically +response = litellm.completion( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "What did we decide about the database?"}], + hindsight_query="database decisions", # what to search memory for +) + +print(response.choices[0].message.content) +``` + + + + +```python +import os +import litellm +import hindsight_litellm + +os.environ["OPENAI_API_KEY"] = "your-openai-key" + +# Point at your own Hindsight server instead of Hindsight Cloud +hindsight_litellm.configure( + hindsight_api_url="http://localhost:8888", +) + +hindsight_litellm.set_defaults( + bank_id="my-agent", + use_reflect=True, +) + +hindsight_litellm.enable() + +response = litellm.completion( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "What did we decide about the database?"}], + hindsight_query="database decisions", +) + +print(response.choices[0].message.content) +``` + + + + +:::info +When `inject_memories` is enabled (the default), pass `hindsight_query` on each call to tell Hindsight what to search memory for. This keeps retrieval intentional and focused. +::: + +## Memory Modes + +Choose how retrieved memory is added to the prompt with `use_reflect`: + + + + +`reflect` asks Hindsight to synthesize a single, query-focused summary from the +underlying memories — useful when you want compact, reasoned context. + +```python +hindsight_litellm.set_defaults( + bank_id="my-agent", + use_reflect=True, + reflect_context="I am a support agent helping a returning customer.", +) +``` + + + + +`recall` injects the raw matching memories — useful when you want the model to +see the individual facts. + +```python +hindsight_litellm.set_defaults( + bank_id="my-agent", + use_reflect=False, + max_memories=10, + fact_types=["world", "opinion"], +) +``` + + + + +## Per-Call Overrides + +Any default set via `set_defaults()` can be overridden on an individual call +using `hindsight_*` keyword arguments: + +```python +response = litellm.completion( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Where is Alice now?"}], + hindsight_query="Alice's current location", # required when inject_memories=True + hindsight_reflect_context="The team is mid-incident", # per-call reflect context + hindsight_bank_id="ops-team", # override the bank for this call +) +``` + +## How It Works + +``` +completion(query) ──► recall/reflect from Hindsight ──► inject into prompt + │ + ▼ + LLM call (any LiteLLM provider) + │ + store conversation to Hindsight ◄─┘ + │ + ▼ + response returned +``` + +1. You call `litellm.completion(...)` with a `hindsight_query`. +2. Hindsight retrieves relevant memory (raw via `recall`, or synthesized via `reflect`). +3. The memory is injected into the prompt (system message by default). +4. The enriched request is sent to your chosen LiteLLM provider. +5. The conversation is stored back to Hindsight (async by default) for future recall. +6. You get the response back exactly as a normal LiteLLM call. + +## Configuration Reference + +| Function | Key settings | +| --- | --- | +| `configure()` | `hindsight_api_url`, `api_key`, `inject_memories`, `store_conversations`, `sync_storage`, `injection_mode`, `excluded_models`, `verbose` | +| `set_defaults()` | `bank_id` (required), `use_reflect`, `budget`, `fact_types`, `max_memories`, `max_memory_tokens`, `reflect_context`, `document_id` | +| Per-call `hindsight_*` kwargs | `hindsight_query` (required when injecting), plus overrides for any default above | + +See the [package README](https://github.com/vectorize-io/hindsight/tree/main/hindsight-integrations/litellm) for the full option list. + +## Direct SDK without `enable()` + +If you prefer not to patch `litellm.completion` globally, call the wrapper +directly — it has the same signature as `litellm.completion`: + +```python +import hindsight_litellm + +hindsight_litellm.configure(api_key=os.environ["HINDSIGHT_API_KEY"]) +hindsight_litellm.set_defaults(bank_id="my-agent", use_reflect=True) + +response = hindsight_litellm.completion( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "Catch me up."}], + hindsight_query="recent project status", +) +``` + +## Resources + +- [Hindsight on GitHub](https://github.com/vectorize-io/hindsight) +- [hindsight-litellm package](https://github.com/vectorize-io/hindsight/tree/main/hindsight-integrations/litellm) +- [Hindsight Cloud](https://hindsight.vectorize.io) +- [LiteLLM SDK Documentation](/docs/#litellm-python-sdk) +- [Custom Callbacks](/docs/observability/custom_callback) diff --git a/docs/integrations/index.md b/docs/integrations/index.md index 0ad934d5b..9e47fe5ff 100644 --- a/docs/integrations/index.md +++ b/docs/integrations/index.md @@ -278,6 +278,12 @@ items={[ description: "Build stateful LLM agents with persistent memory.", to: "./letta", }, + { + icon: "🗂️", + title: "Hindsight", + description: "Long-term memory across sessions for any provider.", + to: "./hindsight", + }, { icon: "🎙️", title: "LiveKit", diff --git a/sidebars.js b/sidebars.js index 00929d41b..b5299eb2b 100644 --- a/sidebars.js +++ b/sidebars.js @@ -188,6 +188,7 @@ const sidebars = { "tutorials/google_genai_sdk", "tutorials/livekit_xai_realtime", "integrations/letta", + "integrations/hindsight", { type: "doc", id: "tutorials/scalekit_agentkit", label: "Scalekit with LiteLLM" }, { type: "doc", id: "tutorials/instructor", label: "Instructor with LiteLLM" }, { type: "doc", id: "langchain/langchain", label: "LangChain with LiteLLM" },