diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..cf2f5b8 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,33 @@ +name: Tests + +on: + pull_request: + push: + branches: + - main + +jobs: + unittest: + name: Run unittest suite + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6.2.0 + with: + python-version: '3.12' + cache: 'pip' + cache-dependency-path: | + pyproject.toml + uv.lock + + - name: Install project + run: | + python -m pip install --upgrade pip + pip install . + + - name: Run tests + run: python -m unittest discover -s tests -p 'test*.py' diff --git a/.gitignore b/.gitignore index 16484bb..ee0561d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ dist/ uv.lock -.venv/ \ No newline at end of file +.venv/ +*.pyc +__pycache__ +.vscode \ No newline at end of file diff --git a/POWER.md b/POWER.md new file mode 100644 index 0000000..205f1f7 --- /dev/null +++ b/POWER.md @@ -0,0 +1,44 @@ +--- +name: "atproto" +displayName: "AT Protocol Docs & Lexicons" +description: "Search AT Protocol docs, lexicons, Bluesky API docs, and cookbook examples with atproto-mcp" +keywords: ["atproto", "bluesky", "lexicon", "mcp", "api docs", "cookbook", "federation", "firehose"] +--- + +# Onboarding + +## Step 1: Verify runtime tools + +Before using this power, ensure one of these is available: + +- `uvx` (recommended): verify with `uvx --version` +- `python` + `pip`: verify with `python --version` + +## Step 2: Configure environment (optional) + +You can customize cache/index behavior with environment variables: + +- `ATPROTO_MCP_CACHE_DIR` +- `ATPROTO_MCP_REFRESH_HOURS` +- `ATPROTO_MCP_EMBEDDING_MODEL` + +## Best Practices + +- Start broad with `search_atproto_docs`, then narrow with `get_lexicon` and `search_bsky_api`. +- Use `list_lexicons` to discover valid NSIDs before requesting a full schema. +- Use `list_cookbook_examples` before `get_cookbook_example` when you need language-specific starter code. +- Use `refresh_sources` when you suspect upstream docs changed. + +## Suggested Workflows + +### Explore a namespace + +1. Run `list_lexicons` with a namespace prefix (for example `app.bsky.feed`). +2. Fetch target schemas with `get_lexicon`. +3. Cross-check implementation details with `search_atproto_docs`. + +### Build a feature + +1. Search concepts and endpoint behavior with `search_atproto_docs`. +2. Inspect canonical schemas with `get_lexicon`. +3. Find implementation references with cookbook tools. diff --git a/README.md b/README.md index 09016aa..d59a5f0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # atproto-mcp +[![Tests](https://github.com/Ashex/atproto-mcp/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/Ashex/atproto-mcp/actions/workflows/tests.yml) + MCP server providing a searchable knowledge base for the [AT Protocol](https://atproto.com/) ecosystem — protocol documentation, lexicon schemas, Bluesky developer API docs, and cookbook examples — powered by [txtai](https://github.com/neuml/txtai) semantic search. ## Data Sources @@ -76,6 +78,12 @@ Add to `.vscode/mcp.json` in your workspace: } ``` +### Kiro Power + +1. Open **Kiro → Powers** +2. Select **Import power from GitHub** +3. Enter `https://github.com/ashex/atproto-mcp` + ### Claude Desktop Add to `~/Library/Application Support/Claude/claude_desktop_config.json`: @@ -105,10 +113,10 @@ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`: On first launch, the server: -1. **Clones** the 4 source repositories into `~/.cache/atproto-mcp/repos/` (shallow clones, ~minutes) -2. **Parses** MDX documentation, lexicon JSON schemas, and cookbook examples into text chunks -3. **Indexes** all chunks using txtai with the `all-MiniLM-L6-v2` sentence-transformer model (~80MB, runs locally on CPU) -4. **Persists** the index to `~/.cache/atproto-mcp/index/` for fast subsequent starts +1. Shallow clones the repos into `~/.cache/atproto-mcp/repos/` +2. Parses MDX docs, lexicon schemas, and cookbook examples into text chunks +3. Indexes the chunks using txtai with the `all-MiniLM-L6-v2` sentence-transformer model (~80MB, runs locally) +4. Index is persisted in `~/.cache/atproto-mcp/index/` for subsequent starts On subsequent launches, the cached index loads in seconds. Repos older than 24 hours are automatically refreshed with `git pull`. diff --git a/mcp.json b/mcp.json new file mode 100644 index 0000000..b977824 --- /dev/null +++ b/mcp.json @@ -0,0 +1,15 @@ +{ + "mcpServers": { + "atproto": { + "command": "uvx", + "args": [ + "atproto-mcp" + ], + "env": { + "ATPROTO_MCP_CACHE_DIR": "${ATPROTO_MCP_CACHE_DIR}", + "ATPROTO_MCP_REFRESH_HOURS": "${ATPROTO_MCP_REFRESH_HOURS}", + "ATPROTO_MCP_EMBEDDING_MODEL": "${ATPROTO_MCP_EMBEDDING_MODEL}" + } + } + } +} diff --git a/pyproject.toml b/pyproject.toml index a726fdd..a01b164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "atproto-mcp" -version = "0.1.1" +version = "0.1.2" description = "MCP server providing AT Protocol documentation, lexicons, Bluesky API docs, and cookbook examples as a searchable knowledge base powered by txtai semantic search." readme = "README.md" requires-python = ">=3.10" diff --git a/src/atproto_mcp/indexer.py b/src/atproto_mcp/indexer.py index 1e1296c..01eeabe 100644 --- a/src/atproto_mcp/indexer.py +++ b/src/atproto_mcp/indexer.py @@ -139,18 +139,13 @@ def search( if not self._embeddings: return [] - if source: - results = self._embeddings.search( - f"select id, text, score from txtai where similar('{_escape_sql(query)}') and " - f"tags is not null limit {limit}" - ) - else: - results = self._embeddings.search(query, limit=limit) + fetch_limit = limit if not source else max(limit * 5, 50) + results = self._embeddings.search(query, limit=fetch_limit) return self._enrich_results( list(results) if isinstance(results, list) else [], # type: ignore[arg-type] source_filter=source, - ) + )[:limit] def search_lexicons(self, query: str, limit: int = 10) -> list[dict[str, object]]: """Semantic search specifically within lexicons.""" @@ -256,11 +251,6 @@ def lexicon_count(self) -> int: return len(self._lexicon_map) -def _escape_sql(value: str) -> str: - """Escape single quotes for txtai SQL queries.""" - return value.replace("'", "''") - - def build_knowledge_base(config: Config, chunks: list[ContentChunk]) -> KnowledgeBase: """Build a new knowledge base from parsed content chunks.""" kb = KnowledgeBase(config) diff --git a/tests/test_source_filtered_search_regression.py b/tests/test_source_filtered_search_regression.py new file mode 100644 index 0000000..caeeea9 --- /dev/null +++ b/tests/test_source_filtered_search_regression.py @@ -0,0 +1,88 @@ +"""Regression tests for source-filtered semantic search.""" + +from __future__ import annotations + +import unittest + +from atproto_mcp.config import Config +from atproto_mcp.indexer import KnowledgeBase +from atproto_mcp.parser import ContentChunk + + +class _FakeEmbeddings: + def __init__(self, rows: list[dict[str, object]]) -> None: + self._rows = rows + + def search(self, query: str, limit: int = 10) -> list[dict[str, object]]: + return self._rows[:limit] + + +class SourceFilteredSearchRegressionTests(unittest.TestCase): + def test_search_lexicons_returns_source_filtered_results(self) -> None: + kb = KnowledgeBase(Config()) + + kb._embeddings = _FakeEmbeddings( + [ + {"id": "a", "text": "lexicon schema", "score": 0.9}, + {"id": "b", "text": "lexicon schema", "score": 0.8}, + ] + ) + kb._chunks_by_uid = { + "a": ContentChunk( + text="", + source="bsky-docs", + file_path="docs/a.md", + title="A", + ), + "b": ContentChunk( + text="", + source="lexicons", + file_path="lexicons/b.json", + title="B", + nsid="com.atproto.lexicon.schema", + ), + } + + results = kb.search_lexicons("com.atproto.lexicon.schema record type", limit=1) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0]["source"], "lexicons") + self.assertEqual(results[0]["nsid"], "com.atproto.lexicon.schema") + + def test_search_atproto_website_source_filter_returns_matching_rows(self) -> None: + kb = KnowledgeBase(Config()) + + kb._embeddings = _FakeEmbeddings( + [ + {"id": "x", "text": "dns txt", "score": 0.7}, + {"id": "y", "text": "dns txt", "score": 0.6}, + ] + ) + kb._chunks_by_uid = { + "x": ContentChunk( + text="", + source="lexicons", + file_path="lexicons/x.json", + title="X", + ), + "y": ContentChunk( + text="", + source="atproto-website", + file_path="specs/y.mdx", + title="En > DNS TXT Method", + ), + } + + results = kb.search( + "lexicon schema record DNS TXT _lexicon authority resolution PDS serving", + source="atproto-website", + limit=1, + ) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0]["source"], "atproto-website") + self.assertEqual(results[0]["title"], "En > DNS TXT Method") + + +if __name__ == "__main__": + unittest.main()