Skip to content

Commit 9975e72

Browse files
committed
fix: improve search quality and query deduplication
- FTS5 multi-word queries now use AND logic instead of phrase matching, so "graph store" finds GraphStore and related nodes - callers_of/callees_of/inheritors_of deduplicate results by qualified name (multiple call-site edges no longer produce duplicate nodes) - Ambiguous bare-name queries auto-resolve to the production function when exactly one non-test candidate exists - Test functions receive a 0.5x score penalty in hybrid search so production code ranks higher - search_nodes now uses FTS5 as fast path, falling back to LIKE only when FTS5 returns no results - Add v6 migration with composite edge index for upsert_edge performance
1 parent afae1c4 commit 9975e72

4 files changed

Lines changed: 71 additions & 21 deletions

File tree

code_review_graph/graph.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -306,24 +306,43 @@ def get_all_files(self) -> list[str]:
306306
return [r["file_path"] for r in rows]
307307

308308
def search_nodes(self, query: str, limit: int = 20) -> list[GraphNode]:
309-
"""Keyword search across node names with multi-word AND logic.
309+
"""Keyword search across node names.
310310
311-
Each word in the query must match independently (case-insensitive)
312-
against the node name or qualified name. For example,
313-
``"firebase auth"`` matches ``verify_firebase_token`` and
314-
``FirebaseAuth`` but not ``get_user``.
311+
Tries FTS5 first (fast, tokenized matching), then falls back to
312+
LIKE-based substring search when FTS5 returns no results.
315313
"""
316-
words = query.lower().split()
314+
words = query.split()
317315
if not words:
318316
return []
319317

318+
# Phase 1: FTS5 search (uses the indexed nodes_fts table)
319+
try:
320+
if len(words) == 1:
321+
fts_query = '"' + query.replace('"', '""') + '"'
322+
else:
323+
fts_query = " AND ".join(
324+
'"' + w.replace('"', '""') + '"' for w in words
325+
)
326+
rows = self._conn.execute(
327+
"SELECT n.* FROM nodes_fts f "
328+
"JOIN nodes n ON f.rowid = n.id "
329+
"WHERE nodes_fts MATCH ? LIMIT ?",
330+
(fts_query, limit),
331+
).fetchall()
332+
if rows:
333+
return [self._row_to_node(r) for r in rows]
334+
except Exception:
335+
pass # FTS5 table may not exist on older schemas
336+
337+
# Phase 2: LIKE fallback (substring matching)
320338
conditions: list[str] = []
321339
params: list[str | int] = []
322340
for word in words:
341+
w = word.lower()
323342
conditions.append(
324343
"(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)"
325344
)
326-
params.extend([f"%{word}%", f"%{word}%"])
345+
params.extend([f"%{w}%", f"%{w}%"])
327346

328347
where = " AND ".join(conditions)
329348
sql = f"SELECT * FROM nodes WHERE {where} LIMIT ?" # nosec B608

code_review_graph/migrations.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ def _migrate_v5(conn: sqlite3.Connection) -> None:
156156
logger.info("Migration v5: created nodes_fts FTS5 virtual table")
157157

158158

159+
def _migrate_v6(conn: sqlite3.Connection) -> None:
160+
"""v6: Add composite index on edges for upsert_edge performance."""
161+
conn.execute("""
162+
CREATE INDEX IF NOT EXISTS idx_edges_composite
163+
ON edges(kind, source_qualified, target_qualified, file_path, line)
164+
""")
165+
logger.info("Migration v6: created composite edge index")
166+
167+
159168
# ---------------------------------------------------------------------------
160169
# Migration registry
161170
# ---------------------------------------------------------------------------
@@ -165,6 +174,7 @@ def _migrate_v5(conn: sqlite3.Connection) -> None:
165174
3: _migrate_v3,
166175
4: _migrate_v4,
167176
5: _migrate_v5,
177+
6: _migrate_v6,
168178
}
169179

170180
LATEST_VERSION = max(MIGRATIONS.keys())

code_review_graph/search.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,14 @@ def _fts_search(
143143
Returns list of ``(node_id, bm25_score)`` tuples. The BM25 score is
144144
negated so higher = better (FTS5 returns negative BM25).
145145
"""
146-
# Sanitize: wrap in double quotes to prevent FTS5 operator injection
147-
safe_query = '"' + query.replace('"', '""') + '"'
146+
# Split multi-word queries into AND-joined terms so "graph store" matches
147+
# both "GraphStore" and nodes containing both words (not just exact phrase).
148+
# Each term is quoted to prevent FTS5 operator injection.
149+
terms = query.split()
150+
if len(terms) <= 1:
151+
safe_query = '"' + query.replace('"', '""') + '"'
152+
else:
153+
safe_query = " AND ".join('"' + t.replace('"', '""') + '"' for t in terms)
148154

149155
try:
150156
rows = conn.execute(
@@ -357,6 +363,8 @@ def hybrid_search(
357363
boost *= kind_boosts["_qualified"]
358364
if context_set and file_path in context_set:
359365
boost *= 1.5
366+
if row["is_test"]:
367+
boost *= 0.5
360368

361369
boosted.append((node_id, score * boost))
362370

code_review_graph/tools/query.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -174,14 +174,20 @@ def query_graph(
174174
node = candidates[0]
175175
target = node.qualified_name
176176
elif len(candidates) > 1:
177-
return {
178-
"status": "ambiguous",
179-
"summary": (
180-
f"Multiple matches for '{target}'. "
181-
"Please use a qualified name."
182-
),
183-
"candidates": [node_to_dict(c) for c in candidates],
184-
}
177+
# Prefer non-test nodes when exactly one production candidate
178+
non_test = [c for c in candidates if not c.is_test]
179+
if len(non_test) == 1:
180+
node = non_test[0]
181+
target = node.qualified_name
182+
else:
183+
return {
184+
"status": "ambiguous",
185+
"summary": (
186+
f"Multiple matches for '{target}'. "
187+
"Please use a qualified name."
188+
),
189+
"candidates": [node_to_dict(c) for c in candidates],
190+
}
185191

186192
if not node and pattern != "file_summary":
187193
return {
@@ -192,10 +198,12 @@ def query_graph(
192198
qn = node.qualified_name if node else target
193199

194200
if pattern == "callers_of":
201+
seen_qn: set[str] = set()
195202
for e in store.get_edges_by_target(qn):
196203
if e.kind == "CALLS":
197204
caller = store.get_node(e.source_qualified)
198-
if caller:
205+
if caller and caller.qualified_name not in seen_qn:
206+
seen_qn.add(caller.qualified_name)
199207
results.append(node_to_dict(caller))
200208
edges_out.append(edge_to_dict(e))
201209
# Fallback: CALLS edges store unqualified target names
@@ -204,15 +212,18 @@ def query_graph(
204212
if not results and node:
205213
for e in store.search_edges_by_target_name(node.name):
206214
caller = store.get_node(e.source_qualified)
207-
if caller:
215+
if caller and caller.qualified_name not in seen_qn:
216+
seen_qn.add(caller.qualified_name)
208217
results.append(node_to_dict(caller))
209218
edges_out.append(edge_to_dict(e))
210219

211220
elif pattern == "callees_of":
221+
seen_qn: set[str] = set()
212222
for e in store.get_edges_by_source(qn):
213223
if e.kind == "CALLS":
214224
callee = store.get_node(e.target_qualified)
215-
if callee:
225+
if callee and callee.qualified_name not in seen_qn:
226+
seen_qn.add(callee.qualified_name)
216227
results.append(node_to_dict(callee))
217228
edges_out.append(edge_to_dict(e))
218229

@@ -261,10 +272,12 @@ def query_graph(
261272
results.append(node_to_dict(t))
262273

263274
elif pattern == "inheritors_of":
275+
seen_qn: set[str] = set()
264276
for e in store.get_edges_by_target(qn):
265277
if e.kind in ("INHERITS", "IMPLEMENTS"):
266278
child = store.get_node(e.source_qualified)
267-
if child:
279+
if child and child.qualified_name not in seen_qn:
280+
seen_qn.add(child.qualified_name)
268281
results.append(node_to_dict(child))
269282
edges_out.append(edge_to_dict(e))
270283

0 commit comments

Comments
 (0)