From f97286eef86741e189ccf5ddf198ce1befd7b4a5 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:04:38 +0800 Subject: [PATCH 1/4] =?UTF-8?q?perf(search):=20cut=20searchContent=20hot?= =?UTF-8?q?=20path=20~2-4x=20=E2=80=94=20line-offset=20cache,=20doc=5Fid?= =?UTF-8?q?=20grouping,=20packed-key=20sorts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 0: per-file newline-offset cache (LineOffsetCache) replaces per-query line rescans; grouping keys postings by doc_id with a contiguous-run fast path (one map op per unique file, not per hit); candidate sort packs the (is_doc, defines, count, first_seen) comparator into one u64 key over (key, idx) pairs. renderPlainSearch precomputes path priors once per file instead of twice per comparison. Rerank memoizes per-path facts across consecutive results and switches both final sorts block -> pdq (total-order comparators, identical permutation). Tier 1 hits_per_file gets the same contiguity fast path. Benchmark harness gains opt-in CODEDB_BENCH_CALLOC=1 (production c_allocator instead of DebugAllocator) and CODEDB_BENCH_BREAKDOWN=1 (per-tier ns). codedb repo, 300 iters, c_allocator: middleware 14us, database 15us, error 28us (was 88/65/107us pre-round-1). 814/814 tests, e2e MCP 20/20. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/benchmark.zig | 8 +- src/explore.zig | 525 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 434 insertions(+), 99 deletions(-) diff --git a/src/benchmark.zig b/src/benchmark.zig index 1dda8631..2729dcd9 100644 --- a/src/benchmark.zig +++ b/src/benchmark.zig @@ -88,6 +88,12 @@ fn benchSearch(explorer: *Explorer, query: []const u8, n: usize, alloc: std.mem. for (r) |e| alloc.free(e.line_text); alloc.free(r); } + if (cio.posixGetenv("CODEDB_BENCH_BREAKDOWN") != null) { + const b = explorer.last_search_breakdown; + var buf: [512]u8 = undefined; + const msg = std.fmt.bufPrint(&buf, " breakdown[{s}]: t0={d}ns t05={d}ns t1={d}ns t2={d}ns rerank={d}ns tier_reached={d} cands={d} results={d}\n", .{ query, b.tier0_ns, b.tier05_ns, b.tier1_ns, b.tier2_ns, b.rerank_ns, b.tier_reached, b.candidate_count, b.result_count }) catch ""; + cio.File.stderr().writeAll(msg) catch {}; + } return .{ .name = query, .kind = "search", .hits = hits, .avg_ns = total / n }; } @@ -274,7 +280,7 @@ pub fn main(init: std.process.Init.Minimal) !void { cio.setProcessArgs(init.args.vector); var gpa: std.heap.DebugAllocator(.{}) = .init; defer _ = gpa.deinit(); - const alloc = gpa.allocator(); + const alloc = if (cio.posixGetenv("CODEDB_BENCH_CALLOC") != null) std.heap.c_allocator else gpa.allocator(); var threaded: std.Io.Threaded = .init(alloc, .{}); defer threaded.deinit(); diff --git a/src/explore.zig b/src/explore.zig index 30a20b82..9046bbdc 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -743,10 +743,142 @@ const LexFreqPenalty = struct { return 1.0 - self.amp * norm; } }; + +/// Per-file newline-offset tables so Tier 0's line-number → line-text lookups +/// skip rescanning file bytes on every query. Entries self-validate against +/// the content slice (ptr+len) they were built from and are invalidated on +/// reindex/remove. Guarded by its own mutex because searchContent runs under +/// the Explorer's SHARED lock — concurrent readers may build entries. +const LineOffsetCache = struct { + const Entry = struct { + content_ptr: usize, + content_len: usize, + offsets: []u32, + }; + pub const Span = struct { + line: u32, + start: usize, + end: usize, + }; + + map: std.StringHashMap(Entry), + mu: cio.Mutex = .{}, + total_bytes: usize = 0, + + const MAX_BYTES: usize = 16 * 1024 * 1024; + + fn init(allocator: std.mem.Allocator) LineOffsetCache { + return .{ .map = std.StringHashMap(Entry).init(allocator) }; + } + + fn deinit(self: *LineOffsetCache) void { + var iter = self.map.iterator(); + while (iter.next()) |e| { + self.map.allocator.free(e.key_ptr.*); + self.map.allocator.free(e.value_ptr.offsets); + } + self.map.deinit(); + } + + fn clearLocked(self: *LineOffsetCache) void { + var iter = self.map.iterator(); + while (iter.next()) |e| { + self.map.allocator.free(e.key_ptr.*); + self.map.allocator.free(e.value_ptr.offsets); + } + self.map.clearRetainingCapacity(); + self.total_bytes = 0; + } + + fn clear(self: *LineOffsetCache) void { + self.mu.lock(); + defer self.mu.unlock(); + self.clearLocked(); + } + + fn invalidate(self: *LineOffsetCache, path: []const u8) void { + self.mu.lock(); + defer self.mu.unlock(); + if (self.map.fetchRemove(path)) |kv| { + self.total_bytes -= kv.value.offsets.len * @sizeOf(u32); + self.map.allocator.free(kv.key); + self.map.allocator.free(kv.value.offsets); + } + } + + fn buildOffsets(allocator: std.mem.Allocator, content: []const u8) ?[]u32 { + var offsets: std.ArrayList(u32) = .empty; + offsets.ensureTotalCapacity(allocator, @max(16, content.len / 32)) catch return null; + offsets.appendAssumeCapacity(0); + var pos: usize = 0; + while (std.mem.indexOfScalarPos(u8, content, pos, '\n')) |nl| { + pos = nl + 1; + offsets.append(allocator, @intCast(pos)) catch { + offsets.deinit(allocator); + return null; + }; + } + return offsets.toOwnedSlice(allocator) catch { + offsets.deinit(allocator); + return null; + }; + } + + /// Resolve ascending 1-based `target_lines` to byte spans in `content`, + /// building (and caching) the offset table for `path` on first touch. + /// Span semantics match std.mem.splitScalar(content, '\n'): a line ends + /// before its '\n'; the final line ends at content.len. Returns the + /// number of spans filled, or null when the table cannot be built (OOM) + /// — the caller falls back to the scanning path. + fn lineSpans(self: *LineOffsetCache, path: []const u8, content: []const u8, target_lines: []const u32, spans: []Span) ?usize { + self.mu.lock(); + defer self.mu.unlock(); + var offsets: []const u32 = undefined; + if (self.map.getPtr(path)) |e| { + if (e.content_ptr == @intFromPtr(content.ptr) and e.content_len == content.len) { + offsets = e.offsets; + } else { + const fresh = buildOffsets(self.map.allocator, content) orelse return null; + self.total_bytes -= e.offsets.len * @sizeOf(u32); + self.map.allocator.free(e.offsets); + e.* = .{ .content_ptr = @intFromPtr(content.ptr), .content_len = content.len, .offsets = fresh }; + self.total_bytes += fresh.len * @sizeOf(u32); + offsets = fresh; + } + } else { + const fresh = buildOffsets(self.map.allocator, content) orelse return null; + const key = self.map.allocator.dupe(u8, path) catch { + self.map.allocator.free(fresh); + return null; + }; + self.map.put(key, .{ .content_ptr = @intFromPtr(content.ptr), .content_len = content.len, .offsets = fresh }) catch { + self.map.allocator.free(fresh); + self.map.allocator.free(key); + return null; + }; + self.total_bytes += fresh.len * @sizeOf(u32); + offsets = fresh; + } + + var n: usize = 0; + for (target_lines) |ln| { + if (n >= spans.len) break; + if (ln == 0 or ln > offsets.len) continue; + const start: usize = offsets[ln - 1]; + const end: usize = if (ln < offsets.len) offsets[ln] - 1 else content.len; + spans[n] = .{ .line = ln, .start = start, .end = end }; + n += 1; + } + + if (self.total_bytes > MAX_BYTES) self.clearLocked(); + return n; + } +}; pub const Explorer = struct { outlines: std.StringHashMap(FileOutline), dep_graph: DependencyGraph, contents: ContentCache, + line_offsets: LineOffsetCache, symbol_index: std.StringHashMap(std.ArrayList(SymbolLocation)), /// False after a snapshot fast-load until ensureSymbolIndex runs (#564). symbol_index_complete: bool, @@ -832,6 +964,7 @@ pub const Explorer = struct { .outlines = std.StringHashMap(FileOutline).init(allocator), .dep_graph = DependencyGraph.init(allocator), .contents = try ContentCache.initAlloc(allocator, content_cache_capacity), + .line_offsets = LineOffsetCache.init(allocator), .symbol_index = std.StringHashMap(std.ArrayList(SymbolLocation)).init(allocator), .symbol_index_complete = true, .word_index = WordIndex.init(allocator), @@ -859,6 +992,7 @@ pub const Explorer = struct { self.symbol_index.deinit(); self.contents.deinit(); + self.line_offsets.deinit(); if (self.call_centrality) |*c| c.deinit(); if (self.call_graph) |*cg| cg.deinit(self.allocator); if (self.co_change) |*cc| git.freeCoChange(cc, self.allocator); @@ -911,6 +1045,7 @@ pub const Explorer = struct { self.mu.lock(); defer self.mu.unlock(); self.contents.clear(); + self.line_offsets.clear(); } pub fn releaseSecondaryIndexes(self: *Explorer) void { @@ -1028,6 +1163,7 @@ pub const Explorer = struct { // Last fallible step: put frees the prior cache value in place, so it // must run only once nothing after it can still need prior_content. try self.contents.put(stable_path, content); + self.line_offsets.invalidate(stable_path); outline_gop.value_ptr.* = persistent_outline; if (prior_outline) |*old_outline| old_outline.deinit(); @@ -1627,6 +1763,7 @@ pub const Explorer = struct { self.removeSymbolIndexFor(path); _ = self.skip_trigram_files.remove(path); self.contents.remove(path); + self.line_offsets.invalidate(path); self.word_index.removeFile(path); self.trigram_index.removeFile(path); @@ -2520,77 +2657,140 @@ pub const Explorer = struct { if (word_hits.len > 0) { const Tier0File = struct { path: []const u8, + doc_id: u32, count: u32, first_seen: usize, + // One past the ordinal of this file's last hit. Together with + // first_seen it bounds the file's posting run, so per-file + // target-line collection slices word_hits[first_seen..hits_end] + // instead of filtering the whole hit list per file. The + // doc_id filter stays, so the bounds are correct even if a + // file's hits were ever non-contiguous. + hits_end: usize, is_doc: bool, defines: bool, }; - var tier0_files_by_path = std.StringHashMap(Tier0File).init(allocator); - defer tier0_files_by_path.deinit(); - + // Keyed by doc_id, not path: with the contiguity fast path below + // the map sees one getOrPut per UNIQUE file, and a u32 hash is + // several times cheaper than re-hashing a ~40-byte path string. + // hitPath only runs on first sight of a doc_id. Invalid postings + // (hitPath == "") stay in the map as empty-path tombstones and + // are skipped when the candidate list is built. + var tier0_files_by_doc = std.AutoHashMap(u32, Tier0File).init(allocator); + defer tier0_files_by_doc.deinit(); + // Pre-size for the unique-file count so high-frequency words + // (hundreds of files) don't pay a rehash cascade while inserting. + tier0_files_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {}; + + // Postings for one file are appended contiguously (indexFile + // processes whole files), so consecutive hits almost always share + // a doc_id — the cached-entry fast path turns one hash getOrPut + // PER HIT (~30µs on 1800-hit words) into one per unique file. + // last_entry is only dereferenced immediately after it was + // (re)set with no intervening map mutation, so it cannot dangle + // across a rehash. + var last_doc_id: u32 = 0; + var last_entry: ?*Tier0File = null; for (word_hits, 0..) |hit, ordinal| { - const hit_path = self.word_index.hitPath(hit); - if (hit_path.len == 0) continue; - const gop = tier0_files_by_path.getOrPut(hit_path) catch continue; + if (last_entry) |entry| { + if (hit.doc_id == last_doc_id) { + entry.count +|= 1; + entry.hits_end = ordinal + 1; + continue; + } + } + const gop = tier0_files_by_doc.getOrPut(hit.doc_id) catch continue; if (!gop.found_existing) { - const is_doc = isDocLanguage(detectLanguage(hit_path)); - const defines = !is_doc and self.fileDefinesSymbol(hit_path, query); + const hit_path = self.word_index.hitPath(hit); + const is_doc = hit_path.len > 0 and isDocLanguage(detectLanguage(hit_path)); + const defines = hit_path.len > 0 and !is_doc and self.fileDefinesSymbol(hit_path, query); gop.value_ptr.* = .{ .path = hit_path, + .doc_id = hit.doc_id, .count = 0, .first_seen = ordinal, + .hits_end = ordinal + 1, .is_doc = is_doc, .defines = defines, }; } gop.value_ptr.count +|= 1; + gop.value_ptr.hits_end = ordinal + 1; + last_doc_id = hit.doc_id; + last_entry = gop.value_ptr; } - var tier0_files: std.ArrayList(Tier0File) = .empty; defer tier0_files.deinit(allocator); - try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_path.count()); - var tier0_iter = tier0_files_by_path.valueIterator(); + try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_doc.count()); + var tier0_iter = tier0_files_by_doc.valueIterator(); while (tier0_iter.next()) |stats| { + if (stats.path.len == 0) continue; tier0_files.appendAssumeCapacity(stats.*); } - if (tier0_files.items.len > 1) { - std.sort.block(Tier0File, tier0_files.items, {}, struct { - pub fn lessThan(_: void, a: Tier0File, b: Tier0File) bool { - if (a.is_doc != b.is_doc) return !a.is_doc; - if (a.defines != b.defines) return a.defines; - if (a.count != b.count) return a.count > b.count; - if (a.first_seen != b.first_seen) return a.first_seen < b.first_seen; - return std.mem.lessThan(u8, a.path, b.path); + // Sort 12-byte (key, index) pairs instead of the 48-byte structs. + // The old comparator (is_doc asc, defines desc, count desc, + // first_seen asc, path asc) packs losslessly into one u64: + // first_seen is the ordinal of a file's first hit, unique per + // file, so the path tiebreak was unreachable. count saturates at + // 2^30−1; beyond that ties fall to first_seen, same as before. + const Tier0Order = struct { key: u64, idx: u32 }; + var tier0_order: std.ArrayList(Tier0Order) = .empty; + defer tier0_order.deinit(allocator); + try tier0_order.ensureTotalCapacity(allocator, tier0_files.items.len); + for (tier0_files.items, 0..) |stats, i| { + const cnt: u64 = @min(stats.count, (1 << 30) - 1); + const key = (@as(u64, @intFromBool(stats.is_doc)) << 63) | + (@as(u64, @intFromBool(!stats.defines)) << 62) | + ((((1 << 30) - 1) - cnt) << 32) | + @as(u64, @as(u32, @truncate(stats.first_seen))); + tier0_order.appendAssumeCapacity(.{ .key = key, .idx = @intCast(i) }); + } + if (tier0_order.items.len > 1) { + std.sort.pdq(Tier0Order, tier0_order.items, {}, struct { + pub fn lessThan(_: void, a: Tier0Order, b: Tier0Order) bool { + return a.key < b.key; } }.lessThan); } - const tier0_per_file_cap: usize = if (tier0_files.items.len <= 1) max_results else @max(1, max_results / 5); var tier0_exact_capacity: usize = 0; - for (tier0_files.items) |stats| { - tier0_exact_capacity += @min(@as(usize, stats.count), tier0_per_file_cap); + for (tier0_order.items) |ord| { + tier0_exact_capacity += @min(@as(usize, tier0_files.items[ord.idx].count), tier0_per_file_cap); if (tier0_exact_capacity >= max_results) break; } const use_line_hits = tier0_exact_capacity >= max_results and tier0_per_file_cap <= 256; - for (tier0_files.items) |stats| { + for (tier0_order.items) |ord| { + const stats = tier0_files.items[ord.idx]; if (result_list.items.len >= max_results) break; const ref = self.readContentForSearch(stats.path, allocator) orelse continue; defer ref.deinit(); if (use_line_hits) { var target_lines: [256]u32 = undefined; var target_count: usize = 0; - for (word_hits) |hit| { + for (word_hits[stats.first_seen..stats.hits_end]) |hit| { if (target_count >= tier0_per_file_cap) break; - const hit_path = self.word_index.hitPath(hit); - if (!std.mem.eql(u8, hit_path, stats.path)) continue; + if (hit.doc_id != stats.doc_id) continue; if (target_count == 0 or target_lines[target_count - 1] != hit.line_num) { target_lines[target_count] = hit.line_num; target_count += 1; } } - try appendTargetLineHits(stats.path, ref.data, allocator, target_lines[0..target_count], max_results, &result_list); + var spans: [256]LineOffsetCache.Span = undefined; + if (self.line_offsets.lineSpans(stats.path, ref.data, target_lines[0..target_count], &spans)) |n_spans| { + result_list.ensureUnusedCapacity(allocator, @min(n_spans, max_results - result_list.items.len)) catch {}; + for (spans[0..n_spans]) |sp| { + if (result_list.items.len >= max_results) break; + const line_text = try allocator.dupe(u8, ref.data[sp.start..sp.end]); + errdefer allocator.free(line_text); + const path_copy = try allocator.dupe(u8, stats.path); + errdefer allocator.free(path_copy); + try result_list.append(allocator, .{ .path = path_copy, .line_num = sp.line, .line_text = line_text }); + } + } else { + try appendTargetLineHits(stats.path, ref.data, allocator, target_lines[0..target_count], max_results, &result_list); + } if (result_list.items.len < max_results) searched.put(stats.path, {}) catch {}; } else { searched.put(stats.path, {}) catch {}; @@ -2659,12 +2859,28 @@ pub const Explorer = struct { // file behind unrelated short files when max_per_file was 1. var hits_per_file = std.StringHashMap(u32).init(allocator); defer hits_per_file.deinit(); + hits_per_file.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {}; + // Same contiguous-posting fast path as Tier 0's grouping: + // consecutive hits share a doc_id, so the per-hit hitPath + + // string getOrPut collapses to once per unique file. The + // cached pointer is only dereferenced immediately after being + // (re)set, so it cannot dangle across a rehash. + var hpf_last_doc: u32 = 0; + var hpf_last: ?*u32 = null; for (word_hits) |hit| { + if (hpf_last) |cnt| { + if (hit.doc_id == hpf_last_doc) { + cnt.* += 1; + continue; + } + } const hp = self.word_index.hitPath(hit); if (hp.len == 0) continue; const gop_h = try hits_per_file.getOrPut(hp); if (!gop_h.found_existing) gop_h.value_ptr.* = 0; gop_h.value_ptr.* += 1; + hpf_last_doc = hit.doc_id; + hpf_last = gop_h.value_ptr; } const SortCtx = struct { contents: *ContentCache, @@ -2790,12 +3006,26 @@ pub const Explorer = struct { path: []const u8, count: u32, first_seen: usize, + // One past the ordinal of this file's last hit — bounds the + // file's posting run for the per-file target-line collection + // below (the doc_id filter stays, so the bounds are correct even + // if a file's hits were ever non-contiguous). + hits_end: usize, is_doc: bool, }; var tier0_files_buf: [512]Tier0File = undefined; var tier0_files_len: usize = 0; for (word_hits, 0..) |hit, ordinal| { + // Postings for one file are appended contiguously (indexFile + // processes whole files), so consecutive hits almost always share + // a doc_id — checking the newest entry first turns the O(hits × + // files) linear rescan below into one pass per unique file. + if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) { + tier0_files_buf[tier0_files_len - 1].count +|= 1; + tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1; + continue; + } const hit_path = self.word_index.hitPath(hit); if (hit_path.len == 0) continue; @@ -2808,6 +3038,7 @@ pub const Explorer = struct { } if (found_i) |i| { tier0_files_buf[i].count +|= 1; + tier0_files_buf[i].hits_end = ordinal + 1; } else { if (tier0_files_len >= tier0_files_buf.len) return false; tier0_files_buf[tier0_files_len] = .{ @@ -2815,6 +3046,7 @@ pub const Explorer = struct { .path = hit_path, .count = 1, .first_seen = ordinal, + .hits_end = ordinal + 1, .is_doc = isDocLanguage(detectLanguage(hit_path)), }; tier0_files_len += 1; @@ -2825,11 +3057,15 @@ pub const Explorer = struct { const tier0_files = tier0_files_buf[0..tier0_files_len]; if (tier0_files.len > 1) { const RankCtx = struct { - query: []const u8, + priors: []const f32, + files: []const Tier0File, // Path-prior portion of rerankSignalScore: the canonical-file signals // (basename-stem match, path segment) and demotion penalties. Without it // this fast-path rendered in raw hit-count order, so a high-frequency // non-canonical file outranked the canonical basename match. + // Computed ONCE per file into `priors` — the old shape + // recomputed both priors (basename + ~10 path-segment scans) + // inside the comparator on every comparison. fn prior(path: []const u8, q: []const u8) f32 { const base = std.fs.path.basename(path); const stem_end = std.mem.indexOfScalar(u8, base, '.') orelse base.len; @@ -2848,17 +3084,31 @@ pub const Explorer = struct { pathHasSegment(path, "third_party")) s *= 0.4; return s; } - pub fn lessThan(ctx: @This(), a: Tier0File, b: Tier0File) bool { - const pa = prior(a.path, ctx.query); - const pb = prior(b.path, ctx.query); + pub fn lessThan(ctx: @This(), ai: u32, bi: u32) bool { + const pa = ctx.priors[ai]; + const pb = ctx.priors[bi]; if (pa != pb) return pa > pb; + const a = ctx.files[ai]; + const b = ctx.files[bi]; if (a.is_doc != b.is_doc) return !a.is_doc; if (a.count != b.count) return a.count > b.count; if (a.first_seen != b.first_seen) return a.first_seen < b.first_seen; return std.mem.lessThan(u8, a.path, b.path); } }; - std.sort.block(Tier0File, tier0_files, RankCtx{ .query = query }, RankCtx.lessThan); + var priors_buf: [512]f32 = undefined; + var order_buf: [512]u32 = undefined; + for (tier0_files, 0..) |stats, i| { + priors_buf[i] = RankCtx.prior(stats.path, query); + order_buf[i] = @intCast(i); + } + const order = order_buf[0..tier0_files.len]; + std.sort.pdq(u32, order, RankCtx{ .priors = priors_buf[0..tier0_files.len], .files = tier0_files }, RankCtx.lessThan); + // Apply the permutation back into the value buffer so the render + // loops below keep iterating tier0_files directly. + var sorted_buf: [512]Tier0File = undefined; + for (order, 0..) |src, dst| sorted_buf[dst] = tier0_files[src]; + @memcpy(tier0_files, sorted_buf[0..tier0_files.len]); } const tier0_per_file_cap: usize = if (tier0_files.len <= 1) max_results else @max(1, max_results / 5); @@ -2895,7 +3145,7 @@ pub const Explorer = struct { var target_lines: [256]u32 = undefined; var target_count: usize = 0; - for (word_hits) |hit| { + for (word_hits[stats.first_seen..stats.hits_end]) |hit| { if (target_count >= tier0_per_file_cap) break; if (hit.doc_id != stats.doc_id) continue; if (target_count == 0 or target_lines[target_count - 1] != hit.line_num) { @@ -2915,17 +3165,11 @@ pub const Explorer = struct { } const content = self.contents.get(stats.path) orelse return false; - var target_i: usize = 0; - var line_num: u32 = 0; - var lines = std.mem.splitScalar(u8, content, '\n'); - while (lines.next()) |line| { - line_num += 1; - while (target_i < target_count and target_lines[target_i] < line_num) { - target_i += 1; - } - if (target_i >= target_count) break; - if (target_lines[target_i] != line_num) continue; - target_i += 1; + var spans: [256]LineOffsetCache.Span = undefined; + // OOM building the offset table → bail to the full searchContent + // path (caller falls through), which renders the same results. + const n_spans = self.line_offsets.lineSpans(stats.path, content, target_lines[0..target_count], &spans) orelse return false; + for (spans[0..n_spans]) |line_span| { rendered += 1; var count_idx: ?usize = null; @@ -2948,7 +3192,7 @@ pub const Explorer = struct { } } else { shown += 1; - try w.print(" {s}:{d}: {s}\n", .{ stats.path, line_num, line }); + try w.print(" {s}:{d}: {s}\n", .{ stats.path, line_span.line, content[line_span.start..line_span.end] }); } if (rendered >= max_results) break; } @@ -2993,7 +3237,6 @@ pub const Explorer = struct { if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*; } } - // #550: a single-token query that exactly names a known symbol gets the // call-graph distance boost here too (the multi-word BM25 path applies // it in searchContentRanked). The symbol_index gate keeps plain word @@ -3012,28 +3255,99 @@ pub const Explorer = struct { graph_dist = self.queryGraphDistances(&gd_terms, ga); } - // #550 signal 2: git co-change. Seeds are the result files that - // DEFINE the queried symbol, so plain word queries never trigger the - // one-time `git log` shell-out. + // Per-unique-path rerank facts, computed ONCE per path instead of per + // result — every path-level signal (outline definition scan, path + // priors, boost multipliers) is identical for all hits in the same + // file. `defines` feeds the co-change seeds (#550 signal 2: seeds are + // the result files that DEFINE the queried symbol, so plain word + // queries never trigger the one-time `git log` shell-out). + // Results arrive grouped by file (every tier emits per-file), so a + // consecutive-path fast path resolves most results with one + // std.mem.eql instead of a string hash + probe. + var facts_by_path = std.StringHashMap(PathRerankFacts).init(allocator); + defer facts_by_path.deinit(); + { + var last_path: []const u8 = ""; + for (result_list.items) |r| { + if (last_path.len > 0 and std.mem.eql(u8, r.path, last_path)) continue; + last_path = r.path; + const gop = facts_by_path.getOrPut(r.path) catch continue; + if (gop.found_existing) continue; + gop.value_ptr.* = self.pathRerankFacts(r.path, query); + } + } + var cc_seeds = std.StringHashMap(void).init(allocator); defer cc_seeds.deinit(); if (cio.posixGetenv("CODEDB_NO_COCHANGE") == null) { - for (result_list.items) |r| { - if (cc_seeds.contains(r.path)) continue; - if (self.fileDefinesSymbol(r.path, query)) cc_seeds.put(r.path, {}) catch {}; + var facts_iter = facts_by_path.iterator(); + while (facts_iter.next()) |entry| { + if (entry.value_ptr.defines) cc_seeds.put(entry.key_ptr.*, {}) catch {}; } if (cc_seeds.count() > 0) self.ensureCoChange(); } + // The boost multipliers depend on graph_dist / cc_seeds / the hit + // tally, so they fill in a second pass over the deduped path set. + var boosts_iter = facts_by_path.iterator(); + while (boosts_iter.next()) |entry| { + const path = entry.key_ptr.*; + entry.value_ptr.gd = graphDistanceBoost(graph_dist, path); + entry.value_ptr.cc = self.coChangeBoost(&cc_seeds, path); + if (lfp.enabled) entry.value_ptr.lfp_mult = lfp.multiplier(file_hit_counts.get(path) orelse 1, max_file_hits); + if (sp.enabled) entry.value_ptr.sp_mult = sp.multiplier(self, path); + } + + // Same consecutive-path memoization as the facts pass: the facts are + // copied by VALUE, so later map lookups can never be invalidated (the + // map is no longer mutated here anyway). + var score_last_path: []const u8 = ""; + var score_last_facts: PathRerankFacts = .{}; for (result_list.items) |*r| { - r.score = self.rerankSignalScore(r.*, query); - r.score *= graphDistanceBoost(graph_dist, r.path); - r.score *= self.coChangeBoost(&cc_seeds, r.path); - if (lfp.enabled) r.score *= lfp.multiplier(file_hit_counts.get(r.path) orelse 1, max_file_hits); - if (sp.enabled) r.score *= sp.multiplier(self, r.path); + if (score_last_path.len == 0 or !std.mem.eql(u8, r.path, score_last_path)) { + score_last_facts = facts_by_path.get(r.path) orelse PathRerankFacts{}; + score_last_path = r.path; + } + const facts = score_last_facts; + const def_line_match = blk: { + for (facts.def_lines[0..facts.def_count]) |ln| { + if (ln == r.line_num) break :blk true; + } + if (facts.def_overflow) { + if (self.outlines.get(r.path)) |outline| { + for (outline.symbols.items) |sym| { + if (sym.line_start == r.line_num and asciiEqlIgnoreCase(sym.name, query)) break :blk true; + } + } + } + break :blk false; + }; + var score: f32 = countOccurrences(r.line_text, query); + if (facts.is_tooling) score = @min(score, 2.0); + if (def_line_match) score += 5.0; + score += facts.add_boost; + if (facts.is_test) score *= 0.6; + if (facts.is_example) score *= 0.6; + if (facts.is_tooling) score *= 0.5; + if (facts.is_vendor) score *= 0.4; + // Doc-language penalty: markdown / data files (CHANGELOG.md, design + // docs, benchmark logs) often mention an identifier many times in a + // single line, which lets per-line frequency dwarf code call sites. + // For doc files, more mentions don't reflect more code-relevance — + // they reflect prose density. Cap at 1.0 then halve so any code hit + // (score >= 1) outranks any doc hit. Symmetric with path-prior. + if (facts.is_doc) score = @min(score, 1.0) * 0.5; + score *= facts.gd; + score *= facts.cc; + score *= facts.lfp_mult; + score *= facts.sp_mult; + r.score = score; } if (result_list.items.len > 1) { - std.sort.block(SearchResult, result_list.items, {}, struct { + // pdq, not block: (score, path, line_num) is a total order, so an + // unstable sort yields the identical permutation while moving the + // fat SearchResult structs far less. + std.sort.pdq(SearchResult, result_list.items, {}, struct { pub fn lessThan(_: void, a: SearchResult, b: SearchResult) bool { const sa = if (a.score == a.score) a.score else 0; const sb = if (b.score == b.score) b.score else 0; @@ -3048,71 +3362,86 @@ pub const Explorer = struct { return result_list.toOwnedSlice(allocator); } - /// Compose the rerank signals for one search hit (issue #429). - fn rerankSignalScore(self: *const Explorer, r: SearchResult, query: []const u8) f32 { - var score: f32 = countOccurrences(r.line_text, query); + /// Path-level rerank signals (issue #429), computed once per UNIQUE path + /// by rerankAndFinalize — every field here is identical for all hits in + /// the same file, so recomputing per result (the pre-fix shape) only + /// burned time. Per-result composition stays in rerankAndFinalize. + const PathRerankFacts = struct { + defines: bool = false, + def_lines: [16]u32 = undefined, + def_count: u8 = 0, + def_overflow: bool = false, + is_tooling: bool = false, + is_test: bool = false, + is_example: bool = false, + is_vendor: bool = false, + is_doc: bool = false, + add_boost: f32 = 0, + gd: f32 = 1, + cc: f32 = 1, + lfp_mult: f32 = 1, + sp_mult: f32 = 1, + }; - // #598: mention-dense tooling files (a bench script repeating the term - // six times per line) saturate the per-line count and shrug off the - // ×0.5 path prior below. Cap the occurrence BASE for tooling paths - // before the stem/symbol boosts so density cannot dominate, while an - // eponymous lookup (query 'install' → install/install.sh) still wins - // through its +15 stem boost. - const is_tooling_path = pathHasSegment(r.path, "bench") or pathHasSegment(r.path, "benchmarks") or - pathHasSegment(r.path, "scripts") or pathHasSegment(r.path, "website") or - pathHasSegment(r.path, "install"); - if (is_tooling_path) score = @min(score, 2.0); - - if (self.outlines.get(r.path)) |outline| { + fn pathRerankFacts(self: *const Explorer, path: []const u8, query: []const u8) PathRerankFacts { + var facts: PathRerankFacts = .{}; + + // Symbol-definition facts from the outline: which lines start a + // symbol named exactly like the query (case-insensitive). Feeds the + // +5 definition-line boost and the co-change seed set (#550). + if (self.outlines.get(path)) |outline| { for (outline.symbols.items) |sym| { - if (sym.line_start == r.line_num and asciiEqlIgnoreCase(sym.name, query)) { - score += 5.0; - break; + if (!asciiEqlIgnoreCase(sym.name, query)) continue; + facts.defines = true; + if (facts.def_count < facts.def_lines.len) { + facts.def_lines[facts.def_count] = sym.line_start; + facts.def_count += 1; + } else { + facts.def_overflow = true; } } } - const basename = std.fs.path.basename(r.path); + // #598: mention-dense tooling files (a bench script repeating the term + // six times per line) saturate the per-line count and shrug off the + // ×0.5 path prior below. The occurrence BASE is capped for tooling + // paths before the stem/symbol boosts so density cannot dominate, + // while an eponymous lookup (query 'install' → install/install.sh) + // still wins through its +15 stem boost. + facts.is_tooling = pathHasSegment(path, "bench") or pathHasSegment(path, "benchmarks") or + pathHasSegment(path, "scripts") or pathHasSegment(path, "website") or + pathHasSegment(path, "install"); + + const basename = std.fs.path.basename(path); const stem_end = std.mem.indexOfScalar(u8, basename, '.') orelse basename.len; const stem = basename[0..stem_end]; const stem_contains_query = asciiContainsIgnoreCase(stem, query); const query_contains_stem = asciiContainsIgnoreCase(query, stem); const stem_related_to_query = stem_contains_query or query_contains_stem; if (asciiEqlIgnoreCase(stem, query)) { - score += 15.0; + facts.add_boost += 15.0; } else if (stem_related_to_query) { - score += 8.0; + facts.add_boost += 8.0; } // Path-segment match boost: query matches a directory segment in // the path (e.g. query="parser" boosts src/parser/foo.zig). Weaker // than basename match because the file's own name is a stronger // intent signal than the directory it lives in. Skip when basename // already matched to avoid double-counting. - if (!stem_related_to_query and pathHasSegmentIgnoreCase(r.path, query)) { - score += 6.0; + if (!stem_related_to_query and pathHasSegmentIgnoreCase(path, query)) { + facts.add_boost += 6.0; } // #580: match BM25's pathRelevanceMultiplier — test files identified by // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without // a test/ directory segment. - const is_test_file = pathHasSegment(r.path, "tests") or pathHasSegment(r.path, "test") or + facts.is_test = pathHasSegment(path, "tests") or pathHasSegment(path, "test") or std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null; - if (is_test_file) score *= 0.6; - if (pathHasSegment(r.path, "examples") or pathHasSegment(r.path, "example")) score *= 0.6; - if (is_tooling_path) score *= 0.5; - if (pathHasSegment(r.path, "vendor") or pathHasSegment(r.path, "node_modules") or - pathHasSegment(r.path, "third_party")) score *= 0.4; - // Doc-language penalty: markdown / data files (CHANGELOG.md, design - // docs, benchmark logs) often mention an identifier many times in a - // single line, which lets per-line frequency dwarf code call sites. - // For doc files, more mentions don't reflect more code-relevance — - // they reflect prose density. Cap at 1.0 then halve so any code hit - // (score >= 1) outranks any doc hit. Symmetric with path-prior. - if (isDocLanguage(detectLanguage(r.path))) { - score = @min(score, 1.0) * 0.5; - } - - return score; + facts.is_example = pathHasSegment(path, "examples") or pathHasSegment(path, "example"); + facts.is_vendor = pathHasSegment(path, "vendor") or pathHasSegment(path, "node_modules") or + pathHasSegment(path, "third_party"); + facts.is_doc = isDocLanguage(detectLanguage(path)); + return facts; } /// Append one JSON line per searchContent invocation. v0 logger for the From 38325f0f755697c6cfc8af0aaed8567c39008f68 Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:17:45 +0800 Subject: [PATCH 2/4] perf(search): direct-address doc slots, symbol-length masks, init-time path classification Tier 0 grouping (searchContent + renderPlainSearch) dedupes doc_ids through a direct-address slot array when the doc table is small (or the query heavy enough to amortize the memset), falling back to the previous map/rescan path otherwise; candidates now append straight into the array, dropping the map-to-array copy pass. The candidate order sorts plain u64 keys - entries land in first-seen order, so the array index doubles as the unique tiebreak and the post-sort address. FileOutline gains name_len_mask (bitmask of symbol-name lengths, a conservative superset maintained at all three append sites including snapshot load) so fileDefinesSymbol / pathRerankFacts skip whole symbol scans when no name can match the query length, and path_class (query-independent #598/#580 tooling/test/example/vendor priors) computed once at init instead of ~10 path tokenizations per unique path per rerank. codedb repo, 300-500 iters, c_allocator: middleware 12.4us, database 13us, error 27us. 814/814 tests, e2e MCP 20/20. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/explore.zig | 287 +++++++++++++++++++++++++++++++---------------- src/snapshot.zig | 1 + 2 files changed, 190 insertions(+), 98 deletions(-) diff --git a/src/explore.zig b/src/explore.zig index 9046bbdc..f74948cd 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -55,6 +55,41 @@ pub const Symbol = struct { detail: ?[]const u8 = null, }; +/// Query-independent path-prior flags shared by the rerankers — a pure +/// function of the path, see classifyPath. +pub const PathClass = struct { + is_tooling: bool = false, + is_test: bool = false, + is_example: bool = false, + is_vendor: bool = false, +}; + +/// Classify `path` for the rerank path priors. Pure — the result is cached +/// on FileOutline at init; callers without an outline compute it directly. +pub fn classifyPath(path: []const u8) PathClass { + var c: PathClass = .{}; + // One tokenize pass over the path instead of one per segment keyword. + var iter = std.mem.tokenizeAny(u8, path, "/\\"); + while (iter.next()) |seg| { + // #598: mention-dense tooling files (a bench script repeating the + // term six times per line) saturate the per-line count and shrug off + // the ×0.5 path prior — the occurrence BASE is capped for tooling. + if (std.mem.eql(u8, seg, "bench") or std.mem.eql(u8, seg, "benchmarks") or + std.mem.eql(u8, seg, "scripts") or std.mem.eql(u8, seg, "website") or + std.mem.eql(u8, seg, "install")) c.is_tooling = true; + if (std.mem.eql(u8, seg, "tests") or std.mem.eql(u8, seg, "test")) c.is_test = true; + if (std.mem.eql(u8, seg, "examples") or std.mem.eql(u8, seg, "example")) c.is_example = true; + if (std.mem.eql(u8, seg, "vendor") or std.mem.eql(u8, seg, "node_modules") or + std.mem.eql(u8, seg, "third_party")) c.is_vendor = true; + } + // #580: match BM25's pathRelevanceMultiplier — test files identified by + // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without + // a test/ directory segment. + const basename = std.fs.path.basename(path); + if (std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null) c.is_test = true; + return c; +} + pub const FileOutline = struct { path: []const u8, language: Language, @@ -69,6 +104,20 @@ pub const FileOutline = struct { /// section, retained by the Explorer) rather than individual allocations, /// so deinit must not free them. The ArrayLists themselves are still owned. borrows_strings: bool = false, + /// Bitmask of symbol-name lengths present (bit min(len, 63)). A + /// conservative superset — never cleared on symbol removal — that lets + /// per-query definition scans (fileDefinesSymbol, pathRerankFacts) skip + /// the whole symbol list when no name could match the query's length. + /// Every site that appends to `symbols` must OR in the new name's bit. + name_len_mask: u64 = 0, + /// Query-independent path-prior classification (issue #429 signals), + /// computed once at init — pure function of `path`, so reranks can read + /// it instead of re-tokenizing the path on every query. + path_class: PathClass = .{}, + + pub fn nameLenBit(len: usize) u64 { + return @as(u64, 1) << @as(u6, @intCast(@min(len, 63))); + } pub fn init(allocator: std.mem.Allocator, path: []const u8) FileOutline { return .{ @@ -77,6 +126,7 @@ pub const FileOutline = struct { .line_count = 0, .byte_size = 0, .allocator = allocator, + .path_class = classifyPath(path), }; } pub fn deinit(self: *FileOutline) void { @@ -2028,6 +2078,7 @@ pub const Explorer = struct { .line_end = sym.line_end, .detail = copied_detail, }); + dst.name_len_mask |= FileOutline.nameLenBit(copied_name.len); } for (src.imports.items) |imp| { const copied_import = try allocator.dupe(u8, imp); @@ -2671,72 +2722,97 @@ pub const Explorer = struct { defines: bool, }; - // Keyed by doc_id, not path: with the contiguity fast path below - // the map sees one getOrPut per UNIQUE file, and a u32 hash is - // several times cheaper than re-hashing a ~40-byte path string. - // hitPath only runs on first sight of a doc_id. Invalid postings - // (hitPath == "") stay in the map as empty-path tombstones and - // are skipped when the candidate list is built. - var tier0_files_by_doc = std.AutoHashMap(u32, Tier0File).init(allocator); - defer tier0_files_by_doc.deinit(); - // Pre-size for the unique-file count so high-frequency words - // (hundreds of files) don't pay a rehash cascade while inserting. - tier0_files_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {}; + // Candidates append straight into this array; deduplication maps + // doc_id → array index. Small doc tables get a direct-address + // slot array (one indexed load per lookup, no hashing at all); + // big tables — or queries too small to amortize the memset — + // fall back to a u32-keyed hash map. SLOT_NONE = unseen, + // SLOT_INVALID = doc checked and skipped (freed doc_id slot). + var tier0_files: std.ArrayList(Tier0File) = .empty; + defer tier0_files.deinit(allocator); + try tier0_files.ensureTotalCapacity(allocator, @min(word_hits.len, 1024)); + + const SLOT_NONE = std.math.maxInt(u32); + const SLOT_INVALID = SLOT_NONE - 1; + const ndocs = self.word_index.id_to_path.items.len; + const use_slots = ndocs > 0 and (ndocs <= 4096 or (ndocs <= 65536 and word_hits.len >= 512)); + var slots: []u32 = &.{}; + defer if (slots.len > 0) allocator.free(slots); + var idx_by_doc = std.AutoHashMap(u32, u32).init(allocator); + defer idx_by_doc.deinit(); + if (use_slots) { + slots = try allocator.alloc(u32, ndocs); + @memset(slots, SLOT_NONE); + } else { + idx_by_doc.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {}; + } // Postings for one file are appended contiguously (indexFile // processes whole files), so consecutive hits almost always share - // a doc_id — the cached-entry fast path turns one hash getOrPut - // PER HIT (~30µs on 1800-hit words) into one per unique file. - // last_entry is only dereferenced immediately after it was - // (re)set with no intervening map mutation, so it cannot dangle - // across a rehash. + // a doc_id — the cached-index fast path resolves them without + // touching the slot table at all. Indices (not pointers) into + // tier0_files stay valid across array growth. var last_doc_id: u32 = 0; - var last_entry: ?*Tier0File = null; + var last_cur: u32 = SLOT_NONE; for (word_hits, 0..) |hit, ordinal| { - if (last_entry) |entry| { - if (hit.doc_id == last_doc_id) { - entry.count +|= 1; - entry.hits_end = ordinal + 1; - continue; + if (last_cur != SLOT_NONE and hit.doc_id == last_doc_id) { + if (last_cur != SLOT_INVALID) { + const e = &tier0_files.items[last_cur]; + e.count +|= 1; + e.hits_end = ordinal + 1; } + continue; } - const gop = tier0_files_by_doc.getOrPut(hit.doc_id) catch continue; - if (!gop.found_existing) { + var cur: u32 = blk: { + if (use_slots) { + if (hit.doc_id >= ndocs) break :blk SLOT_INVALID; + break :blk slots[hit.doc_id]; + } + break :blk idx_by_doc.get(hit.doc_id) orelse SLOT_NONE; + }; + if (cur == SLOT_NONE) { const hit_path = self.word_index.hitPath(hit); - const is_doc = hit_path.len > 0 and isDocLanguage(detectLanguage(hit_path)); - const defines = hit_path.len > 0 and !is_doc and self.fileDefinesSymbol(hit_path, query); - gop.value_ptr.* = .{ - .path = hit_path, - .doc_id = hit.doc_id, - .count = 0, - .first_seen = ordinal, - .hits_end = ordinal + 1, - .is_doc = is_doc, - .defines = defines, - }; + if (hit_path.len == 0) { + cur = SLOT_INVALID; + } else { + const is_doc = isDocLanguage(detectLanguage(hit_path)); + const defines = !is_doc and self.fileDefinesSymbol(hit_path, query); + cur = @intCast(tier0_files.items.len); + tier0_files.append(allocator, .{ + .path = hit_path, + .doc_id = hit.doc_id, + .count = 0, + .first_seen = ordinal, + .hits_end = ordinal + 1, + .is_doc = is_doc, + .defines = defines, + }) catch { + cur = SLOT_INVALID; + }; + } + if (use_slots) { + if (hit.doc_id < ndocs) slots[hit.doc_id] = cur; + } else { + idx_by_doc.put(hit.doc_id, cur) catch {}; + } + } + if (cur != SLOT_INVALID) { + const e = &tier0_files.items[cur]; + e.count +|= 1; + e.hits_end = ordinal + 1; } - gop.value_ptr.count +|= 1; - gop.value_ptr.hits_end = ordinal + 1; last_doc_id = hit.doc_id; - last_entry = gop.value_ptr; + last_cur = cur; } - var tier0_files: std.ArrayList(Tier0File) = .empty; - defer tier0_files.deinit(allocator); - try tier0_files.ensureTotalCapacity(allocator, tier0_files_by_doc.count()); - var tier0_iter = tier0_files_by_doc.valueIterator(); - while (tier0_iter.next()) |stats| { - if (stats.path.len == 0) continue; - tier0_files.appendAssumeCapacity(stats.*); - } - - // Sort 12-byte (key, index) pairs instead of the 48-byte structs. - // The old comparator (is_doc asc, defines desc, count desc, - // first_seen asc, path asc) packs losslessly into one u64: - // first_seen is the ordinal of a file's first hit, unique per - // file, so the path tiebreak was unreachable. count saturates at + + // Sort plain u64 keys instead of the 48-byte structs. The old + // comparator (is_doc asc, defines desc, count desc, first_seen + // asc, path asc) packs losslessly: entries were appended in + // first-seen order, so the array index doubles as the first_seen + // tiebreak (unique per file — the path tiebreak was unreachable) + // AND addresses the entry after sorting. count saturates at // 2^30−1; beyond that ties fall to first_seen, same as before. - const Tier0Order = struct { key: u64, idx: u32 }; - var tier0_order: std.ArrayList(Tier0Order) = .empty; + var tier0_order: std.ArrayList(u64) = .empty; defer tier0_order.deinit(allocator); try tier0_order.ensureTotalCapacity(allocator, tier0_files.items.len); for (tier0_files.items, 0..) |stats, i| { @@ -2744,25 +2820,21 @@ pub const Explorer = struct { const key = (@as(u64, @intFromBool(stats.is_doc)) << 63) | (@as(u64, @intFromBool(!stats.defines)) << 62) | ((((1 << 30) - 1) - cnt) << 32) | - @as(u64, @as(u32, @truncate(stats.first_seen))); - tier0_order.appendAssumeCapacity(.{ .key = key, .idx = @intCast(i) }); + @as(u64, @as(u32, @intCast(i))); + tier0_order.appendAssumeCapacity(key); } if (tier0_order.items.len > 1) { - std.sort.pdq(Tier0Order, tier0_order.items, {}, struct { - pub fn lessThan(_: void, a: Tier0Order, b: Tier0Order) bool { - return a.key < b.key; - } - }.lessThan); + std.sort.pdq(u64, tier0_order.items, {}, std.sort.asc(u64)); } const tier0_per_file_cap: usize = if (tier0_files.items.len <= 1) max_results else @max(1, max_results / 5); var tier0_exact_capacity: usize = 0; - for (tier0_order.items) |ord| { - tier0_exact_capacity += @min(@as(usize, tier0_files.items[ord.idx].count), tier0_per_file_cap); + for (tier0_order.items) |key| { + tier0_exact_capacity += @min(@as(usize, tier0_files.items[@as(u32, @truncate(key))].count), tier0_per_file_cap); if (tier0_exact_capacity >= max_results) break; } const use_line_hits = tier0_exact_capacity >= max_results and tier0_per_file_cap <= 256; - for (tier0_order.items) |ord| { - const stats = tier0_files.items[ord.idx]; + for (tier0_order.items) |key| { + const stats = tier0_files.items[@as(u32, @truncate(key))]; if (result_list.items.len >= max_results) break; const ref = self.readContentForSearch(stats.path, allocator) orelse continue; defer ref.deinit(); @@ -3016,11 +3088,24 @@ pub const Explorer = struct { var tier0_files_buf: [512]Tier0File = undefined; var tier0_files_len: usize = 0; + // Direct-address doc_id → entry-index slots replace the linear rescan + // (O(unique files²) on high-frequency words). Same gating as + // searchContent's grouping; when the table is too big to amortize the + // memset, the rescan path below still handles dedup. + const SLOT_NONE = std.math.maxInt(u32); + const ndocs = self.word_index.id_to_path.items.len; + const use_slots = ndocs > 0 and (ndocs <= 4096 or (ndocs <= 65536 and word_hits.len >= 512)); + var slots: []u32 = &.{}; + defer if (slots.len > 0) allocator.free(slots); + if (use_slots) { + slots = allocator.alloc(u32, ndocs) catch &.{}; + if (slots.len > 0) @memset(slots, SLOT_NONE); + } for (word_hits, 0..) |hit, ordinal| { // Postings for one file are appended contiguously (indexFile // processes whole files), so consecutive hits almost always share - // a doc_id — checking the newest entry first turns the O(hits × - // files) linear rescan below into one pass per unique file. + // a doc_id — checking the newest entry first resolves them + // without touching the slots or the rescan at all. if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) { tier0_files_buf[tier0_files_len - 1].count +|= 1; tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1; @@ -3030,10 +3115,14 @@ pub const Explorer = struct { if (hit_path.len == 0) continue; var found_i: ?usize = null; - for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| { - if (stats.doc_id == hit.doc_id) { - found_i = i; - break; + if (slots.len > 0) { + if (hit.doc_id < slots.len and slots[hit.doc_id] != SLOT_NONE) found_i = slots[hit.doc_id]; + } else { + for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| { + if (stats.doc_id == hit.doc_id) { + found_i = i; + break; + } } } if (found_i) |i| { @@ -3041,6 +3130,7 @@ pub const Explorer = struct { tier0_files_buf[i].hits_end = ordinal + 1; } else { if (tier0_files_len >= tier0_files_buf.len) return false; + if (slots.len > 0 and hit.doc_id < slots.len) slots[hit.doc_id] = @intCast(tier0_files_len); tier0_files_buf[tier0_files_len] = .{ .doc_id = hit.doc_id, .path = hit_path, @@ -3389,28 +3479,36 @@ pub const Explorer = struct { // Symbol-definition facts from the outline: which lines start a // symbol named exactly like the query (case-insensitive). Feeds the // +5 definition-line boost and the co-change seed set (#550). + // Path-prior classification (#598 tooling cap, #580 test detection, + // example/vendor demotion) is query-independent — read it from the + // outline, where it was computed once at init; only paths with no + // outline (not indexed) classify on the fly. + var class: PathClass = .{}; + var is_doc_lang: bool = undefined; if (self.outlines.get(path)) |outline| { - for (outline.symbols.items) |sym| { - if (!asciiEqlIgnoreCase(sym.name, query)) continue; - facts.defines = true; - if (facts.def_count < facts.def_lines.len) { - facts.def_lines[facts.def_count] = sym.line_start; - facts.def_count += 1; - } else { - facts.def_overflow = true; + class = outline.path_class; + is_doc_lang = isDocLanguage(outline.language); + if (outline.name_len_mask & FileOutline.nameLenBit(query.len) != 0) { + for (outline.symbols.items) |sym| { + if (!asciiEqlIgnoreCase(sym.name, query)) continue; + facts.defines = true; + if (facts.def_count < facts.def_lines.len) { + facts.def_lines[facts.def_count] = sym.line_start; + facts.def_count += 1; + } else { + facts.def_overflow = true; + } } } + } else { + class = classifyPath(path); + is_doc_lang = isDocLanguage(detectLanguage(path)); } - - // #598: mention-dense tooling files (a bench script repeating the term - // six times per line) saturate the per-line count and shrug off the - // ×0.5 path prior below. The occurrence BASE is capped for tooling - // paths before the stem/symbol boosts so density cannot dominate, - // while an eponymous lookup (query 'install' → install/install.sh) - // still wins through its +15 stem boost. - facts.is_tooling = pathHasSegment(path, "bench") or pathHasSegment(path, "benchmarks") or - pathHasSegment(path, "scripts") or pathHasSegment(path, "website") or - pathHasSegment(path, "install"); + facts.is_tooling = class.is_tooling; + facts.is_test = class.is_test; + facts.is_example = class.is_example; + facts.is_vendor = class.is_vendor; + facts.is_doc = is_doc_lang; const basename = std.fs.path.basename(path); const stem_end = std.mem.indexOfScalar(u8, basename, '.') orelse basename.len; @@ -3432,15 +3530,6 @@ pub const Explorer = struct { facts.add_boost += 6.0; } - // #580: match BM25's pathRelevanceMultiplier — test files identified by - // BASENAME (tests.zig, test_*.zig, *_tests.zig) are tests even without - // a test/ directory segment. - facts.is_test = pathHasSegment(path, "tests") or pathHasSegment(path, "test") or - std.mem.startsWith(u8, basename, "test") or std.mem.indexOf(u8, basename, "_test") != null; - facts.is_example = pathHasSegment(path, "examples") or pathHasSegment(path, "example"); - facts.is_vendor = pathHasSegment(path, "vendor") or pathHasSegment(path, "node_modules") or - pathHasSegment(path, "third_party"); - facts.is_doc = isDocLanguage(detectLanguage(path)); return facts; } @@ -3661,6 +3750,7 @@ pub const Explorer = struct { /// where symbol_index is deferred (#564). fn fileDefinesSymbol(self: *const Explorer, path: []const u8, name: []const u8) bool { const outline = self.outlines.get(path) orelse return false; + if (outline.name_len_mask & FileOutline.nameLenBit(name.len) == 0) return false; for (outline.symbols.items) |sym| { if (asciiEqlIgnoreCase(sym.name, name)) return true; } @@ -6567,6 +6657,7 @@ fn appendOutlineSymbol( .line_end = line_num, .detail = detail_copy, }); + outline.name_len_mask |= FileOutline.nameLenBit(name_copy.len); } inline fn resIsIdentStart(c: u8) bool { diff --git a/src/snapshot.zig b/src/snapshot.zig index 40d93338..eb45782f 100644 --- a/src/snapshot.zig +++ b/src/snapshot.zig @@ -754,6 +754,7 @@ fn loadOutlineStateMap(io: std.Io, snapshot_path: []const u8, allocator: std.mem .line_end = line_end, .detail = detail, }); + outline.name_len_mask |= explore_mod.FileOutline.nameLenBit(name.len); } try result.put(path, outline); From 3332a8103eb9f91726e3d68244c793910c35007f Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:46:15 +0800 Subject: [PATCH 3/4] perf(search): rare-byte scan anchors, keyed final sort, pointer facts memoization Sampling profile (10s @ 200k iters) showed the rerank final sort, the per-result ~120-byte facts copy, and case-insensitive content scanning as the remaining hot spots. - indexOfCaseInsensitive and searchInContent's SIMD loop now anchor on the needle's RAREST byte (static code-frequency table) instead of byte 0, so common-first-letter words (authentication, error) stop verifying at every 'a'/'e'; searchInContent also widens to 32-byte vectors. Anchor choice never affects which matches are found, only the candidate rate. - The rerank final sort compares one precomputed u64 key per result (score as order-isomorphic descending bits, path as its lexicographic rank among unique result paths, line_num as in-comparator tiebreak) over u32 indices, then applies the permutation in one scratch pass - no string compares or 40-byte struct moves inside the sort. NaN and -0.0 collapse exactly like the float comparator it replaces. - The score loop holds a pointer to the memoized facts instead of copying the struct per result; the lfp file-hit tally gets the same consecutive-path fast path as the other per-result maps. codedb repo, c_allocator, min of 6 runs under load: authentication 31us (was 50), database 8.4us (was 13), middleware 11.8us, error 27us. 814/814 tests, e2e MCP 20/20. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/explore.zig | 273 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 194 insertions(+), 79 deletions(-) diff --git a/src/explore.zig b/src/explore.zig index f74948cd..30b4b3fa 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -3321,10 +3321,25 @@ pub const Explorer = struct { defer file_hit_counts.deinit(); var max_file_hits: u32 = 0; if (lfp.enabled) { + // Results arrive grouped by file, so the consecutive-path fast + // path turns one string getOrPut per result into one per unique + // file. The cached pointer is only dereferenced immediately after + // being (re)set, so it cannot dangle across a rehash. + var last_path: []const u8 = ""; + var last_count: ?*u32 = null; for (result_list.items) |r| { + if (last_count) |cnt| { + if (std.mem.eql(u8, r.path, last_path)) { + cnt.* += 1; + if (cnt.* > max_file_hits) max_file_hits = cnt.*; + continue; + } + } const gop = try file_hit_counts.getOrPut(r.path); gop.value_ptr.* = if (gop.found_existing) gop.value_ptr.* + 1 else 1; if (gop.value_ptr.* > max_file_hits) max_file_hits = gop.value_ptr.*; + last_path = r.path; + last_count = gop.value_ptr; } } // #550: a single-token query that exactly names a known symbol gets the @@ -3388,14 +3403,15 @@ pub const Explorer = struct { if (sp.enabled) entry.value_ptr.sp_mult = sp.multiplier(self, path); } - // Same consecutive-path memoization as the facts pass: the facts are - // copied by VALUE, so later map lookups can never be invalidated (the - // map is no longer mutated here anyway). + // Same consecutive-path memoization as the facts pass. Holding a + // pointer (instead of copying the ~120-byte facts struct per result) + // is safe: the map is not mutated anywhere in this loop. + const no_facts = PathRerankFacts{}; var score_last_path: []const u8 = ""; - var score_last_facts: PathRerankFacts = .{}; + var score_last_facts: *const PathRerankFacts = &no_facts; for (result_list.items) |*r| { if (score_last_path.len == 0 or !std.mem.eql(u8, r.path, score_last_path)) { - score_last_facts = facts_by_path.get(r.path) orelse PathRerankFacts{}; + score_last_facts = facts_by_path.getPtr(r.path) orelse &no_facts; score_last_path = r.path; } const facts = score_last_facts; @@ -3434,19 +3450,55 @@ pub const Explorer = struct { r.score = score; } if (result_list.items.len > 1) { - // pdq, not block: (score, path, line_num) is a total order, so an - // unstable sort yields the identical permutation while moving the - // fat SearchResult structs far less. - std.sort.pdq(SearchResult, result_list.items, {}, struct { - pub fn lessThan(_: void, a: SearchResult, b: SearchResult) bool { - const sa = if (a.score == a.score) a.score else 0; - const sb = if (b.score == b.score) b.score else 0; - if (sa != sb) return sa > sb; - const ord = std.mem.order(u8, a.path, b.path); - if (ord != .eq) return ord == .lt; - return a.line_num < b.line_num; + // The (score desc, path asc, line asc) order sorts via one + // precomputed u64 key per result — score as order-isomorphic + // descending bits, path as its lexicographic rank among the + // unique result paths — with line_num as the in-comparator + // tiebreak. No string compares or 40-byte struct moves inside + // the sort loop; the permutation applies in one scratch pass. + var unique_paths: std.ArrayList([]const u8) = .empty; + defer unique_paths.deinit(allocator); + try unique_paths.ensureTotalCapacity(allocator, facts_by_path.count()); + var path_iter = facts_by_path.keyIterator(); + while (path_iter.next()) |k| unique_paths.appendAssumeCapacity(k.*); + std.sort.pdq([]const u8, unique_paths.items, {}, struct { + pub fn lessThan(_: void, a: []const u8, b: []const u8) bool { + return std.mem.lessThan(u8, a, b); } }.lessThan); + for (unique_paths.items, 0..) |p, rank| { + if (facts_by_path.getPtr(p)) |f| f.path_rank = @intCast(rank); + } + + const keys = try allocator.alloc(u64, result_list.items.len); + defer allocator.free(keys); + const order = try allocator.alloc(u32, result_list.items.len); + defer allocator.free(order); + { + var lp: []const u8 = ""; + var lrank: u32 = std.math.maxInt(u32); + for (result_list.items, 0..) |r, ri| { + if (lp.len == 0 or !std.mem.eql(u8, r.path, lp)) { + lrank = if (facts_by_path.getPtr(r.path)) |f| f.path_rank else std.math.maxInt(u32); + lp = r.path; + } + keys[ri] = (@as(u64, scoreDescBits(r.score)) << 32) | lrank; + order[ri] = @intCast(ri); + } + } + const SortCtx = struct { + keys: []const u64, + items: []const SearchResult, + pub fn lessThan(ctx: @This(), a: u32, b: u32) bool { + if (ctx.keys[a] != ctx.keys[b]) return ctx.keys[a] < ctx.keys[b]; + return ctx.items[a].line_num < ctx.items[b].line_num; + } + }; + std.sort.pdq(u32, order, SortCtx{ .keys = keys, .items = result_list.items }, SortCtx.lessThan); + const scratch = try allocator.alloc(SearchResult, result_list.items.len); + defer allocator.free(scratch); + @memcpy(scratch, result_list.items); + for (order, 0..) |src, dst| result_list.items[dst] = scratch[src]; } self.appendRerankTrace(query, result_list.items); return result_list.toOwnedSlice(allocator); @@ -3471,8 +3523,24 @@ pub const Explorer = struct { cc: f32 = 1, lfp_mult: f32 = 1, sp_mult: f32 = 1, + /// Lexicographic rank of this path among the result set's unique + /// paths — assigned by rerankAndFinalize just before the final sort + /// so the sort key replaces per-comparison string compares. + path_rank: u32 = 0, }; + /// Map a score to bits whose UNSIGNED ascending order equals descending + /// float order — the standard sign-flip trick, with NaN collapsed to 0 + /// and -0.0 to +0.0 so it ties exactly like the float comparator it + /// replaces (`if (sa != sb) return sa > sb` with NaN already mapped). + fn scoreDescBits(score: f32) u32 { + var v: f32 = if (score == score) score else 0; + if (v == 0) v = 0; + const b: u32 = @bitCast(v); + const asc: u32 = if (b & 0x8000_0000 != 0) ~b else b | 0x8000_0000; + return ~asc; + } + fn pathRerankFacts(self: *const Explorer, path: []const u8, query: []const u8) PathRerankFacts { var facts: PathRerankFacts = .{}; @@ -6334,53 +6402,68 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all query_lower_buf[i] = if (c >= 'A' and c <= 'Z') c + 32 else c; } const query_lower = query_lower_buf[0..query.len]; - const first_lower: u8 = query_lower[0]; - const first_upper: u8 = if (first_lower >= 'a' and first_lower <= 'z') first_lower - 32 else first_lower; + // Anchor the scan on the needle's RAREST byte (see code_char_freq) — for + // common-first-letter words this cuts verify calls by an order of + // magnitude versus always anchoring on byte 0. A match starting at s has + // its anchor at s + anchor, so candidate positions live in + // [anchor, end + anchor). + var anchor: usize = 0; + var anchor_rarity: u8 = std.math.maxInt(u8); + for (query_lower, 0..) |c, j| { + if (code_char_freq[c] < anchor_rarity) { + anchor_rarity = code_char_freq[c]; + anchor = j; + } + } + const anchor_lower: u8 = query_lower[anchor]; + const anchor_upper: u8 = if (anchor_lower >= 'a' and anchor_lower <= 'z') anchor_lower - 32 else anchor_lower; var file_hits: usize = 0; - var pos: usize = 0; const end = content.len - query.len + 1; + const scan_end = end + anchor; + var pos: usize = anchor; // Track line number incrementally. var current_line: u32 = 1; var current_line_start: usize = 0; - // SIMD constants — 16-byte NEON/SSE vectors. - const VW = 16; + // SIMD constants — 32-byte vectors (2x NEON / 1x AVX2 per compare). + const VW = 32; const Vec = @Vector(VW, u8); - const splat_lo: Vec = @splat(first_lower); - const splat_hi: Vec = @splat(first_upper); + const splat_lo: Vec = @splat(anchor_lower); + const splat_hi: Vec = @splat(anchor_upper); - scan: while (pos < end) { - // ── SIMD path: process full 16-byte chunks ── - if (pos + VW <= end) { + scan: while (pos < scan_end) { + // ── SIMD path: process full chunks ── + if (pos + VW <= scan_end) { const chunk: Vec = content[pos..][0..VW].*; const eq_lo: @Vector(VW, u1) = @bitCast(chunk == splat_lo); const eq_hi: @Vector(VW, u1) = @bitCast(chunk == splat_hi); - var mask: u16 = @bitCast(eq_lo | eq_hi); + var mask: u32 = @bitCast(eq_lo | eq_hi); if (mask == 0) { pos += VW; continue; } - // Process ALL first-byte candidates in this chunk without reloading. + // Process ALL anchor candidates in this chunk without reloading. while (mask != 0) { const offset: usize = @ctz(mask); const cand = pos + offset; - if (cand >= end) break; + if (cand >= scan_end) break; + const start = cand - anchor; - if (matchAtCaseInsensitive(content, cand, query_lower)) { + if (matchAtCaseInsensitive(content, start, query_lower)) { // ── Match found ── - while (current_line_start < cand) { + while (current_line_start < start) { if (simdIndexOfNewline(content, current_line_start)) |nl| { - if (nl < cand) { + if (nl < start) { current_line += 1; current_line_start = nl + 1; } else break; } else break; } const line_start = current_line_start; - const line_end = simdIndexOfNewline(content, cand) orelse content.len; + const line_end = simdIndexOfNewline(content, start) orelse content.len; const line_text = try allocator.dupe(u8, content[line_start..line_end]); errdefer allocator.free(line_text); @@ -6392,8 +6475,10 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all current_line += 1; current_line_start = line_end + 1; - pos = line_end + 1; - if (pos >= end) return; + // One result per line: the next match must START after + // the line, so its anchor sits at least `anchor` later. + pos = line_end + 1 + anchor; + if (pos >= scan_end) return; continue :scan; } mask &= mask - 1; // clear lowest bit, try next candidate in chunk @@ -6402,32 +6487,35 @@ fn searchInContent(path: []const u8, content: []const u8, query: []const u8, all continue; } - // ── Scalar tail for last <16 bytes ── + // ── Scalar tail for the last = max_per_file or result_list.items.len >= max_results) return; + const line_text = try allocator.dupe(u8, content[line_start..line_end]); + errdefer allocator.free(line_text); + const path_copy = try allocator.dupe(u8, path); + errdefer allocator.free(path_copy); + try result_list.append(allocator, .{ .path = path_copy, .line_num = current_line, .line_text = line_text }); + file_hits += 1; + if (file_hits >= max_per_file or result_list.items.len >= max_results) return; - current_line += 1; - current_line_start = line_end + 1; - pos = line_end + 1; - continue; + current_line += 1; + current_line_start = line_end + 1; + pos = line_end + 1 + anchor; + continue; + } } pos += 1; } @@ -6512,43 +6600,70 @@ pub fn regexMatch(haystack: []const u8, pattern: []const u8) bool { return false; } +/// Rough frequency of each lowercase byte in source code, used only to pick +/// the SIMD anchor inside indexOfCaseInsensitive — lower is rarer. Anchor +/// choice never affects correctness, only how often candidates verify. +const code_char_freq: [256]u8 = blk: { + var t = [_]u8{3} ** 256; + const ranks = "zqjxkvbywgpfmucdlhrsnioate"; + for (ranks, 0..) |c, i| t[c] = @intCast(i + 4); + for ('0'..'9' + 1) |c| t[c] = 5; + t['_'] = 20; + t['.'] = 14; + break :blk t; +}; + fn indexOfCaseInsensitive(haystack: []const u8, needle: []const u8) ?usize { if (needle.len == 0) return 0; if (needle.len > haystack.len) return null; - // Pre-compute lowered first byte + second byte for fast skip. - const first_lower: u8 = if (needle[0] >= 'A' and needle[0] <= 'Z') needle[0] + 32 else needle[0]; - const first_upper: u8 = if (needle[0] >= 'a' and needle[0] <= 'z') needle[0] - 32 else needle[0]; const end = haystack.len - needle.len + 1; if (needle.len == 1) { - // Single-char: use std.mem.indexOfAny for speed. - const chars = [2]u8{ first_lower, first_upper }; + const c = needle[0]; + const lower: u8 = if (c >= 'A' and c <= 'Z') c + 32 else c; + const upper: u8 = if (lower >= 'a' and lower <= 'z') lower - 32 else lower; + const chars = [2]u8{ lower, upper }; return std.mem.indexOfAny(u8, haystack, &chars); } - const second_lower: u8 = if (needle[1] >= 'A' and needle[1] <= 'Z') needle[1] + 32 else needle[1]; - - var i: usize = 0; - while (i < end) : (i += 1) { - // Fast reject: check first byte, then second byte before full compare. - const c0 = haystack[i]; - if (c0 != first_lower and c0 != first_upper) continue; - const c1 = haystack[i + 1]; - const c1_lower = if (c1 >= 'A' and c1 <= 'Z') c1 + 32 else c1; - if (c1_lower != second_lower) continue; - - // First two bytes match — verify the rest. + // Jump between candidates of the needle's RAREST byte with the + // vectorized indexOfAnyPos instead of walking byte-by-byte — content + // scans (searchInContent, Tier 1 candidate verification) spend most of + // their time here, and anchoring on a rare letter (a 'k' or 'x') rather + // than position 0 keeps the verify rate low for common-first-letter words. + var anchor: usize = 0; + var anchor_freq: u8 = std.math.maxInt(u8); + for (needle, 0..) |c, j| { + const cl: u8 = if (c >= 'A' and c <= 'Z') c + 32 else c; + if (code_char_freq[cl] < anchor_freq) { + anchor_freq = code_char_freq[cl]; + anchor = j; + } + } + const ac = needle[anchor]; + const anchor_lower: u8 = if (ac >= 'A' and ac <= 'Z') ac + 32 else ac; + const anchor_upper: u8 = if (anchor_lower >= 'a' and anchor_lower <= 'z') anchor_lower - 32 else anchor_lower; + const anchor_chars = [2]u8{ anchor_lower, anchor_upper }; + + // A match starting at s puts the anchor at s + anchor, so anchor + // candidates live in [anchor, end + anchor). + const scan = haystack[0 .. end - 1 + anchor + 1]; + var i: usize = anchor; + while (std.mem.indexOfAnyPos(u8, scan, i, &anchor_chars)) |pos| { + i = pos + 1; + const start = pos - anchor; var match = true; - for (2..needle.len) |j| { - const hc = if (haystack[i + j] >= 'A' and haystack[i + j] <= 'Z') haystack[i + j] + 32 else haystack[i + j]; - const nc = if (needle[j] >= 'A' and needle[j] <= 'Z') needle[j] + 32 else needle[j]; - if (hc != nc) { + for (needle, 0..) |nc0, j| { + const hc = haystack[start + j]; + const hl: u8 = if (hc >= 'A' and hc <= 'Z') hc + 32 else hc; + const nl: u8 = if (nc0 >= 'A' and nc0 <= 'Z') nc0 + 32 else nc0; + if (hl != nl) { match = false; break; } } - if (match) return i; + if (match) return start; } return null; } From 6b17e4ee5edc6f482a9a2932fcb92fc17836831f Mon Sep 17 00:00:00 2001 From: justrach <54503978+justrach@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:55:38 +0800 Subject: [PATCH 4/4] perf(search): run-at-a-time posting grouping, single outline fetch per candidate The Tier 0 hit list decomposes into contiguous runs of one doc_id each, so all three grouping passes (searchContent, renderPlainSearch, Tier 1's hits_per_file tally) now scan to each run boundary first and touch the slot table / hash map / entry ONCE per run instead of once per hit - the per-hit work drops to a doc_id compare. searchContent's candidate metadata also collapses to a single outlines.get per unique file: language comes from the outline (computed at init via the same detectLanguage) and the defines scan runs inline behind the name_len_mask gate, where the old shape hashed the path twice (detectLanguage + fileDefinesSymbol's own lookup). codedb repo, c_allocator, min of 5 runs: error 19.6us (was 27), middleware 10.2us, database 7.4us, authentication 28.7us, webhook 16.6us. 814/814 tests, e2e MCP 20/20. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/explore.zig | 145 ++++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 67 deletions(-) diff --git a/src/explore.zig b/src/explore.zig index 30b4b3fa..ee1226b5 100644 --- a/src/explore.zig +++ b/src/explore.zig @@ -2748,42 +2748,59 @@ pub const Explorer = struct { } // Postings for one file are appended contiguously (indexFile - // processes whole files), so consecutive hits almost always share - // a doc_id — the cached-index fast path resolves them without - // touching the slot table at all. Indices (not pointers) into + // processes whole files), so the hit list decomposes into runs of + // one doc_id each — group run-at-a-time: scan to the run boundary + // first, then touch the slot table and the entry ONCE per run + // instead of once per hit. Indices (not pointers) into // tier0_files stay valid across array growth. - var last_doc_id: u32 = 0; - var last_cur: u32 = SLOT_NONE; - for (word_hits, 0..) |hit, ordinal| { - if (last_cur != SLOT_NONE and hit.doc_id == last_doc_id) { - if (last_cur != SLOT_INVALID) { - const e = &tier0_files.items[last_cur]; - e.count +|= 1; - e.hits_end = ordinal + 1; - } - continue; - } + var run_start: usize = 0; + while (run_start < word_hits.len) { + const doc_id = word_hits[run_start].doc_id; + var run_end = run_start + 1; + while (run_end < word_hits.len and word_hits[run_end].doc_id == doc_id) run_end += 1; + defer run_start = run_end; + var cur: u32 = blk: { if (use_slots) { - if (hit.doc_id >= ndocs) break :blk SLOT_INVALID; - break :blk slots[hit.doc_id]; + if (doc_id >= ndocs) break :blk SLOT_INVALID; + break :blk slots[doc_id]; } - break :blk idx_by_doc.get(hit.doc_id) orelse SLOT_NONE; + break :blk idx_by_doc.get(doc_id) orelse SLOT_NONE; }; if (cur == SLOT_NONE) { - const hit_path = self.word_index.hitPath(hit); + const hit_path = self.word_index.hitPath(word_hits[run_start]); if (hit_path.len == 0) { cur = SLOT_INVALID; } else { - const is_doc = isDocLanguage(detectLanguage(hit_path)); - const defines = !is_doc and self.fileDefinesSymbol(hit_path, query); + // One outline fetch serves both signals: language is + // detectLanguage(path) computed at outline init, and + // the defines scan is gated by the symbol-name-length + // mask — the old shape hashed the path twice + // (detectLanguage + fileDefinesSymbol's own get). + // Files with no outline never define (same as + // fileDefinesSymbol's `orelse return false`). + var is_doc = false; + var defines = false; + if (self.outlines.get(hit_path)) |o| { + is_doc = isDocLanguage(o.language); + if (!is_doc and o.name_len_mask & FileOutline.nameLenBit(query.len) != 0) { + for (o.symbols.items) |sym| { + if (asciiEqlIgnoreCase(sym.name, query)) { + defines = true; + break; + } + } + } + } else { + is_doc = isDocLanguage(detectLanguage(hit_path)); + } cur = @intCast(tier0_files.items.len); tier0_files.append(allocator, .{ .path = hit_path, - .doc_id = hit.doc_id, + .doc_id = doc_id, .count = 0, - .first_seen = ordinal, - .hits_end = ordinal + 1, + .first_seen = run_start, + .hits_end = run_end, .is_doc = is_doc, .defines = defines, }) catch { @@ -2791,18 +2808,16 @@ pub const Explorer = struct { }; } if (use_slots) { - if (hit.doc_id < ndocs) slots[hit.doc_id] = cur; + if (doc_id < ndocs) slots[doc_id] = cur; } else { - idx_by_doc.put(hit.doc_id, cur) catch {}; + idx_by_doc.put(doc_id, cur) catch {}; } } if (cur != SLOT_INVALID) { const e = &tier0_files.items[cur]; - e.count +|= 1; - e.hits_end = ordinal + 1; + e.count +|= @intCast(@min(run_end - run_start, std.math.maxInt(u32))); + e.hits_end = run_end; } - last_doc_id = hit.doc_id; - last_cur = cur; } // Sort plain u64 keys instead of the 48-byte structs. The old @@ -2932,27 +2947,21 @@ pub const Explorer = struct { var hits_per_file = std.StringHashMap(u32).init(allocator); defer hits_per_file.deinit(); hits_per_file.ensureTotalCapacity(@intCast(@min(word_hits.len, 1024))) catch {}; - // Same contiguous-posting fast path as Tier 0's grouping: - // consecutive hits share a doc_id, so the per-hit hitPath + - // string getOrPut collapses to once per unique file. The - // cached pointer is only dereferenced immediately after being - // (re)set, so it cannot dangle across a rehash. - var hpf_last_doc: u32 = 0; - var hpf_last: ?*u32 = null; - for (word_hits) |hit| { - if (hpf_last) |cnt| { - if (hit.doc_id == hpf_last_doc) { - cnt.* += 1; - continue; - } - } - const hp = self.word_index.hitPath(hit); + // Same contiguous-posting decomposition as Tier 0's grouping: + // the hit list is runs of one doc_id each, so hitPath + the + // string getOrPut run once per file run, not per hit. + var hpf_run_start: usize = 0; + while (hpf_run_start < word_hits.len) { + const hpf_doc = word_hits[hpf_run_start].doc_id; + var hpf_run_end = hpf_run_start + 1; + while (hpf_run_end < word_hits.len and word_hits[hpf_run_end].doc_id == hpf_doc) hpf_run_end += 1; + defer hpf_run_start = hpf_run_end; + + const hp = self.word_index.hitPath(word_hits[hpf_run_start]); if (hp.len == 0) continue; const gop_h = try hits_per_file.getOrPut(hp); if (!gop_h.found_existing) gop_h.value_ptr.* = 0; - gop_h.value_ptr.* += 1; - hpf_last_doc = hit.doc_id; - hpf_last = gop_h.value_ptr; + gop_h.value_ptr.* += @intCast(@min(hpf_run_end - hpf_run_start, std.math.maxInt(u32))); } const SortCtx = struct { contents: *ContentCache, @@ -3101,42 +3110,44 @@ pub const Explorer = struct { slots = allocator.alloc(u32, ndocs) catch &.{}; if (slots.len > 0) @memset(slots, SLOT_NONE); } - for (word_hits, 0..) |hit, ordinal| { - // Postings for one file are appended contiguously (indexFile - // processes whole files), so consecutive hits almost always share - // a doc_id — checking the newest entry first resolves them - // without touching the slots or the rescan at all. - if (tier0_files_len > 0 and tier0_files_buf[tier0_files_len - 1].doc_id == hit.doc_id) { - tier0_files_buf[tier0_files_len - 1].count +|= 1; - tier0_files_buf[tier0_files_len - 1].hits_end = ordinal + 1; - continue; - } - const hit_path = self.word_index.hitPath(hit); + // Postings for one file are appended contiguously (indexFile + // processes whole files), so the hit list decomposes into runs of one + // doc_id each — group run-at-a-time: scan to the run boundary first, + // then touch the slots/rescan and the entry ONCE per run. + var run_start: usize = 0; + while (run_start < word_hits.len) { + const doc_id = word_hits[run_start].doc_id; + var run_end = run_start + 1; + while (run_end < word_hits.len and word_hits[run_end].doc_id == doc_id) run_end += 1; + defer run_start = run_end; + + const hit_path = self.word_index.hitPath(word_hits[run_start]); if (hit_path.len == 0) continue; var found_i: ?usize = null; if (slots.len > 0) { - if (hit.doc_id < slots.len and slots[hit.doc_id] != SLOT_NONE) found_i = slots[hit.doc_id]; + if (doc_id < slots.len and slots[doc_id] != SLOT_NONE) found_i = slots[doc_id]; } else { for (tier0_files_buf[0..tier0_files_len], 0..) |stats, i| { - if (stats.doc_id == hit.doc_id) { + if (stats.doc_id == doc_id) { found_i = i; break; } } } + const run_count: u32 = @intCast(@min(run_end - run_start, std.math.maxInt(u32))); if (found_i) |i| { - tier0_files_buf[i].count +|= 1; - tier0_files_buf[i].hits_end = ordinal + 1; + tier0_files_buf[i].count +|= run_count; + tier0_files_buf[i].hits_end = run_end; } else { if (tier0_files_len >= tier0_files_buf.len) return false; - if (slots.len > 0 and hit.doc_id < slots.len) slots[hit.doc_id] = @intCast(tier0_files_len); + if (slots.len > 0 and doc_id < slots.len) slots[doc_id] = @intCast(tier0_files_len); tier0_files_buf[tier0_files_len] = .{ - .doc_id = hit.doc_id, + .doc_id = doc_id, .path = hit_path, - .count = 1, - .first_seen = ordinal, - .hits_end = ordinal + 1, + .count = run_count, + .first_seen = run_start, + .hits_end = run_end, .is_doc = isDocLanguage(detectLanguage(hit_path)), }; tier0_files_len += 1;