Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 101 additions & 12 deletions src/explore.zig
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ const MmapTrigramIndex = idx.MmapTrigramIndex;
const AnyTrigramIndex = idx.AnyTrigramIndex;
const SparseNgramIndex = idx.SparseNgramIndex;
const codegraph = @import("codegraph.zig");
const git = @import("git.zig");

/// Fast hash context for u32-keyed maps on hot paths (ranked search aggregation).
/// Zig's AutoHashMap runs the 4 key bytes through Wyhash even for an integer key;
Expand Down Expand Up @@ -775,6 +776,16 @@ pub const Explorer = struct {
call_graph: ?CallGraph = null,
centrality_build_mu: cio.Mutex = .{},
root_dir: ?std.Io.Dir = null,
/// Absolute project root path (duped in setRoot) — needed to shell out to
/// git for the co-change map (#550).
root_path: ?[]const u8 = null,
/// file → strongest git co-change partners (#550). Built lazily from
/// `git log --name-only`; null until built (or unavailable: no root, not
/// a repo). Guarded by cochange_build_mu; attempted-flag stops re-shelling
/// when git is absent.
co_change: ?std.StringHashMap([]git.CoChangePartner) = null,
co_change_attempted: bool = false,
cochange_build_mu: cio.Mutex = .{},
io: ?std.Io = null,
/// When non-null, append one JSON line per searchContent invocation
/// to this path (v0 rerank-trace experiment). Borrowed; caller owns
Expand Down Expand Up @@ -807,6 +818,8 @@ pub const Explorer = struct {
pub fn setRoot(self: *Explorer, io: std.Io, root_path: []const u8) void {
self.io = io;
self.root_dir = std.Io.Dir.cwd().openDir(io, root_path, .{}) catch null;
if (self.root_path) |old| self.allocator.free(old);
self.root_path = self.allocator.dupe(u8, root_path) catch null;
}

pub fn init(allocator: std.mem.Allocator, content_cache_capacity: u32) Explorer {
Expand Down Expand Up @@ -848,6 +861,8 @@ pub const Explorer = struct {
self.contents.deinit();
if (self.call_centrality) |*c| c.deinit();
if (self.call_graph) |*cg| cg.deinit(self.allocator);
if (self.co_change) |*cc| git.freeCoChange(cc, self.allocator);
if (self.root_path) |p| self.allocator.free(p);

self.word_index.deinit();
self.trigram_index.deinit();
Expand Down Expand Up @@ -2520,17 +2535,7 @@ pub const Explorer = struct {
const gop = tier0_files_by_path.getOrPut(hit_path) catch continue;
if (!gop.found_existing) {
const is_doc = isDocLanguage(detectLanguage(hit_path));
var defines = false;
if (!is_doc) {
if (self.outlines.get(hit_path)) |outline| {
for (outline.symbols.items) |sym| {
if (asciiEqlIgnoreCase(sym.name, query)) {
defines = true;
break;
}
}
}
}
const defines = !is_doc and self.fileDefinesSymbol(hit_path, query);
gop.value_ptr.* = .{
.path = hit_path,
.count = 0,
Expand Down Expand Up @@ -3007,9 +3012,23 @@ pub const Explorer = struct {
graph_dist = self.queryGraphDistances(&gd_terms, ga);
}

// #550 signal 2: git co-change. Seeds are the result files that
// DEFINE the queried symbol, so plain word queries never trigger the
// one-time `git log` shell-out.
var cc_seeds = std.StringHashMap(void).init(allocator);
defer cc_seeds.deinit();
if (cio.posixGetenv("CODEDB_NO_COCHANGE") == null) {
for (result_list.items) |r| {
if (cc_seeds.contains(r.path)) continue;
if (self.fileDefinesSymbol(r.path, query)) cc_seeds.put(r.path, {}) catch {};
}
if (cc_seeds.count() > 0) self.ensureCoChange();
}

for (result_list.items) |*r| {
r.score = self.rerankSignalScore(r.*, query);
r.score *= graphDistanceBoost(graph_dist, r.path);
r.score *= self.coChangeBoost(&cc_seeds, r.path);
if (lfp.enabled) r.score *= lfp.multiplier(file_hit_counts.get(r.path) orelse 1, max_file_hits);
if (sp.enabled) r.score *= sp.multiplier(self, r.path);
}
Expand Down Expand Up @@ -3308,6 +3327,54 @@ pub const Explorer = struct {
return false;
}

/// True when `path`'s outline defines a symbol named `name` (any kind,
/// case-insensitive). Outline-based so it works on snapshot fast-loads
/// where symbol_index is deferred (#564).
fn fileDefinesSymbol(self: *const Explorer, path: []const u8, name: []const u8) bool {
const outline = self.outlines.get(path) orelse return false;
for (outline.symbols.items) |sym| {
if (asciiEqlIgnoreCase(sym.name, name)) return true;
}
return false;
}

/// Build the git co-change map once (#550): `git log --name-only` over
/// the last 500 commits, mega-commits (>32 files) skipped, top 8 partners
/// per file. Needs root_path (setRoot); silently unavailable outside a
/// git repo — the attempted flag stops re-shelling. Mirrors
/// ensureCallGraph: call while holding at least a shared lock on `mu`.
fn ensureCoChange(self: *Explorer) void {
if (self.co_change != null or self.co_change_attempted) return;
self.cochange_build_mu.lock();
defer self.cochange_build_mu.unlock();
if (self.co_change != null or self.co_change_attempted) return;
self.co_change_attempted = true;
const root = self.root_path orelse return;
self.co_change = git.buildCoChange(self.allocator, root, 500, 32, 8);
}

/// Boost for files that historically change together with the files
/// defining the queried symbol — git co-change, the temporal sibling of
/// graphDistanceBoost (#550). Always ≥ 1, never a filter; 1.0 when the
/// map or seeds are absent. Two shared commits is the noise floor;
/// strength saturates at eight (×1.25).
fn coChangeBoost(self: *const Explorer, seeds: *const std.StringHashMap(void), path: []const u8) f32 {
const cc = self.co_change orelse return 1.0;
if (seeds.count() == 0) return 1.0;
if (seeds.contains(path)) return 1.0;
var best: u32 = 0;
var it = seeds.keyIterator();
while (it.next()) |s| {
const partners = cc.get(s.*) orelse continue;
for (partners) |p| {
if (p.count > best and std.mem.eql(u8, p.path, path)) best = p.count;
}
}
if (best < 2) return 1.0;
const strength = @min(@as(f32, @floatFromInt(best)) / 8.0, 1.0);
return 1.0 + 0.25 * strength;
}

/// Public, lock-acquiring entry point for single-threaded callers (the
/// index/scan path) to pre-build call_centrality before persisting a snapshot,
/// so a later load can restore it instead of paying the lazy first-query build.
Expand Down Expand Up @@ -3694,6 +3761,27 @@ pub const Explorer = struct {
}
if (per_doc.count() == 0) return try allocator.alloc(SearchResult, 0);

// #550 signal 2: git co-change. Seeds are the candidate files that
// DEFINE a query-named symbol; without seeds the one-time `git log`
// shell-out never happens.
var cc_seeds = std.StringHashMap(void).init(ta);
if (cio.posixGetenv("CODEDB_NO_COCHANGE") == null) {
var seed_iter = per_doc.iterator();
while (seed_iter.next()) |entry| {
const doc_id = entry.key_ptr.*;
const p = if (doc_id < self.word_index.id_to_path.items.len) self.word_index.id_to_path.items[doc_id] else "";
if (p.len == 0 or cc_seeds.contains(p)) continue;
var t_it = terms_set.keyIterator();
while (t_it.next()) |t| {
if (self.fileDefinesSymbol(p, t.*)) {
cc_seeds.put(p, {}) catch {};
break;
}
}
}
if (cc_seeds.count() > 0) self.ensureCoChange();
}

const Cand = struct { doc_id: u32, score: f32, best_line: u32 };
var cands: std.ArrayList(Cand) = .empty;
defer cands.deinit(ta);
Expand All @@ -3707,7 +3795,8 @@ pub const Explorer = struct {
.score = entry.value_ptr.score *
pathRelevanceMultiplier(cand_path, &terms_set) *
self.centralityBoost(cand_path) *
graphDistanceBoost(graph_dist, cand_path),
graphDistanceBoost(graph_dist, cand_path) *
self.coChangeBoost(&cc_seeds, cand_path),
.best_line = entry.value_ptr.best_line,
});
}
Expand Down
143 changes: 143 additions & 0 deletions src/git.zig
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,146 @@ pub fn getGitHead(root: []const u8, allocator: std.mem.Allocator) !?[40]u8 {
@memcpy(&out, trimmed[0..40]);
return out;
}

pub const CoChangePartner = struct {
path: []const u8,
count: u32,
};

fn isCommitSha(line: []const u8) bool {
if (line.len != 40) return false;
for (line) |c| {
if (!std.ascii.isHex(c)) return false;
}
return true;
}

/// Parse `git log --name-only --pretty=format:%H` output into a co-change
/// map: file → strongest co-change partners, by shared-commit count. Pure
/// over the log text. Commits touching more than max_files_per_commit files
/// are skipped (vendor drops and formatting sweeps are co-change noise), as
/// are git-quoted exotic filenames. Caller owns the returned map — free with
/// freeCoChange.
pub fn parseCoChange(
allocator: std.mem.Allocator,
log_text: []const u8,
max_files_per_commit: usize,
max_partners: usize,
) !std.StringHashMap([]CoChangePartner) {
var arena_state = std.heap.ArenaAllocator.init(allocator);
defer arena_state.deinit();
const a = arena_state.allocator();

var pair_counts = std.StringHashMap(u32).init(a);
var commit_files: std.ArrayList([]const u8) = .empty;

var lines = std.mem.splitScalar(u8, log_text, '\n');
var done = false;
while (!done) {
const maybe = lines.next();
if (maybe == null) done = true;
const line = std.mem.trimEnd(u8, maybe orelse "", "\r");
if (done or isCommitSha(line)) {
if (commit_files.items.len >= 2 and commit_files.items.len <= max_files_per_commit) {
for (commit_files.items, 0..) |fa, i| {
for (commit_files.items[i + 1 ..]) |fb| {
if (std.mem.eql(u8, fa, fb)) continue;
const lo = if (std.mem.lessThan(u8, fa, fb)) fa else fb;
const hi = if (std.mem.lessThan(u8, fa, fb)) fb else fa;
const key = try std.fmt.allocPrint(a, "{s}\x00{s}", .{ lo, hi });
const gop = try pair_counts.getOrPut(key);
if (!gop.found_existing) gop.value_ptr.* = 0;
gop.value_ptr.* += 1;
}
}
}
commit_files.clearRetainingCapacity();
continue;
}
if (line.len == 0) continue;
if (line[0] == '"') continue;
try commit_files.append(a, line);
}

var per_file = std.StringHashMap(std.ArrayList(CoChangePartner)).init(a);
var pc_it = pair_counts.iterator();
while (pc_it.next()) |entry| {
const key = entry.key_ptr.*;
const sep = std.mem.indexOfScalar(u8, key, 0) orelse continue;
const pair = [2][]const u8{ key[0..sep], key[sep + 1 ..] };
for (pair, 0..) |file, side| {
const gop = try per_file.getOrPut(file);
if (!gop.found_existing) gop.value_ptr.* = .empty;
try gop.value_ptr.append(a, .{ .path = pair[1 - side], .count = entry.value_ptr.* });
}
}

var out = std.StringHashMap([]CoChangePartner).init(allocator);
errdefer freeCoChange(&out, allocator);
var pf_it = per_file.iterator();
while (pf_it.next()) |entry| {
std.mem.sort(CoChangePartner, entry.value_ptr.items, {}, struct {
fn lt(_: void, x: CoChangePartner, y: CoChangePartner) bool {
if (x.count != y.count) return x.count > y.count;
return std.mem.lessThan(u8, x.path, y.path);
}
}.lt);
const n = @min(entry.value_ptr.items.len, max_partners);
const slice = try allocator.alloc(CoChangePartner, n);
var filled: usize = 0;
errdefer {
for (slice[0..filled]) |p| allocator.free(p.path);
allocator.free(slice);
}
for (entry.value_ptr.items[0..n]) |src| {
slice[filled] = .{ .path = try allocator.dupe(u8, src.path), .count = src.count };
filled += 1;
}
const owned_key = try allocator.dupe(u8, entry.key_ptr.*);
errdefer allocator.free(owned_key);
try out.put(owned_key, slice);
}
return out;
}

pub fn freeCoChange(map: *std.StringHashMap([]CoChangePartner), allocator: std.mem.Allocator) void {
var it = map.iterator();
while (it.next()) |entry| {
for (entry.value_ptr.*) |p| allocator.free(p.path);
allocator.free(entry.value_ptr.*);
allocator.free(entry.key_ptr.*);
}
map.deinit();
}

/// Shell out to git log in `root` and build the co-change map (#550). Null
/// on any failure: not a git repo, git missing, empty history. A shallow
/// clone just yields a sparser map.
pub fn buildCoChange(
allocator: std.mem.Allocator,
root: []const u8,
max_commits: u32,
max_files_per_commit: usize,
max_partners: usize,
) ?std.StringHashMap([]CoChangePartner) {
var nbuf: [16]u8 = undefined;
const nstr = std.fmt.bufPrint(&nbuf, "{d}", .{max_commits}) catch return null;
const result = cio.runCapture(.{
.allocator = allocator,
.argv = &.{ "git", "log", "--name-only", "--no-merges", "--pretty=format:%H", "-n", nstr },
.cwd = root,
.max_output_bytes = 8 * 1024 * 1024,
}) catch return null;
defer allocator.free(result.stdout);
defer allocator.free(result.stderr);
switch (result.term) {
.Exited => |code| if (code != 0) return null,
else => return null,
}
var map = parseCoChange(allocator, result.stdout, max_files_per_commit, max_partners) catch return null;
if (map.count() == 0) {
map.deinit();
return null;
}
return map;
}
Loading
Loading