From 3e98586d7001ad70f6af1fcf1e87b773a59494b1 Mon Sep 17 00:00:00 2001 From: Rahul Rajaram Date: Thu, 26 Mar 2026 11:16:22 -0400 Subject: [PATCH] refactor: decompose main.rs into 15 focused modules Break a monolithic 13,241-line main.rs into cohesive modules: - cli.rs: Clap CLI struct definitions (1173 lines) - types.rs: all shared type definitions (978 lines) - config.rs: config loading and resolution (107 lines) - search.rs: BM25, SimHash, MinHash primitives (317 lines) - util.rs: shared utilities and index loaders (579 lines) - index.rs: index building (639 lines) - assemble.rs: assembly engine and cross-refs (1570 lines) - mcp.rs: MCP protocol handlers (767 lines) - commands_query.rs: query, similar, dupes, diff (1243 lines) - commands_text.rs: stats, repl, vocabulary, eval (840 lines) - commands_links.rs: check-links, policy, fix-links (1377 lines) - commands_graph.rs: mv, paths, graph, stale, health (877 lines) - commands_audit.rs: orphans, canonicality, suggest (498 lines) - tests_main.rs: unit tests via #[path] (1939 lines) main.rs is now 495 lines (module wiring + dispatch). All 125 tests pass, zero clippy warnings. --- src/assemble.rs | 1570 +++++ src/cli.rs | 1173 ++++ src/commands_audit.rs | 498 ++ src/commands_graph.rs | 877 +++ src/commands_links.rs | 1377 ++++ src/commands_query.rs | 1243 ++++ src/commands_text.rs | 840 +++ src/config.rs | 107 + src/index.rs | 639 ++ src/main.rs | 13602 ++-------------------------------------- src/mcp.rs | 767 +++ src/search.rs | 317 + src/tests_main.rs | 1939 ++++++ src/types.rs | 978 +++ src/util.rs | 579 ++ 15 files changed, 13332 insertions(+), 13174 deletions(-) create mode 100644 src/assemble.rs create mode 100644 src/cli.rs create mode 100644 src/commands_audit.rs create mode 100644 src/commands_graph.rs create mode 100644 src/commands_links.rs create mode 100644 src/commands_query.rs create mode 100644 src/commands_text.rs create mode 100644 src/config.rs create mode 100644 src/index.rs create mode 100644 src/mcp.rs create mode 100644 src/search.rs create mode 100644 src/tests_main.rs create mode 100644 src/types.rs create mode 100644 src/util.rs diff --git a/src/assemble.rs b/src/assemble.rs new file mode 100644 index 0000000..d940fd7 --- /dev/null +++ b/src/assemble.rs @@ -0,0 +1,1570 @@ +use regex::Regex; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::search::*; +use crate::types::*; +use crate::util::*; + +pub(crate) fn search_relevant_sections( + query: &str, + index: &ForwardIndex, + max_sections: usize, +) -> Vec { + let query_terms = parse_query_terms(query, true); + if query_terms.is_empty() { + return Vec::new(); + } + + let mut all_sections: Vec = Vec::new(); + + // First, get top documents by BM25 + let mut doc_scores: Vec<(&String, &FileEntry, f64)> = index + .files + .iter() + .map(|(path, entry)| { + let score = bm25_score(&query_terms, entry, index.avg_doc_length, &index.idf_map); + (path, entry, score) + }) + .filter(|(_, _, score)| *score > 0.01) + .collect(); + + doc_scores.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal)); + + // Take top 20 documents + for (doc_path, entry, doc_score) in doc_scores.iter().take(20) { + let canonicality = score_canonicality(doc_path, entry); + + // Split document into sections based on section_fingerprints + if !entry.section_fingerprints.is_empty() { + if let Ok(content) = read_indexed_doc(index, doc_path, entry) { + let lines: Vec<&str> = content.lines().collect(); + + // Use indexed sections + for section in &entry.section_fingerprints { + let start = section.line_start.saturating_sub(1); + let end = section.line_end.min(lines.len()); + + if start < end { + let section_content = lines[start..end].join("\n"); + + all_sections.push(SectionMatch { + doc_path: (*doc_path).to_string(), + heading: section.heading.clone(), + line_start: section.line_start, + line_end: section.line_end, + bm25_score: *doc_score, // Use doc-level score for now + content: section_content, + canonicality, + }); + } + } + } + } else { + // Fallback: treat whole doc as one section + if let Ok(content) = read_indexed_doc(index, doc_path, entry) { + all_sections.push(SectionMatch { + doc_path: (*doc_path).to_string(), + heading: "Full Document".to_string(), + line_start: 1, + line_end: content.lines().count(), + bm25_score: *doc_score, + content, + canonicality, + }); + } + } + } + + // Sort by combined score with deterministic tie-breaks. + all_sections.sort_by(compare_sections_by_relevance); + + // Take top N sections + all_sections.into_iter().take(max_sections).collect() +} + +/// Score document canonicality based on path, recency, and patterns +pub(crate) fn score_canonicality(doc_path: &str, _entry: &FileEntry) -> f64 { + let mut score: f64 = 0.5; // baseline + + let path_lower = doc_path.to_lowercase(); + + // Path-based boosts + if path_lower.contains("docs/adr/") || path_lower.contains("docs/architecture/") { + score += 0.2; + } + if path_lower.contains("docs/index/") { + score += 0.15; + } + if path_lower.contains("scratch") + || path_lower.contains("archive") + || path_lower.contains("old") + { + score -= 0.3; + } + if path_lower.contains("deprecated") || path_lower.contains("backup") { + score -= 0.25; + } + + // Filename patterns + let filename = Path::new(doc_path) + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_lowercase(); + + if filename.contains("readme") || filename.contains("index") { + score += 0.1; + } + if filename.contains("guide") || filename.contains("runbook") || filename.contains("plan") { + score += 0.1; + } + + // Recency (approximate - we don't have mtime in index yet) + // For now, we'll just use this as a placeholder + // In future: add last_modified to FileEntry + + // Clamp to [0.0, 1.0] + score.clamp(0.0, 1.0) +} + +/// Distill sections into markdown digest within token budget +pub(crate) fn distill_to_markdown( + sections: &[SectionMatch], + query: &str, + max_tokens: usize, +) -> String { + let mut output = String::new(); + let mut used_tokens = 0; + + // Header + let header = format!( + "# Context Digest for: \"{}\"\n\n\ + **Generated:** {}\n\ + **Token Budget:** {}\n\ + **Documents Scanned:** N/A\n\ + **Sections Selected:** {}\n\n\ + ---\n\n", + query, + chrono_now(), + max_tokens, + sections.len() + ); + output.push_str(&header); + used_tokens += estimate_tokens(&header); + + // Group sections by document + let mut doc_groups: HashMap> = HashMap::new(); + for section in sections { + doc_groups + .entry(section.doc_path.clone()) + .or_default() + .push(section); + } + + // Top Relevant Documents section + output.push_str("## Top Relevant Documents\n\n"); + used_tokens += 10; + + let mut ranked_docs: Vec<_> = doc_groups.iter().collect(); + ranked_docs.sort_by(|a, b| { + let score_a = a.1[0].bm25_score * 0.7 + a.1[0].canonicality * 0.3; + let score_b = b.1[0].bm25_score * 0.7 + b.1[0].canonicality * 0.3; + score_b + .partial_cmp(&score_a) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + for (idx, (doc_path, doc_sections)) in ranked_docs.iter().enumerate().take(10) { + let section = doc_sections[0]; + let combined_score = section.bm25_score * 0.7 + section.canonicality * 0.3; + let doc_line = format!( + "{}. **{}** (score: {:.2}, canonical: {:.2})\n - Sections included: {}\n\n", + idx + 1, + doc_path, + combined_score, + section.canonicality, + doc_sections.len() + ); + output.push_str(&doc_line); + used_tokens += estimate_tokens(&doc_line); + } + + output.push_str("---\n\n## Distilled Content\n\n"); + used_tokens += 10; + + // Add sections + for section in sections { + if used_tokens >= max_tokens { + output.push_str("\n\n*[Content truncated due to token budget]*\n"); + break; + } + + let section_header = format!( + "### {} (from {})\n\n**Source:** {}:{}-{} (canonical: {:.2})\n\n", + section.heading, + section.doc_path, + section.doc_path, + section.line_start, + section.line_end, + section.canonicality + ); + + // Estimate how much space we need + let section_tokens = estimate_tokens(§ion_header) + estimate_tokens(§ion.content); + + if used_tokens + section_tokens > max_tokens { + // Try to fit a truncated version + let remaining_tokens = max_tokens - used_tokens; + let chars_to_include = remaining_tokens * 4; // rough approximation + + if chars_to_include > 200 { + output.push_str(§ion_header); + output.push_str(§ion.content[..chars_to_include.min(section.content.len())]); + output.push_str("\n\n*[Section truncated]*\n"); + } + break; + } + + output.push_str(§ion_header); + output.push_str(§ion.content); + output.push_str("\n\n---\n\n"); + + used_tokens += section_tokens; + } + + // Metadata footer + let footer = format!( + "\n## Metadata\n\n\ + **Canonicality Scores:**\n\ + - 0.90+: Authoritative source, prefer over other docs\n\ + - 0.70-0.89: Reliable, current documentation\n\ + - 0.50-0.69: Secondary or supporting documentation\n\ + - <0.50: Potentially stale, use with caution\n\n\ + **Actual Tokens Used:** ~{used_tokens}\n\n\ + ---\n\n\ + ## Usage with LLM\n\n\ + Paste this digest into your LLM conversation, then ask:\n\n\ + > Using only the information in the context above, answer: \"{query}\"\n\ + > Be explicit when something is not documented in the context.\n" + ); + + output.push_str(&footer); + + output +} + +/// Estimate token count (rough approximation: 1 token ≈ 4 chars) +pub(crate) fn estimate_tokens(text: &str) -> usize { + text.len() / 4 +} + +/// Build ADR index mapping ADR numbers to file paths +/// Extract all deterministic relation edges from a forward index. +/// Produces document-level links, section-level links, and ADR reference edges. +pub fn extract_relations(forward_index: &ForwardIndex) -> RelationIndex { + // Build normalized-path-to-key map (sorted iteration for determinism) + let mut norm_to_key: HashMap = HashMap::new(); + let mut sorted_keys: Vec<&String> = forward_index.files.keys().collect(); + sorted_keys.sort(); + for key in &sorted_keys { + let normalized = normalize_path(Path::new(key)); + norm_to_key + .entry(normalized) + .or_insert_with(|| (*key).clone()); + } + + let adr_index = build_adr_index(forward_index); + let mut edges: Vec = Vec::new(); + + for source_key in &sorted_keys { + let entry = &forward_index.files[*source_key]; + let source_base = Path::new(source_key.as_str()); + + // Document & section edges from links + for link in &entry.links { + let target = &link.target; + + // Skip external links + if target.starts_with("http://") + || target.starts_with("https://") + || target.starts_with("mailto:") + || target.starts_with("ftp://") + { + continue; + } + + // Split off anchor + let (link_path, anchor) = if let Some(idx) = target.find('#') { + ( + target[..idx].to_string(), + Some(target[idx + 1..].to_string()), + ) + } else { + (target.clone(), None) + }; + + if link_path.is_empty() { + continue; + } + + let resolved = if let Some(parent) = source_base.parent() { + parent.join(&link_path).to_string_lossy().to_string() + } else { + link_path.clone() + }; + let normalized = normalize_path(Path::new(&resolved)); + + let target_key = match norm_to_key.get(&normalized) { + Some(k) => k.clone(), + None => continue, + }; + + // Skip self-links + if &target_key == *source_key { + continue; + } + + // Document-level LinksTo edge + edges.push(RelationEdge { + source: (*source_key).clone(), + target: target_key.clone(), + kind: RelationKind::LinksTo, + anchor: anchor.clone(), + source_section: None, + target_section: None, + raw_text: None, + }); + + // Section-level edge + let source_section = find_containing_section(&entry.section_fingerprints, link.line); + if source_section.is_some() { + let target_section = anchor.as_deref().and_then(|a| { + forward_index + .files + .get(&target_key) + .and_then(|te| resolve_anchor_to_section(te, a)) + }); + + edges.push(RelationEdge { + source: (*source_key).clone(), + target: target_key.clone(), + kind: RelationKind::SectionLinksTo, + anchor: anchor.clone(), + source_section, + target_section, + raw_text: None, + }); + } + } + + // ADR reference edges + for adr_ref in &entry.adr_references { + if let Some(target_path) = adr_index.get(&adr_ref.normalized_id) { + // Skip self-links + if target_path == *source_key { + continue; + } + + let source_section = + find_containing_section(&entry.section_fingerprints, adr_ref.line); + + edges.push(RelationEdge { + source: (*source_key).clone(), + target: target_path.clone(), + kind: RelationKind::AdrReference, + anchor: None, + source_section, + target_section: None, + raw_text: Some(adr_ref.raw_text.clone()), + }); + } + } + } + + edges.sort(); + edges.dedup(); + + RelationIndex { + version: 1, + indexed_at: chrono_now(), + total_edges: edges.len(), + edges, + } +} + +/// Parse markdown links from a section's content +pub fn parse_markdown_links(section: &SectionMatch, origin_dir: &Path) -> Vec { + let mut refs = Vec::new(); + + // Regex: [text](target) - we'll filter out ![image] manually + let link_regex = Regex::new(r"(!?)\[(?P