From feb600e2ea34592cf81dc12b3c51a4a83caab4f4 Mon Sep 17 00:00:00 2001 From: Bruno Melo Date: Thu, 16 Apr 2026 22:40:56 -0300 Subject: [PATCH 1/3] feat: add --body-regex filter and --help-llm compact cheatsheet Third PR in the LLM-friendly series. Two small additions that complete the filter/help surface area for agents. - --body-regex REGEX: regex variant of --body-grep, mirroring how --url pairs with --url-regex. Compiled at CLI parse time so bad patterns error with exit code 2 before any file is read. Supports (?i) for case-insensitive. Composes with --body-grep and all other filters as AND. - --help-llm: prints a compact flag reference and exits. 1566 bytes vs 3511 for clap's default --help (-55%). Tuned for LLM consumption: one line per flag group, no examples, exit codes documented. Lets an agent discover flags on-demand for ~400 tokens instead of carrying a 1k+ token cheatsheet in every system prompt. 129 tests pass (55 unit + 74 integration). Clippy clean, fmt clean. --- README.md | 2 + src/filter.rs | 23 +++++++++++ src/main.rs | 53 ++++++++++++++++++++++++++ tests/integration.rs | 90 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+) diff --git a/README.md b/README.md index 5f96cd5..7932557 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Reads from stdin if no file is given. | `--mime ` | `--mime application/json` (matches `application/json; charset=utf-8` too) | | `--min-time ` | `--min-time 500` | | `--body-grep ` | Match against request or response body text (case-sensitive). | +| `--body-regex ` | Regex match against request or response body text. Use `(?i)pattern` for case-insensitive. | Filters combine with AND logic. @@ -84,6 +85,7 @@ Static-asset response bodies (images, fonts, CSS, JS, WASM, video, audio) are st |------|-------------| | `--validate` | Validate HAR only, no query | | `-v`, `--verbose` | Print parsing info to stderr | +| `--help-llm` | Print a compact, LLM-tuned cheatsheet of every flag (~1.5 KB vs ~3.5 KB for `--help`) and exit. | ### Exit codes diff --git a/src/filter.rs b/src/filter.rs index cc770ab..695e7d2 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -110,6 +110,9 @@ pub struct FilterOptions { /// Matches if either contains the pattern. Agents fall through to /// `grep`/`rg` on raw HAR otherwise, which is noisy and unreliable. pub body_grep: Option, + /// Regex variant of `body_grep`. Compiled at CLI parse time (invalid + /// patterns error with exit code 2 before any file is read). + pub body_regex: Option, } /// Filter entries against the provided options, preserving each entry's @@ -174,9 +177,29 @@ fn matches_all(entry: &Entry, opts: &FilterOptions) -> bool { { return false; } + if let Some(ref re) = opts.body_regex + && !body_matches_regex(entry, re) + { + return false; + } true } +fn body_matches_regex(entry: &Entry, re: &Regex) -> bool { + if let Some(resp_text) = entry.response.content.text.as_deref() + && re.is_match(resp_text) + { + return true; + } + if let Some(post_data) = &entry.request.post_data + && let Some(req_text) = post_data.text.as_deref() + && re.is_match(req_text) + { + return true; + } + false +} + fn body_contains(entry: &Entry, pat: &str) -> bool { if let Some(resp_text) = entry.response.content.text.as_deref() && resp_text.contains(pat) diff --git a/src/main.rs b/src/main.rs index 829b432..fb008da 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,6 +56,11 @@ struct Cli { #[arg(long)] body_grep: Option, + /// Filter by regex match against request or response body text. + /// Use `(?i)pattern` for case-insensitive matching. + #[arg(long)] + body_regex: Option, + /// Output format #[arg(long, value_enum, default_value_t = OutputFormat::Json, conflicts_with = "count")] output: OutputFormat, @@ -131,6 +136,12 @@ struct Cli { #[arg(long)] validate: bool, + /// Print a compact, LLM-tuned cheatsheet of every flag and exit. Unlike + /// `--help`, this omits clap's formatting and examples so an agent pays + /// a few hundred tokens instead of a few thousand for the reference. + #[arg(long)] + help_llm: bool, + /// Show parsing info on stderr #[arg(short, long)] verbose: bool, @@ -139,7 +150,48 @@ struct Cli { file: Option, } +const HELP_LLM: &str = "\ +hargrep — HAR query CLI. Reads FILE (or stdin). + +FILTERS (AND-combined): + --method GET|POST|... --status CODE + --status-range 4xx|200-299 --url SUBSTR + --url-regex REGEX --header 'NAME[:VALUE]' + --mime SUBSTR --min-time MS + --body-grep SUBSTR --body-regex REGEX + +OUTPUT (mutually exclusive): + (default) Filtered entries as JSON (pretty in TTY, compact when piped). + --output json|jsonl|summary + --fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time + --count Matching entry count. + --overview {entries,status,methods,mime_types,top_domains,total_body_size_bytes,total_time_ms} + --domains [{domain,count}] sorted by count desc. + --size-by-type [{mime_type,total_bytes,count}] sorted by total_bytes desc. + --redirects [{id,url,status,location}] for every 3xx. + --entry N One entry by id (original 0-indexed HAR position). + +BODY: + (default) Keep JSON/HTML/XML/text; strip CSS/JS/images/fonts/WASM. + --no-body Strip ALL body text. + --include-all-bodies Keep ALL bodies, including static assets. + +UTIL: --validate -v/--verbose --help --help-llm --version + +Every entry output includes `id` (stable across filters). Agent flow: + hargrep --overview FILE + hargrep --status-range 5xx --fields id,url,status FILE # list + hargrep --entry N FILE # drill in + +EXIT: 0=matches 1=no matches 2=error (bad args, invalid HAR, IO). +"; + fn run(cli: Cli) -> Result { + if cli.help_llm { + print!("{HELP_LLM}"); + return Ok(0); + } + let raw = input::read_input(cli.file.as_deref())?; let har: har::Har = serde_json::from_str(&raw).map_err(|e| { @@ -189,6 +241,7 @@ fn run(cli: Cli) -> Result { mime: cli.mime, min_time: cli.min_time, body_grep: cli.body_grep, + body_regex: cli.body_regex, }; let filtered = filter::filter_entries(har.log.entries, &filter_opts); diff --git a/tests/integration.rs b/tests/integration.rs index 378794d..e672130 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -438,6 +438,96 @@ fn test_body_grep_composes_with_other_filters() { assert_eq!(parsed[0]["request"]["method"], "POST"); } +#[test] +fn test_body_regex_matches_response_body() { + let (stdout, _, _) = hargrep(&[ + "--body-regex", + r#""name":\s*"Al\w+""#, + "tests/fixtures/valid.har", + ]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 1); + assert_eq!(parsed[0]["id"], 1); +} + +#[test] +fn test_body_regex_matches_request_post_body() { + let (stdout, _, _) = hargrep(&["--body-regex", "Al.ce", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert!(parsed.iter().any(|e| e["id"] == 1)); +} + +#[test] +fn test_body_regex_invalid_pattern_errors_at_parse() { + let (_, stderr, code) = hargrep(&["--body-regex", "[unclosed", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); + assert!( + stderr.to_lowercase().contains("body-regex"), + "expected body-regex error, got: {stderr}" + ); +} + +#[test] +fn test_body_regex_composes_with_body_grep_as_and() { + // Both flags set: entry must match BOTH (AND, like all other filters). + let (_, _, code) = hargrep(&[ + "--body-grep", + "Alice", + "--body-regex", + "^no_match_$", + "tests/fixtures/valid.har", + ]); + assert_eq!(code, 1); +} + +// --- --help-llm --- + +#[test] +fn test_help_llm_emits_compact_cheatsheet() { + let (stdout, _, code) = hargrep(&["--help-llm"]); + assert_eq!(code, 0); + // Must fit in roughly one screen; serves LLM agents, not humans. + assert!( + stdout.len() < 2000, + "--help-llm output should be compact (<2KB); got {} bytes", + stdout.len() + ); + // Sanity: lists every top-level flag category we want an agent to know. + for needle in [ + "--method", + "--status", + "--status-range", + "--url", + "--mime", + "--body-grep", + "--body-regex", + "--count", + "--overview", + "--domains", + "--size-by-type", + "--redirects", + "--entry", + "--fields", + "--output", + "--no-body", + "--include-all-bodies", + ] { + assert!( + stdout.contains(needle), + "--help-llm missing {needle:?}; output:\n{stdout}" + ); + } + // Exit codes should be documented. + assert!(stdout.contains('0') && stdout.contains('1') && stdout.contains('2')); +} + +#[test] +fn test_help_llm_does_not_require_a_file() { + // --help-llm is a self-contained info flag, like --help. + let (_, _, code) = hargrep(&["--help-llm"]); + assert_eq!(code, 0); +} + // --- --overview dashboard --- #[test] From 7b90748affb9525530c4e4cf8614cf4b265cc2ee Mon Sep 17 00:00:00 2001 From: Bruno Melo Date: Thu, 16 Apr 2026 22:42:26 -0300 Subject: [PATCH 2/3] fix: --entry also conflicts with --body-grep and --body-regex Codex flagged on PR #3 that --body-grep wasn't in --entry's conflict set, so `hargrep --entry N --body-grep foo` silently ignored the filter. Same class of bug as the earlier --entry fix; body-grep was just added later and missed the sweep. Adding body_grep + the newly-introduced body_regex to the list. Extended the existing conflict test to cover both. --- src/main.rs | 2 +- tests/integration.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index fb008da..592dc31 100644 --- a/src/main.rs +++ b/src/main.rs @@ -117,7 +117,7 @@ struct Cli { conflicts_with_all = [ "count", "fields", "output", "method", "status", "status_range", "url", "url_regex", - "header", "mime", "min_time", + "header", "mime", "min_time", "body_grep", "body_regex", ] )] entry: Option, diff --git a/tests/integration.rs b/tests/integration.rs index e672130..70355f5 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -758,6 +758,8 @@ fn test_entry_flag_conflicts_with_filter_flags() { &["--mime", "json"], &["--min-time", "100"], &["--header", "Authorization"], + &["--body-grep", "Alice"], + &["--body-regex", "Al.ce"], ]; for filter_args in cases { let mut args = vec!["--entry", "0"]; From 9b136e36c1b0c953b3ddca346383ef01fe87bafc Mon Sep 17 00:00:00 2001 From: Bruno Melo Date: Thu, 16 Apr 2026 22:43:50 -0300 Subject: [PATCH 3/3] docs: note --entry conflicts with filters; add --body-regex + --help-llm examples --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7932557..42952f1 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Filters combine with AND logic. | `--domains` | Emit `[{domain, count}]` sorted by count desc. Respects filters. | | `--size-by-type` | Emit `[{mime_type, total_bytes, count}]` sorted by total_bytes desc. Respects filters. | | `--redirects` | Emit `[{id, url, status, location}]` for every 3xx entry. Respects filters. | -| `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. | +| `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. As a direct lookup, `--entry` conflicts with every filter flag; combine them and the command errors at parse time. | | `--no-body` | Exclude all request/response body text. | | `--include-all-bodies` | Include bodies for static-asset MIME types (CSS/JS/images/fonts/WASM). By default those are stripped to save tokens. | @@ -122,6 +122,10 @@ hargrep --redirects recording.har # all 3xx + Location h # Body search that actually knows about HAR schema hargrep --body-grep 'session expired' --fields id,url,status recording.har +hargrep --body-regex '(?i)timeout|deadline' --status-range 5xx recording.har + +# Compact flag reference for LLM agents (~1.5 KB vs ~3.5 KB for --help) +hargrep --help-llm # Validate before processing hargrep --validate untrusted.har