diff --git a/README.md b/README.md index 5f96cd5..42952f1 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Reads from stdin if no file is given. | `--mime ` | `--mime application/json` (matches `application/json; charset=utf-8` too) | | `--min-time ` | `--min-time 500` | | `--body-grep ` | Match against request or response body text (case-sensitive). | +| `--body-regex ` | Regex match against request or response body text. Use `(?i)pattern` for case-insensitive. | Filters combine with AND logic. @@ -70,7 +71,7 @@ Filters combine with AND logic. | `--domains` | Emit `[{domain, count}]` sorted by count desc. Respects filters. | | `--size-by-type` | Emit `[{mime_type, total_bytes, count}]` sorted by total_bytes desc. Respects filters. | | `--redirects` | Emit `[{id, url, status, location}]` for every 3xx entry. Respects filters. | -| `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. | +| `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. As a direct lookup, `--entry` conflicts with every filter flag; combine them and the command errors at parse time. | | `--no-body` | Exclude all request/response body text. | | `--include-all-bodies` | Include bodies for static-asset MIME types (CSS/JS/images/fonts/WASM). By default those are stripped to save tokens. | @@ -84,6 +85,7 @@ Static-asset response bodies (images, fonts, CSS, JS, WASM, video, audio) are st |------|-------------| | `--validate` | Validate HAR only, no query | | `-v`, `--verbose` | Print parsing info to stderr | +| `--help-llm` | Print a compact, LLM-tuned cheatsheet of every flag (~1.5 KB vs ~3.5 KB for `--help`) and exit. | ### Exit codes @@ -120,6 +122,10 @@ hargrep --redirects recording.har # all 3xx + Location h # Body search that actually knows about HAR schema hargrep --body-grep 'session expired' --fields id,url,status recording.har +hargrep --body-regex '(?i)timeout|deadline' --status-range 5xx recording.har + +# Compact flag reference for LLM agents (~1.5 KB vs ~3.5 KB for --help) +hargrep --help-llm # Validate before processing hargrep --validate untrusted.har diff --git a/src/filter.rs b/src/filter.rs index cc770ab..695e7d2 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -110,6 +110,9 @@ pub struct FilterOptions { /// Matches if either contains the pattern. Agents fall through to /// `grep`/`rg` on raw HAR otherwise, which is noisy and unreliable. pub body_grep: Option, + /// Regex variant of `body_grep`. Compiled at CLI parse time (invalid + /// patterns error with exit code 2 before any file is read). + pub body_regex: Option, } /// Filter entries against the provided options, preserving each entry's @@ -174,9 +177,29 @@ fn matches_all(entry: &Entry, opts: &FilterOptions) -> bool { { return false; } + if let Some(ref re) = opts.body_regex + && !body_matches_regex(entry, re) + { + return false; + } true } +fn body_matches_regex(entry: &Entry, re: &Regex) -> bool { + if let Some(resp_text) = entry.response.content.text.as_deref() + && re.is_match(resp_text) + { + return true; + } + if let Some(post_data) = &entry.request.post_data + && let Some(req_text) = post_data.text.as_deref() + && re.is_match(req_text) + { + return true; + } + false +} + fn body_contains(entry: &Entry, pat: &str) -> bool { if let Some(resp_text) = entry.response.content.text.as_deref() && resp_text.contains(pat) diff --git a/src/main.rs b/src/main.rs index 829b432..592dc31 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,6 +56,11 @@ struct Cli { #[arg(long)] body_grep: Option, + /// Filter by regex match against request or response body text. + /// Use `(?i)pattern` for case-insensitive matching. + #[arg(long)] + body_regex: Option, + /// Output format #[arg(long, value_enum, default_value_t = OutputFormat::Json, conflicts_with = "count")] output: OutputFormat, @@ -112,7 +117,7 @@ struct Cli { conflicts_with_all = [ "count", "fields", "output", "method", "status", "status_range", "url", "url_regex", - "header", "mime", "min_time", + "header", "mime", "min_time", "body_grep", "body_regex", ] )] entry: Option, @@ -131,6 +136,12 @@ struct Cli { #[arg(long)] validate: bool, + /// Print a compact, LLM-tuned cheatsheet of every flag and exit. Unlike + /// `--help`, this omits clap's formatting and examples so an agent pays + /// a few hundred tokens instead of a few thousand for the reference. + #[arg(long)] + help_llm: bool, + /// Show parsing info on stderr #[arg(short, long)] verbose: bool, @@ -139,7 +150,48 @@ struct Cli { file: Option, } +const HELP_LLM: &str = "\ +hargrep — HAR query CLI. Reads FILE (or stdin). + +FILTERS (AND-combined): + --method GET|POST|... --status CODE + --status-range 4xx|200-299 --url SUBSTR + --url-regex REGEX --header 'NAME[:VALUE]' + --mime SUBSTR --min-time MS + --body-grep SUBSTR --body-regex REGEX + +OUTPUT (mutually exclusive): + (default) Filtered entries as JSON (pretty in TTY, compact when piped). + --output json|jsonl|summary + --fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time + --count Matching entry count. + --overview {entries,status,methods,mime_types,top_domains,total_body_size_bytes,total_time_ms} + --domains [{domain,count}] sorted by count desc. + --size-by-type [{mime_type,total_bytes,count}] sorted by total_bytes desc. + --redirects [{id,url,status,location}] for every 3xx. + --entry N One entry by id (original 0-indexed HAR position). + +BODY: + (default) Keep JSON/HTML/XML/text; strip CSS/JS/images/fonts/WASM. + --no-body Strip ALL body text. + --include-all-bodies Keep ALL bodies, including static assets. + +UTIL: --validate -v/--verbose --help --help-llm --version + +Every entry output includes `id` (stable across filters). Agent flow: + hargrep --overview FILE + hargrep --status-range 5xx --fields id,url,status FILE # list + hargrep --entry N FILE # drill in + +EXIT: 0=matches 1=no matches 2=error (bad args, invalid HAR, IO). +"; + fn run(cli: Cli) -> Result { + if cli.help_llm { + print!("{HELP_LLM}"); + return Ok(0); + } + let raw = input::read_input(cli.file.as_deref())?; let har: har::Har = serde_json::from_str(&raw).map_err(|e| { @@ -189,6 +241,7 @@ fn run(cli: Cli) -> Result { mime: cli.mime, min_time: cli.min_time, body_grep: cli.body_grep, + body_regex: cli.body_regex, }; let filtered = filter::filter_entries(har.log.entries, &filter_opts); diff --git a/tests/integration.rs b/tests/integration.rs index 378794d..70355f5 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -438,6 +438,96 @@ fn test_body_grep_composes_with_other_filters() { assert_eq!(parsed[0]["request"]["method"], "POST"); } +#[test] +fn test_body_regex_matches_response_body() { + let (stdout, _, _) = hargrep(&[ + "--body-regex", + r#""name":\s*"Al\w+""#, + "tests/fixtures/valid.har", + ]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 1); + assert_eq!(parsed[0]["id"], 1); +} + +#[test] +fn test_body_regex_matches_request_post_body() { + let (stdout, _, _) = hargrep(&["--body-regex", "Al.ce", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert!(parsed.iter().any(|e| e["id"] == 1)); +} + +#[test] +fn test_body_regex_invalid_pattern_errors_at_parse() { + let (_, stderr, code) = hargrep(&["--body-regex", "[unclosed", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); + assert!( + stderr.to_lowercase().contains("body-regex"), + "expected body-regex error, got: {stderr}" + ); +} + +#[test] +fn test_body_regex_composes_with_body_grep_as_and() { + // Both flags set: entry must match BOTH (AND, like all other filters). + let (_, _, code) = hargrep(&[ + "--body-grep", + "Alice", + "--body-regex", + "^no_match_$", + "tests/fixtures/valid.har", + ]); + assert_eq!(code, 1); +} + +// --- --help-llm --- + +#[test] +fn test_help_llm_emits_compact_cheatsheet() { + let (stdout, _, code) = hargrep(&["--help-llm"]); + assert_eq!(code, 0); + // Must fit in roughly one screen; serves LLM agents, not humans. + assert!( + stdout.len() < 2000, + "--help-llm output should be compact (<2KB); got {} bytes", + stdout.len() + ); + // Sanity: lists every top-level flag category we want an agent to know. + for needle in [ + "--method", + "--status", + "--status-range", + "--url", + "--mime", + "--body-grep", + "--body-regex", + "--count", + "--overview", + "--domains", + "--size-by-type", + "--redirects", + "--entry", + "--fields", + "--output", + "--no-body", + "--include-all-bodies", + ] { + assert!( + stdout.contains(needle), + "--help-llm missing {needle:?}; output:\n{stdout}" + ); + } + // Exit codes should be documented. + assert!(stdout.contains('0') && stdout.contains('1') && stdout.contains('2')); +} + +#[test] +fn test_help_llm_does_not_require_a_file() { + // --help-llm is a self-contained info flag, like --help. + let (_, _, code) = hargrep(&["--help-llm"]); + assert_eq!(code, 0); +} + // --- --overview dashboard --- #[test] @@ -668,6 +758,8 @@ fn test_entry_flag_conflicts_with_filter_flags() { &["--mime", "json"], &["--min-time", "100"], &["--header", "Authorization"], + &["--body-grep", "Alice"], + &["--body-regex", "Al.ce"], ]; for filter_args in cases { let mut args = vec!["--entry", "0"];