diff --git a/README.md b/README.md index 42952f1..fb9e06c 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,12 @@ Filters combine with AND logic. | Flag | Description | |------|-------------| | `--output ` | `json` (default, pretty in a TTY, compact when piped), `jsonl`, or `summary`. | -| `--fields ` | Comma-separated. Valid names: `id`, `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`). Unknown names error at parse time. | +| `--fields ` | Comma-separated. Valid names: `id`, `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`, `content-size`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`, `contentSize`). Unknown names error at parse time. | | `--count` | Print only the count of matching entries. Conflicts with `--fields`, `--no-body`, `--output`. | | `--overview` | Print a single JSON dashboard of the filtered HAR: entry count, status/method/MIME histograms, top 10 domains, total body size, total time. Replaces a cascade of exploratory queries with one call. | | `--domains` | Emit `[{domain, count}]` sorted by count desc. Respects filters. | | `--size-by-type` | Emit `[{mime_type, total_bytes, count}]` sorted by total_bytes desc. Respects filters. | +| `--largest-bodies[=N]` | Emit the top-N entries by response body size: `[{id, url, mime_type, content_size}]` sorted desc. Defaults to N=10. Pass as `--largest-bodies=5`. Entries whose HAR logger recorded `content.size = -1` (unknown) sort to the bottom. | | `--redirects` | Emit `[{id, url, status, location}]` for every 3xx entry. Respects filters. | | `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. As a direct lookup, `--entry` conflicts with every filter flag; combine them and the command errors at parse time. | | `--no-body` | Exclude all request/response body text. | @@ -117,7 +118,9 @@ hargrep --entry 42 recording.har # Aggregate views — one call each hargrep --domains recording.har # which hosts? -hargrep --size-by-type recording.har # where's the bandwidth going? +hargrep --size-by-type recording.har # where's the bandwidth going? (by MIME) +hargrep --largest-bodies recording.har # which URLs had the largest bodies? +hargrep --largest-bodies=3 recording.har # top 3 only hargrep --redirects recording.har # all 3xx + Location headers # Body search that actually knows about HAR schema diff --git a/src/aggregates.rs b/src/aggregates.rs index 676e761..d195c6e 100644 --- a/src/aggregates.rs +++ b/src/aggregates.rs @@ -60,6 +60,33 @@ pub fn size_by_type(entries: &[(usize, Entry)]) -> Value { ) } +/// `--largest-bodies N`: top-N entries by response body size, descending. +/// Each row: {id, url, mime_type, content_size}. Answers "which URL returned +/// the largest body?" without forcing the agent to extract `content.size` +/// from every entry and sort client-side. +/// +/// Entries whose HAR logger recorded `content.size = -1` (unknown) sort to +/// the bottom of the desc order — they're the smallest signed value. Sort is +/// stable, so among equal-size rows the original HAR order is preserved. +/// `limit = 0` is accepted and yields an empty array. +pub fn largest_bodies(entries: &[(usize, Entry)], limit: usize) -> Value { + let mut rows: Vec<&(usize, Entry)> = entries.iter().collect(); + rows.sort_by_key(|row| std::cmp::Reverse(row.1.response.content.size)); + rows.truncate(limit); + Value::Array( + rows.into_iter() + .map(|(id, entry)| { + json!({ + "id": id, + "url": entry.request.url, + "mime_type": entry.response.content.mime_type.clone().unwrap_or_default(), + "content_size": entry.response.content.size, + }) + }) + .collect(), + ) +} + /// `--redirects`: flat list of 3xx entries with their Location header. /// Each row: {id, url, status, location}. Chain reconstruction is left to the /// caller — the raw pairs are enough information and the format stays simple. @@ -196,6 +223,74 @@ mod tests { assert_eq!(rows.as_array().unwrap()[0]["total_bytes"], 0); } + #[test] + fn largest_bodies_sorts_desc_by_content_size() { + let rows = largest_bodies( + &indexed(vec![ + make_entry("GET", "https://x/a", 200, "application/json", 50), + make_entry("GET", "https://x/b", 200, "image/png", 5000), + make_entry("GET", "https://x/c", 200, "text/html", 800), + ]), + 10, + ); + let arr = rows.as_array().unwrap(); + assert_eq!(arr.len(), 3); + // The 5000-byte PNG wins; id references the original HAR index (1 here). + assert_eq!(arr[0]["id"], 1); + assert_eq!(arr[0]["url"], "https://x/b"); + assert_eq!(arr[0]["content_size"], 5000); + assert_eq!(arr[0]["mime_type"], "image/png"); + assert_eq!(arr[1]["content_size"], 800); + assert_eq!(arr[2]["content_size"], 50); + } + + #[test] + fn largest_bodies_truncates_to_limit() { + let entries = (0..20) + .map(|i| make_entry("GET", "u", 200, "application/json", (i * 10) as i64)) + .collect::>(); + let rows = largest_bodies(&indexed(entries), 3); + let arr = rows.as_array().unwrap(); + assert_eq!(arr.len(), 3); + // Top three by size, descending: 190, 180, 170. + assert_eq!(arr[0]["content_size"], 190); + assert_eq!(arr[1]["content_size"], 180); + assert_eq!(arr[2]["content_size"], 170); + } + + #[test] + fn largest_bodies_limit_zero_yields_empty_array() { + let rows = largest_bodies( + &indexed(vec![make_entry("GET", "u", 200, "application/json", 100)]), + 0, + ); + assert!(rows.as_array().unwrap().is_empty()); + } + + #[test] + fn largest_bodies_sinks_unknown_size_entries() { + // HAR records content.size = -1 when the logger didn't measure it. + // Desc sort treats -1 as smaller than real sizes, so these sort last. + let rows = largest_bodies( + &indexed(vec![ + make_entry("GET", "https://x/a", 200, "application/json", -1), + make_entry("GET", "https://x/b", 200, "image/png", 2000), + make_entry("GET", "https://x/c", 200, "application/json", -1), + make_entry("GET", "https://x/d", 200, "text/html", 100), + ]), + 10, + ); + let arr = rows.as_array().unwrap(); + assert_eq!(arr[0]["content_size"], 2000); + assert_eq!(arr[1]["content_size"], 100); + // Both -1 rows come last; stable sort preserves their original order + // so a comes before c. + assert_eq!(arr[2]["content_size"], -1); + assert_eq!(arr[2]["id"], 0); + assert_eq!(arr[3]["content_size"], -1); + assert_eq!(arr[3]["id"], 2); + } + #[test] fn redirects_only_includes_3xx() { let entries = vec![ diff --git a/src/main.rs b/src/main.rs index 592dc31..f2795ad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -79,7 +79,7 @@ struct Cli { /// Replaces a cascade of exploratory queries with one call. #[arg( long, - conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "domains", "size_by_type", "redirects"] + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "domains", "size_by_type", "redirects", "largest_bodies"] )] overview: bool, @@ -87,7 +87,7 @@ struct Cli { /// Respects filters. #[arg( long, - conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "size_by_type", "redirects"] + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "size_by_type", "redirects", "largest_bodies"] )] domains: bool, @@ -95,7 +95,7 @@ struct Cli { /// sorted by total_bytes desc. Respects filters. #[arg( long, - conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "redirects"] + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "redirects", "largest_bodies"] )] size_by_type: bool, @@ -103,10 +103,23 @@ struct Cli { /// Respects filters. #[arg( long, - conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type"] + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type", "largest_bodies"] )] redirects: bool, + /// Top-N entries by response body size, desc: [{id, url, mime_type, content_size}]. + /// Default N = 10. Pass a number with `--largest-bodies=N` to override. + /// Respects filters. + #[arg( + long, + value_name = "N", + num_args = 0..=1, + require_equals = true, + default_missing_value = "10", + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type", "redirects"] + )] + largest_bodies: Option, + /// Fetch a single entry by id (the original 0-indexed position in the HAR). /// Returns a JSON object, not an array. Useful after listing entries with /// `--fields id,url,status` and then zeroing in on one. `--entry` is a @@ -163,11 +176,12 @@ FILTERS (AND-combined): OUTPUT (mutually exclusive): (default) Filtered entries as JSON (pretty in TTY, compact when piped). --output json|jsonl|summary - --fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time + --fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time,content-size --count Matching entry count. --overview {entries,status,methods,mime_types,top_domains,total_body_size_bytes,total_time_ms} --domains [{domain,count}] sorted by count desc. --size-by-type [{mime_type,total_bytes,count}] sorted by total_bytes desc. + --largest-bodies[=N] [{id,url,mime_type,content_size}] top-N by content_size desc (default N=10). --redirects [{id,url,status,location}] for every 3xx. --entry N One entry by id (original 0-indexed HAR position). @@ -274,6 +288,12 @@ fn run(cli: Cli) -> Result { return Ok(aggregate_exit_code(&doc)); } + if let Some(limit) = cli.largest_bodies { + let doc = aggregates::largest_bodies(&filtered, limit); + emit_json_doc(&doc)?; + return Ok(aggregate_exit_code(&doc)); + } + let mode = if cli.count { OutputMode::Count } else { @@ -291,9 +311,9 @@ fn run(cli: Cli) -> Result { } /// Exit 1 when the aggregate document has nothing to report, 0 otherwise. -/// Array documents (`--domains`, `--size-by-type`, `--redirects`) are empty -/// when the array has no rows. The overview object is empty when its -/// `entries` count is zero. +/// Array documents (`--domains`, `--size-by-type`, `--redirects`, +/// `--largest-bodies`) are empty when the array has no rows. The overview +/// object is empty when its `entries` count is zero. fn aggregate_exit_code(doc: &serde_json::Value) -> i32 { let is_empty = match doc { serde_json::Value::Array(rows) => rows.is_empty(), diff --git a/src/output.rs b/src/output.rs index e0aee50..fa927da 100644 --- a/src/output.rs +++ b/src/output.rs @@ -29,6 +29,7 @@ pub enum Field { Time, MimeType, StartedDateTime, + ContentSize, } impl Field { @@ -42,6 +43,7 @@ impl Field { Field::Time => "time", Field::MimeType => "mimeType", Field::StartedDateTime => "startedDateTime", + Field::ContentSize => "contentSize", } } @@ -57,6 +59,9 @@ impl Field { Value::String(entry.response.content.mime_type.clone().unwrap_or_default()) } Field::StartedDateTime => Value::String(entry.started_date_time.clone()), + // HAR's content.size can be -1 when unknown; surface the raw value + // so callers can filter it themselves rather than guessing. + Field::ContentSize => Value::Number(entry.response.content.size.into()), }) } } diff --git a/tests/integration.rs b/tests/integration.rs index 70355f5..c564654 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -480,6 +480,117 @@ fn test_body_regex_composes_with_body_grep_as_and() { assert_eq!(code, 1); } +// --- content-size field + --largest-bodies --- + +#[test] +fn test_fields_includes_content_size() { + let (stdout, _, _) = hargrep(&["--fields", "url,content-size", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 4); + // Values must match the fixture, not just "some non-negative integer" — + // otherwise a bug swapping the sort key wouldn't surface here. + let sizes: Vec = parsed + .iter() + .map(|e| e["contentSize"].as_i64().unwrap()) + .collect(); + // Fixture exact values: [123, 45, 20, 50000]. Pinning at least the winner + // catches a bug that swaps the sort key to a different numeric field. + assert_eq!(sizes, vec![123, 45, 20, 50000]); + for entry in &parsed { + assert!(entry.get("url").is_some()); + } +} + +#[test] +fn test_largest_bodies_default_returns_top_10_sorted_desc() { + let (stdout, _, code) = hargrep(&["--largest-bodies", "tests/fixtures/valid.har"]); + assert_eq!(code, 0); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + // valid.har has 4 entries → capped at 4. + assert_eq!(parsed.len(), 4); + // Pin the expected winner: entry 3 is the PNG image with the largest body. + // Asserting identity (not just sortedness) catches regressions where the + // sort key is swapped to a different numeric field. + assert_eq!(parsed[0]["id"], 3); + assert!( + parsed[0]["url"].as_str().unwrap().contains("image.png"), + "expected image URL at rank 0, got {}", + parsed[0]["url"] + ); + let top_size = parsed[0]["content_size"].as_i64().unwrap(); + assert!(top_size > 1000, "PNG body should be >1KB, got {top_size}"); + // And everything below is strictly smaller-or-equal. + let sizes: Vec = parsed + .iter() + .map(|e| e["content_size"].as_i64().unwrap()) + .collect(); + let mut sorted = sizes.clone(); + sorted.sort_by(|a, b| b.cmp(a)); + assert_eq!(sizes, sorted); + // Schema shape on every row. + for entry in &parsed { + assert!(entry.get("id").is_some()); + assert!(entry.get("url").is_some()); + assert!(entry.get("content_size").is_some()); + assert!(entry.get("mime_type").is_some()); + } +} + +#[test] +fn test_largest_bodies_honors_limit() { + // `--largest-bodies=N` (equals) — bare `--largest-bodies N` is ambiguous + // with the FILE positional, so clap's require_equals keeps things clear. + let (stdout, _, _) = hargrep(&["--largest-bodies=2", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 2); + // Top-2 must be the PNG (id 3) followed by whichever JSON body is next largest. + assert_eq!(parsed[0]["id"], 3); +} + +#[test] +fn test_largest_bodies_limit_one() { + let (stdout, _, _) = hargrep(&["--largest-bodies=1", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 1); + assert_eq!(parsed[0]["id"], 3); +} + +#[test] +fn test_largest_bodies_respects_filter() { + let (stdout, _, _) = hargrep(&[ + "--largest-bodies", + "--method", + "POST", + "tests/fixtures/valid.har", + ]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 1); +} + +#[test] +fn test_largest_bodies_exits_1_when_empty() { + let (_, _, code) = hargrep(&[ + "--largest-bodies", + "--status", + "999", + "tests/fixtures/valid.har", + ]); + assert_eq!(code, 1); +} + +#[test] +fn test_largest_bodies_conflicts_with_other_views() { + // Must conflict with every other view flag — if clap's conflict list + // drops an entry, this test catches it. + for other in ["--overview", "--domains", "--size-by-type", "--redirects"] { + let (_, stderr, code) = hargrep(&["--largest-bodies", other, "tests/fixtures/valid.har"]); + assert_eq!( + code, 2, + "--largest-bodies with {other} should exit 2; stderr: {stderr}" + ); + } +} + // --- --help-llm --- #[test]