Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ Filters combine with AND logic.
| Flag | Description |
|------|-------------|
| `--output <FORMAT>` | `json` (default, pretty in a TTY, compact when piped), `jsonl`, or `summary`. |
| `--fields <FIELDS>` | Comma-separated. Valid names: `id`, `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`). Unknown names error at parse time. |
| `--fields <FIELDS>` | Comma-separated. Valid names: `id`, `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`, `content-size`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`, `contentSize`). Unknown names error at parse time. |
| `--count` | Print only the count of matching entries. Conflicts with `--fields`, `--no-body`, `--output`. |
| `--overview` | Print a single JSON dashboard of the filtered HAR: entry count, status/method/MIME histograms, top 10 domains, total body size, total time. Replaces a cascade of exploratory queries with one call. |
| `--domains` | Emit `[{domain, count}]` sorted by count desc. Respects filters. |
| `--size-by-type` | Emit `[{mime_type, total_bytes, count}]` sorted by total_bytes desc. Respects filters. |
| `--largest-bodies[=N]` | Emit the top-N entries by response body size: `[{id, url, mime_type, content_size}]` sorted desc. Defaults to N=10. Pass as `--largest-bodies=5`. Entries whose HAR logger recorded `content.size = -1` (unknown) sort to the bottom. |
| `--redirects` | Emit `[{id, url, status, location}]` for every 3xx entry. Respects filters. |
| `--entry <N>` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. As a direct lookup, `--entry` conflicts with every filter flag; combine them and the command errors at parse time. |
| `--no-body` | Exclude all request/response body text. |
Expand Down Expand Up @@ -117,7 +118,9 @@ hargrep --entry 42 recording.har

# Aggregate views — one call each
hargrep --domains recording.har # which hosts?
hargrep --size-by-type recording.har # where's the bandwidth going?
hargrep --size-by-type recording.har # where's the bandwidth going? (by MIME)
hargrep --largest-bodies recording.har # which URLs had the largest bodies?
hargrep --largest-bodies=3 recording.har # top 3 only
hargrep --redirects recording.har # all 3xx + Location headers

# Body search that actually knows about HAR schema
Expand Down
95 changes: 95 additions & 0 deletions src/aggregates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,33 @@ pub fn size_by_type(entries: &[(usize, Entry)]) -> Value {
)
}

/// `--largest-bodies N`: top-N entries by response body size, descending.
/// Each row: {id, url, mime_type, content_size}. Answers "which URL returned
/// the largest body?" without forcing the agent to extract `content.size`
/// from every entry and sort client-side.
///
/// Entries whose HAR logger recorded `content.size = -1` (unknown) sort to
/// the bottom of the desc order — they're the smallest signed value. Sort is
/// stable, so among equal-size rows the original HAR order is preserved.
/// `limit = 0` is accepted and yields an empty array.
pub fn largest_bodies(entries: &[(usize, Entry)], limit: usize) -> Value {
let mut rows: Vec<&(usize, Entry)> = entries.iter().collect();
rows.sort_by_key(|row| std::cmp::Reverse(row.1.response.content.size));
rows.truncate(limit);
Value::Array(
rows.into_iter()
.map(|(id, entry)| {
json!({
"id": id,
"url": entry.request.url,
"mime_type": entry.response.content.mime_type.clone().unwrap_or_default(),
"content_size": entry.response.content.size,
})
})
.collect(),
)
}

/// `--redirects`: flat list of 3xx entries with their Location header.
/// Each row: {id, url, status, location}. Chain reconstruction is left to the
/// caller — the raw pairs are enough information and the format stays simple.
Expand Down Expand Up @@ -196,6 +223,74 @@ mod tests {
assert_eq!(rows.as_array().unwrap()[0]["total_bytes"], 0);
}

#[test]
fn largest_bodies_sorts_desc_by_content_size() {
let rows = largest_bodies(
&indexed(vec![
make_entry("GET", "https://x/a", 200, "application/json", 50),
make_entry("GET", "https://x/b", 200, "image/png", 5000),
make_entry("GET", "https://x/c", 200, "text/html", 800),
]),
10,
);
let arr = rows.as_array().unwrap();
assert_eq!(arr.len(), 3);
// The 5000-byte PNG wins; id references the original HAR index (1 here).
assert_eq!(arr[0]["id"], 1);
assert_eq!(arr[0]["url"], "https://x/b");
assert_eq!(arr[0]["content_size"], 5000);
assert_eq!(arr[0]["mime_type"], "image/png");
assert_eq!(arr[1]["content_size"], 800);
assert_eq!(arr[2]["content_size"], 50);
}

#[test]
fn largest_bodies_truncates_to_limit() {
let entries = (0..20)
.map(|i| make_entry("GET", "u", 200, "application/json", (i * 10) as i64))
.collect::<Vec<_>>();
let rows = largest_bodies(&indexed(entries), 3);
let arr = rows.as_array().unwrap();
assert_eq!(arr.len(), 3);
// Top three by size, descending: 190, 180, 170.
assert_eq!(arr[0]["content_size"], 190);
assert_eq!(arr[1]["content_size"], 180);
assert_eq!(arr[2]["content_size"], 170);
}

#[test]
fn largest_bodies_limit_zero_yields_empty_array() {
let rows = largest_bodies(
&indexed(vec![make_entry("GET", "u", 200, "application/json", 100)]),
0,
);
assert!(rows.as_array().unwrap().is_empty());
}

#[test]
fn largest_bodies_sinks_unknown_size_entries() {
// HAR records content.size = -1 when the logger didn't measure it.
// Desc sort treats -1 as smaller than real sizes, so these sort last.
let rows = largest_bodies(
&indexed(vec![
make_entry("GET", "https://x/a", 200, "application/json", -1),
make_entry("GET", "https://x/b", 200, "image/png", 2000),
make_entry("GET", "https://x/c", 200, "application/json", -1),
make_entry("GET", "https://x/d", 200, "text/html", 100),
]),
10,
);
let arr = rows.as_array().unwrap();
assert_eq!(arr[0]["content_size"], 2000);
assert_eq!(arr[1]["content_size"], 100);
// Both -1 rows come last; stable sort preserves their original order
// so a comes before c.
assert_eq!(arr[2]["content_size"], -1);
assert_eq!(arr[2]["id"], 0);
assert_eq!(arr[3]["content_size"], -1);
assert_eq!(arr[3]["id"], 2);
}

#[test]
fn redirects_only_includes_3xx() {
let entries = vec![
Expand Down
36 changes: 28 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,47 @@ struct Cli {
/// Replaces a cascade of exploratory queries with one call.
#[arg(
long,
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "domains", "size_by_type", "redirects"]
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "domains", "size_by_type", "redirects", "largest_bodies"]
)]
overview: bool,

/// List unique request domains with per-domain request counts, sorted desc.
/// Respects filters.
#[arg(
long,
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "size_by_type", "redirects"]
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "size_by_type", "redirects", "largest_bodies"]
)]
domains: bool,

/// Breakdown of response body size by MIME type: [{mime_type, total_bytes, count}]
/// sorted by total_bytes desc. Respects filters.
#[arg(
long,
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "redirects"]
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "redirects", "largest_bodies"]
)]
size_by_type: bool,

/// List 3xx entries with their Location header: [{id, url, status, location}].
/// Respects filters.
#[arg(
long,
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type"]
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type", "largest_bodies"]
)]
redirects: bool,

/// Top-N entries by response body size, desc: [{id, url, mime_type, content_size}].
/// Default N = 10. Pass a number with `--largest-bodies=N` to override.
/// Respects filters.
#[arg(
long,
value_name = "N",
num_args = 0..=1,
require_equals = true,
default_missing_value = "10",
conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output", "overview", "domains", "size_by_type", "redirects"]
)]
largest_bodies: Option<usize>,

/// Fetch a single entry by id (the original 0-indexed position in the HAR).
/// Returns a JSON object, not an array. Useful after listing entries with
/// `--fields id,url,status` and then zeroing in on one. `--entry` is a
Expand Down Expand Up @@ -163,11 +176,12 @@ FILTERS (AND-combined):
OUTPUT (mutually exclusive):
(default) Filtered entries as JSON (pretty in TTY, compact when piped).
--output json|jsonl|summary
--fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time
--fields F,F,... id,url,method,status,status-text,time,mime-type,started-date-time,content-size
--count Matching entry count.
--overview {entries,status,methods,mime_types,top_domains,total_body_size_bytes,total_time_ms}
--domains [{domain,count}] sorted by count desc.
--size-by-type [{mime_type,total_bytes,count}] sorted by total_bytes desc.
--largest-bodies[=N] [{id,url,mime_type,content_size}] top-N by content_size desc (default N=10).
--redirects [{id,url,status,location}] for every 3xx.
--entry N One entry by id (original 0-indexed HAR position).

Expand Down Expand Up @@ -274,6 +288,12 @@ fn run(cli: Cli) -> Result<i32> {
return Ok(aggregate_exit_code(&doc));
}

if let Some(limit) = cli.largest_bodies {
let doc = aggregates::largest_bodies(&filtered, limit);
emit_json_doc(&doc)?;
return Ok(aggregate_exit_code(&doc));
}

let mode = if cli.count {
OutputMode::Count
} else {
Expand All @@ -291,9 +311,9 @@ fn run(cli: Cli) -> Result<i32> {
}

/// Exit 1 when the aggregate document has nothing to report, 0 otherwise.
/// Array documents (`--domains`, `--size-by-type`, `--redirects`) are empty
/// when the array has no rows. The overview object is empty when its
/// `entries` count is zero.
/// Array documents (`--domains`, `--size-by-type`, `--redirects`,
/// `--largest-bodies`) are empty when the array has no rows. The overview
/// object is empty when its `entries` count is zero.
fn aggregate_exit_code(doc: &serde_json::Value) -> i32 {
let is_empty = match doc {
serde_json::Value::Array(rows) => rows.is_empty(),
Expand Down
5 changes: 5 additions & 0 deletions src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pub enum Field {
Time,
MimeType,
StartedDateTime,
ContentSize,
}

impl Field {
Expand All @@ -42,6 +43,7 @@ impl Field {
Field::Time => "time",
Field::MimeType => "mimeType",
Field::StartedDateTime => "startedDateTime",
Field::ContentSize => "contentSize",
}
}

Expand All @@ -57,6 +59,9 @@ impl Field {
Value::String(entry.response.content.mime_type.clone().unwrap_or_default())
}
Field::StartedDateTime => Value::String(entry.started_date_time.clone()),
// HAR's content.size can be -1 when unknown; surface the raw value
// so callers can filter it themselves rather than guessing.
Field::ContentSize => Value::Number(entry.response.content.size.into()),
})
}
}
Expand Down
111 changes: 111 additions & 0 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,117 @@ fn test_body_regex_composes_with_body_grep_as_and() {
assert_eq!(code, 1);
}

// --- content-size field + --largest-bodies ---

#[test]
fn test_fields_includes_content_size() {
let (stdout, _, _) = hargrep(&["--fields", "url,content-size", "tests/fixtures/valid.har"]);
let parsed: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap();
assert_eq!(parsed.len(), 4);
// Values must match the fixture, not just "some non-negative integer" —
// otherwise a bug swapping the sort key wouldn't surface here.
let sizes: Vec<i64> = parsed
.iter()
.map(|e| e["contentSize"].as_i64().unwrap())
.collect();
// Fixture exact values: [123, 45, 20, 50000]. Pinning at least the winner
// catches a bug that swaps the sort key to a different numeric field.
assert_eq!(sizes, vec![123, 45, 20, 50000]);
for entry in &parsed {
assert!(entry.get("url").is_some());
}
}

#[test]
fn test_largest_bodies_default_returns_top_10_sorted_desc() {
let (stdout, _, code) = hargrep(&["--largest-bodies", "tests/fixtures/valid.har"]);
assert_eq!(code, 0);
let parsed: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap();
// valid.har has 4 entries → capped at 4.
assert_eq!(parsed.len(), 4);
// Pin the expected winner: entry 3 is the PNG image with the largest body.
// Asserting identity (not just sortedness) catches regressions where the
// sort key is swapped to a different numeric field.
assert_eq!(parsed[0]["id"], 3);
assert!(
parsed[0]["url"].as_str().unwrap().contains("image.png"),
"expected image URL at rank 0, got {}",
parsed[0]["url"]
);
let top_size = parsed[0]["content_size"].as_i64().unwrap();
assert!(top_size > 1000, "PNG body should be >1KB, got {top_size}");
// And everything below is strictly smaller-or-equal.
let sizes: Vec<i64> = parsed
.iter()
.map(|e| e["content_size"].as_i64().unwrap())
.collect();
let mut sorted = sizes.clone();
sorted.sort_by(|a, b| b.cmp(a));
assert_eq!(sizes, sorted);
// Schema shape on every row.
for entry in &parsed {
assert!(entry.get("id").is_some());
assert!(entry.get("url").is_some());
assert!(entry.get("content_size").is_some());
assert!(entry.get("mime_type").is_some());
}
}

#[test]
fn test_largest_bodies_honors_limit() {
// `--largest-bodies=N` (equals) — bare `--largest-bodies N` is ambiguous
// with the FILE positional, so clap's require_equals keeps things clear.
let (stdout, _, _) = hargrep(&["--largest-bodies=2", "tests/fixtures/valid.har"]);
let parsed: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap();
assert_eq!(parsed.len(), 2);
// Top-2 must be the PNG (id 3) followed by whichever JSON body is next largest.
assert_eq!(parsed[0]["id"], 3);
}

#[test]
fn test_largest_bodies_limit_one() {
let (stdout, _, _) = hargrep(&["--largest-bodies=1", "tests/fixtures/valid.har"]);
let parsed: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap();
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0]["id"], 3);
}

#[test]
fn test_largest_bodies_respects_filter() {
let (stdout, _, _) = hargrep(&[
"--largest-bodies",
"--method",
"POST",
"tests/fixtures/valid.har",
]);
let parsed: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap();
assert_eq!(parsed.len(), 1);
}

#[test]
fn test_largest_bodies_exits_1_when_empty() {
let (_, _, code) = hargrep(&[
"--largest-bodies",
"--status",
"999",
"tests/fixtures/valid.har",
]);
assert_eq!(code, 1);
}

#[test]
fn test_largest_bodies_conflicts_with_other_views() {
// Must conflict with every other view flag — if clap's conflict list
// drops an entry, this test catches it.
for other in ["--overview", "--domains", "--size-by-type", "--redirects"] {
let (_, stderr, code) = hargrep(&["--largest-bodies", other, "tests/fixtures/valid.har"]);
assert_eq!(
code, 2,
"--largest-bodies with {other} should exit 2; stderr: {stderr}"
);
}
}

// --- --help-llm ---

#[test]
Expand Down
Loading