diff --git a/README.md b/README.md index df786e8..10ef1cb 100644 --- a/README.md +++ b/README.md @@ -62,10 +62,17 @@ Filters combine with AND logic. | Flag | Description | |------|-------------| -| `--output ` | `json` (default), `jsonl`, or `summary` | -| `--fields ` | Comma-separated. Valid names: `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`). Unknown names error at parse time. | +| `--output ` | `json` (default, pretty in a TTY, compact when piped), `jsonl`, or `summary`. | +| `--fields ` | Comma-separated. Valid names: `id`, `url`, `method`, `status`, `status-text`, `time`, `mime-type`, `started-date-time`. CLI names are kebab-case; emitted JSON keys preserve HAR camelCase (`statusText`, `mimeType`). Unknown names error at parse time. | | `--count` | Print only the count of matching entries. Conflicts with `--fields`, `--no-body`, `--output`. | -| `--no-body` | Exclude request/response bodies | +| `--overview` | Print a single JSON dashboard of the filtered HAR: entry count, status/method/MIME histograms, top 10 domains, total body size, total time. Replaces a cascade of exploratory queries with one call. | +| `--entry ` | Fetch a single entry by id (its original 0-indexed position in the HAR). Returns a JSON object, not an array. | +| `--no-body` | Exclude all request/response body text. | +| `--include-all-bodies` | Include bodies for static-asset MIME types (CSS/JS/images/fonts/WASM). By default those are stripped to save tokens. | + +Every output entry includes an `id` field — the entry's original 0-indexed position in the HAR. IDs are stable across filter changes, so you can list matches with `--fields id,url,status` and then drill into a specific one with `--entry N`. + +Static-asset response bodies (images, fonts, CSS, JS, WASM, video, audio) are stripped by default, since they dominate HAR size but rarely help debug API behaviour. Use `--include-all-bodies` to keep them, or `--no-body` to strip everything. ### Utility @@ -95,6 +102,13 @@ hargrep --method POST --count session.har # LLM-friendly: just URLs, statuses, timings, no bodies hargrep --fields url,status,time --no-body --output jsonl recording.har +# One-shot overview of a HAR: entry count, histograms, top domains, totals +hargrep --overview recording.har + +# Narrow with filters, list IDs, then fetch one entry in full +hargrep --status-range 5xx --fields id,url,status --output jsonl recording.har +hargrep --entry 42 recording.har + # Validate before processing hargrep --validate untrusted.har @@ -109,20 +123,22 @@ hargrep --header 'Authorization:Bearer' --fields url,status recording.har `hargrep` is designed to fit into agent pipelines: -- **Predictable schema** — every output mode produces deterministic, well-formed JSON or compact text -- **`--fields`** — request only the columns you need so the output stays small -- **`--no-body`** — strip base64 images and large response bodies -- **`--count`** — check scope cheaply before committing context to a full query -- **`--jsonl`** — stream one entry per line, easy to chunk +- **Predictable schema** — every output mode produces deterministic, well-formed JSON or compact text. +- **Stable entry IDs** — every entry includes an `id` field (its original HAR index). List matches cheaply, then fetch specific entries with `--entry N`. +- **`--overview`** — one call returns a dashboard of the (optionally filtered) HAR. Replaces several exploratory queries. +- **`--fields`** — request only the columns you need so the output stays small. +- **Asset bodies stripped by default** — CSS/JS/images/fonts/WASM response bodies are dropped automatically since they dominate HAR size. `--include-all-bodies` disables this; `--no-body` strips everything. +- **`--count`** — check scope cheaply before committing context to a full query. +- **`--output jsonl`** — stream one entry per line, easy to chunk. Default JSON is compact when piped and pretty in a TTY. - **Fails fast** — CLI arguments are validated before any file is read. Unknown `--fields` names, invalid `--status-range`, bad `--url-regex`, and conflicting flags (e.g. `--count --fields`) all error with exit code 2 and a descriptive message on stderr. Typos surface immediately instead of producing empty results. -Typical agent flow: validate → count → filter narrowly → read specific entries. +Typical agent flow: overview → filter → fetch specific entries. ```bash -hargrep --validate recording.har # check it parses -hargrep --count --status-range 5xx recording.har # probe the scope -hargrep --status-range 5xx --fields url,status,time \ - --output jsonl recording.har # pull just what's needed +hargrep --overview recording.har # shape + scope in one call +hargrep --status-range 5xx --fields id,url,status \ + --output jsonl recording.har # list candidates +hargrep --entry 42 recording.har # pull the full entry for one id ``` ## HAR format diff --git a/src/filter.rs b/src/filter.rs index c198b28..3c57c25 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -108,10 +108,15 @@ pub struct FilterOptions { pub min_time: Option, } -pub fn filter_entries(entries: Vec, opts: &FilterOptions) -> Vec { +/// Filter entries against the provided options, preserving each entry's +/// original index in the HAR. Downstream formatters emit this index as `id`, +/// which lets an LLM agent list entries and then fetch one by id stably even +/// after the filter set changes. +pub fn filter_entries(entries: Vec, opts: &FilterOptions) -> Vec<(usize, Entry)> { entries .into_iter() - .filter(|entry| matches_all(entry, opts)) + .enumerate() + .filter(|(_, entry)| matches_all(entry, opts)) .collect() } @@ -273,7 +278,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert_eq!(result[0].request.method, "POST"); + assert_eq!(result[0].1.request.method, "POST"); } #[test] @@ -296,7 +301,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert_eq!(result[0].response.status, 404); + assert_eq!(result[0].1.response.status, 404); } #[test] @@ -308,7 +313,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert_eq!(result[0].response.status, 404); + assert_eq!(result[0].1.response.status, 404); } #[test] @@ -342,7 +347,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert!(result[0].request.url.contains("/users/999")); + assert!(result[0].1.request.url.contains("/users/999")); } #[test] @@ -354,7 +359,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert_eq!(result[0].request.method, "POST"); + assert_eq!(result[0].1.request.method, "POST"); } #[test] @@ -366,7 +371,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert_eq!(result[0].request.method, "POST"); + assert_eq!(result[0].1.request.method, "POST"); } #[test] @@ -389,7 +394,7 @@ mod tests { }; let result = filter_entries(entries, &opts); assert_eq!(result.len(), 1); - assert!(result[0].request.url.contains("image.png")); + assert!(result[0].1.request.url.contains("image.png")); } #[test] diff --git a/src/main.rs b/src/main.rs index 97e02a2..c8dcb30 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,11 +2,12 @@ mod filter; mod har; mod input; mod output; +mod overview; use anyhow::Result; use clap::Parser; use filter::{FilterOptions, HeaderFilter, StatusRange}; -use output::{Field, OutputFormat, OutputMode}; +use output::{BodyMode, Field, OutputFormat, OutputMode}; use regex::Regex; use std::path::PathBuf; use std::process; @@ -62,10 +63,40 @@ struct Cli { #[arg(long)] count: bool, + /// Print a single-shot JSON dashboard of the filtered HAR: entry count, + /// status/method/MIME histograms, top domains, total body size, total time. + /// Replaces a cascade of exploratory queries with one call. + #[arg( + long, + conflicts_with_all = ["count", "fields", "entry", "no_body", "include_all_bodies", "output"] + )] + overview: bool, + + /// Fetch a single entry by id (the original 0-indexed position in the HAR). + /// Returns a JSON object, not an array. Useful after listing entries with + /// `--fields id,url,status` and then zeroing in on one. `--entry` is a + /// direct lookup, not a filter operation — it conflicts with filter flags + /// so an agent can't accidentally combine them and get surprising results. + #[arg( + long, + conflicts_with_all = [ + "count", "fields", "output", + "method", "status", "status_range", "url", "url_regex", + "header", "mime", "min_time", + ] + )] + entry: Option, + /// Exclude request/response bodies from output - #[arg(long, conflicts_with = "count")] + #[arg(long, conflicts_with_all = ["count", "include_all_bodies"])] no_body: bool, + /// Include bodies for static-asset MIME types (CSS/JS/images/fonts/WASM) + /// that would otherwise be stripped by default. Use when you actually need + /// to inspect an asset payload. + #[arg(long, conflicts_with = "count")] + include_all_bodies: bool, + /// Validate HAR only, don't query #[arg(long)] validate: bool, @@ -100,6 +131,24 @@ fn run(cli: Cli) -> Result { return Ok(0); } + let body_mode = if cli.no_body { + BodyMode::StripAll + } else if cli.include_all_bodies { + BodyMode::IncludeAll + } else { + BodyMode::SkipAssets + }; + + if let Some(id) = cli.entry { + let total = har.log.entries.len(); + let entry = har.log.entries.into_iter().nth(id).ok_or_else(|| { + anyhow::anyhow!("entry id {id} out of range (HAR has {total} entries)") + })?; + let output = output::format_single_entry(id, &entry, body_mode)?; + print!("{output}"); + return Ok(0); + } + let filter_opts = FilterOptions { method: cli.method, status: cli.status, @@ -114,13 +163,25 @@ fn run(cli: Cli) -> Result { let filtered = filter::filter_entries(har.log.entries, &filter_opts); let exit_code = if filtered.is_empty() { 1 } else { 0 }; + if cli.overview { + let doc = overview::build_overview(&filtered); + let serialized = if std::io::IsTerminal::is_terminal(&std::io::stdout()) { + serde_json::to_string_pretty(&doc)? + } else { + serde_json::to_string(&doc)? + }; + println!("{serialized}"); + // Keep grep-like exit semantics: empty filtered set → exit 1. + return Ok(exit_code); + } + let mode = if cli.count { OutputMode::Count } else { OutputMode::Formatted { format: cli.output, fields: cli.fields, - no_body: cli.no_body, + body: body_mode, } }; diff --git a/src/output.rs b/src/output.rs index 16e4e72..e0aee50 100644 --- a/src/output.rs +++ b/src/output.rs @@ -1,6 +1,7 @@ use crate::har::Entry; use clap::ValueEnum; use serde_json::Value; +use std::io::IsTerminal; #[derive(Debug, Clone, Copy, ValueEnum)] #[value(rename_all = "kebab-case")] @@ -20,6 +21,7 @@ pub enum OutputFormat { #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] #[value(rename_all = "kebab-case")] pub enum Field { + Id, Url, Method, Status, @@ -32,6 +34,7 @@ pub enum Field { impl Field { fn json_key(self) -> &'static str { match self { + Field::Id => "id", Field::Url => "url", Field::Method => "method", Field::Status => "status", @@ -42,8 +45,9 @@ impl Field { } } - fn value_for(self, entry: &Entry) -> anyhow::Result { + fn value_for(self, id: usize, entry: &Entry) -> anyhow::Result { Ok(match self { + Field::Id => Value::Number(id.into()), Field::Url => Value::String(entry.request.url.clone()), Field::Method => Value::String(entry.request.method.clone()), Field::Status => Value::Number(entry.response.status.into()), @@ -57,6 +61,23 @@ impl Field { } } +/// How aggressively to strip request/response body text from output. +/// +/// `SkipAssets` is the default: it drops bodies for static-asset MIME types +/// (CSS, JS, images, fonts, video, audio, WASM) where the text is usually +/// uninteresting to a debugging agent and often base64-encoded. This saves +/// tokens on realistic HARs without losing API/HTML/JSON payloads. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum BodyMode { + /// Drop all body text (request + response). Triggered by `--no-body`. + StripAll, + /// Strip bodies for static-asset MIME types; keep JSON/HTML/XML/text. + #[default] + SkipAssets, + /// Include every body verbatim. Triggered by `--include-all-bodies`. + IncludeAll, +} + /// What to emit for the filtered entries. Modeled as a sum type so illegal /// combinations (e.g. `--count` with `--fields`) are unrepresentable at the /// boundary between main and output logic. @@ -65,48 +86,119 @@ pub enum OutputMode { Formatted { format: OutputFormat, fields: Vec, - no_body: bool, + body: BodyMode, }, } -pub fn format_output(entries: &[Entry], mode: &OutputMode) -> anyhow::Result { +pub fn format_output(entries: &[(usize, Entry)], mode: &OutputMode) -> anyhow::Result { match mode { OutputMode::Count => Ok(format!("{}\n", entries.len())), OutputMode::Formatted { format, fields, - no_body, + body, } => match format { - OutputFormat::Json => format_json(entries, fields, *no_body), - OutputFormat::Jsonl => format_jsonl(entries, fields, *no_body), + OutputFormat::Json => format_json(entries, fields, *body), + OutputFormat::Jsonl => format_jsonl(entries, fields, *body), OutputFormat::Summary => Ok(format_summary(entries)), }, } } -fn entry_to_value(entry: &Entry, fields: &[Field], no_body: bool) -> anyhow::Result { +/// Emit a single entry as a JSON object (not wrapped in an array). Used by +/// `--entry N` — the object always includes an `id` field and respects +/// the body mode, so an agent can request the full entry or a stripped view. +pub fn format_single_entry(id: usize, entry: &Entry, body: BodyMode) -> anyhow::Result { + let value = entry_to_value(id, entry, &[], body)?; + let serialized = if std::io::stdout().is_terminal() { + serde_json::to_string_pretty(&value)? + } else { + serde_json::to_string(&value)? + }; + Ok(serialized + "\n") +} + +fn entry_to_value( + id: usize, + entry: &Entry, + fields: &[Field], + body: BodyMode, +) -> anyhow::Result { if !fields.is_empty() { let mut map = serde_json::Map::with_capacity(fields.len()); for field in fields { - map.insert(field.json_key().to_string(), field.value_for(entry)?); + map.insert(field.json_key().to_string(), field.value_for(id, entry)?); } return Ok(Value::Object(map)); } let mut value = serde_json::to_value(entry)?; - if no_body { - strip_bodies(&mut value); + if let Some(obj) = value.as_object_mut() { + obj.insert("id".to_string(), Value::Number(id.into())); + } + match body { + BodyMode::StripAll => strip_all_bodies(&mut value), + BodyMode::SkipAssets => { + if is_asset_response(entry) { + strip_response_text(&mut value); + } + // Request post bodies are usually small and always interesting; keep them. + } + BodyMode::IncludeAll => {} } Ok(value) } -fn strip_bodies(value: &mut Value) { +/// True if the response content-type is a static asset that rarely carries +/// information useful for debugging API behaviour (and is often base64-encoded). +pub fn is_asset_mime(mime: &str) -> bool { + let m = mime + .split(';') + .next() + .unwrap_or("") + .trim() + .to_ascii_lowercase(); + if m.starts_with("image/") + || m.starts_with("font/") + || m.starts_with("video/") + || m.starts_with("audio/") + { + return true; + } + matches!( + m.as_str(), + "application/javascript" + | "application/x-javascript" + | "text/javascript" + | "text/css" + | "application/wasm" + | "application/font-woff" + | "application/font-woff2" + | "application/x-font-ttf" + | "application/x-font-otf" + ) +} + +fn is_asset_response(entry: &Entry) -> bool { + entry + .response + .content + .mime_type + .as_deref() + .is_some_and(is_asset_mime) +} + +fn strip_response_text(value: &mut Value) { if let Some(content) = value .pointer_mut("/response/content") .and_then(Value::as_object_mut) { content.remove("text"); } +} + +fn strip_all_bodies(value: &mut Value) { + strip_response_text(value); if let Some(post_data) = value .pointer_mut("/request/postData") .and_then(Value::as_object_mut) @@ -115,29 +207,43 @@ fn strip_bodies(value: &mut Value) { } } -fn format_json(entries: &[Entry], fields: &[Field], no_body: bool) -> anyhow::Result { +fn format_json( + entries: &[(usize, Entry)], + fields: &[Field], + body: BodyMode, +) -> anyhow::Result { let values: Vec = entries .iter() - .map(|e| entry_to_value(e, fields, no_body)) + .map(|(id, e)| entry_to_value(*id, e, fields, body)) .collect::>>()?; - Ok(serde_json::to_string_pretty(&values)? + "\n") + let serialized = if std::io::stdout().is_terminal() { + serde_json::to_string_pretty(&values)? + } else { + serde_json::to_string(&values)? + }; + Ok(serialized + "\n") } -fn format_jsonl(entries: &[Entry], fields: &[Field], no_body: bool) -> anyhow::Result { +fn format_jsonl( + entries: &[(usize, Entry)], + fields: &[Field], + body: BodyMode, +) -> anyhow::Result { let mut output = String::new(); - for entry in entries { - let value = entry_to_value(entry, fields, no_body)?; + for (id, entry) in entries { + let value = entry_to_value(*id, entry, fields, body)?; output.push_str(&serde_json::to_string(&value)?); output.push('\n'); } Ok(output) } -fn format_summary(entries: &[Entry]) -> String { +fn format_summary(entries: &[(usize, Entry)]) -> String { let mut output = String::new(); - for entry in entries { + for (id, entry) in entries { output.push_str(&format!( - "{:<6} {:<4} {:<6} {}\n", + "{:<4} {:<6} {:<4} {:<6} {}\n", + id, entry.request.method, entry.response.status, format!("{}ms", entry.time as i64), @@ -152,25 +258,28 @@ mod tests { use super::*; use crate::har::Har; - fn load_entries() -> Vec { + fn load_entries() -> Vec<(usize, Entry)> { let json = include_str!("../tests/fixtures/valid.har"); let har: Har = serde_json::from_str(json).unwrap(); - har.log.entries + har.log.entries.into_iter().enumerate().collect() } - fn formatted(format: OutputFormat, fields: Vec, no_body: bool) -> OutputMode { + fn formatted(format: OutputFormat, fields: Vec, body: BodyMode) -> OutputMode { OutputMode::Formatted { format, fields, - no_body, + body, } } #[test] fn test_json_output() { let entries = load_entries(); - let output = - format_output(&entries, &formatted(OutputFormat::Json, vec![], false)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Json, vec![], BodyMode::IncludeAll), + ) + .unwrap(); let parsed: Vec = serde_json::from_str(&output).unwrap(); assert_eq!(parsed.len(), 4); } @@ -178,8 +287,11 @@ mod tests { #[test] fn test_jsonl_output() { let entries = load_entries(); - let output = - format_output(&entries, &formatted(OutputFormat::Jsonl, vec![], false)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Jsonl, vec![], BodyMode::IncludeAll), + ) + .unwrap(); let lines: Vec<&str> = output.trim().lines().collect(); assert_eq!(lines.len(), 4); for line in &lines { @@ -190,8 +302,11 @@ mod tests { #[test] fn test_summary_output() { let entries = load_entries(); - let output = - format_output(&entries, &formatted(OutputFormat::Summary, vec![], false)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Summary, vec![], BodyMode::IncludeAll), + ) + .unwrap(); let lines: Vec<&str> = output.trim().lines().collect(); assert_eq!(lines.len(), 4); assert!(lines[0].contains("GET")); @@ -210,8 +325,11 @@ mod tests { fn test_fields_selection_basic() { let entries = load_entries(); let fields = vec![Field::Url, Field::Status, Field::Time]; - let output = - format_output(&entries, &formatted(OutputFormat::Json, fields, false)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Json, fields, BodyMode::IncludeAll), + ) + .unwrap(); let parsed: Vec = serde_json::from_str(&output).unwrap(); let first = &parsed[0]; assert!(first.get("url").is_some()); @@ -224,8 +342,11 @@ mod tests { fn test_fields_preserves_camelcase_json_keys() { let entries = load_entries(); let fields = vec![Field::StatusText, Field::MimeType, Field::StartedDateTime]; - let output = - format_output(&entries, &formatted(OutputFormat::Json, fields, false)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Json, fields, BodyMode::IncludeAll), + ) + .unwrap(); let parsed: Vec = serde_json::from_str(&output).unwrap(); let first = &parsed[0]; assert_eq!(first["statusText"], "OK"); @@ -236,7 +357,11 @@ mod tests { #[test] fn test_no_body_strips_response_text() { let entries = load_entries(); - let output = format_output(&entries, &formatted(OutputFormat::Json, vec![], true)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Json, vec![], BodyMode::StripAll), + ) + .unwrap(); let parsed: Vec = serde_json::from_str(&output).unwrap(); for (i, entry) in parsed.iter().enumerate() { assert!( @@ -250,7 +375,11 @@ mod tests { fn test_no_body_strips_post_data_text() { // Second entry in fixture (index 1) is the POST with postData. let entries = load_entries(); - let output = format_output(&entries, &formatted(OutputFormat::Json, vec![], true)).unwrap(); + let output = format_output( + &entries, + &formatted(OutputFormat::Json, vec![], BodyMode::StripAll), + ) + .unwrap(); let parsed: Vec = serde_json::from_str(&output).unwrap(); let post_entry = &parsed[1]; assert!( @@ -265,15 +394,18 @@ mod tests { #[test] fn test_empty_entries_json() { - let entries: Vec = vec![]; - let output = - format_output(&entries, &formatted(OutputFormat::Json, vec![], false)).unwrap(); + let entries: Vec<(usize, Entry)> = vec![]; + let output = format_output( + &entries, + &formatted(OutputFormat::Json, vec![], BodyMode::IncludeAll), + ) + .unwrap(); assert_eq!(output.trim(), "[]"); } #[test] fn test_empty_entries_count() { - let entries: Vec = vec![]; + let entries: Vec<(usize, Entry)> = vec![]; let output = format_output(&entries, &OutputMode::Count).unwrap(); assert_eq!(output.trim(), "0"); } diff --git a/src/overview.rs b/src/overview.rs new file mode 100644 index 0000000..c07bc6b --- /dev/null +++ b/src/overview.rs @@ -0,0 +1,185 @@ +//! Single-shot HAR overview for LLM agents. +//! +//! `hargrep --overview ` replaces a sequence of small exploratory queries +//! (count, domains, methods, status ranges, size) with one compact JSON +//! document. Cheap to produce, cheap to read. + +use crate::har::Entry; +use serde_json::{Map, Value, json}; +use std::collections::BTreeMap; + +/// Maximum number of domains to list in the `top_domains` section. An agent +/// drilling into long-tail traffic should filter by `--url` rather than walk a +/// giant histogram. +const TOP_DOMAINS_LIMIT: usize = 10; + +/// Build the overview document for the given (already-filtered) entries. +pub fn build_overview(entries: &[(usize, Entry)]) -> Value { + let mut status_buckets = [0u64; 6]; // 1xx..5xx indexed by leading digit; slot 0 unused + let mut methods: BTreeMap = BTreeMap::new(); + let mut mime_types: BTreeMap = BTreeMap::new(); + let mut domains: BTreeMap = BTreeMap::new(); + let mut total_body_size: i64 = 0; + let mut total_time_ms: f64 = 0.0; + + for (_, entry) in entries { + let status_digit = (entry.response.status / 100) as usize; + if status_digit < status_buckets.len() { + status_buckets[status_digit] += 1; + } + + *methods.entry(entry.request.method.clone()).or_insert(0) += 1; + + if let Some(mime) = &entry.response.content.mime_type { + let normalized = mime + .split(';') + .next() + .unwrap_or("") + .trim() + .to_ascii_lowercase(); + if !normalized.is_empty() { + *mime_types.entry(normalized).or_insert(0) += 1; + } + } + + if let Some(host) = extract_host(&entry.request.url) { + *domains.entry(host).or_insert(0) += 1; + } + + // `content.size` can be -1 in HAR when unknown; treat those as 0. + if entry.response.content.size > 0 { + total_body_size += entry.response.content.size; + } + total_time_ms += entry.time; + } + + let mut status = Map::new(); + for (i, bucket_name) in ["", "1xx", "2xx", "3xx", "4xx", "5xx"].iter().enumerate() { + if i == 0 { + continue; + } + if status_buckets[i] > 0 { + status.insert((*bucket_name).to_string(), json!(status_buckets[i])); + } + } + + let mut top_domains: Vec<(String, u64)> = domains.into_iter().collect(); + top_domains.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + top_domains.truncate(TOP_DOMAINS_LIMIT); + let top_domains_value: Vec = top_domains + .into_iter() + .map(|(host, count)| json!({ "domain": host, "count": count })) + .collect(); + + json!({ + "entries": entries.len(), + "status": Value::Object(status), + "methods": Value::Object(methods_to_map(methods)), + "mime_types": Value::Object(methods_to_map(mime_types)), + "top_domains": top_domains_value, + "total_body_size_bytes": total_body_size, + "total_time_ms": total_time_ms, + }) +} + +fn methods_to_map(bt: BTreeMap) -> Map { + let mut m = Map::new(); + for (k, v) in bt { + m.insert(k, json!(v)); + } + m +} + +/// Extract the host portion of a URL without pulling in a full URL parser. +/// Tolerates missing schemes and malformed input — returns None rather than +/// erroring, so one weird URL doesn't break the whole overview. +fn extract_host(url: &str) -> Option { + let after_scheme = url.split_once("://").map(|(_, rest)| rest).unwrap_or(url); + let host = after_scheme + .split(['/', '?', '#']) + .next()? + .split('@') + .next_back()?; + if host.is_empty() { + return None; + } + Some(host.split(':').next().unwrap_or(host).to_ascii_lowercase()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::har::Har; + + fn load_entries() -> Vec<(usize, Entry)> { + let json = include_str!("../tests/fixtures/valid.har"); + let har: Har = serde_json::from_str(json).unwrap(); + har.log.entries.into_iter().enumerate().collect() + } + + #[test] + fn overview_counts_entries() { + let v = build_overview(&load_entries()); + assert_eq!(v["entries"], 4); + } + + #[test] + fn overview_status_buckets_present_only_for_nonzero() { + let v = build_overview(&load_entries()); + assert_eq!(v["status"]["2xx"], 3); + assert_eq!(v["status"]["4xx"], 1); + assert!(v["status"].get("1xx").is_none()); + assert!(v["status"].get("3xx").is_none()); + assert!(v["status"].get("5xx").is_none()); + } + + #[test] + fn overview_methods_and_mimes() { + let v = build_overview(&load_entries()); + assert_eq!(v["methods"]["GET"], 3); + assert_eq!(v["methods"]["POST"], 1); + assert!(v["mime_types"]["application/json"].as_u64().unwrap() >= 1); + } + + #[test] + fn overview_top_domains_sorted_by_count() { + let v = build_overview(&load_entries()); + let domains = v["top_domains"].as_array().unwrap(); + assert_eq!(domains.len(), 2); + // valid.har: 3 requests to api.example.com, 1 to cdn.example.com. + assert_eq!(domains[0]["domain"], "api.example.com"); + assert_eq!(domains[0]["count"], 3); + assert_eq!(domains[1]["domain"], "cdn.example.com"); + assert_eq!(domains[1]["count"], 1); + } + + #[test] + fn overview_total_time_and_body_size_sum() { + let v = build_overview(&load_entries()); + let time = v["total_time_ms"].as_f64().unwrap(); + assert!(time > 0.0); + let size = v["total_body_size_bytes"].as_i64().unwrap(); + assert!(size > 0); + } + + #[test] + fn extract_host_handles_common_shapes() { + assert_eq!( + extract_host("https://api.example.com/path"), + Some("api.example.com".to_string()) + ); + assert_eq!( + extract_host("https://api.example.com:8443/path"), + Some("api.example.com".to_string()) + ); + assert_eq!( + extract_host("http://user:pass@host.example.com/x"), + Some("host.example.com".to_string()) + ); + assert_eq!( + extract_host("api.example.com/path"), + Some("api.example.com".to_string()) + ); + assert_eq!(extract_host(""), None); + } +} diff --git a/tests/integration.rs b/tests/integration.rs index 2cd3c1d..1318af6 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -237,6 +237,265 @@ fn test_stdin_input() { assert_eq!(stdout.trim(), "4"); } +// --- --overview dashboard --- + +#[test] +fn test_overview_emits_json_object_with_expected_shape() { + let (stdout, _, code) = hargrep(&["--overview", "tests/fixtures/valid.har"]); + assert_eq!(code, 0); + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert!(parsed.is_object()); + assert_eq!(parsed["entries"], 4); + assert!(parsed["status"].is_object()); + assert!(parsed["methods"].is_object()); + assert!(parsed["mime_types"].is_object()); + assert!(parsed["top_domains"].is_array()); + assert!(parsed["total_body_size_bytes"].is_number()); + assert!(parsed["total_time_ms"].is_number()); +} + +#[test] +fn test_overview_status_histogram_uses_xx_buckets() { + let (stdout, _, _) = hargrep(&["--overview", "tests/fixtures/valid.har"]); + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + // valid.har has 3 2xx and 1 4xx + assert_eq!(parsed["status"]["2xx"], 3); + assert_eq!(parsed["status"]["4xx"], 1); +} + +#[test] +fn test_overview_methods_histogram() { + let (stdout, _, _) = hargrep(&["--overview", "tests/fixtures/valid.har"]); + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed["methods"]["GET"], 3); + assert_eq!(parsed["methods"]["POST"], 1); +} + +#[test] +fn test_overview_respects_filter() { + // With --method GET, the overview should only reflect GETs. + let (stdout, _, _) = hargrep(&["--overview", "--method", "GET", "tests/fixtures/valid.har"]); + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed["entries"], 3); + assert_eq!(parsed["methods"]["GET"], 3); + assert!(parsed["methods"].get("POST").is_none()); +} + +#[test] +fn test_overview_conflicts_with_count() { + let (_, _, code) = hargrep(&["--overview", "--count", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); +} + +#[test] +fn test_overview_conflicts_with_fields() { + let (_, _, code) = hargrep(&["--overview", "--fields", "url", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); +} + +#[test] +fn test_overview_conflicts_with_entry() { + let (_, _, code) = hargrep(&["--overview", "--entry", "0", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); +} + +// --- TTY-aware compact default JSON --- + +#[test] +fn test_json_output_is_compact_when_piped() { + // When hargrep's stdout is a pipe (like `.output()` captures), the default + // json format should be compact (single line, no indentation). + let (stdout, _, _) = hargrep(&["tests/fixtures/valid.har"]); + let first_newline = stdout.find('\n').unwrap_or(stdout.len()); + let first_line = &stdout[..first_newline]; + let non_newline_count = stdout.chars().filter(|&c| c != '\n').count(); + // Pretty-printed JSON would have many newlines; compact has at most 1 trailing. + assert!( + stdout.matches('\n').count() <= 1, + "compact JSON should have at most one trailing newline; got {} newlines.\nfirst line: {}", + stdout.matches('\n').count(), + first_line + ); + // Should still be valid JSON + let _: Vec = serde_json::from_str(&stdout).unwrap(); + // Smoke: output is non-empty + assert!(non_newline_count > 100); +} + +// --- Auto-skip static asset bodies --- + +#[test] +fn test_asset_bodies_stripped_by_default() { + let (stdout, _, _) = hargrep(&["tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + // Index 3 in valid.har is the image/png asset — its body text must be stripped by default. + let image_entry = parsed.iter().find(|e| { + e["response"]["content"]["mimeType"] + .as_str() + .is_some_and(|m| m.starts_with("image/")) + }); + let entry = image_entry.expect("valid.har should have an image entry"); + assert!( + entry["response"]["content"].get("text").is_none(), + "asset bodies (image/*) should be stripped by default; got: {}", + entry["response"]["content"] + ); +} + +#[test] +fn test_non_asset_bodies_kept_by_default() { + let (stdout, _, _) = hargrep(&["tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + // application/json bodies must remain. Find the /users GET response. + let json_entry = parsed.iter().find(|e| { + e["response"]["content"]["mimeType"] + .as_str() + .is_some_and(|m| m.starts_with("application/json")) + }); + let entry = json_entry.expect("valid.har has a json entry"); + assert!( + entry["response"]["content"].get("text").is_some(), + "json bodies must be kept by default" + ); +} + +#[test] +fn test_include_all_bodies_keeps_asset_bodies() { + let (stdout, _, _) = hargrep(&["--include-all-bodies", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + let image_entry = parsed.iter().find(|e| { + e["response"]["content"]["mimeType"] + .as_str() + .is_some_and(|m| m.starts_with("image/")) + }); + let entry = image_entry.expect("valid.har should have an image entry"); + assert!( + entry["response"]["content"].get("text").is_some(), + "--include-all-bodies keeps asset bodies" + ); +} + +#[test] +fn test_no_body_still_strips_everything() { + // --no-body wins over --include-all-bodies semantics + let (stdout, _, _) = hargrep(&["--no-body", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + for entry in &parsed { + assert!(entry["response"]["content"].get("text").is_none()); + } +} + +// --- Entry IDs + --entry N --- + +#[test] +fn test_entries_include_id_field_in_json() { + let (stdout, _, _) = hargrep(&["tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + // IDs are the original indices: 0, 1, 2, 3. + for (i, entry) in parsed.iter().enumerate() { + assert_eq!(entry["id"], i, "entry at position {i} should have id={i}"); + } +} + +#[test] +fn test_entries_include_id_field_in_jsonl() { + let (stdout, _, _) = hargrep(&["--output", "jsonl", "tests/fixtures/valid.har"]); + for (i, line) in stdout.trim().lines().enumerate() { + let entry: serde_json::Value = serde_json::from_str(line).unwrap(); + assert_eq!(entry["id"], i); + } +} + +#[test] +fn test_ids_are_stable_after_filter() { + // Filter to 4xx only; original index for the 404 entry is 2 (index-2 in valid.har). + let (stdout, _, _) = hargrep(&["--status", "404", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed.len(), 1); + assert_eq!(parsed[0]["id"], 2); +} + +#[test] +fn test_fields_can_include_id() { + let (stdout, _, _) = hargrep(&["--fields", "id,url", "tests/fixtures/valid.har"]); + let parsed: Vec = serde_json::from_str(&stdout).unwrap(); + for (i, entry) in parsed.iter().enumerate() { + assert_eq!(entry["id"], i); + assert!(entry.get("url").is_some()); + assert!(entry.get("request").is_none()); + } +} + +#[test] +fn test_entry_flag_fetches_single_entry_by_id() { + let (stdout, _, code) = hargrep(&["--entry", "1", "tests/fixtures/valid.har"]); + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert!( + parsed.is_object(), + "--entry returns a single object, not array" + ); + assert_eq!(parsed["id"], 1); + assert_eq!(parsed["request"]["method"], "POST"); // index 1 is the POST + assert_eq!(code, 0); +} + +#[test] +fn test_entry_flag_out_of_range_errors() { + let (_, stderr, code) = hargrep(&["--entry", "999", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); + assert!( + stderr.to_lowercase().contains("entry") || stderr.to_lowercase().contains("range"), + "expected out-of-range error, got: {stderr}" + ); +} + +#[test] +fn test_entry_flag_conflicts_with_count() { + let (_, _, code) = hargrep(&["--entry", "0", "--count", "tests/fixtures/valid.har"]); + assert_eq!(code, 2); +} + +#[test] +fn test_entry_flag_conflicts_with_filter_flags() { + // --entry is a direct lookup; combining with filters would silently ignore + // the predicates and mislead automation. + let cases: &[&[&str]] = &[ + &["--status", "500"], + &["--method", "GET"], + &["--url", "/users"], + &["--status-range", "5xx"], + &["--mime", "json"], + &["--min-time", "100"], + &["--header", "Authorization"], + ]; + for filter_args in cases { + let mut args = vec!["--entry", "0"]; + args.extend_from_slice(filter_args); + args.push("tests/fixtures/valid.har"); + let (_, stderr, code) = hargrep(&args); + assert_eq!( + code, 2, + "--entry with {filter_args:?} should exit 2; stderr: {stderr}" + ); + } +} + +#[test] +fn test_overview_exits_1_when_filter_produces_no_matches() { + // Grep-like exit contract: empty result → exit 1. + let (stdout, _, code) = hargrep(&["--overview", "--status", "999", "tests/fixtures/valid.har"]); + assert_eq!(code, 1); + // Body still emitted — empty overview is informative. + let parsed: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert_eq!(parsed["entries"], 0); +} + +#[test] +fn test_overview_exits_0_when_there_are_matches() { + let (_, _, code) = hargrep(&["--overview", "tests/fixtures/valid.har"]); + assert_eq!(code, 0); +} + // --- CLI argument validation (parse-time errors) --- #[test]