From 791028326ac7cf766a02760a5519c166b330a56a Mon Sep 17 00:00:00 2001 From: us Date: Fri, 29 May 2026 12:42:57 +0300 Subject: [PATCH 1/2] fix(browse): harden outbound URL handling --- Cargo.lock | 1 + crates/crw-browse/src/main.rs | 6 + crates/crw-browse/src/server.rs | 5 + crates/crw-browse/src/session.rs | 19 +++ crates/crw-browse/src/tools/goto.rs | 90 +++++++++- crates/crw-browse/src/tools/screenshot.rs | 195 +++++++++++++++++++--- crates/crw-cli/src/commands/browse.rs | 6 + crates/crw-core/Cargo.toml | 1 + crates/crw-core/src/url_safety.rs | 99 ++++++++++- crates/crw-crawl/src/crawl.rs | 25 ++- crates/crw-server/src/routes/crawl.rs | 4 +- crates/crw-server/src/routes/map.rs | 4 +- crates/crw-server/src/routes/mcp.rs | 10 +- crates/crw-server/src/routes/scrape.rs | 4 +- crates/crw-server/src/routes/search.rs | 5 +- 15 files changed, 423 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 193e6bb..711d0d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -755,6 +755,7 @@ dependencies = [ "serde", "serde_json", "thiserror 2.0.18", + "tokio", "toml 0.8.23", "tracing", "url", diff --git a/crates/crw-browse/src/main.rs b/crates/crw-browse/src/main.rs index 277bfa5..23daf80 100644 --- a/crates/crw-browse/src/main.rs +++ b/crates/crw-browse/src/main.rs @@ -24,6 +24,11 @@ struct Cli { /// those tools return `NOT_IMPLEMENTED`. #[arg(long, env = "CRW_BROWSE_CHROME_WS_URL")] chrome_ws_url: Option, + + /// Directory where screenshot(path=...) may create image artifacts. + /// If unset, screenshot output must be returned inline as base64. + #[arg(long, env = "CRW_BROWSE_SCREENSHOT_DIR")] + screenshot_dir: Option, } #[tokio::main] @@ -41,6 +46,7 @@ async fn main() -> Result<()> { ws_url: cli.ws_url, page_timeout: Duration::from_millis(cli.page_timeout_ms), chrome_ws_url: cli.chrome_ws_url, + screenshot_dir: cli.screenshot_dir, }; tracing::info!(ws_url = %config.ws_url, "starting crw-browse"); diff --git a/crates/crw-browse/src/server.rs b/crates/crw-browse/src/server.rs index fa1a194..bdfffde 100644 --- a/crates/crw-browse/src/server.rs +++ b/crates/crw-browse/src/server.rs @@ -7,6 +7,7 @@ //! tool call. Multi-session + `session.new`/`session.close` tools land later //! in Phase 2 (see ROADMAP). +use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -34,6 +35,9 @@ pub struct BrowseConfig { /// v0.2.9). Tools that require Chrome return `NOT_IMPLEMENTED` when /// this is `None`. pub chrome_ws_url: Option, + /// Optional directory where screenshot `path` outputs may be created. + /// When unset, screenshots must be returned inline as base64. + pub screenshot_dir: Option, } impl Default for BrowseConfig { @@ -42,6 +46,7 @@ impl Default for BrowseConfig { ws_url: "ws://localhost:9222".to_string(), page_timeout: Duration::from_secs(30), chrome_ws_url: None, + screenshot_dir: None, } } } diff --git a/crates/crw-browse/src/session.rs b/crates/crw-browse/src/session.rs index 2d3a779..e97717f 100644 --- a/crates/crw-browse/src/session.rs +++ b/crates/crw-browse/src/session.rs @@ -132,6 +132,10 @@ pub struct BrowserSession { /// Listener task handle — aborted on session close so it can't outlive the /// connection. `None` until `ensure_attached` runs for the first time. event_listener: Mutex>>, + /// CDP Fetch guard task. It must live for the full session lifetime because + /// navigations can be triggered by clicks, scripts, redirects, and SPA code + /// after the original `goto` has returned. + outbound_guard: Mutex>>, } impl BrowserSession { @@ -155,6 +159,7 @@ impl BrowserSession { network_buffer: Arc::new(Mutex::new(VecDeque::with_capacity(NETWORK_BUFFER_CAP))), network_event_count: Arc::new(AtomicU64::new(0)), event_listener: Mutex::new(None), + outbound_guard: Mutex::new(None), } } @@ -377,6 +382,17 @@ impl BrowserSession { ) .await?; } + crate::tools::goto::enable_outbound_guard(&self.conn, &cdp_session_id, timeout).await?; + let mut outbound_guard = self.outbound_guard.lock().await; + if outbound_guard.as_ref().is_none_or(|h| h.is_finished()) { + let rx = self.conn.subscribe(); + let conn = self.conn.clone(); + let sid = cdp_session_id.clone(); + *outbound_guard = Some(tokio::spawn(async move { + crate::tools::goto::run_outbound_guard(conn, rx, &sid).await; + })); + } + drop(outbound_guard); *self.target_id.write().await = Some(target_id); *self.cdp_session_id.write().await = Some(cdp_session_id.clone()); @@ -403,6 +419,9 @@ impl BrowserSession { if let Some(h) = self.event_listener.lock().await.take() { h.abort(); } + if let Some(h) = self.outbound_guard.lock().await.take() { + h.abort(); + } self.conn.close().await; } } diff --git a/crates/crw-browse/src/tools/goto.rs b/crates/crw-browse/src/tools/goto.rs index 19f6801..f3f5658 100644 --- a/crates/crw-browse/src/tools/goto.rs +++ b/crates/crw-browse/src/tools/goto.rs @@ -68,7 +68,6 @@ pub async fn handle(server: &CrwBrowse, input: GotoInput) -> Result Result<(), String> { "scheme {scheme:?} not allowed — goto accepts http or https only" )); } + crw_core::url_safety::validate_safe_url(&parsed)?; Ok(()) } +async fn validate_goto_url_resolved(url: &str) -> Result<(), String> { + validate_goto_url(url)?; + let parsed = url::Url::parse(url).map_err(|e| format!("invalid url: {e}"))?; + tokio::time::timeout( + Duration::from_secs(2), + crw_core::url_safety::validate_safe_url_resolved(&parsed), + ) + .await + .map_err(|_| "DNS validation timed out".to_string())? +} + +pub(crate) async fn enable_outbound_guard( + conn: &crw_renderer::cdp_conn::CdpConnection, + cdp_session_id: &str, + timeout: Duration, +) -> crw_core::error::CrwResult<()> { + conn.send_recv( + "Fetch.enable", + serde_json::json!({ + "patterns": [ + { "urlPattern": "*", "requestStage": "Request" } + ] + }), + Some(cdp_session_id), + timeout, + ) + .await + .map(|_| ()) +} + +pub(crate) async fn run_outbound_guard( + conn: std::sync::Arc, + mut events: tokio::sync::broadcast::Receiver, + cdp_session_id: &str, +) { + use tokio::sync::broadcast::error::RecvError; + let cmd_timeout = Duration::from_secs(2); + loop { + let ev = match events.recv().await { + Ok(ev) => ev, + Err(RecvError::Closed) => return, + Err(RecvError::Lagged(_)) => continue, + }; + if ev.session_id.as_deref() != Some(cdp_session_id) || ev.method != "Fetch.requestPaused" { + continue; + } + let request_id = ev + .params + .get("requestId") + .and_then(|v| v.as_str()) + .unwrap_or(""); + if request_id.is_empty() { + continue; + } + let req_url = ev + .params + .get("request") + .and_then(|r| r.get("url")) + .and_then(|v| v.as_str()) + .unwrap_or(""); + let method = if validate_goto_url_resolved(req_url).await.is_ok() { + "Fetch.continueRequest" + } else { + "Fetch.failRequest" + }; + let params = if method == "Fetch.continueRequest" { + serde_json::json!({ "requestId": request_id }) + } else { + serde_json::json!({ "requestId": request_id, "errorReason": "BlockedByClient" }) + }; + let _ = conn + .send_recv(method, params, Some(cdp_session_id), cmd_timeout) + .await; + } +} + /// Waits until either `Page.loadEventFired` arrives or `timeout` elapses, and /// returns the HTTP status from the first `Network.responseReceived` event /// with `type: "Document"` that matches our session. @@ -248,6 +324,18 @@ mod tests { assert!(validate_goto_url("ht%74ps://example.com").is_err()); } + #[test] + fn validate_goto_url_rejects_internal_networks() { + for bad in [ + "http://127.0.0.1", + "http://10.0.0.1", + "http://169.254.169.254/latest/meta-data/", + "http://[::1]/", + ] { + assert!(validate_goto_url(bad).is_err(), "{bad} should be rejected"); + } + } + #[test] fn validate_goto_url_rejects_malformed() { assert!(validate_goto_url("not a url").is_err()); diff --git a/crates/crw-browse/src/tools/screenshot.rs b/crates/crw-browse/src/tools/screenshot.rs index 8379d5f..3ad0a6c 100644 --- a/crates/crw-browse/src/tools/screenshot.rs +++ b/crates/crw-browse/src/tools/screenshot.rs @@ -13,11 +13,13 @@ //! useful for visual regression and structural debugging, not for capturing //! the post-interaction state of a Lightpanda session. +use std::path::{Component, Path, PathBuf}; use std::time::{Duration, Instant}; use base64::Engine as _; use rmcp::{ErrorData as McpError, model::CallToolResult, schemars}; use serde::{Deserialize, Serialize}; +use tokio::io::AsyncWriteExt; use crate::errors::{ErrorCode, ErrorResponse}; use crate::response::ToolResponse; @@ -160,16 +162,31 @@ pub async fn handle( }; if let Some(path) = input.path { - // `tokio::fs` keeps the runtime non-blocking — `std::fs::write` would - // park the executor thread on a network-mounted or slow disk and stall - // every other in-flight tool call on the same runtime. - if let Err(e) = tokio::fs::write(&path, &bytes).await { + let safe_path = match resolve_screenshot_path(server, &path, &format).await { + Ok(path) => path, + Err(e) => return Ok(err_result(&e)), + }; + let mut file = match tokio::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&safe_path) + .await + { + Ok(file) => file, + Err(e) => { + return Ok(err_result(&ErrorResponse::new( + ErrorCode::Internal, + format!("failed to create {}: {e}", safe_path.display()), + ))); + } + }; + if let Err(e) = file.write_all(&bytes).await { return Ok(err_result(&ErrorResponse::new( ErrorCode::Internal, - format!("failed to write {path}: {e}"), + format!("failed to write {}: {e}", safe_path.display()), ))); } - payload_data.path = Some(path); + payload_data.path = Some(safe_path.display().to_string()); } else { payload_data.base64 = Some(base64::engine::general_purpose::STANDARD.encode(&bytes)); } @@ -184,12 +201,83 @@ pub async fn handle( Ok(ok_result(&payload)) } +async fn resolve_screenshot_path( + server: &CrwBrowse, + requested: &str, + format: &str, +) -> Result { + let root = server.config().screenshot_dir.as_ref().ok_or_else(|| { + ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot path output requires --screenshot-dir; omit path to receive base64", + ) + })?; + let rel = Path::new(requested); + if rel.is_absolute() { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot path must be relative to the configured screenshot directory", + )); + } + let mut normal_components = 0; + for component in rel.components() { + match component { + Component::Normal(_) => normal_components += 1, + _ => { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot path must not contain traversal or special components", + )); + } + } + } + if normal_components != 1 { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot path must be a single file name inside the screenshot directory", + )); + } + let ext = rel + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_ascii_lowercase()); + let expected = if format == "jpeg" { "jpg" } else { "png" }; + if !matches!(ext.as_deref(), Some("png" | "jpg" | "jpeg")) { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot path extension must be .png, .jpg, or .jpeg", + )); + } + if format == "png" && ext.as_deref() != Some("png") { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "png screenshots must use a .png path", + )); + } + if format == "jpeg" && !matches!(ext.as_deref(), Some("jpg" | "jpeg")) { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + format!("jpeg screenshots must use a .{expected} or .jpeg path"), + )); + } + tokio::fs::create_dir_all(root).await.map_err(|e| { + ErrorResponse::new( + ErrorCode::Internal, + format!( + "failed to create screenshot directory {}: {e}", + root.display() + ), + ) + })?; + Ok(root.join(rel)) +} + /// Open a fresh Chrome target, navigate it to `url`, capture the screenshot, /// then close the target. We don't keep targets around between calls because /// the caller usually just wants a one-shot snapshot — keeping them open /// would slow down the next call (state to reset) without useful sharing. async fn capture_via_chrome( - conn: &crw_renderer::cdp_conn::CdpConnection, + conn: &std::sync::Arc, url: &str, format: &str, timeout: Duration, @@ -243,7 +331,7 @@ async fn capture_via_chrome( } async fn capture_inner( - conn: &crw_renderer::cdp_conn::CdpConnection, + conn: &std::sync::Arc, target_id: &str, url: &str, format: &str, @@ -281,35 +369,53 @@ async fn capture_inner( ErrorResponse::new(ErrorCode::CdpError, format!("Chrome {method} failed: {e}")) })?; } + crate::tools::goto::enable_outbound_guard(conn, &sid, timeout) + .await + .map_err(|e| { + ErrorResponse::new( + ErrorCode::CdpError, + format!("Chrome Fetch.enable failed: {e}"), + ) + })?; // Subscribe before navigate so we don't miss the load event. let mut events = conn.subscribe(); + let guard_rx = conn.subscribe(); + let guard_conn = conn.clone(); + let guard_sid = sid.clone(); + let guard = tokio::spawn(async move { + crate::tools::goto::run_outbound_guard(guard_conn, guard_rx, &guard_sid).await; + }); - conn.send_recv( - "Page.navigate", - serde_json::json!({ "url": url }), - Some(&sid), - timeout, - ) - .await - .map_err(|e| { - ErrorResponse::new( + let navigate = conn + .send_recv( + "Page.navigate", + serde_json::json!({ "url": url }), + Some(&sid), + timeout, + ) + .await; + if let Err(e) = navigate { + guard.abort(); + return Err(ErrorResponse::new( ErrorCode::NavBlocked, format!("Chrome Page.navigate failed: {e}"), - ) - })?; + )); + } // Wait for load. Reuse the broadcast pattern from goto::wait_for_load. let deadline = tokio::time::Instant::now() + timeout; loop { match tokio::time::timeout_at(deadline, events.recv()).await { Err(_) => { + guard.abort(); return Err(ErrorResponse::new( ErrorCode::Timeout, "Chrome Page.loadEventFired did not arrive", )); } Ok(Err(tokio::sync::broadcast::error::RecvError::Closed)) => { + guard.abort(); return Err(ErrorResponse::new( ErrorCode::CdpError, "Chrome event channel closed", @@ -324,7 +430,7 @@ async fn capture_inner( } } - let resp = conn + let resp = match conn .send_recv( "Page.captureScreenshot", serde_json::json!({ "format": format }), @@ -332,12 +438,17 @@ async fn capture_inner( timeout, ) .await - .map_err(|e| { - ErrorResponse::new( + { + Ok(resp) => resp, + Err(e) => { + guard.abort(); + return Err(ErrorResponse::new( ErrorCode::CdpError, format!("Chrome Page.captureScreenshot failed: {e}"), - ) - })?; + )); + } + }; + guard.abort(); let b64 = resp.get("data").and_then(|v| v.as_str()).ok_or_else(|| { ErrorResponse::new( ErrorCode::CdpError, @@ -382,4 +493,40 @@ mod tests { assert!(!is_likely_stub_screenshot(67)); assert!(!is_likely_stub_screenshot(2_048)); } + + #[tokio::test] + async fn path_output_requires_configured_directory() { + let server = CrwBrowse::new(crate::server::BrowseConfig::default()); + let err = resolve_screenshot_path(&server, "shot.png", "png") + .await + .expect_err("path output without screenshot_dir must fail"); + assert_eq!(err.code, ErrorCode::InvalidArgs); + } + + #[tokio::test] + async fn path_output_rejects_absolute_traversal_and_extension_mismatch() { + let dir = tempfile::tempdir().unwrap(); + let config = crate::server::BrowseConfig { + screenshot_dir: Some(dir.path().to_path_buf()), + ..crate::server::BrowseConfig::default() + }; + let server = CrwBrowse::new(config); + + for bad in [ + "/tmp/shot.png", + "../shot.png", + "nested/shot.png", + "shot.jpg", + ] { + let err = resolve_screenshot_path(&server, bad, "png") + .await + .expect_err(bad); + assert_eq!(err.code, ErrorCode::InvalidArgs); + } + + let ok = resolve_screenshot_path(&server, "shot.png", "png") + .await + .expect("single filename inside screenshot_dir"); + assert_eq!(ok, dir.path().join("shot.png")); + } } diff --git a/crates/crw-cli/src/commands/browse.rs b/crates/crw-cli/src/commands/browse.rs index d20b05d..9b27193 100644 --- a/crates/crw-cli/src/commands/browse.rs +++ b/crates/crw-cli/src/commands/browse.rs @@ -19,6 +19,11 @@ pub struct BrowseArgs { /// those tools return `NOT_IMPLEMENTED`. #[arg(long, env = "CRW_BROWSE_CHROME_WS_URL")] pub chrome_ws_url: Option, + + /// Directory where screenshot(path=...) may create image artifacts. + /// If unset, screenshot output must be returned inline as base64. + #[arg(long, env = "CRW_BROWSE_SCREENSHOT_DIR")] + pub screenshot_dir: Option, } pub async fn run(args: BrowseArgs) -> anyhow::Result<()> { @@ -34,6 +39,7 @@ pub async fn run(args: BrowseArgs) -> anyhow::Result<()> { ws_url: args.ws_url.clone(), page_timeout: Duration::from_millis(args.page_timeout_ms), chrome_ws_url: args.chrome_ws_url, + screenshot_dir: args.screenshot_dir, }; tracing::info!(ws_url = %config.ws_url, "starting crw browse"); diff --git a/crates/crw-core/Cargo.toml b/crates/crw-core/Cargo.toml index 5346a5b..13ead64 100644 --- a/crates/crw-core/Cargo.toml +++ b/crates/crw-core/Cargo.toml @@ -30,6 +30,7 @@ uuid = { workspace = true } tracing = { workspace = true } reqwest = { workspace = true } prometheus = { workspace = true } +tokio = { workspace = true } [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/crw-core/src/url_safety.rs b/crates/crw-core/src/url_safety.rs index d07e66a..70cabc3 100644 --- a/crates/crw-core/src/url_safety.rs +++ b/crates/crw-core/src/url_safety.rs @@ -54,10 +54,8 @@ pub fn validate_safe_url(url: &url::Url) -> Result<(), String> { .host_str() .ok_or_else(|| "URL has no host".to_string())?; - // Block localhost by name - let host_lower = host.to_lowercase(); - if !test_allow_loopback && (host_lower == "localhost" || host_lower.ends_with(".localhost")) { - return Err("Localhost URLs are not allowed".into()); + if !test_allow_loopback && is_blocked_host_name(host) { + return Err("host is not allowed".into()); } // Parse as IP if possible and check ranges @@ -80,12 +78,77 @@ pub fn validate_safe_url(url: &url::Url) -> Result<(), String> { Ok(()) } +/// Validate the URL and resolve DNS for hostname targets, rejecting any private, +/// loopback, link-local, or otherwise non-public address. This is intentionally +/// fail-closed: if DNS cannot be resolved, callers cannot prove the destination +/// is public and must not fetch it. +pub async fn validate_safe_url_resolved(url: &url::Url) -> Result<(), String> { + validate_safe_url(url)?; + + let test_allow_loopback = std::env::var("CRW_ALLOW_LOOPBACK_FOR_TESTS").as_deref() == Ok("1"); + if test_allow_loopback { + return Ok(()); + } + + let host = url + .host_str() + .ok_or_else(|| "URL has no host".to_string())?; + let stripped = host.trim_start_matches('[').trim_end_matches(']'); + if stripped.parse::().is_ok() { + return Ok(()); + } + let port = url + .port_or_known_default() + .ok_or_else(|| "URL has no resolvable port".to_string())?; + let addrs = tokio::net::lookup_host((stripped, port)) + .await + .map_err(|_| "DNS resolution failed".to_string())?; + validate_resolved_ips(addrs.map(|addr| addr.ip())) +} + +pub fn validate_resolved_ips(ips: I) -> Result<(), String> +where + I: IntoIterator, +{ + let mut saw_ip = false; + for ip in ips { + saw_ip = true; + if is_blocked_ip(&ip) { + return Err(format!("Access to {ip} is not allowed")); + } + } + if !saw_ip { + return Err("DNS resolution returned no addresses".into()); + } + Ok(()) +} + +fn is_blocked_host_name(host: &str) -> bool { + let host_lower = host.to_lowercase(); + let host_lower = host_lower.trim_end_matches('.'); + host_lower == "localhost" + || host_lower == "metadata.google.internal" + || host_lower.ends_with(".localhost") + || host_lower.ends_with(".localtest.me") + || host_lower.ends_with(".lvh.me") + || host_lower.ends_with(".nip.io") + || host_lower.ends_with(".xip.io") + || host_lower.ends_with(".sslip.io") +} + fn is_blocked_ipv4(v4: &std::net::Ipv4Addr) -> bool { + let [a, b, _, _] = v4.octets(); v4.is_loopback() // 127.0.0.0/8 - || v4.is_private() // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 - || v4.is_link_local() // 169.254.0.0/16 (AWS metadata) - || v4.is_unspecified() // 0.0.0.0 - || v4.is_broadcast() // 255.255.255.255 + || v4.is_private() // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 + || v4.is_link_local() // 169.254.0.0/16 (metadata) + || v4.is_unspecified() // 0.0.0.0 + || v4.is_broadcast() // 255.255.255.255 + || (a == 100 && (64..=127).contains(&b)) // carrier-grade NAT + || a == 0 + || a >= 224 // multicast/reserved + || (a == 192 && b == 0) + || (a == 198 && (b == 18 || b == 19 || b == 51)) + || (a == 203 && b == 0) } fn is_blocked_ip(ip: &IpAddr) -> bool { @@ -100,6 +163,9 @@ fn is_blocked_ip(ip: &IpAddr) -> bool { || (v6.segments()[0] & 0xffc0) == 0xfe80 // IPv6 unique-local / ULA (fc00::/7) — private network equivalent || (v6.segments()[0] & 0xfe00) == 0xfc00 + // IPv6 multicast and special/reserved ranges. + || (v6.segments()[0] & 0xff00) == 0xff00 + || (v6.segments()[0] & 0xff00) == 0x0200 } } } @@ -130,6 +196,8 @@ mod tests { assert!(validate_safe_url(&url("http://localhost:8080")).is_err()); assert!(validate_safe_url(&url("http://127.0.0.1")).is_err()); assert!(validate_safe_url(&url("http://127.0.0.1:9999")).is_err()); + assert!(validate_safe_url(&url("http://metadata.google.internal")).is_err()); + assert!(validate_safe_url(&url("http://127.0.0.1.nip.io")).is_err()); } #[test] @@ -147,6 +215,8 @@ mod tests { #[test] fn blocks_zero_ip() { assert!(validate_safe_url(&url("http://0.0.0.0")).is_err()); + assert!(validate_safe_url(&url("http://100.64.0.1")).is_err()); + assert!(validate_safe_url(&url("http://224.0.0.1")).is_err()); } #[test] @@ -193,4 +263,17 @@ mod tests { // is tested via integration tests with actual HTTP redirects). let _policy = super::safe_redirect_policy(); } + + #[test] + fn resolved_ips_fail_closed_for_private_or_empty_answers() { + assert!(validate_resolved_ips([IpAddr::from([93, 184, 216, 34])]).is_ok()); + assert!( + validate_resolved_ips([ + IpAddr::from([93, 184, 216, 34]), + IpAddr::from([10, 0, 0, 1]) + ]) + .is_err() + ); + assert!(validate_resolved_ips([]).is_err()); + } } diff --git a/crates/crw-crawl/src/crawl.rs b/crates/crw-crawl/src/crawl.rs index 39106d6..ad7e01e 100644 --- a/crates/crw-crawl/src/crawl.rs +++ b/crates/crw-crawl/src/crawl.rs @@ -131,10 +131,12 @@ async fn run_crawl_inner(opts: CrawlOptions<'_>) { }; let base_url = match url::Url::parse(&req.url) { - Ok(u) if is_safe_url(&u) => u, - Ok(_) => { - send_failed(id, &state_tx, "Only http/https URLs are allowed".into()); - return; + Ok(u) => { + if let Err(e) = crw_core::url_safety::validate_safe_url_resolved(&u).await { + send_failed(id, &state_tx, e); + return; + } + u } Err(e) => { send_failed(id, &state_tx, format!("Invalid URL: {e}")); @@ -433,11 +435,9 @@ pub async fn discover_urls(opts: DiscoverOptions<'_>) -> CrwResult) -> CrwResult break, }; // Per-host limiter handled in FallbackRenderer (see run_crawl note). + if let Ok(parsed) = url::Url::parse(&url) + && crw_core::url_safety::validate_safe_url_resolved(&parsed) + .await + .is_err() + { + continue; + } let discover_deadline = crw_core::Deadline::from_request_ms(deadline_ms_per_page); let fetch = renderer diff --git a/crates/crw-server/src/routes/crawl.rs b/crates/crw-server/src/routes/crawl.rs index e008fad..a473faa 100644 --- a/crates/crw-server/src/routes/crawl.rs +++ b/crates/crw-server/src/routes/crawl.rs @@ -17,7 +17,9 @@ pub async fn start_crawl( let Json(req) = body.map_err(AppError::from)?; let parsed_url = url::Url::parse(&req.url) .map_err(|e| CrwError::InvalidRequest(format!("Invalid URL: {e}")))?; - crw_core::url_safety::validate_safe_url(&parsed_url).map_err(CrwError::InvalidRequest)?; + crw_core::url_safety::validate_safe_url_resolved(&parsed_url) + .await + .map_err(CrwError::InvalidRequest)?; validate_crawl_renderer(&req, &state)?; diff --git a/crates/crw-server/src/routes/map.rs b/crates/crw-server/src/routes/map.rs index c55d12d..e040801 100644 --- a/crates/crw-server/src/routes/map.rs +++ b/crates/crw-server/src/routes/map.rs @@ -70,7 +70,9 @@ pub async fn map( let Json(req) = body.map_err(AppError::from)?; let parsed_url = url::Url::parse(&req.url) .map_err(|e| CrwError::InvalidRequest(format!("Invalid URL: {e}")))?; - crw_core::url_safety::validate_safe_url(&parsed_url).map_err(CrwError::InvalidRequest)?; + crw_core::url_safety::validate_safe_url_resolved(&parsed_url) + .await + .map_err(CrwError::InvalidRequest)?; check_cap("extra_tracking_params", &req.extra_tracking_params)?; check_cap("extra_action_params", &req.extra_action_params)?; diff --git a/crates/crw-server/src/routes/mcp.rs b/crates/crw-server/src/routes/mcp.rs index 81da3ca..54ce11c 100644 --- a/crates/crw-server/src/routes/mcp.rs +++ b/crates/crw-server/src/routes/mcp.rs @@ -16,9 +16,9 @@ const SERVER_NAME: &str = "crw"; const SERVER_VERSION: &str = env!("CARGO_PKG_VERSION"); /// Validate URL safety for MCP tool calls (same checks as REST API routes). -pub fn validate_url(url: &str) -> Result<(), String> { +pub async fn validate_url(url: &str) -> Result<(), String> { let parsed = url::Url::parse(url).map_err(|e| format!("Invalid URL: {e}"))?; - crw_core::url_safety::validate_safe_url(&parsed) + crw_core::url_safety::validate_safe_url_resolved(&parsed).await } pub async fn call_tool(state: &AppState, tool_name: &str, args: Value) -> Result { @@ -26,7 +26,7 @@ pub async fn call_tool(state: &AppState, tool_name: &str, args: Value) -> Result "crw_scrape" => { let req: ScrapeRequest = serde_json::from_value(args).map_err(|e| format!("invalid arguments: {e}"))?; - validate_url(&req.url)?; + validate_url(&req.url).await?; let llm_config = state.config.extraction.llm.as_ref(); let user_agent = &state.config.crawler.user_agent; let default_stealth = @@ -53,7 +53,7 @@ pub async fn call_tool(state: &AppState, tool_name: &str, args: Value) -> Result "crw_crawl" => { let req: CrawlRequest = serde_json::from_value(args).map_err(|e| format!("invalid arguments: {e}"))?; - validate_url(&req.url)?; + validate_url(&req.url).await?; validate_crawl_renderer(&req, state).map_err(|e| format!("{e}"))?; let id = state.start_crawl_job(req).await; Ok(json!({"success": true, "id": id.to_string()})) @@ -74,7 +74,7 @@ pub async fn call_tool(state: &AppState, tool_name: &str, args: Value) -> Result "crw_map" => { let req: MapRequest = serde_json::from_value(args).map_err(|e| format!("invalid arguments: {e}"))?; - validate_url(&req.url)?; + validate_url(&req.url).await?; let max_depth = req .max_depth .unwrap_or(state.config.crawler.default_max_depth); diff --git a/crates/crw-server/src/routes/scrape.rs b/crates/crw-server/src/routes/scrape.rs index 6d40f75..a392536 100644 --- a/crates/crw-server/src/routes/scrape.rs +++ b/crates/crw-server/src/routes/scrape.rs @@ -16,7 +16,9 @@ pub async fn scrape( let Json(req) = body.map_err(AppError::from)?; let parsed_url = url::Url::parse(&req.url) .map_err(|e| CrwError::InvalidRequest(format!("Invalid URL: {e}")))?; - crw_core::url_safety::validate_safe_url(&parsed_url).map_err(CrwError::InvalidRequest)?; + crw_core::url_safety::validate_safe_url_resolved(&parsed_url) + .await + .map_err(CrwError::InvalidRequest)?; validate_renderer_pin(req.renderer, req.render_js, &state)?; let llm_config = state.config.extraction.llm.as_ref(); diff --git a/crates/crw-server/src/routes/search.rs b/crates/crw-server/src/routes/search.rs index 1e4e798..3cdf0bf 100644 --- a/crates/crw-server/src/routes/search.rs +++ b/crates/crw-server/src/routes/search.rs @@ -393,7 +393,10 @@ async fn enrich_with_scrape( Ok(u) => u, Err(_) => continue, }; - if crw_core::url_safety::validate_safe_url(&parsed).is_err() { + if crw_core::url_safety::validate_safe_url_resolved(&parsed) + .await + .is_err() + { continue; } jobs.push((idx, r.url.clone())); From b08ead2a4f27f8297976d65ce656df06d84efb10 Mon Sep 17 00:00:00 2001 From: us Date: Sat, 30 May 2026 15:33:25 +0300 Subject: [PATCH 2/2] fix(browse): address outbound hardening review feedback --- crates/crw-browse/src/tools/goto.rs | 51 +++++++++++++++------ crates/crw-browse/src/tools/screenshot.rs | 56 ++++++++++++++++++++++- crates/crw-core/src/url_safety.rs | 38 ++++++++++++++- 3 files changed, 129 insertions(+), 16 deletions(-) diff --git a/crates/crw-browse/src/tools/goto.rs b/crates/crw-browse/src/tools/goto.rs index f3f5658..cb50712 100644 --- a/crates/crw-browse/src/tools/goto.rs +++ b/crates/crw-browse/src/tools/goto.rs @@ -173,6 +173,7 @@ pub(crate) async fn run_outbound_guard( cdp_session_id: &str, ) { use tokio::sync::broadcast::error::RecvError; + let concurrency = std::sync::Arc::new(tokio::sync::Semaphore::new(32)); let cmd_timeout = Duration::from_secs(2); loop { let ev = match events.recv().await { @@ -191,25 +192,49 @@ pub(crate) async fn run_outbound_guard( if request_id.is_empty() { continue; } + let permit = match concurrency.clone().try_acquire_owned() { + Ok(permit) => permit, + Err(_) => { + let _ = conn + .send_recv( + "Fetch.failRequest", + serde_json::json!({ + "requestId": request_id, + "errorReason": "BlockedByClient", + }), + Some(cdp_session_id), + cmd_timeout, + ) + .await; + continue; + } + }; let req_url = ev .params .get("request") .and_then(|r| r.get("url")) .and_then(|v| v.as_str()) .unwrap_or(""); - let method = if validate_goto_url_resolved(req_url).await.is_ok() { - "Fetch.continueRequest" - } else { - "Fetch.failRequest" - }; - let params = if method == "Fetch.continueRequest" { - serde_json::json!({ "requestId": request_id }) - } else { - serde_json::json!({ "requestId": request_id, "errorReason": "BlockedByClient" }) - }; - let _ = conn - .send_recv(method, params, Some(cdp_session_id), cmd_timeout) - .await; + let request_id = request_id.to_string(); + let req_url = req_url.to_string(); + let conn = conn.clone(); + let cdp_session_id = cdp_session_id.to_string(); + tokio::spawn(async move { + let _permit = permit; + let method = if validate_goto_url_resolved(&req_url).await.is_ok() { + "Fetch.continueRequest" + } else { + "Fetch.failRequest" + }; + let params = if method == "Fetch.continueRequest" { + serde_json::json!({ "requestId": request_id }) + } else { + serde_json::json!({ "requestId": request_id, "errorReason": "BlockedByClient" }) + }; + let _ = conn + .send_recv(method, params, Some(&cdp_session_id), cmd_timeout) + .await; + }); } } diff --git a/crates/crw-browse/src/tools/screenshot.rs b/crates/crw-browse/src/tools/screenshot.rs index 3ad0a6c..05ed6c3 100644 --- a/crates/crw-browse/src/tools/screenshot.rs +++ b/crates/crw-browse/src/tools/screenshot.rs @@ -260,6 +260,14 @@ async fn resolve_screenshot_path( format!("jpeg screenshots must use a .{expected} or .jpeg path"), )); } + if let Ok(meta) = tokio::fs::symlink_metadata(root).await + && meta.file_type().is_symlink() + { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot directory must not be a symlink", + )); + } tokio::fs::create_dir_all(root).await.map_err(|e| { ErrorResponse::new( ErrorCode::Internal, @@ -269,6 +277,30 @@ async fn resolve_screenshot_path( ), ) })?; + let meta = tokio::fs::symlink_metadata(root).await.map_err(|e| { + ErrorResponse::new( + ErrorCode::Internal, + format!( + "failed to inspect screenshot directory {}: {e}", + root.display() + ), + ) + })?; + if meta.file_type().is_symlink() || !meta.is_dir() { + return Err(ErrorResponse::new( + ErrorCode::InvalidArgs, + "screenshot directory must be a real directory", + )); + } + let root = tokio::fs::canonicalize(root).await.map_err(|e| { + ErrorResponse::new( + ErrorCode::Internal, + format!( + "failed to canonicalize screenshot directory {}: {e}", + root.display() + ), + ) + })?; Ok(root.join(rel)) } @@ -527,6 +559,28 @@ mod tests { let ok = resolve_screenshot_path(&server, "shot.png", "png") .await .expect("single filename inside screenshot_dir"); - assert_eq!(ok, dir.path().join("shot.png")); + assert_eq!(ok, dir.path().canonicalize().unwrap().join("shot.png")); + } + + #[tokio::test] + async fn path_output_rejects_symlinked_directory() { + let dir = tempfile::tempdir().unwrap(); + let target = dir.path().join("target"); + let link = dir.path().join("link"); + tokio::fs::create_dir(&target).await.unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink(&target, &link).unwrap(); + #[cfg(windows)] + std::os::windows::fs::symlink_dir(&target, &link).unwrap(); + + let config = crate::server::BrowseConfig { + screenshot_dir: Some(link), + ..crate::server::BrowseConfig::default() + }; + let server = CrwBrowse::new(config); + let err = resolve_screenshot_path(&server, "shot.png", "png") + .await + .expect_err("symlinked screenshot_dir must fail"); + assert_eq!(err.code, ErrorCode::InvalidArgs); } } diff --git a/crates/crw-core/src/url_safety.rs b/crates/crw-core/src/url_safety.rs index 70cabc3..fbd89ec 100644 --- a/crates/crw-core/src/url_safety.rs +++ b/crates/crw-core/src/url_safety.rs @@ -1,4 +1,4 @@ -use std::net::IpAddr; +use std::net::{IpAddr, ToSocketAddrs}; /// Build a reqwest redirect policy that validates each redirect target /// against the SSRF safety checks. This prevents attackers from using @@ -7,7 +7,7 @@ pub fn safe_redirect_policy() -> reqwest::redirect::Policy { reqwest::redirect::Policy::custom(|attempt| { if attempt.previous().len() >= 10 { attempt.error("too many redirects") - } else if let Err(e) = validate_safe_url(attempt.url()) { + } else if let Err(e) = validate_safe_url_blocking_resolved(attempt.url()) { attempt.error(format!("redirect blocked: {e}")) } else { attempt.follow() @@ -106,6 +106,33 @@ pub async fn validate_safe_url_resolved(url: &url::Url) -> Result<(), String> { validate_resolved_ips(addrs.map(|addr| addr.ip())) } +/// Synchronous resolved validation for places that cannot await, notably +/// reqwest's redirect-policy callback. Keep use narrow; it performs DNS on the +/// current thread and intentionally fails closed. +pub fn validate_safe_url_blocking_resolved(url: &url::Url) -> Result<(), String> { + validate_safe_url(url)?; + + let test_allow_loopback = std::env::var("CRW_ALLOW_LOOPBACK_FOR_TESTS").as_deref() == Ok("1"); + if test_allow_loopback { + return Ok(()); + } + + let host = url + .host_str() + .ok_or_else(|| "URL has no host".to_string())?; + let stripped = host.trim_start_matches('[').trim_end_matches(']'); + if stripped.parse::().is_ok() { + return Ok(()); + } + let port = url + .port_or_known_default() + .ok_or_else(|| "URL has no resolvable port".to_string())?; + (stripped, port) + .to_socket_addrs() + .map_err(|_| "DNS resolution failed".to_string()) + .and_then(|addrs| validate_resolved_ips(addrs.map(|addr| addr.ip()))) +} + pub fn validate_resolved_ips(ips: I) -> Result<(), String> where I: IntoIterator, @@ -276,4 +303,11 @@ mod tests { ); assert!(validate_resolved_ips([]).is_err()); } + + #[test] + fn blocking_resolved_validation_rejects_denied_literal_ips() { + assert!(validate_safe_url_blocking_resolved(&url("http://169.254.169.254")).is_err()); + assert!(validate_safe_url_blocking_resolved(&url("http://127.0.0.1")).is_err()); + assert!(validate_safe_url_blocking_resolved(&url("http://[::1]")).is_err()); + } }