-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathwebfetch.yaml.example
More file actions
131 lines (118 loc) · 4.84 KB
/
webfetch.yaml.example
File metadata and controls
131 lines (118 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# WebFetch MCP — YAML configuration file
#
# Usage: set the environment variable WEBFETCH_CONFIG to the absolute path
# of this file (after copying it to webfetch.yaml and editing it):
#
# WEBFETCH_CONFIG=/absolute/path/to/webfetch.yaml
#
# When WEBFETCH_CONFIG is set, this file is used instead of the legacy
# WEBFETCH_HEADERS / WEBFETCH_OUTPUT environment variables.
#
# All fields under "global" and each domain entry are optional.
# Domain keys support suffix matching: "example.com" matches both
# "example.com" and "www.example.com".
# The most specific (longest) matching domain wins on conflicts.
# ---------------------------------------------------------------------------
# Global defaults — applied to every request unless overridden by a domain
# ---------------------------------------------------------------------------
global:
# HTTP headers injected into every outbound request
headers:
User-Agent: "MyBot/1.0"
# Default response output format.
# Values: "raw" | "markdown" | "trafilatura" | "json"
# raw — return body as-is (default)
# markdown — convert HTML to Markdown (markdownify)
# trafilatura — extract main content as Markdown (trafilatura)
# json — pretty-print JSON body (auto-detected when Content-Type is
# application/json even if not set explicitly)
output_format: raw
# Request timeout in seconds (default: 30)
timeout: 30
# Retry on HTTP 5xx responses or transient network errors.
# attempts: total number of tries (1 = no retry, default)
# backoff: delay multiplier between retries (delay starts at 1s)
# e.g. backoff=2.0 → waits 1s, 2s, 4s, …
retry:
attempts: 1
backoff: 2.0
# Outbound HTTP proxy URL (null = no proxy, default)
proxy: null
# TLS configuration — useful in enterprise environments with custom CAs or
# TLS-intercepting proxies.
#
# tls_verify: Set to false ONLY in development environments to skip cert
# verification. Always true in production. Default: true.
# tls_ca_bundle: Absolute path to a PEM CA bundle. Use when your network
# uses a corporate CA not in the system trust store (e.g.
# Zscaler, Palo Alto SSL inspection).
# tls_min_version: Minimum TLS version: "1.2" or "1.3". Default: system
# default (varies by OS/Python version).
#
# tls_verify: true
# tls_ca_bundle: "/etc/ssl/certs/corporate-ca.pem"
# tls_min_version: "1.2"
# SSRF / egress filtering — optional but recommended in enterprise deployments.
#
# allowed_domains: if non-empty, ONLY these domains (and their subdomains) may
# be fetched. All other hosts are blocked with a ValueError.
# denied_domains: if non-empty, requests to these domains (and their
# subdomains) are always blocked, regardless of allowed_domains.
#
# Both lists use suffix matching: "example.com" matches "example.com" and
# "api.example.com" but NOT "notexample.com".
#
# Note: loopback (127.x, localhost), link-local (169.254.x — AWS metadata),
# and all RFC-1918 private ranges are ALWAYS blocked regardless of these lists.
#
# allowed_domains:
# - "api.mycompany.com"
# - "docs.mycompany.com"
# denied_domains:
# - "internal.corp"
# Headless browser rendering — executes JavaScript before returning HTML.
#
# When true, pages are fetched using Playwright (headless Chromium) instead
# of plain HTTP. The browser executes all JavaScript, waits for the network
# to be idle, then returns the fully rendered HTML. All output format options
# (trafilatura, markdown, css_selector, etc.) still apply to the rendered HTML.
#
# Default: false (use plain HTTP via httpx — no JavaScript execution)
#
# Requires: pip install playwright && playwright install chromium
#
# render_js: false
# ---------------------------------------------------------------------------
# Per-domain overrides
# All fields are optional — only the fields present override the global value.
# ---------------------------------------------------------------------------
domains:
# example.com and any subdomain (e.g. www.example.com)
example.com:
headers:
X-Auth-Token: "token-for-example"
output_format: trafilatura
timeout: 60
retry:
attempts: 3
backoff: 2.0
# News sites — convert to clean Markdown
news.com:
output_format: markdown
# A domain behind a corporate proxy
internal.corp:
proxy: "http://proxy.corp:8080"
headers:
Authorization: "Bearer my-internal-token"
# A JSON API endpoint — pretty-print output
api.example.com:
output_format: json
timeout: 10
retry:
attempts: 5
backoff: 1.5
# A JavaScript-heavy SPA — use headless browser to render before extracting
spa.example.com:
render_js: true
output_format: trafilatura
timeout: 60