From baed9e36f1fce152d045de0f68d3f0389e692cf1 Mon Sep 17 00:00:00 2001 From: Dimitrie Stefanescu Date: Mon, 22 Jun 2026 20:09:38 +0100 Subject: [PATCH] feat: send cookies with every request during a clone Add a --cookie flag to `kage clone` that attaches cookies to all four request paths a clone makes, so a site behind a login or a region/cookie wall can be mirrored: - Chrome page navigations (seeded into the browser before first load) - asset downloads - the robots.txt fetch - the sitemap.xml fetch The flag takes a single name=value pair, repeats for several cookies, or accepts a whole Cookie header in one value ("a=1; b=2") pasted straight from devtools. Browser cookies are scoped to the seed host and its subdomains; the HTTP fetches send the same header. Off-domain assets are left on the live web by default, so the cookies are not leaked to third-party hosts. Tests cover the header serialisation, the downloader/sitemap sending the cookie, the Chrome cookie-param building, and the CLI flag parsing (including malformed input). README, CLI reference, and CHANGELOG updated. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 7 ++++ README.md | 1 + asset/download.go | 8 +++-- asset/download_test.go | 38 ++++++++++++++++++++ browser/cookie_test.go | 26 ++++++++++++++ browser/pool.go | 48 +++++++++++++++++++++++++ cli/clone.go | 31 ++++++++++++++++ cli/clone_test.go | 67 +++++++++++++++++++++++++++++++++++ clone/cloner.go | 24 +++++++++++-- clone/config.go | 31 +++++++++++++++- clone/cookie_test.go | 46 ++++++++++++++++++++++++ clone/sitemap.go | 9 +++-- docs/content/reference/cli.md | 1 + 13 files changed, 329 insertions(+), 8 deletions(-) create mode 100644 browser/cookie_test.go create mode 100644 cli/clone_test.go create mode 100644 clone/cookie_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 0246fe2..714f26e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ All notable changes to kage are recorded here. The format follows ## [Unreleased] +### Added + +- `kage clone --cookie` sends cookies with every request the crawl makes, so a site behind a login or a region/cookie wall can be cloned. + The cookies are attached to all four request paths: the Chrome page navigations (seeded into the browser before the first load), the asset downloads, and the `robots.txt` and `sitemap.xml` fetches — so the authenticated session is consistent across the whole run rather than only the pages. + Pass a single `name=value` pair, repeat the flag for several cookies, or paste a whole `Cookie` header straight from the browser's devtools in one value (`--cookie "session=abc; theme=dark"`). + Browser cookies are scoped to the seed host and its subdomains; the HTTP fetches send the same header, and because off-domain assets are left on the live web by default the cookies are not leaked to third-party hosts. + ## [0.3.6] - 2026-06-19 ### Fixed diff --git a/README.md b/README.md index 76195d2..488dddf 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ The flags you'll actually reach for: | `--workers` | `4` | How many pages to render at once | | `--no-robots` | `false` | Ignore `robots.txt` (be nice) | | `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts | +| `--cookie` | | Cookie sent with every request, as `name=value` (repeatable, or a whole `"a=1; b=2"` header) — for login- or region-gated sites | | `-f, --force` | `false` | Delete any existing mirror for the host first | | `--chrome` | | Path to the Chrome/Chromium binary | diff --git a/asset/download.go b/asset/download.go index b8c2112..7ac62c8 100644 --- a/asset/download.go +++ b/asset/download.go @@ -18,8 +18,9 @@ import ( type Downloader struct { Client *http.Client UserAgent string - MaxBytes int64 // per-asset cap; 0 = unlimited - Retries int // extra attempts for a transient failure (0 = try once) + Cookie string // value for the Cookie request header; empty sends none + MaxBytes int64 // per-asset cap; 0 = unlimited + Retries int // extra attempts for a transient failure (0 = try once) } // NewDownloader builds a Downloader with a sane client and the given timeout. @@ -105,6 +106,9 @@ func (d *Downloader) try(ctx context.Context, u *url.URL, referer string) (*Resu if d.UserAgent != "" { req.Header.Set("User-Agent", d.UserAgent) } + if d.Cookie != "" { + req.Header.Set("Cookie", d.Cookie) + } if referer != "" { req.Header.Set("Referer", referer) } diff --git a/asset/download_test.go b/asset/download_test.go index 83fd70d..866e97a 100644 --- a/asset/download_test.go +++ b/asset/download_test.go @@ -51,6 +51,44 @@ func TestGetRetriesTransientThenSucceeds(t *testing.T) { } } +func TestGetSendsCookieHeader(t *testing.T) { + var got string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + got = r.Header.Get("Cookie") + w.Header().Set("Content-Type", "text/css") + _, _ = w.Write([]byte("body{}")) + })) + defer srv.Close() + + d := NewDownloader("kage-test", 5*time.Second, 0) + d.Cookie = "session=abc; theme=dark" + u, _ := url.Parse(srv.URL + "/style.css") + if _, err := d.Get(context.Background(), u, ""); err != nil { + t.Fatalf("Get: %v", err) + } + if got != "session=abc; theme=dark" { + t.Errorf("Cookie header = %q; want %q", got, "session=abc; theme=dark") + } +} + +func TestGetSendsNoCookieByDefault(t *testing.T) { + var got string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + got = r.Header.Get("Cookie") + _, _ = w.Write([]byte("x")) + })) + defer srv.Close() + + d := NewDownloader("kage-test", 5*time.Second, 0) + u, _ := url.Parse(srv.URL + "/x.png") + if _, err := d.Get(context.Background(), u, ""); err != nil { + t.Fatalf("Get: %v", err) + } + if got != "" { + t.Errorf("expected no Cookie header, got %q", got) + } +} + func TestGetDoesNotRetryPermanent(t *testing.T) { var hits int32 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/browser/cookie_test.go b/browser/cookie_test.go new file mode 100644 index 0000000..8d2d7cb --- /dev/null +++ b/browser/cookie_test.go @@ -0,0 +1,26 @@ +package browser + +import "testing" + +func TestCookieParams(t *testing.T) { + got := cookieParams([]Cookie{ + {Name: "session", Value: "abc", Domain: "example.com"}, + {Name: "", Value: "skipme", Domain: "example.com"}, // dropped: empty name + {Name: "theme", Value: "dark", Domain: "example.com"}, + }) + if len(got) != 2 { + t.Fatalf("got %d params; want 2", len(got)) + } + if got[0].Name != "session" || got[0].Value != "abc" || got[0].Domain != "example.com" || got[0].Path != "/" { + t.Errorf("param[0] = %+v; want session/abc/example.com//", got[0]) + } + if got[1].Name != "theme" { + t.Errorf("param[1].Name = %q; want theme", got[1].Name) + } +} + +func TestCookieParamsEmpty(t *testing.T) { + if got := cookieParams(nil); got != nil && len(got) != 0 { + t.Errorf("cookieParams(nil) = %v; want empty", got) + } +} diff --git a/browser/pool.go b/browser/pool.go index 5a8d289..3bd93d7 100644 --- a/browser/pool.go +++ b/browser/pool.go @@ -20,6 +20,14 @@ import ( "github.com/go-rod/stealth" ) +// Cookie is a cookie injected into Chrome before any navigation, so login- or +// region-gated pages render the way the authenticated user would see them. +type Cookie struct { + Name string + Value string + Domain string // host the cookie applies to; subdomains are covered too +} + // Options configure a Pool. type Options struct { Headless bool // run Chrome without a window @@ -29,6 +37,7 @@ type Options struct { Scroll bool // auto-scroll to trigger lazy-loaded media ChromeBin string // explicit binary; empty = autodetect ControlURL string // attach to an existing Chrome instead of launching + Cookies []Cookie // cookies seeded into Chrome before navigation } // DefaultOptions returns the baseline render settings. @@ -207,6 +216,13 @@ func (p *Pool) getBrowser() (*rod.Browser, error) { return nil, fmt.Errorf("connect Chrome: %w", err) } + // Seed any configured cookies before the first navigation so a login- or + // region-gated site renders as the authenticated user, not the logged-out + // wall. Scoped to the seed host by the caller. + if err := setCookies(b, p.opts.Cookies); err != nil { + return nil, fmt.Errorf("set cookies: %w", err) + } + // kage never wants Chrome to write a file to disk. Every asset is fetched // through kage's own downloader, which applies the size and media policy, so a // Chrome-initiated download is only ever an accident: navigating an link @@ -225,6 +241,38 @@ func (p *Pool) getBrowser() (*rod.Browser, error) { return b, nil } +// setCookies seeds Chrome with the given cookies so the very first navigation is +// already authenticated. An empty list is a no-op (and never clears Chrome's +// cookies — rod treats a nil slice as "clear all", which is not what an absent +// --cookie flag should mean). Each cookie is scoped to its Domain with a root +// path so every page under that host receives it. +func setCookies(b *rod.Browser, cookies []Cookie) error { + params := cookieParams(cookies) + if len(params) == 0 { + return nil + } + return b.SetCookies(params) +} + +// cookieParams builds the Chrome cookie parameters for the given cookies, +// scoping each to its Domain with a root path so every page under that host +// receives it. Entries with an empty name are dropped. +func cookieParams(cookies []Cookie) []*proto.NetworkCookieParam { + params := make([]*proto.NetworkCookieParam, 0, len(cookies)) + for _, ck := range cookies { + if ck.Name == "" { + continue + } + params = append(params, &proto.NetworkCookieParam{ + Name: ck.Name, + Value: ck.Value, + Domain: ck.Domain, + Path: "/", + }) + } + return params +} + // Close shuts down the managed Chrome process. func (p *Pool) Close() error { p.mu.Lock() diff --git a/cli/clone.go b/cli/clone.go index a3b402a..1fd55d1 100644 --- a/cli/clone.go +++ b/cli/clone.go @@ -33,6 +33,7 @@ type cloneFlags struct { renderTO time.Duration scroll bool userAgent string + cookies []string subdomains bool scopePrefix string exclude []string @@ -81,6 +82,7 @@ func newCloneCmd() *cobra.Command { fs.DurationVar(&f.renderTO, "render-timeout", 30*time.Second, "hard cap per page render") fs.BoolVar(&f.scroll, "scroll", false, "auto-scroll each page to trigger lazy loading") fs.StringVar(&f.userAgent, "user-agent", clone.DefaultUserAgent, "User-Agent for asset and robots fetches") + fs.StringArrayVar(&f.cookies, "cookie", nil, "cookie to send with every request, as name=value; pass a whole header (\"a=1; b=2\") or repeat the flag") fs.BoolVar(&f.subdomains, "subdomains", false, "treat subdomains of the seed host as in scope") fs.StringVar(&f.scopePrefix, "scope-prefix", "", "only crawl pages whose path starts with this prefix") fs.StringSliceVar(&f.exclude, "exclude", nil, "path prefixes to skip (repeatable)") @@ -140,6 +142,11 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error { cfg.RenderTimeout = f.renderTO cfg.Scroll = f.scroll cfg.UserAgent = f.userAgent + cookies, err := parseCookies(f.cookies) + if err != nil { + return err + } + cfg.Cookies = cookies cfg.IncludeSubdomains = f.subdomains cfg.ScopePrefix = f.scopePrefix cfg.ExcludePaths = f.exclude @@ -205,6 +212,30 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error { return nil } +// parseCookies turns the repeated --cookie flag values into clone.Cookie pairs. +// Each value is either a single "name=value" pair or a whole Cookie header +// ("a=1; b=2"), so a user can paste one straight from the browser's devtools or +// build the list up flag by flag. Whitespace around names and separators is +// trimmed; an entry with no "=" or an empty name is a usage error. +func parseCookies(values []string) ([]clone.Cookie, error) { + var out []clone.Cookie + for _, v := range values { + for _, pair := range strings.Split(v, ";") { + pair = strings.TrimSpace(pair) + if pair == "" { + continue + } + name, val, ok := strings.Cut(pair, "=") + name = strings.TrimSpace(name) + if !ok || name == "" { + return nil, fmt.Errorf("invalid --cookie %q: want name=value", pair) + } + out = append(out, clone.Cookie{Name: name, Value: strings.TrimSpace(val)}) + } + } + return out, nil +} + // progressLine renders the single-line live counter. "pages" is the count of // real pages (distinct paths); when a faceted site spawns query-string variants // they are shown separately so the page number stays easy to read. diff --git a/cli/clone_test.go b/cli/clone_test.go new file mode 100644 index 0000000..6266ed6 --- /dev/null +++ b/cli/clone_test.go @@ -0,0 +1,67 @@ +package cli + +import ( + "testing" + + "github.com/tamnd/kage/clone" +) + +func TestParseCookies(t *testing.T) { + cases := []struct { + name string + in []string + want []clone.Cookie + }{ + {"nil", nil, nil}, + {"single pair", []string{"session=abc"}, []clone.Cookie{{Name: "session", Value: "abc"}}}, + { + "header form splits on semicolons", + []string{"a=1; b=2"}, + []clone.Cookie{{Name: "a", Value: "1"}, {Name: "b", Value: "2"}}, + }, + { + "repeated flag accumulates", + []string{"a=1", "b=2"}, + []clone.Cookie{{Name: "a", Value: "1"}, {Name: "b", Value: "2"}}, + }, + {"value may contain equals", []string{"token=ab=cd"}, []clone.Cookie{{Name: "token", Value: "ab=cd"}}}, + {"trims whitespace", []string{" a = 1 "}, []clone.Cookie{{Name: "a", Value: "1"}}}, + {"empty value allowed", []string{"a="}, []clone.Cookie{{Name: "a", Value: ""}}}, + {"trailing semicolon ignored", []string{"a=1;"}, []clone.Cookie{{Name: "a", Value: "1"}}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := parseCookies(tc.in) + if err != nil { + t.Fatalf("parseCookies(%q): %v", tc.in, err) + } + if !equalCookies(got, tc.want) { + t.Errorf("parseCookies(%q) = %v; want %v", tc.in, got, tc.want) + } + }) + } +} + +func TestParseCookiesErrors(t *testing.T) { + for _, in := range [][]string{ + {"noequalssign"}, + {"=novalue"}, + {"a=1; =bad"}, + } { + if _, err := parseCookies(in); err == nil { + t.Errorf("parseCookies(%q) = nil error; want error", in) + } + } +} + +func equalCookies(a, b []clone.Cookie) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/clone/cloner.go b/clone/cloner.go index 2940e77..02b5ae5 100644 --- a/clone/cloner.go +++ b/clone/cloner.go @@ -73,13 +73,15 @@ func New(seed *url.URL, cfg Config, logf Logf) *Cloner { } host := seed.Hostname() outRoot := cfg.HostDir(host) + dl := asset.NewDownloader(cfg.UserAgent, cfg.Timeout, cfg.MaxAssetBytes) + dl.Cookie = cfg.CookieHeader() return &Cloner{ cfg: cfg, seed: seed, seedHost: host, outRoot: outRoot, statePth: filepath.Join(outRoot, cfg.Reserved, "state.json"), - dl: asset.NewDownloader(cfg.UserAgent, cfg.Timeout, cfg.MaxAssetBytes), + dl: dl, httpC: &http.Client{Timeout: cfg.Timeout}, robots: robots.AllowAll(), front: newFrontier(), @@ -144,6 +146,7 @@ func (c *Cloner) Run(ctx context.Context) (Result, error) { Scroll: c.cfg.Scroll, ChromeBin: c.cfg.ChromeBin, ControlURL: c.cfg.ControlURL, + Cookies: c.browserCookies(), }) defer func() { _ = c.pool.Close() }() @@ -208,6 +211,9 @@ func (c *Cloner) loadRobots(ctx context.Context) { return } req.Header.Set("User-Agent", c.cfg.UserAgent) + if h := c.cfg.CookieHeader(); h != "" { + req.Header.Set("Cookie", h) + } resp, err := c.httpC.Do(req) if err != nil { return @@ -241,7 +247,7 @@ func (c *Cloner) setupCrawlDelayLimiter() { func (c *Cloner) seedSitemaps(ctx context.Context) { seeds := append([]string{}, c.robots.Sitemaps...) seeds = append(seeds, c.seed.Scheme+"://"+c.seed.Host+"/sitemap.xml") - locs := collectSitemaps(ctx, c.httpC, c.cfg.UserAgent, seeds) + locs := collectSitemaps(ctx, c.httpC, c.cfg.UserAgent, c.cfg.CookieHeader(), seeds) added := 0 for _, loc := range locs { u, err := urlx.Normalize(c.seed, loc) @@ -468,6 +474,20 @@ func (c *Cloner) enqueuePage(ctx context.Context, u *url.URL, depth int) bool { return true } +// browserCookies maps the configured cookies to Chrome cookies scoped to the +// seed host (and its subdomains), so gated pages render as the authenticated +// user. It returns nil when no cookies are set, leaving Chrome's jar untouched. +func (c *Cloner) browserCookies() []browser.Cookie { + if len(c.cfg.Cookies) == 0 { + return nil + } + out := make([]browser.Cookie, 0, len(c.cfg.Cookies)) + for _, ck := range c.cfg.Cookies { + out = append(out, browser.Cookie{Name: ck.Name, Value: ck.Value, Domain: c.seedHost}) + } + return out +} + // wantAsset reports whether an asset should be downloaded and localized, or left // pointing at its live URL. kage skips two classes by default: assets on a host // outside the seed's registrable domain (a third-party tracker, an unrelated diff --git a/clone/config.go b/clone/config.go index 97d7eed..d8a4cd8 100644 --- a/clone/config.go +++ b/clone/config.go @@ -6,11 +6,21 @@ package clone import ( "os" "path/filepath" + "strings" "time" "github.com/tamnd/kage/urlx" ) +// Cookie is a single name=value cookie kage sends with every request it makes +// during a clone — the Chrome page navigations, the asset downloads, and the +// robots.txt and sitemap fetches — so a site behind a login or a cookie wall +// can still be mirrored. +type Cookie struct { + Name string + Value string +} + // DefaultOutDir is where mirrors land unless --out overrides it: a per-user data // directory ($HOME/data/kage) so clones from anywhere collect in one place, // falling back to a local kage-out when the home directory cannot be resolved. @@ -51,7 +61,12 @@ type Config struct { RenderTimeout time.Duration // hard cap per page render Scroll bool - UserAgent string + UserAgent string + // Cookies are sent with every request kage makes during the run (the Chrome + // page renders, the asset downloads, and the robots.txt and sitemap fetches), + // scoped to the seed host and its subdomains, so a login- or region-gated site + // can be cloned. They are empty by default. + Cookies []Cookie IncludeSubdomains bool ScopePrefix string ExcludePaths []string @@ -131,6 +146,20 @@ func DefaultConfig() Config { } } +// CookieHeader serialises the configured cookies into a value for the Cookie +// request header ("a=1; b=2"), or "" when none are set. Empty-named entries are +// skipped so a stray pair never produces a malformed header. +func (c Config) CookieHeader() string { + parts := make([]string, 0, len(c.Cookies)) + for _, ck := range c.Cookies { + if ck.Name == "" { + continue + } + parts = append(parts, ck.Name+"="+ck.Value) + } + return strings.Join(parts, "; ") +} + // HostDir returns the mirror root for a seed host: /. func (c Config) HostDir(host string) string { return filepath.Join(c.OutDir, host) diff --git a/clone/cookie_test.go b/clone/cookie_test.go new file mode 100644 index 0000000..31ed231 --- /dev/null +++ b/clone/cookie_test.go @@ -0,0 +1,46 @@ +package clone + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" +) + +func TestConfigCookieHeader(t *testing.T) { + cases := []struct { + name string + cookies []Cookie + want string + }{ + {"none", nil, ""}, + {"one", []Cookie{{Name: "session", Value: "abc"}}, "session=abc"}, + {"many", []Cookie{{Name: "a", Value: "1"}, {Name: "b", Value: "2"}}, "a=1; b=2"}, + {"skips empty name", []Cookie{{Name: "", Value: "x"}, {Name: "a", Value: "1"}}, "a=1"}, + {"empty value kept", []Cookie{{Name: "a", Value: ""}}, "a="}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := (Config{Cookies: tc.cookies}).CookieHeader(); got != tc.want { + t.Errorf("CookieHeader() = %q; want %q", got, tc.want) + } + }) + } +} + +func TestFetchSitemapSendsCookie(t *testing.T) { + var got string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + got = r.Header.Get("Cookie") + _, _ = w.Write([]byte(`https://x/`)) + })) + defer srv.Close() + + locs, _ := fetchSitemap(context.Background(), srv.Client(), "kage-test", "session=abc", srv.URL+"/sitemap.xml") + if got != "session=abc" { + t.Errorf("Cookie header = %q; want %q", got, "session=abc") + } + if len(locs) != 1 || locs[0] != "https://x/" { + t.Errorf("locs = %v; want [https://x/]", locs) + } +} diff --git a/clone/sitemap.go b/clone/sitemap.go index d736248..22176a0 100644 --- a/clone/sitemap.go +++ b/clone/sitemap.go @@ -21,7 +21,7 @@ type sitemapEntry struct { // fetchSitemap downloads and parses one sitemap, returning page locations and, // for a sitemap index, the child sitemap URLs. -func fetchSitemap(ctx context.Context, client *http.Client, ua, sitemapURL string) (locs, children []string) { +func fetchSitemap(ctx context.Context, client *http.Client, ua, cookie, sitemapURL string) (locs, children []string) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, sitemapURL, nil) if err != nil { return nil, nil @@ -29,6 +29,9 @@ func fetchSitemap(ctx context.Context, client *http.Client, ua, sitemapURL strin if ua != "" { req.Header.Set("User-Agent", ua) } + if cookie != "" { + req.Header.Set("Cookie", cookie) + } resp, err := client.Do(req) if err != nil { return nil, nil @@ -57,7 +60,7 @@ func fetchSitemap(ctx context.Context, client *http.Client, ua, sitemapURL strin // collectSitemaps walks a set of seed sitemap URLs (following index files one // level deep) and returns all discovered page locations, bounded by a deadline. -func collectSitemaps(ctx context.Context, client *http.Client, ua string, seeds []string) []string { +func collectSitemaps(ctx context.Context, client *http.Client, ua, cookie string, seeds []string) []string { ctx, cancel := context.WithTimeout(ctx, 20*time.Second) defer cancel() @@ -71,7 +74,7 @@ func collectSitemaps(ctx context.Context, client *http.Client, ua string, seeds continue } seen[sm] = true - locs, children := fetchSitemap(ctx, client, ua, sm) + locs, children := fetchSitemap(ctx, client, ua, cookie, sm) out = append(out, locs...) for _, c := range children { if !seen[c] { diff --git a/docs/content/reference/cli.md b/docs/content/reference/cli.md index c3b6f4c..7b9400a 100644 --- a/docs/content/reference/cli.md +++ b/docs/content/reference/cli.md @@ -50,6 +50,7 @@ images, and fonts, and writes a browsable mirror to `//`. | `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts (0 = use robots.txt) | | `--no-sitemap` | `false` | Do not seed URLs from `sitemap.xml` | | `--user-agent` | Chrome UA | User-Agent for asset and robots fetches | +| `--cookie` | none | Cookie sent with every request, as `name=value`; pass a whole header (`"a=1; b=2"`) or repeat the flag. Scoped to the seed host — useful for login- or region-gated sites | ### Rendering