Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ All notable changes to kage are recorded here. The format follows

## [Unreleased]

### Added

- `kage clone --cookie` sends cookies with every request the crawl makes, so a site behind a login or a region/cookie wall can be cloned.
The cookies are attached to all four request paths: the Chrome page navigations (seeded into the browser before the first load), the asset downloads, and the `robots.txt` and `sitemap.xml` fetches — so the authenticated session is consistent across the whole run rather than only the pages.
Pass a single `name=value` pair, repeat the flag for several cookies, or paste a whole `Cookie` header straight from the browser's devtools in one value (`--cookie "session=abc; theme=dark"`).
Browser cookies are scoped to the seed host and its subdomains; the HTTP fetches send the same header, and because off-domain assets are left on the live web by default the cookies are not leaked to third-party hosts.

## [0.3.6] - 2026-06-19

### Fixed
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ The flags you'll actually reach for:
| `--workers` | `4` | How many pages to render at once |
| `--no-robots` | `false` | Ignore `robots.txt` (be nice) |
| `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts |
| `--cookie` | | Cookie sent with every request, as `name=value` (repeatable, or a whole `"a=1; b=2"` header) — for login- or region-gated sites |
| `-f, --force` | `false` | Delete any existing mirror for the host first |
| `--chrome` | | Path to the Chrome/Chromium binary |

Expand Down
8 changes: 6 additions & 2 deletions asset/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ import (
type Downloader struct {
Client *http.Client
UserAgent string
MaxBytes int64 // per-asset cap; 0 = unlimited
Retries int // extra attempts for a transient failure (0 = try once)
Cookie string // value for the Cookie request header; empty sends none
MaxBytes int64 // per-asset cap; 0 = unlimited
Retries int // extra attempts for a transient failure (0 = try once)
}

// NewDownloader builds a Downloader with a sane client and the given timeout.
Expand Down Expand Up @@ -105,6 +106,9 @@ func (d *Downloader) try(ctx context.Context, u *url.URL, referer string) (*Resu
if d.UserAgent != "" {
req.Header.Set("User-Agent", d.UserAgent)
}
if d.Cookie != "" {
req.Header.Set("Cookie", d.Cookie)
}
if referer != "" {
req.Header.Set("Referer", referer)
}
Expand Down
38 changes: 38 additions & 0 deletions asset/download_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,44 @@ func TestGetRetriesTransientThenSucceeds(t *testing.T) {
}
}

func TestGetSendsCookieHeader(t *testing.T) {
var got string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
got = r.Header.Get("Cookie")
w.Header().Set("Content-Type", "text/css")
_, _ = w.Write([]byte("body{}"))
}))
defer srv.Close()

d := NewDownloader("kage-test", 5*time.Second, 0)
d.Cookie = "session=abc; theme=dark"
u, _ := url.Parse(srv.URL + "/style.css")
if _, err := d.Get(context.Background(), u, ""); err != nil {
t.Fatalf("Get: %v", err)
}
if got != "session=abc; theme=dark" {
t.Errorf("Cookie header = %q; want %q", got, "session=abc; theme=dark")
}
}

func TestGetSendsNoCookieByDefault(t *testing.T) {
var got string
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
got = r.Header.Get("Cookie")
_, _ = w.Write([]byte("x"))
}))
defer srv.Close()

d := NewDownloader("kage-test", 5*time.Second, 0)
u, _ := url.Parse(srv.URL + "/x.png")
if _, err := d.Get(context.Background(), u, ""); err != nil {
t.Fatalf("Get: %v", err)
}
if got != "" {
t.Errorf("expected no Cookie header, got %q", got)
}
}

func TestGetDoesNotRetryPermanent(t *testing.T) {
var hits int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
26 changes: 26 additions & 0 deletions browser/cookie_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package browser

import "testing"

func TestCookieParams(t *testing.T) {
got := cookieParams([]Cookie{
{Name: "session", Value: "abc", Domain: "example.com"},
{Name: "", Value: "skipme", Domain: "example.com"}, // dropped: empty name
{Name: "theme", Value: "dark", Domain: "example.com"},
})
if len(got) != 2 {
t.Fatalf("got %d params; want 2", len(got))
}
if got[0].Name != "session" || got[0].Value != "abc" || got[0].Domain != "example.com" || got[0].Path != "/" {
t.Errorf("param[0] = %+v; want session/abc/example.com//", got[0])
}
if got[1].Name != "theme" {
t.Errorf("param[1].Name = %q; want theme", got[1].Name)
}
}

func TestCookieParamsEmpty(t *testing.T) {
if got := cookieParams(nil); got != nil && len(got) != 0 {
t.Errorf("cookieParams(nil) = %v; want empty", got)
}
}
48 changes: 48 additions & 0 deletions browser/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ import (
"github.com/go-rod/stealth"
)

// Cookie is a cookie injected into Chrome before any navigation, so login- or
// region-gated pages render the way the authenticated user would see them.
type Cookie struct {
Name string
Value string
Domain string // host the cookie applies to; subdomains are covered too
}

// Options configure a Pool.
type Options struct {
Headless bool // run Chrome without a window
Expand All @@ -29,6 +37,7 @@ type Options struct {
Scroll bool // auto-scroll to trigger lazy-loaded media
ChromeBin string // explicit binary; empty = autodetect
ControlURL string // attach to an existing Chrome instead of launching
Cookies []Cookie // cookies seeded into Chrome before navigation
}

// DefaultOptions returns the baseline render settings.
Expand Down Expand Up @@ -207,6 +216,13 @@ func (p *Pool) getBrowser() (*rod.Browser, error) {
return nil, fmt.Errorf("connect Chrome: %w", err)
}

// Seed any configured cookies before the first navigation so a login- or
// region-gated site renders as the authenticated user, not the logged-out
// wall. Scoped to the seed host by the caller.
if err := setCookies(b, p.opts.Cookies); err != nil {
return nil, fmt.Errorf("set cookies: %w", err)
}

// kage never wants Chrome to write a file to disk. Every asset is fetched
// through kage's own downloader, which applies the size and media policy, so a
// Chrome-initiated download is only ever an accident: navigating an <a> link
Expand All @@ -225,6 +241,38 @@ func (p *Pool) getBrowser() (*rod.Browser, error) {
return b, nil
}

// setCookies seeds Chrome with the given cookies so the very first navigation is
// already authenticated. An empty list is a no-op (and never clears Chrome's
// cookies — rod treats a nil slice as "clear all", which is not what an absent
// --cookie flag should mean). Each cookie is scoped to its Domain with a root
// path so every page under that host receives it.
func setCookies(b *rod.Browser, cookies []Cookie) error {
params := cookieParams(cookies)
if len(params) == 0 {
return nil
}
return b.SetCookies(params)
}

// cookieParams builds the Chrome cookie parameters for the given cookies,
// scoping each to its Domain with a root path so every page under that host
// receives it. Entries with an empty name are dropped.
func cookieParams(cookies []Cookie) []*proto.NetworkCookieParam {
params := make([]*proto.NetworkCookieParam, 0, len(cookies))
for _, ck := range cookies {
if ck.Name == "" {
continue
}
params = append(params, &proto.NetworkCookieParam{
Name: ck.Name,
Value: ck.Value,
Domain: ck.Domain,
Path: "/",
})
}
return params
}

// Close shuts down the managed Chrome process.
func (p *Pool) Close() error {
p.mu.Lock()
Expand Down
31 changes: 31 additions & 0 deletions cli/clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type cloneFlags struct {
renderTO time.Duration
scroll bool
userAgent string
cookies []string
subdomains bool
scopePrefix string
exclude []string
Expand Down Expand Up @@ -81,6 +82,7 @@ func newCloneCmd() *cobra.Command {
fs.DurationVar(&f.renderTO, "render-timeout", 30*time.Second, "hard cap per page render")
fs.BoolVar(&f.scroll, "scroll", false, "auto-scroll each page to trigger lazy loading")
fs.StringVar(&f.userAgent, "user-agent", clone.DefaultUserAgent, "User-Agent for asset and robots fetches")
fs.StringArrayVar(&f.cookies, "cookie", nil, "cookie to send with every request, as name=value; pass a whole header (\"a=1; b=2\") or repeat the flag")
fs.BoolVar(&f.subdomains, "subdomains", false, "treat subdomains of the seed host as in scope")
fs.StringVar(&f.scopePrefix, "scope-prefix", "", "only crawl pages whose path starts with this prefix")
fs.StringSliceVar(&f.exclude, "exclude", nil, "path prefixes to skip (repeatable)")
Expand Down Expand Up @@ -140,6 +142,11 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error {
cfg.RenderTimeout = f.renderTO
cfg.Scroll = f.scroll
cfg.UserAgent = f.userAgent
cookies, err := parseCookies(f.cookies)
if err != nil {
return err
}
cfg.Cookies = cookies
cfg.IncludeSubdomains = f.subdomains
cfg.ScopePrefix = f.scopePrefix
cfg.ExcludePaths = f.exclude
Expand Down Expand Up @@ -205,6 +212,30 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error {
return nil
}

// parseCookies turns the repeated --cookie flag values into clone.Cookie pairs.
// Each value is either a single "name=value" pair or a whole Cookie header
// ("a=1; b=2"), so a user can paste one straight from the browser's devtools or
// build the list up flag by flag. Whitespace around names and separators is
// trimmed; an entry with no "=" or an empty name is a usage error.
func parseCookies(values []string) ([]clone.Cookie, error) {
var out []clone.Cookie
for _, v := range values {
for _, pair := range strings.Split(v, ";") {
pair = strings.TrimSpace(pair)
if pair == "" {
continue
}
name, val, ok := strings.Cut(pair, "=")
name = strings.TrimSpace(name)
if !ok || name == "" {
return nil, fmt.Errorf("invalid --cookie %q: want name=value", pair)
}
out = append(out, clone.Cookie{Name: name, Value: strings.TrimSpace(val)})
}
}
return out, nil
}

// progressLine renders the single-line live counter. "pages" is the count of
// real pages (distinct paths); when a faceted site spawns query-string variants
// they are shown separately so the page number stays easy to read.
Expand Down
67 changes: 67 additions & 0 deletions cli/clone_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package cli

import (
"testing"

"github.com/tamnd/kage/clone"
)

func TestParseCookies(t *testing.T) {
cases := []struct {
name string
in []string
want []clone.Cookie
}{
{"nil", nil, nil},
{"single pair", []string{"session=abc"}, []clone.Cookie{{Name: "session", Value: "abc"}}},
{
"header form splits on semicolons",
[]string{"a=1; b=2"},
[]clone.Cookie{{Name: "a", Value: "1"}, {Name: "b", Value: "2"}},
},
{
"repeated flag accumulates",
[]string{"a=1", "b=2"},
[]clone.Cookie{{Name: "a", Value: "1"}, {Name: "b", Value: "2"}},
},
{"value may contain equals", []string{"token=ab=cd"}, []clone.Cookie{{Name: "token", Value: "ab=cd"}}},
{"trims whitespace", []string{" a = 1 "}, []clone.Cookie{{Name: "a", Value: "1"}}},
{"empty value allowed", []string{"a="}, []clone.Cookie{{Name: "a", Value: ""}}},
{"trailing semicolon ignored", []string{"a=1;"}, []clone.Cookie{{Name: "a", Value: "1"}}},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got, err := parseCookies(tc.in)
if err != nil {
t.Fatalf("parseCookies(%q): %v", tc.in, err)
}
if !equalCookies(got, tc.want) {
t.Errorf("parseCookies(%q) = %v; want %v", tc.in, got, tc.want)
}
})
}
}

func TestParseCookiesErrors(t *testing.T) {
for _, in := range [][]string{
{"noequalssign"},
{"=novalue"},
{"a=1; =bad"},
} {
if _, err := parseCookies(in); err == nil {
t.Errorf("parseCookies(%q) = nil error; want error", in)
}
}
}

func equalCookies(a, b []clone.Cookie) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
24 changes: 22 additions & 2 deletions clone/cloner.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,15 @@ func New(seed *url.URL, cfg Config, logf Logf) *Cloner {
}
host := seed.Hostname()
outRoot := cfg.HostDir(host)
dl := asset.NewDownloader(cfg.UserAgent, cfg.Timeout, cfg.MaxAssetBytes)
dl.Cookie = cfg.CookieHeader()
return &Cloner{
cfg: cfg,
seed: seed,
seedHost: host,
outRoot: outRoot,
statePth: filepath.Join(outRoot, cfg.Reserved, "state.json"),
dl: asset.NewDownloader(cfg.UserAgent, cfg.Timeout, cfg.MaxAssetBytes),
dl: dl,
httpC: &http.Client{Timeout: cfg.Timeout},
robots: robots.AllowAll(),
front: newFrontier(),
Expand Down Expand Up @@ -144,6 +146,7 @@ func (c *Cloner) Run(ctx context.Context) (Result, error) {
Scroll: c.cfg.Scroll,
ChromeBin: c.cfg.ChromeBin,
ControlURL: c.cfg.ControlURL,
Cookies: c.browserCookies(),
})
defer func() { _ = c.pool.Close() }()

Expand Down Expand Up @@ -208,6 +211,9 @@ func (c *Cloner) loadRobots(ctx context.Context) {
return
}
req.Header.Set("User-Agent", c.cfg.UserAgent)
if h := c.cfg.CookieHeader(); h != "" {
req.Header.Set("Cookie", h)
}
resp, err := c.httpC.Do(req)
if err != nil {
return
Expand Down Expand Up @@ -241,7 +247,7 @@ func (c *Cloner) setupCrawlDelayLimiter() {
func (c *Cloner) seedSitemaps(ctx context.Context) {
seeds := append([]string{}, c.robots.Sitemaps...)
seeds = append(seeds, c.seed.Scheme+"://"+c.seed.Host+"/sitemap.xml")
locs := collectSitemaps(ctx, c.httpC, c.cfg.UserAgent, seeds)
locs := collectSitemaps(ctx, c.httpC, c.cfg.UserAgent, c.cfg.CookieHeader(), seeds)
added := 0
for _, loc := range locs {
u, err := urlx.Normalize(c.seed, loc)
Expand Down Expand Up @@ -468,6 +474,20 @@ func (c *Cloner) enqueuePage(ctx context.Context, u *url.URL, depth int) bool {
return true
}

// browserCookies maps the configured cookies to Chrome cookies scoped to the
// seed host (and its subdomains), so gated pages render as the authenticated
// user. It returns nil when no cookies are set, leaving Chrome's jar untouched.
func (c *Cloner) browserCookies() []browser.Cookie {
if len(c.cfg.Cookies) == 0 {
return nil
}
out := make([]browser.Cookie, 0, len(c.cfg.Cookies))
for _, ck := range c.cfg.Cookies {
out = append(out, browser.Cookie{Name: ck.Name, Value: ck.Value, Domain: c.seedHost})
}
return out
}

// wantAsset reports whether an asset should be downloaded and localized, or left
// pointing at its live URL. kage skips two classes by default: assets on a host
// outside the seed's registrable domain (a third-party tracker, an unrelated
Expand Down
Loading