Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ The flags you'll actually reach for:
| `--scroll` | `false` | Auto-scroll each page to trigger lazy loading |
| `--workers` | `4` | How many pages to render at once |
| `--no-robots` | `false` | Ignore `robots.txt` (be nice) |
| `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts |
| `-f, --force` | `false` | Delete any existing mirror for the host first |
| `--chrome` | | Path to the Chrome/Chromium binary |

Expand Down
6 changes: 6 additions & 0 deletions cli/clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type cloneFlags struct {
scopePrefix string
exclude []string
noRobots bool
crawlDelay time.Duration
noSitemap bool
headful bool
keepNoscript bool
Expand Down Expand Up @@ -84,6 +85,7 @@ func newCloneCmd() *cobra.Command {
fs.StringVar(&f.scopePrefix, "scope-prefix", "", "only crawl pages whose path starts with this prefix")
fs.StringSliceVar(&f.exclude, "exclude", nil, "path prefixes to skip (repeatable)")
fs.BoolVar(&f.noRobots, "no-robots", false, "ignore robots.txt (be careful and polite)")
fs.DurationVar(&f.crawlDelay, "crawl-delay", 0, "override robots.txt Crawl-delay between page starts (0 = use robots.txt)")
fs.BoolVar(&f.noSitemap, "no-sitemap", false, "do not seed URLs from sitemap.xml")
fs.BoolVar(&f.headful, "headful", false, "run Chrome with a visible window (debugging)")
fs.BoolVar(&f.keepNoscript, "keep-noscript", false, "unwrap <noscript> content instead of dropping it")
Expand All @@ -102,6 +104,9 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error {
if err != nil {
return fmt.Errorf("invalid url %q: %w", arg, err)
}
if f.crawlDelay < 0 {
return fmt.Errorf("--crawl-delay must be >= 0")
}

cfg := clone.DefaultConfig()
cfg.OutDir = f.out
Expand Down Expand Up @@ -139,6 +144,7 @@ func runClone(ctx context.Context, arg string, f *cloneFlags) error {
cfg.ScopePrefix = f.scopePrefix
cfg.ExcludePaths = f.exclude
cfg.RespectRobots = !f.noRobots
cfg.CrawlDelay = f.crawlDelay
cfg.FollowSitemap = !f.noSitemap
cfg.Headless = !f.headful
cfg.KeepNoscript = f.keepNoscript
Expand Down
36 changes: 33 additions & 3 deletions clone/cloner.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/tamnd/kage/sanitize"
"github.com/tamnd/kage/urlx"
"golang.org/x/net/html"
"golang.org/x/time/rate"
)

// Logf is an optional sink for human-readable progress lines.
Expand All @@ -43,9 +44,12 @@ type Cloner struct {
mu sync.Mutex
seenAssets map[string]bool
enqueued int // pages offered to the queue
wg sync.WaitGroup
pageJobs chan pageItem
assetJobs chan assetItem

crawlLimiter *rate.Limiter

wg sync.WaitGroup
pageJobs chan pageItem
assetJobs chan assetItem

muContent sync.Mutex
seenContent map[string]string // sha-256 of page bytes -> first path written
Expand Down Expand Up @@ -144,6 +148,7 @@ func (c *Cloner) Run(ctx context.Context) (Result, error) {
defer func() { _ = c.pool.Close() }()

c.loadRobots(ctx)
c.setupCrawlDelayLimiter()

// Start workers.
var workers sync.WaitGroup
Expand Down Expand Up @@ -218,6 +223,19 @@ func (c *Cloner) loadRobots(ctx context.Context) {
c.robots = robots.Parse(string(data), "kage")
}

func (c *Cloner) setupCrawlDelayLimiter() {
delay := c.cfg.CrawlDelay
if delay <= 0 && c.cfg.RespectRobots && c.robots != nil {
delay = c.robots.CrawlDelay
}
if delay <= 0 {
c.crawlLimiter = nil
return
}

c.crawlLimiter = rate.NewLimiter(rate.Every(delay), 1)
}

// seedSitemaps adds in-scope sitemap URLs (from robots and the default path) to
// the frontier.
func (c *Cloner) seedSitemaps(ctx context.Context) {
Expand Down Expand Up @@ -252,6 +270,9 @@ func (c *Cloner) processPage(ctx context.Context, j pageItem) {
c.stats.skipped.Add(1)
return
}
if !c.waitForCrawlDelay(ctx) {
return
}

res, err := c.pool.Render(ctx, j.u.String())
if err != nil {
Expand Down Expand Up @@ -324,6 +345,15 @@ func (c *Cloner) processPage(ctx context.Context, j pageItem) {
c.stats.recordPage(c.pagePathKey(j.u), deduped)
}

// waitForCrawlDelay spaces page render starts.
func (c *Cloner) waitForCrawlDelay(ctx context.Context) bool {
if c.crawlLimiter == nil {
return true
}

return c.crawlLimiter.Wait(ctx) == nil
}

// processAsset downloads one asset, rewriting CSS references on the way, and
// writes it to its deterministic local path.
func (c *Cloner) processAsset(ctx context.Context, j assetItem) {
Expand Down
51 changes: 51 additions & 0 deletions clone/cloner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"time"

"github.com/tamnd/kage/browser"
"github.com/tamnd/kage/robots"
"github.com/tamnd/kage/urlx"
)

Expand Down Expand Up @@ -178,6 +179,56 @@ func TestPageKeyCollapsesDuplicates(t *testing.T) {
}
}

func TestCrawlDelaySpacesPageStarts(t *testing.T) {
seed, _ := urlx.ParseSeed("https://ex.com")
cfg := DefaultConfig()
cfg.RespectRobots = true
c := New(seed, cfg, nil)
c.robots = &robots.Matcher{CrawlDelay: 20 * time.Millisecond}
c.setupCrawlDelayLimiter()

ctx := context.Background()
if !c.waitForCrawlDelay(ctx) {
t.Fatal("first crawl-delay wait returned false")
}

start := time.Now()
if !c.waitForCrawlDelay(ctx) {
t.Fatal("second crawl-delay wait returned false")
}
if elapsed := time.Since(start); elapsed < 15*time.Millisecond {
t.Fatalf("second crawl-delay wait = %v, want at least 15ms", elapsed)
}
}

func TestCrawlDelayFlagOverridesRobots(t *testing.T) {
seed, _ := urlx.ParseSeed("https://ex.com")
cfg := DefaultConfig()
cfg.RespectRobots = true
cfg.CrawlDelay = 20 * time.Millisecond
c := New(seed, cfg, nil)
c.robots = &robots.Matcher{CrawlDelay: time.Minute}
c.setupCrawlDelayLimiter()

ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
if !c.waitForCrawlDelay(ctx) {
t.Fatal("first crawl-delay wait returned false")
}

start := time.Now()
if !c.waitForCrawlDelay(ctx) {
t.Fatal("second crawl-delay wait returned false")
}
elapsed := time.Since(start)
if elapsed < 15*time.Millisecond {
t.Fatalf("second crawl-delay wait = %v, want at least 15ms", elapsed)
}
if elapsed > 150*time.Millisecond {
t.Fatalf("second crawl-delay wait = %v, override likely ignored", elapsed)
}
}

func mustURL(t *testing.T, raw string) *url.URL {
t.Helper()
u, err := url.Parse(raw)
Expand Down
1 change: 1 addition & 0 deletions clone/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Config struct {
ExcludePaths []string

RespectRobots bool
CrawlDelay time.Duration // override robots.txt Crawl-delay when > 0
FollowSitemap bool
Headless bool
KeepNoscript bool
Expand Down
1 change: 1 addition & 0 deletions docs/content/reference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ images, and fonts, and writes a browsable mirror to `<out>/<host>/`.
| Flag | Default | Meaning |
|------|---------|---------|
| `--no-robots` | `false` | Ignore `robots.txt` |
| `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts (0 = use robots.txt) |
| `--no-sitemap` | `false` | Do not seed URLs from `sitemap.xml` |
| `--user-agent` | Chrome UA | User-Agent for asset and robots fetches |

Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ require (
github.com/webview/webview_go v0.0.0-20240831120633-6173450d4dd6
golang.org/x/image v0.42.0
golang.org/x/net v0.56.0
golang.org/x/time v0.15.0
)

require (
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw=
golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE=
golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4=
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
Expand Down