diff --git a/README.md b/README.md index d6d0dc3..76195d2 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ The flags you'll actually reach for: | `--scroll` | `false` | Auto-scroll each page to trigger lazy loading | | `--workers` | `4` | How many pages to render at once | | `--no-robots` | `false` | Ignore `robots.txt` (be nice) | +| `--crawl-delay` | `0s` | Override robots.txt `Crawl-delay` between page starts | | `-f, --force` | `false` | Delete any existing mirror for the host first | | `--chrome` | | Path to the Chrome/Chromium binary | diff --git a/cli/clone.go b/cli/clone.go index 47ce799..a3b402a 100644 --- a/cli/clone.go +++ b/cli/clone.go @@ -37,6 +37,7 @@ type cloneFlags struct { scopePrefix string exclude []string noRobots bool + crawlDelay time.Duration noSitemap bool headful bool keepNoscript bool @@ -84,6 +85,7 @@ func newCloneCmd() *cobra.Command { fs.StringVar(&f.scopePrefix, "scope-prefix", "", "only crawl pages whose path starts with this prefix") fs.StringSliceVar(&f.exclude, "exclude", nil, "path prefixes to skip (repeatable)") fs.BoolVar(&f.noRobots, "no-robots", false, "ignore robots.txt (be careful and polite)") + fs.DurationVar(&f.crawlDelay, "crawl-delay", 0, "override robots.txt Crawl-delay between page starts (0 = use robots.txt)") fs.BoolVar(&f.noSitemap, "no-sitemap", false, "do not seed URLs from sitemap.xml") fs.BoolVar(&f.headful, "headful", false, "run Chrome with a visible window (debugging)") fs.BoolVar(&f.keepNoscript, "keep-noscript", false, "unwrap