From e47a72be76295459425f6b9319b11e4ae1a09026 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 09:40:42 +0300
Subject: [PATCH 01/10] feat(frontier): priority lanes + weighted round-robin
 claim
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds four lanes (submitted/refresh/discovered/bulk) so high-value URLs
from RSS, sitemaps, and publisher submissions jump the bulk-crawl
backlog instead of waiting behind 2.8M cloud.google.com pages. Default
weights 50/30/15/5; empty lanes donate their share to the next priority.

Wire format: 'f' + sub + lane + host + 0x00 + url for the lane-aware
secondary index. Lane byte (0..3) is below printable-ASCII so legacy
and lane-aware keys coexist; ClaimFrontier scans new format first then
falls back to the legacy 'f' + sub + host index so the existing 4.3M
queued URLs drain naturally without a synchronous migration.

frontierEntry gains a trailing Lane byte; missing bytes decode as
LaneDiscovered (2) so pre-lane rows keep working through transitions.

Per-lane round-robin cursor (laneCursors) and a monotonic lane tick
(laneTick) drive deterministic weighted RR — fair without per-call
randomness. Host-fairness preserved within each lane.

GetLaneStats walks both secondary indexes key-only and tallies per
lane; surfaced in /queue as a lanes[] block plus legacy_queued /
legacy_in_flight totals so operators can see whether the RR is
actually draining RSS ahead of bulk.

SeedRSS and SeedSitemap push to LaneRefresh and bypass allowedDomain:
the operator explicitly requested the feed/sitemap so its URLs are
trusted regardless of the curated include_domains list. Crawler
outbound-link discovery still goes through Seed (which defaults to
LaneDiscovered and respects allowedDomain) — so include_domains
continues to gate organic exploration as designed.

Backwards-compat notes:
- PushFrontier is a thin wrapper over PushFrontierLane(LaneDiscovered)
- transitionFrontier blind-deletes BOTH legacy and lane-aware keys,
  so completion/failure works for entries created in either era.
- SQLite Store gains a PushFrontierLane stub that ignores the lane
  (legacy schema has no lane column); production runs on Pebble.
---
 cmd/cosift/pebble_serve.go      |  26 +-
 internal/crawler/crawler.go     |  14 +-
 internal/crawler/rss.go         |  15 +-
 internal/crawler/sitemap.go     |  13 +-
 internal/crawler/store_iface.go |   1 +
 internal/store/pebble.go        | 619 +++++++++++++++++++++++++-------
 internal/store/store.go         |   8 +
 7 files changed, 551 insertions(+), 145 deletions(-)

diff --git a/cmd/cosift/pebble_serve.go b/cmd/cosift/pebble_serve.go
index 55a604d..ba90b1b 100644
--- a/cmd/cosift/pebble_serve.go
+++ b/cmd/cosift/pebble_serve.go
@@ -2908,13 +2908,35 @@ func (s *pebbleHTTP) handleQueue(w http.ResponseWriter, r *http.Request) {
 		writeProblem(w, http.StatusInternalServerError, err.Error())
 		return
 	}
-	writeJSON(w, http.StatusOK, map[string]any{
+	body := map[string]any{
 		"queued":    fs.Queued,
 		"in_flight": fs.InFlight,
 		"done":      fs.Done,
 		"errored":   fs.Errored,
 		"top_hosts": hosts,
-	})
+	}
+	// Lane breakdown: PebbleStore-only (SQLite Store has no lanes). When the
+	// store is a PebbleStore, surface the per-lane queued/in_flight counts so
+	// operators can see whether the weighted RR is actually draining RSS
+	// (lane 1) ahead of bulk (lane 3).
+	if ps, ok := any(s.store).(*store.PebbleStore); ok {
+		if ls, lerr := ps.GetLaneStats(r.Context()); lerr == nil {
+			laneNames := [4]string{"submitted", "refresh", "discovered", "bulk"}
+			lanesOut := make([]map[string]any, 0, 4)
+			for i, n := range laneNames {
+				lanesOut = append(lanesOut, map[string]any{
+					"lane":      i,
+					"name":      n,
+					"queued":    ls.Lanes[i].Queued,
+					"in_flight": ls.Lanes[i].InFlight,
+				})
+			}
+			body["lanes"] = lanesOut
+			body["legacy_queued"] = ls.LegacyQueued
+			body["legacy_in_flight"] = ls.LegacyInFlight
+		}
+	}
+	writeJSON(w, http.StatusOK, body)
 }
 
 // handleDomainsAudit streams the entire 'h' family as JSONL, one
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 9bd333a..56a6d6d 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -327,7 +327,19 @@ func (c *Crawler) WithRouter(route RouteFn, forward ForwardFn) *Crawler {
 //
 // `INSERT OR IGNORE` semantics: if the URL is already in the frontier (queued,
 // in-flight, done, or errored), Seed is a no-op. To force a refresh, use Recrawl.
+//
+// Defaults to the discovered lane (organic crawl). For lane-aware seeds
+// (RSS = refresh, sitemap = refresh, publisher-submitted = submitted, WET =
+// bulk), use SeedLane.
 func (c *Crawler) Seed(rawURL string) error {
+	return c.SeedLane(rawURL, 2) // LaneDiscovered
+}
+
+// SeedLane is like Seed but pushes the URL into a specific lane. Used by
+// SeedRSS (refresh), SeedSitemap (refresh), and future publisher-submit
+// paths (submitted) so high-value URLs jump the cloud.google.com bulk
+// backlog via the weighted round-robin in ClaimFrontier.
+func (c *Crawler) SeedLane(rawURL string, lane byte) error {
 	canon, err := canonicalize(rawURL)
 	if err != nil {
 		return err
@@ -335,7 +347,7 @@ func (c *Crawler) Seed(rawURL string) error {
 	if !c.allowedDomain(canon) {
 		return fmt.Errorf("seed %s not allowed by include/exclude rules", canon)
 	}
-	return c.store.PushFrontier(context.Background(), canon, 0, 1.0)
+	return c.store.PushFrontierLane(context.Background(), canon, 0, lane, 1.0)
 }
 
 // Recrawl re-enqueues a URL even if it was previously crawled. Status flips
diff --git a/internal/crawler/rss.go b/internal/crawler/rss.go
index 5d3aad2..edcb845 100644
--- a/internal/crawler/rss.go
+++ b/internal/crawler/rss.go
@@ -78,10 +78,21 @@ func (c *Crawler) SeedRSS(ctx context.Context, feedURL string) (int, error) {
 	}
 	n := 0
 	for _, u := range urls {
-		if err := c.Seed(u); err != nil {
+		// RSS items are fresh-by-definition — push into the refresh lane
+		// so they jump cloud.google.com and other bulk backlog via the
+		// weighted round-robin in PebbleStore.ClaimFrontier.
+		//
+		// Bypass include_domains here: the operator explicitly asked to
+		// import this feed, so its items are trusted regardless of the
+		// curated crawler allowlist. (Crawler outbound-link discovery
+		// still goes through allowedDomain via Seed.)
+		canon, cerr := canonicalize(u)
+		if cerr != nil {
 			continue
 		}
-		n++
+		if perr := c.store.PushFrontierLane(context.Background(), canon, 0, 1, 1.0); perr == nil {
+			n++
+		}
 	}
 	return n, nil
 }
diff --git a/internal/crawler/sitemap.go b/internal/crawler/sitemap.go
index c54d07d..ca10ee7 100644
--- a/internal/crawler/sitemap.go
+++ b/internal/crawler/sitemap.go
@@ -52,8 +52,19 @@ func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, erro
 	// showed strings.Builder.Write at 107 GB). Streaming bounds heap to
 	// O(current sitemap size) regardless of nesting depth or total URLs.
 	n := 0
+	// Sitemap-imported URLs go into the refresh lane: callers run
+	// sitemap-import to refresh known-good sources (kubernetes.io,
+	// docs.python.org, etc.), so prioritize over generic discovery.
+	//
+	// Bypass include_domains here for the same reason as SeedRSS: the
+	// operator explicitly requested this sitemap, so trust its URLs
+	// regardless of the curated crawler allowlist.
 	err := c.fetchSitemapStream(ctx, sitemapURL, 2, func(u string) {
-		if seedErr := c.Seed(u); seedErr == nil {
+		canon, cerr := canonicalize(u)
+		if cerr != nil {
+			return
+		}
+		if perr := c.store.PushFrontierLane(context.Background(), canon, 0, 1, 1.0); perr == nil { // LaneRefresh
 			n++
 		}
 	})
diff --git a/internal/crawler/store_iface.go b/internal/crawler/store_iface.go
index 2e06c9a..767621a 100644
--- a/internal/crawler/store_iface.go
+++ b/internal/crawler/store_iface.go
@@ -25,6 +25,7 @@ import (
 type CrawlerStore interface {
 	// Frontier
 	PushFrontier(ctx context.Context, url string, depth int, priority float64) error
+	PushFrontierLane(ctx context.Context, url string, depth int, lane byte, priority float64) error
 	ClaimFrontier(ctx context.Context) (store.FrontierItem, bool, error)
 	CompleteFrontier(ctx context.Context, url string) error
 	FailFrontier(ctx context.Context, url, errMsg string) error
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index 3fc6ba4..5b40e57 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -75,7 +75,9 @@ type PebbleStore struct {
 	// (host, url) tuple; next claim seeks past it so each call resumes
 	// where the previous one stopped, wrapping at the end.
 	frontierCursorMu sync.Mutex
-	frontierCursor   []byte
+	frontierCursor   []byte             // legacy single cursor; pre-lanes scan state.
+	laneCursors      [laneCount][]byte  // per-lane round-robin cursors.
+	laneTick         atomic.Uint64      // monotonic counter driving weighted lane pick.
 
 	// PILOT-190: pebble.DB.Close() panics if called twice. Wrap teardown
 	// in sync.Once so repeated Close() calls (e.g. from layered cleanups
@@ -368,6 +370,9 @@ func frontierKey(url string) []byte {
 // The 0x00 separator keeps the host field prefix-disambiguated so a URL
 // can't slide into a different host's row even if it byte-prefixes a
 // host name.
+//
+// Legacy format (pre-lanes). Reads still walk these so the existing 4.3M
+// queue drains naturally; new pushes go through frontierStatusIndexKeyLane.
 func frontierStatusIndexKey(sub byte, host, url string) []byte {
 	k := make([]byte, 2+len(host)+1+len(url))
 	k[0] = famFrontier
@@ -378,8 +383,8 @@ func frontierStatusIndexKey(sub byte, host, url string) []byte {
 	return k
 }
 
-// frontierStatusIndexHost extracts the host portion of a secondary-index
-// key. Returns "" if the key shape is wrong.
+// frontierStatusIndexHost extracts the host portion of a legacy secondary
+// index key. Returns "" if the key shape is wrong.
 func frontierStatusIndexHost(key []byte) string {
 	if len(key) < 3 || key[0] != famFrontier {
 		return ""
@@ -393,6 +398,64 @@ func frontierStatusIndexHost(key []byte) string {
 	return ""
 }
 
+// Lane priority classes for the frontier. Higher-weighted lanes drain
+// proportionally more often via the weighted round-robin in ClaimFrontier.
+// Wire format: one byte per key, valid range 0..3.
+const (
+	LaneSubmitted  byte = 0 // publisher-submitted (e.g. /pub/submit) — weight 50
+	LaneRefresh    byte = 1 // refresh / fresh-content (RSS, sitemap-lastmod) — weight 30
+	LaneDiscovered byte = 2 // crawler outbound-link discovery (default) — weight 15
+	LaneBulk       byte = 3 // bulk import (WET, mass site-pack) — weight 5
+	laneCount           = 4
+)
+
+// frontierStatusIndexKeyLane is the lane-aware secondary index:
+//
+//	'f' + 'q' + lane + host + 0x00 + url → empty   (queued)
+//	'f' + 'i' + lane + host + 0x00 + url → empty   (in_flight)
+//
+// One byte after the status separator carries the lane. Hosts start with
+// printable ASCII (>= 0x21), so a key prefix of [famFrontier, sub, 0..3]
+// is unambiguously lane-format vs legacy (where byte 2 is a host byte).
+func frontierStatusIndexKeyLane(sub, lane byte, host, url string) []byte {
+	k := make([]byte, 3+len(host)+1+len(url))
+	k[0] = famFrontier
+	k[1] = sub
+	k[2] = lane
+	copy(k[3:], host)
+	k[3+len(host)] = 0x00
+	copy(k[3+len(host)+1:], url)
+	return k
+}
+
+// frontierStatusIndexHostLane extracts (host, lane) from a lane-format
+// secondary index key. Returns ("", 0) if the key shape is wrong.
+func frontierStatusIndexHostLane(key []byte) (host string, lane byte) {
+	if len(key) < 4 || key[0] != famFrontier {
+		return "", 0
+	}
+	lane = key[2]
+	rest := key[3:]
+	for i, b := range rest {
+		if b == 0x00 {
+			return string(rest[:i]), lane
+		}
+	}
+	return "", 0
+}
+
+// frontierLanePrefix is the lower bound for an iteration scoped to one
+// (sub, lane) combo.
+func frontierLanePrefix(sub, lane byte) []byte {
+	return []byte{famFrontier, sub, lane}
+}
+
+// frontierLaneUpperBound is the (exclusive) upper bound for an iteration
+// scoped to one (sub, lane) combo.
+func frontierLaneUpperBound(sub, lane byte) []byte {
+	return []byte{famFrontier, sub, lane + 1}
+}
+
 // FrontierStatus is the lifecycle position of a frontier URL.
 // FrontierStatus is the one-byte lifecycle tag stored at the head of every
 // frontier entry. The four states form a strict progression: Queued → InFlight
@@ -411,7 +474,11 @@ const (
 // frontierEntry is the value side of the 'f' + 'u' + url key. Packed
 // little-endian: status (1) + depth (varint) + priority (float64-le) +
 // enqueued_at (varint) + attempts (varint) + host (varint-len + bytes) +
-// last_error (varint-len + bytes).
+// last_error (varint-len + bytes) [+ lane (1)].
+//
+// Lane is appended at the end as one optional byte. Entries written before
+// the lane feature appear without it; the unpacker defaults missing lanes
+// to LaneDiscovered so existing 4.3M queued URLs keep working.
 type frontierEntry struct {
 	Status     FrontierStatus
 	Depth      int64
@@ -420,11 +487,12 @@ type frontierEntry struct {
 	Attempts   int64
 	Host       string
 	LastError  string
+	Lane       byte
 }
 
 func packFrontierEntry(e frontierEntry) []byte {
 	tmp := make([]byte, binary.MaxVarintLen64)
-	out := make([]byte, 0, 1+8+len(e.Host)+len(e.LastError)+30)
+	out := make([]byte, 0, 1+8+len(e.Host)+len(e.LastError)+32)
 	out = append(out, byte(e.Status))
 	n := binary.PutVarint(tmp, e.Depth)
 	out = append(out, tmp[:n]...)
@@ -441,6 +509,7 @@ func packFrontierEntry(e frontierEntry) []byte {
 	n = binary.PutUvarint(tmp, uint64(len(e.LastError)))
 	out = append(out, tmp[:n]...)
 	out = append(out, e.LastError...)
+	out = append(out, e.Lane)
 	return out
 }
 
@@ -487,6 +556,14 @@ func unpackFrontierEntry(buf []byte) (frontierEntry, error) {
 	}
 	buf = buf[n:]
 	e.LastError = string(buf[:errLen])
+	buf = buf[errLen:]
+	// Lane (optional, appended). Pre-lanes entries have no trailing byte
+	// and default to LaneDiscovered.
+	if len(buf) >= 1 {
+		e.Lane = buf[0]
+	} else {
+		e.Lane = LaneDiscovered
+	}
 	return e, nil
 }
 
@@ -1181,13 +1258,25 @@ func (p *PebbleStore) IndexDocument(ctx context.Context, docID int64, title, tex
 	return nil
 }
 
-// PushFrontier inserts a URL into the queue. INSERT-OR-IGNORE semantics:
-// if the URL already exists in any state, this is a no-op.
-// also writes the 'f'+'q' secondary index for host-fair claim.
+// PushFrontier inserts a URL into the queue at LaneDiscovered (the
+// crawler-default lane). Thin wrapper around PushFrontierLane kept for
+// backwards compat with callers (crawler outbound-link discovery) that
+// don't pick a lane explicitly.
 func (p *PebbleStore) PushFrontier(ctx context.Context, url string, depth int, priority float64) error {
+	return p.PushFrontierLane(ctx, url, depth, LaneDiscovered, priority)
+}
+
+// PushFrontierLane inserts a URL into a specific lane. INSERT-OR-IGNORE:
+// if the URL already exists in any state (including legacy pre-lane
+// entries), this is a no-op. Writes the lane-aware 'f'+'q'+lane secondary
+// index for the weighted round-robin in ClaimFrontier.
+func (p *PebbleStore) PushFrontierLane(ctx context.Context, url string, depth int, lane byte, priority float64) error {
 	if err := ctx.Err(); err != nil {
 		return err
 	}
+	if lane >= laneCount {
+		lane = LaneDiscovered
+	}
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if _, closer, err := p.db.Get(frontierKey(url)); err == nil {
@@ -1203,30 +1292,69 @@ func (p *PebbleStore) PushFrontier(ctx context.Context, url string, depth int, p
 		Priority:   priority,
 		EnqueuedAt: time.Now().Unix(),
 		Host:       host,
+		Lane:       lane,
 	}
 	batch := p.db.NewBatch()
 	defer batch.Close()
 	if err := batch.Set(frontierKey(url), packFrontierEntry(entry), nil); err != nil {
 		return err
 	}
-	if err := batch.Set(frontierStatusIndexKey('q', host, url), nil, nil); err != nil {
+	if err := batch.Set(frontierStatusIndexKeyLane('q', lane, host, url), nil, nil); err != nil {
 		return err
 	}
 	return batch.Commit(p.writeOpts)
 }
 
+// laneWeights drives weighted round-robin across the four lanes. The
+// integers are relative weights, not absolute caps. ClaimFrontier picks a
+// lane every call based on a global tick counter modded by the sum of
+// weights; the deterministic schedule guarantees fairness without
+// per-call randomness.
+//
+// Default 50/30/15/5: submitted gets half the work, refresh a third, the
+// catch-all discovered lane a sixth, and bulk imports the leftover. An
+// empty lane donates its share to the next non-empty lane in priority
+// order, so a quiet submitted/refresh queue can't slow down discovery.
+var laneWeights = [laneCount]int{50, 30, 15, 5}
+
+// laneOrder is the lane priority for donation when a chosen lane is
+// empty: prefer the higher-priority lanes first, then fall through.
+var laneOrder = [laneCount]byte{LaneSubmitted, LaneRefresh, LaneDiscovered, LaneBulk}
+
+// pickLane returns the lane index for the next claim. Deterministic
+// weighted RR over laneWeights using p.laneTick (monotonic counter).
+func (p *PebbleStore) pickLane() byte {
+	sum := 0
+	for _, w := range laneWeights {
+		sum += w
+	}
+	if sum == 0 {
+		return LaneDiscovered
+	}
+	t := int(p.laneTick.Add(1) % uint64(sum))
+	acc := 0
+	for i, w := range laneWeights {
+		acc += w
+		if t < acc {
+			return byte(i)
+		}
+	}
+	return LaneDiscovered
+}
+
 // ClaimFrontier picks one queued URL, atomically marks it in_flight, and
-// returns the FrontierItem. ok=false when the queue is empty.:
-// host-fair via two secondary-index scans — O(distinct in-flight hosts +
-// distinct queued URLs walked until a free host found). At a healthy
-// crawl where most hosts are NOT in-flight, this is effectively O(1).
+// returns the FrontierItem. ok=false when the queue is empty.
 //
-// Tradeoff: priority ordering is no longer enforced across hosts. The
-// implementation traversed every queued URL to honor strict
-// (priority DESC, enqueued ASC) order; trades that for the
-// host-fair scheduling that the SQLite-side Claim provides.
-// Within a host's queued URLs Pebble returns them in URL-byte order,
-// which approximates enqueue order for outbound-link discovery.
+// Lane-weighted: each call picks a lane via pickLane (weighted RR), then
+// scans the lane's queued URLs in host-fair order. Empty lanes fall
+// through to the next non-empty lane in priority order. As a final
+// fallback, the legacy lane-less 'f'+'q'+host+0x00+url index is scanned
+// so the pre-lanes queue (millions of URLs from the cloud.google.com era)
+// keeps draining in parallel with lane-aware pushes.
+//
+// Host-fairness preserved within each lane: in-flight hosts are tracked
+// across ALL lanes so a worker on one lane never piles onto a host
+// another lane is already touching.
 func (p *PebbleStore) ClaimFrontier(ctx context.Context) (FrontierItem, bool, error) {
 	if err := ctx.Err(); err != nil {
 		return FrontierItem{}, false, err
@@ -1234,10 +1362,11 @@ func (p *PebbleStore) ClaimFrontier(ctx context.Context) (FrontierItem, bool, er
 	p.mu.Lock()
 	defer p.mu.Unlock()
 
-	// Step 1: build the set of hosts currently in-flight. wrap in
-	// closure so iIt.Close() runs even if iteration panics (was explicit
-	// Close after the loop; a panic inside leaked the iterator).
-	inflightHosts := make(map[string]struct{}, 32)
+	// Step 1: build the cross-lane set of in-flight hosts. Scans 'f'+'i'
+	// (both legacy and lane-aware live in the same sub-byte range —
+	// frontierStatusIndexHost handles legacy keys; frontierStatusIndexHostLane
+	// handles lane-aware ones).
+	inflightHosts := make(map[string]struct{}, 64)
 	if err := func() error {
 		iIt, err := p.db.NewIter(&pebble.IterOptions{
 			LowerBound: []byte{famFrontier, 'i'},
@@ -1248,7 +1377,7 @@ func (p *PebbleStore) ClaimFrontier(ctx context.Context) (FrontierItem, bool, er
 		}
 		defer iIt.Close()
 		for valid := iIt.First(); valid; valid = iIt.Next() {
-			h := frontierStatusIndexHost(iIt.Key())
+			h := decodeFrontierIndexHost(iIt.Key())
 			if h != "" {
 				inflightHosts[h] = struct{}{}
 			}
@@ -1258,58 +1387,160 @@ func (p *PebbleStore) ClaimFrontier(ctx context.Context) (FrontierItem, bool, er
 		return FrontierItem{}, false, err
 	}
 
-	// Step 2: walk queued URLs in key order (host-then-URL). Pick the first
-	// whose host is NOT in inflightHosts. Start the scan from
-	// the LAST-CLAIMED key (round-robin), wrapping at the end. Previously
-	// we always started at the first queued URL, so hosts late in the
-	// alphabet (e.g. pilotprotocol.network) could be starved indefinitely
-	// when many earlier-alpha hosts had queued URLs.
+	// Step 2: lane-weighted pick. Try the chosen lane first; if empty or
+	// no free host found, walk other lanes in priority order; finally
+	// fall back to the legacy lane-less index.
+	chosenLane := p.pickLane()
+	tryLanes := append([]byte{chosenLane}, laneOrderWithout(chosenLane)...)
+
+	var pickedHost, pickedURL string
+	var pickedLane byte
+	var pickedLegacy bool
+
+	for _, lane := range tryLanes {
+		host, url, ok := p.scanLaneForFreeHost(lane, inflightHosts)
+		if ok {
+			pickedHost, pickedURL, pickedLane = host, url, lane
+			break
+		}
+	}
+	if pickedURL == "" {
+		// Legacy fall-through: drain the pre-lanes 'f'+'q'+host queue.
+		host, url, ok := p.scanLegacyForFreeHost(inflightHosts)
+		if ok {
+			pickedHost, pickedURL = host, url
+			pickedLegacy = true
+		}
+	}
+	if pickedURL == "" {
+		return FrontierItem{}, false, nil
+	}
+
+	// Prefer a URL within the picked host's bucket that has no prior doc
+	// record. Same trick as the legacy claim — RSS/sitemap-imported novel
+	// URLs sit deeper in alphabetical order; without this probe, every
+	// claim picks the first URL, which is almost always already-crawled
+	// and re-fetched silently.
+	if pickedHost != "" {
+		probed := p.probeForNovelURL(pickedHost, pickedURL, pickedLane, pickedLegacy)
+		if probed != "" {
+			pickedURL = probed
+		}
+	}
+
+	// Step 3: atomic transition. Read primary, flip status, swap indexes.
+	val, closer, err := p.db.Get(frontierKey(pickedURL))
+	if err != nil {
+		return FrontierItem{}, false, err
+	}
+	entry, err := unpackFrontierEntry(val)
+	_ = closer.Close()
+	if err != nil {
+		return FrontierItem{}, false, err
+	}
+	entry.Status = FrontierStatusInFlight
+
+	batch := p.db.NewBatch()
+	defer batch.Close()
+	if err := batch.Set(frontierKey(pickedURL), packFrontierEntry(entry), nil); err != nil {
+		return FrontierItem{}, false, err
+	}
+	if pickedLegacy {
+		if err := batch.Delete(frontierStatusIndexKey('q', pickedHost, pickedURL), nil); err != nil {
+			return FrontierItem{}, false, err
+		}
+		if err := batch.Set(frontierStatusIndexKey('i', pickedHost, pickedURL), nil, nil); err != nil {
+			return FrontierItem{}, false, err
+		}
+	} else {
+		if err := batch.Delete(frontierStatusIndexKeyLane('q', pickedLane, pickedHost, pickedURL), nil); err != nil {
+			return FrontierItem{}, false, err
+		}
+		if err := batch.Set(frontierStatusIndexKeyLane('i', pickedLane, pickedHost, pickedURL), nil, nil); err != nil {
+			return FrontierItem{}, false, err
+		}
+	}
+	if err := batch.Commit(p.writeOpts); err != nil {
+		return FrontierItem{}, false, err
+	}
+	return FrontierItem{URL: pickedURL, Depth: int(entry.Depth), Priority: entry.Priority}, true, nil
+}
+
+// laneOrderWithout returns laneOrder minus the given lane, preserving the
+// priority order of the remainder so weighted-RR donation walks
+// submitted → refresh → discovered → bulk on misses.
+func laneOrderWithout(skip byte) []byte {
+	out := make([]byte, 0, laneCount-1)
+	for _, l := range laneOrder {
+		if l != skip {
+			out = append(out, l)
+		}
+	}
+	return out
+}
+
+// decodeFrontierIndexHost extracts the host from either a legacy
+// 'f'+sub+host+0x00+url key or a lane-aware 'f'+sub+lane+host+0x00+url
+// key. Distinguishes by inspecting byte 2: lane bytes are 0..3 (below
+// the printable-ASCII range any host byte uses), so a byte < 0x04 means
+// lane-format.
+func decodeFrontierIndexHost(key []byte) string {
+	if len(key) < 3 || key[0] != famFrontier {
+		return ""
+	}
+	if key[2] < laneCount {
+		h, _ := frontierStatusIndexHostLane(key)
+		return h
+	}
+	return frontierStatusIndexHost(key)
+}
+
+// scanLaneForFreeHost walks one lane's queued URLs, returning the first
+// URL whose host is not in inflightHosts. Round-robin starts at the
+// per-lane cursor stored in p.laneCursors so each lane has its own
+// fairness state independent of the others.
+func (p *PebbleStore) scanLaneForFreeHost(lane byte, inflightHosts map[string]struct{}) (host, url string, ok bool) {
 	qIt, err := p.db.NewIter(&pebble.IterOptions{
-		LowerBound: []byte{famFrontier, 'q'},
-		UpperBound: []byte{famFrontier, 'q' + 1},
+		LowerBound: frontierLanePrefix('q', lane),
+		UpperBound: frontierLaneUpperBound('q', lane),
 	})
 	if err != nil {
-		return FrontierItem{}, false, err
+		return "", "", false
 	}
 	defer qIt.Close()
 
 	p.frontierCursorMu.Lock()
-	cursor := append([]byte(nil), p.frontierCursor...)
+	cursor := append([]byte(nil), p.laneCursors[lane]...)
 	p.frontierCursorMu.Unlock()
 
-	var pickedHost, pickedURL string
 	var fallbackHost, fallbackURL string
 	var fallbackFound bool
 
 	scan := func(start func() bool) (found bool) {
 		for valid := start(); valid; valid = qIt.Next() {
-			host := frontierStatusIndexHost(qIt.Key())
-			if host == "" {
+			h, _ := frontierStatusIndexHostLane(qIt.Key())
+			if h == "" {
 				continue
 			}
 			key := qIt.Key()
-			urlOffset := 2 + len(host) + 1
+			urlOffset := 3 + len(h) + 1
 			if urlOffset > len(key) {
 				continue
 			}
-			url := string(key[urlOffset:])
+			u := string(key[urlOffset:])
 			if !fallbackFound {
-				fallbackHost = host
-				fallbackURL = url
+				fallbackHost = h
+				fallbackURL = u
 				fallbackFound = true
 			}
-			if _, busy := inflightHosts[host]; !busy {
-				pickedHost = host
-				pickedURL = url
+			if _, busy := inflightHosts[h]; !busy {
+				host, url = h, u
 				return true
 			}
 		}
 		return false
 	}
 
-	// First sweep: cursor points to the first key of the next host
-	// (set by the previous claim's skipKey logic), so a plain
-	// SeekGE lands at the first URL of that next host directly.
 	startFromCursor := func() bool {
 		if len(cursor) > 0 {
 			return qIt.SeekGE(cursor)
@@ -1317,112 +1548,144 @@ func (p *PebbleStore) ClaimFrontier(ctx context.Context) (FrontierItem, bool, er
 		return qIt.First()
 	}
 	if !scan(startFromCursor) {
-		// Wrap: try from the beginning. fallbackHost will be set from the
-		// first sweep if any URL existed (so we can reuse it without
-		// re-iterating).
 		_ = scan(qIt.First)
 	}
-	if pickedURL == "" {
+	if host == "" {
 		if !fallbackFound {
-			return FrontierItem{}, false, nil
-		}
-		pickedHost = fallbackHost
-		pickedURL = fallbackURL
-	}
-
-	// within the picked host's queued bucket, prefer a URL that
-	// has NO prior doc record. The naive round-robin always picks the first
-	// alphabetical URL per host, which on a saturated link-graph is almost
-	// always already in famDoc — re-crawl with no doc-count growth. RSS- or
-	// sitemap-imported genuinely-novel URLs sit deeper in the host's bucket
-	// and never get picked. Probe up to 32 URLs in the host's block; if
-	// any has no prior doc, take that one. Falls back to pickedURL when
-	// every probed URL is a known doc. Cheap: 32 × ~1ms pebble point-lookups
-	// per claim, vs the alternative of waiting days for the cursor to drain
-	// every host's first-URL.
-	if pickedHost != "" {
-		hostPrefix := make([]byte, 2+len(pickedHost)+1)
-		hostPrefix[0] = famFrontier
-		hostPrefix[1] = 'q'
-		copy(hostPrefix[2:], pickedHost)
-		hostPrefix[2+len(pickedHost)] = 0x00
-		hostUpper := make([]byte, len(hostPrefix))
-		copy(hostUpper, hostPrefix)
-		hostUpper[len(hostUpper)-1] = 0x01
-		probeIt, perr := p.db.NewIter(&pebble.IterOptions{LowerBound: hostPrefix, UpperBound: hostUpper})
-		if perr == nil {
-			const maxProbes = 32
-			probed := 0
-			for valid := probeIt.First(); valid && probed < maxProbes; valid = probeIt.Next() {
-				key := probeIt.Key()
-				urlPart := string(key[len(hostPrefix):])
-				if urlPart == "" {
-					continue
-				}
-				probed++
-				if _, ok, _ := p.lookupIDByURL(urlPart); !ok {
-					pickedURL = urlPart
-					break
-				}
-			}
-			probeIt.Close()
+			return "", "", false
 		}
+		host, url = fallbackHost, fallbackURL
 	}
 
-	// advance the round-robin cursor PAST the picked host's
-	// entire URL block. Without this, each claim only advances by one URL,
-	// so hosts with thousands of queued URLs (github.com, en.wikipedia.org)
-	// hog the cursor and hosts later in the alphabet take days to reach.
-	// Cursor = {famFrontier, 'q', host, 0xFF} — lex-greater than any real
-	// URL key for this host (URLs are ASCII), so the next SeekGE lands on
-	// the first URL of the NEXT host. also persist to pebble
-	// so a restart resumes where it stopped.
-	var skipKey []byte
-	if pickedHost != "" {
-		skipKey = make([]byte, 2+len(pickedHost)+1)
-		skipKey[0] = famFrontier
-		skipKey[1] = 'q'
-		copy(skipKey[2:], pickedHost)
-		skipKey[2+len(pickedHost)] = 0xFF
-		p.frontierCursorMu.Lock()
-		p.frontierCursor = skipKey
-		p.frontierCursorMu.Unlock()
-	}
+	// Advance the per-lane cursor past the picked host's URL block.
+	skipKey := make([]byte, 3+len(host)+1)
+	skipKey[0] = famFrontier
+	skipKey[1] = 'q'
+	skipKey[2] = lane
+	copy(skipKey[3:], host)
+	skipKey[3+len(host)] = 0xFF
+	p.frontierCursorMu.Lock()
+	p.laneCursors[lane] = skipKey
+	p.frontierCursorMu.Unlock()
+	return host, url, true
+}
 
-	// Step 3: atomic transition. Read primary, flip status, swap indexes.
-	val, closer, err := p.db.Get(frontierKey(pickedURL))
-	if err != nil {
-		return FrontierItem{}, false, err
-	}
-	entry, err := unpackFrontierEntry(val)
-	_ = closer.Close()
+// scanLegacyForFreeHost is the unchanged pre-lanes scan, used as a final
+// fallback so existing 4.3M queued URLs from before lanes shipped continue
+// to drain.
+func (p *PebbleStore) scanLegacyForFreeHost(inflightHosts map[string]struct{}) (host, url string, ok bool) {
+	qIt, err := p.db.NewIter(&pebble.IterOptions{
+		LowerBound: []byte{famFrontier, 'q', laneCount}, // skip lane-aware keys
+		UpperBound: []byte{famFrontier, 'q' + 1},
+	})
 	if err != nil {
-		return FrontierItem{}, false, err
+		return "", "", false
 	}
-	entry.Status = FrontierStatusInFlight
+	defer qIt.Close()
 
-	batch := p.db.NewBatch()
-	defer batch.Close()
-	if err := batch.Set(frontierKey(pickedURL), packFrontierEntry(entry), nil); err != nil {
-		return FrontierItem{}, false, err
+	p.frontierCursorMu.Lock()
+	cursor := append([]byte(nil), p.frontierCursor...)
+	p.frontierCursorMu.Unlock()
+
+	var fallbackHost, fallbackURL string
+	var fallbackFound bool
+
+	scan := func(start func() bool) (found bool) {
+		for valid := start(); valid; valid = qIt.Next() {
+			h := frontierStatusIndexHost(qIt.Key())
+			if h == "" {
+				continue
+			}
+			key := qIt.Key()
+			urlOffset := 2 + len(h) + 1
+			if urlOffset > len(key) {
+				continue
+			}
+			u := string(key[urlOffset:])
+			if !fallbackFound {
+				fallbackHost = h
+				fallbackURL = u
+				fallbackFound = true
+			}
+			if _, busy := inflightHosts[h]; !busy {
+				host, url = h, u
+				return true
+			}
+		}
+		return false
 	}
-	if err := batch.Delete(frontierStatusIndexKey('q', pickedHost, pickedURL), nil); err != nil {
-		return FrontierItem{}, false, err
+
+	startFromCursor := func() bool {
+		if len(cursor) > 0 && cursor[2] >= laneCount {
+			return qIt.SeekGE(cursor)
+		}
+		return qIt.First()
 	}
-	if err := batch.Set(frontierStatusIndexKey('i', pickedHost, pickedURL), nil, nil); err != nil {
-		return FrontierItem{}, false, err
+	if !scan(startFromCursor) {
+		_ = scan(qIt.First)
 	}
-	// persist the cursor in the same atomic batch as the status
-	// transition, so we never desync the state on a crash.
-	if len(skipKey) > 0 {
-		if err := batch.Set(metaKey("frontier_cursor"), skipKey, nil); err != nil {
-			return FrontierItem{}, false, err
+	if host == "" {
+		if !fallbackFound {
+			return "", "", false
 		}
+		host, url = fallbackHost, fallbackURL
 	}
-	if err := batch.Commit(p.writeOpts); err != nil {
-		return FrontierItem{}, false, err
+
+	skipKey := make([]byte, 2+len(host)+1)
+	skipKey[0] = famFrontier
+	skipKey[1] = 'q'
+	copy(skipKey[2:], host)
+	skipKey[2+len(host)] = 0xFF
+	p.frontierCursorMu.Lock()
+	p.frontierCursor = skipKey
+	p.frontierCursorMu.Unlock()
+	return host, url, true
+}
+
+// probeForNovelURL prefers a URL inside the picked host's bucket that has
+// no prior doc record. Same intent as the legacy code: avoid re-crawling
+// known docs when freshly-imported (RSS/sitemap) URLs sit deeper in the
+// alphabetical block. Returns "" if no novel URL found in maxProbes
+// attempts (caller keeps the original pickedURL).
+func (p *PebbleStore) probeForNovelURL(host, fallback string, lane byte, legacy bool) string {
+	var hostPrefix []byte
+	if legacy {
+		hostPrefix = make([]byte, 2+len(host)+1)
+		hostPrefix[0] = famFrontier
+		hostPrefix[1] = 'q'
+		copy(hostPrefix[2:], host)
+		hostPrefix[2+len(host)] = 0x00
+	} else {
+		hostPrefix = make([]byte, 3+len(host)+1)
+		hostPrefix[0] = famFrontier
+		hostPrefix[1] = 'q'
+		hostPrefix[2] = lane
+		copy(hostPrefix[3:], host)
+		hostPrefix[3+len(host)] = 0x00
+	}
+	hostUpper := make([]byte, len(hostPrefix))
+	copy(hostUpper, hostPrefix)
+	hostUpper[len(hostUpper)-1] = 0x01
+	probeIt, perr := p.db.NewIter(&pebble.IterOptions{LowerBound: hostPrefix, UpperBound: hostUpper})
+	if perr != nil {
+		return ""
 	}
-	return FrontierItem{URL: pickedURL, Depth: int(entry.Depth), Priority: entry.Priority}, true, nil
+	defer probeIt.Close()
+	const maxProbes = 32
+	probed := 0
+	for valid := probeIt.First(); valid && probed < maxProbes; valid = probeIt.Next() {
+		key := probeIt.Key()
+		urlPart := string(key[len(hostPrefix):])
+		if urlPart == "" {
+			continue
+		}
+		probed++
+		if _, ok, _ := p.lookupIDByURL(urlPart); !ok {
+			return urlPart
+		}
+	}
+	_ = fallback
+	return ""
 }
 
 // CompleteFrontier marks a URL as successfully processed.
@@ -1474,23 +1737,36 @@ func (p *PebbleStore) transitionFrontier(ctx context.Context, url string, newSta
 	if err := batch.Set(frontierKey(url), packFrontierEntry(entry), nil); err != nil {
 		return err
 	}
+	// blind-delete BOTH legacy and lane-aware secondary keys so transition
+	// works for entries written before lanes shipped (legacy key only) and
+	// for entries pushed after (lane-aware key only). Pebble Delete is a
+	// no-op on missing keys.
 	switch oldStatus {
 	case FrontierStatusQueued:
 		if err := batch.Delete(frontierStatusIndexKey('q', entry.Host, url), nil); err != nil {
 			return err
 		}
+		if err := batch.Delete(frontierStatusIndexKeyLane('q', entry.Lane, entry.Host, url), nil); err != nil {
+			return err
+		}
 	case FrontierStatusInFlight:
 		if err := batch.Delete(frontierStatusIndexKey('i', entry.Host, url), nil); err != nil {
 			return err
 		}
+		if err := batch.Delete(frontierStatusIndexKeyLane('i', entry.Lane, entry.Host, url), nil); err != nil {
+			return err
+		}
 	}
+	// New secondary key follows the entry's Lane — recovery into queued
+	// goes back to whatever lane the URL was originally on (defaults to
+	// LaneDiscovered for legacy entries via unpack).
 	switch newStatus {
 	case FrontierStatusQueued:
-		if err := batch.Set(frontierStatusIndexKey('q', entry.Host, url), nil, nil); err != nil {
+		if err := batch.Set(frontierStatusIndexKeyLane('q', entry.Lane, entry.Host, url), nil, nil); err != nil {
 			return err
 		}
 	case FrontierStatusInFlight:
-		if err := batch.Set(frontierStatusIndexKey('i', entry.Host, url), nil, nil); err != nil {
+		if err := batch.Set(frontierStatusIndexKeyLane('i', entry.Lane, entry.Host, url), nil, nil); err != nil {
 			return err
 		}
 	}
@@ -1746,6 +2022,71 @@ func (p *PebbleStore) GetFrontierStats(ctx context.Context) (FrontierStats, erro
 	return s, nil
 }
 
+// LaneStats is a per-lane queued/in_flight breakdown of the frontier.
+// LegacyQueued/LegacyInFlight count entries written before lanes shipped
+// (their secondary keys lack a lane byte); they're drained as a fall-through
+// in ClaimFrontier and disappear over time.
+type LaneStats struct {
+	Lanes           [laneCount]LaneCounts
+	LegacyQueued    int
+	LegacyInFlight  int
+}
+
+// LaneCounts is the per-lane summary surfaced in /queue.
+type LaneCounts struct {
+	Queued   int
+	InFlight int
+}
+
+// GetLaneStats walks the 'f'+'q' and 'f'+'i' secondary indexes (key-only,
+// no value reads) and tallies by lane. O(N) over secondary keys; for 4M
+// frontier rows on the GH200 this is sub-second because Pebble iterators
+// stream key bytes directly out of the SST without decoding values.
+func (p *PebbleStore) GetLaneStats(ctx context.Context) (LaneStats, error) {
+	if err := ctx.Err(); err != nil {
+		return LaneStats{}, err
+	}
+	var out LaneStats
+	scan := func(sub byte, addQueued bool) error {
+		it, err := p.db.NewIter(&pebble.IterOptions{
+			LowerBound: []byte{famFrontier, sub},
+			UpperBound: []byte{famFrontier, sub + 1},
+			KeyTypes:   pebble.IterKeyTypePointsOnly,
+		})
+		if err != nil {
+			return err
+		}
+		defer it.Close()
+		for valid := it.First(); valid; valid = it.Next() {
+			k := it.Key()
+			if len(k) < 3 {
+				continue
+			}
+			if k[2] < laneCount {
+				if addQueued {
+					out.Lanes[k[2]].Queued++
+				} else {
+					out.Lanes[k[2]].InFlight++
+				}
+			} else {
+				if addQueued {
+					out.LegacyQueued++
+				} else {
+					out.LegacyInFlight++
+				}
+			}
+		}
+		return nil
+	}
+	if err := scan('q', true); err != nil {
+		return LaneStats{}, err
+	}
+	if err := scan('i', false); err != nil {
+		return LaneStats{}, err
+	}
+	return out, nil
+}
+
 // CountQueuedPerHost returns a host → queued-URL-count map for the given
 // hosts. Used by crawler.enqueueLinks to enforce the per-host enqueue cap;
 // one prefix-count per host against the 'f'+'q' secondary index.
diff --git a/internal/store/store.go b/internal/store/store.go
index fa696f6..2a93225 100644
--- a/internal/store/store.go
+++ b/internal/store/store.go
@@ -1172,6 +1172,14 @@ func extractHost(rawURL string) string {
 
 // PushFrontier inserts a URL into the queue at the given depth/priority.
 // No-op if the URL is already present (regardless of its current status).
+// PushFrontierLane on the SQLite backend ignores the lane (legacy
+// schema has no lane column) and delegates to PushFrontier. Cosift's
+// production crawl runs on PebbleStore; the SQLite backend is the
+// legacy single-node path and retains FIFO semantics.
+func (s *Store) PushFrontierLane(ctx context.Context, url string, depth int, _ byte, priority float64) error {
+	return s.PushFrontier(ctx, url, depth, priority)
+}
+
 func (s *Store) PushFrontier(ctx context.Context, url string, depth int, priority float64) error {
 	const q = `INSERT OR IGNORE INTO frontier (url, depth, priority, enqueued_at, host) VALUES (?, ?, ?, ?, ?);`
 	_, err := s.db.ExecContext(ctx, q, url, depth, priority, time.Now().Unix(), extractHost(url))

From 17af2051cac67f791abf15a00710d22519687048 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 11:10:09 +0300
Subject: [PATCH 02/10] feat(frontier): batch push + host demotion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PushFrontierBatch lets a caller insert N URLs in a single Pebble batch
and a single p.mu acquire. SeedRSS and SeedSitemap now buffer URLs and
flush via this path: a 25-URL reddit feed that previously took 8-17
minutes (one mu hop per URL, contending with 256 crawler workers) now
lands in milliseconds. Sitemap streaming flushes every 1024 URLs so a
100K-entry kubernetes.io sitemap doesn't hold the lock for the whole
parse.

DemoteHostToLane walks every queued URL for a host (across legacy AND
lane-aware indexes) and re-keys to a target lane. The escape hatch for
the cloud.google.com situation: 2.8M queued URLs on one host blocked
65% of the host-fair claim slots from fresher lanes. Re-keys atomically
in 1024-URL batches; skips URLs that flipped to in_flight under us.

New endpoint POST /admin/frontier-demote-host {host, lane} surfaces it.
Tested on the GH200: cloud.google.com → lane 3 moved 2,804,001 URLs
in 31 seconds (~90K rekeys/sec); steady crawl rate went from 79 to 134
docs/min on the next sample (+70%).
---
 cmd/cosift/pebble_serve.go      |  51 +++++++
 internal/crawler/rss.go         |  32 +++--
 internal/crawler/sitemap.go     |  25 +++-
 internal/crawler/store_iface.go |   1 +
 internal/store/pebble.go        | 231 ++++++++++++++++++++++++++++++++
 internal/store/store.go         |  15 +++
 6 files changed, 340 insertions(+), 15 deletions(-)

diff --git a/cmd/cosift/pebble_serve.go b/cmd/cosift/pebble_serve.go
index ba90b1b..be8e32b 100644
--- a/cmd/cosift/pebble_serve.go
+++ b/cmd/cosift/pebble_serve.go
@@ -493,6 +493,7 @@ func runPebbleServe(ctx context.Context, cfg *config.Config, args []string) erro
 	mux.HandleFunc("POST /admin/crawl-enqueue", wrap(srv.handleCrawlEnqueue))
 	mux.HandleFunc("POST /admin/frontier-purge-host", wrap(srv.handleFrontierPurgeHost))
 	mux.HandleFunc("POST /admin/frontier-clear", wrap(srv.handleFrontierClear))
+	mux.HandleFunc("POST /admin/frontier-demote-host", wrap(srv.handleFrontierDemoteHost))
 	mux.HandleFunc("POST /admin/rss-import", wrap(srv.handleRSSImport))
 	mux.HandleFunc("POST /admin/crawl-now", wrap(srv.handleCrawlNow))
 	mux.HandleFunc("POST /admin/wet-import", wrap(srv.handleWETImport))
@@ -2800,6 +2801,56 @@ func (s *pebbleHTTP) handleWETImport(w http.ResponseWriter, r *http.Request) {
 	})
 }
 
+// handleFrontierDemoteHost re-keys every queued URL for a host into a
+// different lane. The escape hatch for the cloud.google.com problem:
+// 2.8M queued URLs on one host (65% of the queue) starve host-fair
+// claim slots from fresher lanes. Demote to lane 3 (bulk, 5% weight)
+// and lane 1/2 actually get the work.
+//
+// POST body: {"host": "cloud.google.com", "lane": 3}
+type frontierDemoteHostReq struct {
+	Host string `json:"host"`
+	Lane int    `json:"lane"`
+}
+
+func (s *pebbleHTTP) handleFrontierDemoteHost(w http.ResponseWriter, r *http.Request) {
+	if want := s.cluster.PeerAuthToken; want != "" {
+		got := strings.TrimPrefix(r.Header.Get("Authorization"), "Bearer ")
+		if got != want {
+			writeProblem(w, http.StatusUnauthorized, "missing or invalid admin token")
+			return
+		}
+	}
+	ps, ok := any(s.store).(*store.PebbleStore)
+	if !ok {
+		writeProblem(w, http.StatusNotImplemented, "lanes are PebbleStore-only")
+		return
+	}
+	var req frontierDemoteHostReq
+	body, _ := io.ReadAll(io.LimitReader(r.Body, 64<<10))
+	if err := json.Unmarshal(body, &req); err != nil || req.Host == "" {
+		writeProblem(w, http.StatusBadRequest, "expected {\"host\":\"foo.com\",\"lane\":0..3}")
+		return
+	}
+	if req.Lane < 0 || req.Lane > 3 {
+		writeProblem(w, http.StatusBadRequest, "lane must be 0..3")
+		return
+	}
+	t0 := time.Now()
+	n, err := ps.DemoteHostToLane(r.Context(), req.Host, byte(req.Lane))
+	if err != nil {
+		writeProblem(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	log.Printf("frontier-demote-host: moved %d URLs (%s -> lane %d) in %s", n, req.Host, req.Lane, time.Since(t0).Round(time.Millisecond))
+	writeJSON(w, http.StatusOK, map[string]any{
+		"host":    req.Host,
+		"lane":    req.Lane,
+		"moved":   n,
+		"elapsed": time.Since(t0).String(),
+	})
+}
+
 // handleRSSImport fetches an RSS 2.0 or Atom feed and pushes every <item>/
 // <entry> link to the live frontier. Same auth shape as sitemap-import.
 // Designed to be cron-friendly: idempotent against the frontier (re-seeding
diff --git a/internal/crawler/rss.go b/internal/crawler/rss.go
index edcb845..f75b491 100644
--- a/internal/crawler/rss.go
+++ b/internal/crawler/rss.go
@@ -7,6 +7,8 @@ import (
 	"io"
 	"net/http"
 	"strings"
+
+	"github.com/pilot-protocol/cosift/internal/store"
 )
 
 // RSS / Atom feed seeding. Parallel to sitemap.go in spirit: fetch a feed,
@@ -76,25 +78,29 @@ func (c *Crawler) SeedRSS(ctx context.Context, feedURL string) (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	n := 0
+	// Batch the URL push: one Pebble write transaction + one global mu
+	// acquire for the whole feed. Pre-batch, each PushFrontierLane fought
+	// 256 crawler workers for p.mu and feeds took 8-17min wall-clock.
+	// Batched, the same call returns in milliseconds.
+	//
+	// Bypass include_domains: the operator explicitly asked to import
+	// this feed, so its items are trusted regardless of the curated
+	// crawler allowlist. (Crawler outbound-link discovery still goes
+	// through allowedDomain via Seed.)
+	items := make([]store.FrontierPushItem, 0, len(urls))
 	for _, u := range urls {
-		// RSS items are fresh-by-definition — push into the refresh lane
-		// so they jump cloud.google.com and other bulk backlog via the
-		// weighted round-robin in PebbleStore.ClaimFrontier.
-		//
-		// Bypass include_domains here: the operator explicitly asked to
-		// import this feed, so its items are trusted regardless of the
-		// curated crawler allowlist. (Crawler outbound-link discovery
-		// still goes through allowedDomain via Seed.)
 		canon, cerr := canonicalize(u)
 		if cerr != nil {
 			continue
 		}
-		if perr := c.store.PushFrontierLane(context.Background(), canon, 0, 1, 1.0); perr == nil {
-			n++
-		}
+		items = append(items, store.FrontierPushItem{
+			URL:      canon,
+			Depth:    0,
+			Lane:     1, // LaneRefresh
+			Priority: 1.0,
+		})
 	}
-	return n, nil
+	return c.store.PushFrontierBatch(context.Background(), items)
 }
 
 // fetchRSS pulls the feed body and parses either RSS2 or Atom. Returns the
diff --git a/internal/crawler/sitemap.go b/internal/crawler/sitemap.go
index ca10ee7..379575a 100644
--- a/internal/crawler/sitemap.go
+++ b/internal/crawler/sitemap.go
@@ -11,6 +11,8 @@ import (
 	"net/http"
 	"strings"
 	"time"
+
+	"github.com/pilot-protocol/cosift/internal/store"
 )
 
 // Sitemap parser, intentionally minimal: handles the standard urlset shape,
@@ -59,15 +61,34 @@ func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, erro
 	// Bypass include_domains here for the same reason as SeedRSS: the
 	// operator explicitly requested this sitemap, so trust its URLs
 	// regardless of the curated crawler allowlist.
+	//
+	// Buffer URLs into 1024-item batches and flush via PushFrontierBatch.
+	// Single mu acquire per batch instead of per URL — at scale (MDN,
+	// kubernetes.io sitemaps with 100K+ URLs) this is the difference
+	// between a sitemap-import that returns in seconds vs an hour.
+	const batchSize = 1024
+	buf := make([]store.FrontierPushItem, 0, batchSize)
+	flush := func() {
+		if len(buf) == 0 {
+			return
+		}
+		w, perr := c.store.PushFrontierBatch(context.Background(), buf)
+		if perr == nil {
+			n += w
+		}
+		buf = buf[:0]
+	}
 	err := c.fetchSitemapStream(ctx, sitemapURL, 2, func(u string) {
 		canon, cerr := canonicalize(u)
 		if cerr != nil {
 			return
 		}
-		if perr := c.store.PushFrontierLane(context.Background(), canon, 0, 1, 1.0); perr == nil { // LaneRefresh
-			n++
+		buf = append(buf, store.FrontierPushItem{URL: canon, Depth: 0, Lane: 1, Priority: 1.0})
+		if len(buf) >= batchSize {
+			flush()
 		}
 	})
+	flush()
 	return n, err
 }
 
diff --git a/internal/crawler/store_iface.go b/internal/crawler/store_iface.go
index 767621a..b79abff 100644
--- a/internal/crawler/store_iface.go
+++ b/internal/crawler/store_iface.go
@@ -26,6 +26,7 @@ type CrawlerStore interface {
 	// Frontier
 	PushFrontier(ctx context.Context, url string, depth int, priority float64) error
 	PushFrontierLane(ctx context.Context, url string, depth int, lane byte, priority float64) error
+	PushFrontierBatch(ctx context.Context, items []store.FrontierPushItem) (int, error)
 	ClaimFrontier(ctx context.Context) (store.FrontierItem, bool, error)
 	CompleteFrontier(ctx context.Context, url string) error
 	FailFrontier(ctx context.Context, url, errMsg string) error
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index 5b40e57..15c1d99 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -1266,6 +1266,85 @@ func (p *PebbleStore) PushFrontier(ctx context.Context, url string, depth int, p
 	return p.PushFrontierLane(ctx, url, depth, LaneDiscovered, priority)
 }
 
+// FrontierPushItem is one entry handed to PushFrontierBatch. Callers
+// typically pass the URL list straight from a parsed feed/sitemap; the
+// store extracts the host and writes both the primary and lane-aware
+// secondary index keys.
+type FrontierPushItem struct {
+	URL      string
+	Depth    int
+	Lane     byte
+	Priority float64
+}
+
+// PushFrontierBatch inserts N URLs in a single Pebble batch + single
+// mu acquire. This is the fix for the "rss-import takes 14 minutes"
+// problem: per-URL PushFrontierLane calls fight 256 crawler workers
+// for p.mu, dragging a 25-URL feed to ~14 minutes wall-clock. Batched,
+// the same feed lands in tens of milliseconds.
+//
+// Dedup semantics match PushFrontierLane: if frontierKey(url) exists
+// in any state, that URL is skipped (no overwrite). Returns the count
+// actually written (new URLs only); duplicates are silently ignored
+// so callers can pre-count for telemetry without double-checking.
+func (p *PebbleStore) PushFrontierBatch(ctx context.Context, items []FrontierPushItem) (int, error) {
+	if err := ctx.Err(); err != nil {
+		return 0, err
+	}
+	if len(items) == 0 {
+		return 0, nil
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	batch := p.db.NewBatch()
+	defer batch.Close()
+
+	written := 0
+	now := time.Now().Unix()
+	for _, it := range items {
+		if it.URL == "" {
+			continue
+		}
+		lane := it.Lane
+		if lane >= laneCount {
+			lane = LaneDiscovered
+		}
+		// Dedup against existing frontier rows. One Get per URL is the
+		// price of INSERT-OR-IGNORE; cheaper than the post-hoc batch
+		// reconciliation a true bulk-load would need.
+		if _, closer, err := p.db.Get(frontierKey(it.URL)); err == nil {
+			_ = closer.Close()
+			continue
+		} else if !errors.Is(err, pebble.ErrNotFound) {
+			return written, err
+		}
+		host := extractHost(it.URL)
+		entry := frontierEntry{
+			Status:     FrontierStatusQueued,
+			Depth:      int64(it.Depth),
+			Priority:   it.Priority,
+			EnqueuedAt: now,
+			Host:       host,
+			Lane:       lane,
+		}
+		if err := batch.Set(frontierKey(it.URL), packFrontierEntry(entry), nil); err != nil {
+			return written, err
+		}
+		if err := batch.Set(frontierStatusIndexKeyLane('q', lane, host, it.URL), nil, nil); err != nil {
+			return written, err
+		}
+		written++
+	}
+	if written == 0 {
+		return 0, nil
+	}
+	if err := batch.Commit(p.writeOpts); err != nil {
+		return 0, err
+	}
+	return written, nil
+}
+
 // PushFrontierLane inserts a URL into a specific lane. INSERT-OR-IGNORE:
 // if the URL already exists in any state (including legacy pre-lane
 // entries), this is a no-op. Writes the lane-aware 'f'+'q'+lane secondary
@@ -2022,6 +2101,158 @@ func (p *PebbleStore) GetFrontierStats(ctx context.Context) (FrontierStats, erro
 	return s, nil
 }
 
+// DemoteHostToLane walks every queued URL for the given host (across
+// both legacy and lane-aware indexes) and rewrites it to the target
+// lane. Used to clear host-fair scheduling bottlenecks where one host
+// (cloud.google.com had 2.8M legacy URLs on the GH200, blocking 65% of
+// the queue with its slow JS-rendered pages) hogs claim slots from
+// fresher content. Re-keys atomically in 1024-URL batches.
+//
+// Returns the count of URLs moved. Safe to re-run — already-demoted
+// URLs (already in target lane) are skipped.
+func (p *PebbleStore) DemoteHostToLane(ctx context.Context, host string, lane byte) (int, error) {
+	if err := ctx.Err(); err != nil {
+		return 0, err
+	}
+	if lane >= laneCount {
+		return 0, fmt.Errorf("DemoteHostToLane: lane %d out of range 0..%d", lane, laneCount-1)
+	}
+	if host == "" {
+		return 0, fmt.Errorf("DemoteHostToLane: empty host")
+	}
+
+	const batchSize = 1024
+	moved := 0
+	// Walk both the legacy 'q'+host+... and lane-aware 'q'+lane+host+...
+	// keyspaces. For each, collect URLs first (so we don't iterate while
+	// mutating) then re-key in batches.
+	collect := func(legacy bool, srcLane byte) ([]string, error) {
+		var lo, hi []byte
+		if legacy {
+			lo = make([]byte, 2+len(host)+1)
+			lo[0] = famFrontier
+			lo[1] = 'q'
+			copy(lo[2:], host)
+			lo[2+len(host)] = 0x00
+			hi = make([]byte, len(lo))
+			copy(hi, lo)
+			hi[len(hi)-1] = 0x01
+		} else {
+			lo = make([]byte, 3+len(host)+1)
+			lo[0] = famFrontier
+			lo[1] = 'q'
+			lo[2] = srcLane
+			copy(lo[3:], host)
+			lo[3+len(host)] = 0x00
+			hi = make([]byte, len(lo))
+			copy(hi, lo)
+			hi[len(hi)-1] = 0x01
+		}
+		it, err := p.db.NewIter(&pebble.IterOptions{LowerBound: lo, UpperBound: hi})
+		if err != nil {
+			return nil, err
+		}
+		defer it.Close()
+		var urls []string
+		urlOffset := len(lo)
+		for valid := it.First(); valid; valid = it.Next() {
+			k := it.Key()
+			if len(k) <= urlOffset {
+				continue
+			}
+			urls = append(urls, string(k[urlOffset:]))
+		}
+		return urls, nil
+	}
+
+	rekeyBatch := func(urls []string, sourceLegacy bool, srcLane byte) error {
+		p.mu.Lock()
+		defer p.mu.Unlock()
+		batch := p.db.NewBatch()
+		defer batch.Close()
+		for _, u := range urls {
+			val, closer, err := p.db.Get(frontierKey(u))
+			if errors.Is(err, pebble.ErrNotFound) {
+				continue
+			}
+			if err != nil {
+				return err
+			}
+			entry, uerr := unpackFrontierEntry(val)
+			_ = closer.Close()
+			if uerr != nil {
+				return uerr
+			}
+			// Skip URLs whose status changed under us (e.g. claimed by a
+			// worker between collect and rekey). Only Queued rows have
+			// secondary 'q' entries to swap.
+			if entry.Status != FrontierStatusQueued {
+				continue
+			}
+			if entry.Lane == lane {
+				continue
+			}
+			// Delete the OLD secondary key (legacy or lane-aware as appropriate).
+			if sourceLegacy {
+				if err := batch.Delete(frontierStatusIndexKey('q', host, u), nil); err != nil {
+					return err
+				}
+			} else {
+				if err := batch.Delete(frontierStatusIndexKeyLane('q', srcLane, host, u), nil); err != nil {
+					return err
+				}
+			}
+			// Insert the NEW lane-aware secondary key.
+			if err := batch.Set(frontierStatusIndexKeyLane('q', lane, host, u), nil, nil); err != nil {
+				return err
+			}
+			// Update the primary entry value with the new lane.
+			entry.Lane = lane
+			if err := batch.Set(frontierKey(u), packFrontierEntry(entry), nil); err != nil {
+				return err
+			}
+			moved++
+		}
+		return batch.Commit(p.writeOpts)
+	}
+
+	// Legacy sweep.
+	urls, err := collect(true, 0)
+	if err != nil {
+		return moved, err
+	}
+	for i := 0; i < len(urls); i += batchSize {
+		end := i + batchSize
+		if end > len(urls) {
+			end = len(urls)
+		}
+		if err := rekeyBatch(urls[i:end], true, 0); err != nil {
+			return moved, err
+		}
+	}
+
+	// Lane-aware sweep across every source lane EXCEPT the target.
+	for sl := byte(0); sl < laneCount; sl++ {
+		if sl == lane {
+			continue
+		}
+		urls, err := collect(false, sl)
+		if err != nil {
+			return moved, err
+		}
+		for i := 0; i < len(urls); i += batchSize {
+			end := i + batchSize
+			if end > len(urls) {
+				end = len(urls)
+			}
+			if err := rekeyBatch(urls[i:end], false, sl); err != nil {
+				return moved, err
+			}
+		}
+	}
+	return moved, nil
+}
+
 // LaneStats is a per-lane queued/in_flight breakdown of the frontier.
 // LegacyQueued/LegacyInFlight count entries written before lanes shipped
 // (their secondary keys lack a lane byte); they're drained as a fall-through
diff --git a/internal/store/store.go b/internal/store/store.go
index 2a93225..236721f 100644
--- a/internal/store/store.go
+++ b/internal/store/store.go
@@ -1180,6 +1180,21 @@ func (s *Store) PushFrontierLane(ctx context.Context, url string, depth int, _ b
 	return s.PushFrontier(ctx, url, depth, priority)
 }
 
+// PushFrontierBatch on the SQLite backend loops PushFrontier — there's
+// no single-write-acquire equivalent without a SQL transaction wrapper,
+// and the SQLite backend is only run in test fixtures where this path
+// isn't hot.
+func (s *Store) PushFrontierBatch(ctx context.Context, items []FrontierPushItem) (int, error) {
+	n := 0
+	for _, it := range items {
+		if err := s.PushFrontier(ctx, it.URL, it.Depth, it.Priority); err != nil {
+			return n, err
+		}
+		n++
+	}
+	return n, nil
+}
+
 func (s *Store) PushFrontier(ctx context.Context, url string, depth int, priority float64) error {
 	const q = `INSERT OR IGNORE INTO frontier (url, depth, priority, enqueued_at, host) VALUES (?, ?, ?, ?, ?);`
 	_, err := s.db.ExecContext(ctx, q, url, depth, priority, time.Now().Unix(), extractHost(url))

From 32f1047fe3ed6ffd5faa14eb399a308cc16949da Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 11:49:59 +0300
Subject: [PATCH 03/10] feat(crawler): decoupled embed pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optional embed worker pool drains a buffered channel separate from the
crawl-worker loop. Enabled when COSIFT_EMBED_DECOUPLE_WORKERS > 0:

  Crawler worker:  fetch → parse → UpsertDocument → IndexDocument →
                   push embedJob → claim next URL  (returns immediately)

  Embed worker:    embedJob → embedder.Embed → UpsertPassageBatch
                   (or per-chunk fallback when batch unavailable)

Pre-decouple, each crawler worker held onto a URL for fetch + parse +
BM25 + (Embed network call + HNSW writes for N chunks). With 512
workers contending on p.mu and the HNSW write lock, the synchronous
embed leg dominated per-cycle latency.

Bounded send (8K-default buffer): if the embed pool falls behind, the
hot path increments embedDropped and continues. The dropped docs land
in embed-backfill later, which the operator runs anyway. Counters
(embedQueued/Done/Failed/Dropped) logged on shutdown so we can verify
the pool kept up.

Closes the embed channel only after crawl workers exit so no producer
races a closed channel. Zombie-reclaim and per-host overrides preserved.
---
 internal/crawler/crawler.go | 142 +++++++++++++++++++++++++++++++++++-
 1 file changed, 141 insertions(+), 1 deletion(-)

diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 56a6d6d..ef70913 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -68,12 +68,37 @@ type Crawler struct {
 	// an int read on the hot path; not atomic — diagnostic only).
 	zombieDebugLogged int
 
-	// per-host error-rate tracking via sync.Map + atomic
+// per-host error-rate tracking via sync.Map + atomic
 	// counters. With 512 workers we cannot afford a single write lock
 	// on every claim's completion — that bottlenecked and cost
 	// ~25% throughput. sync.Map.LoadOrStore lets us avoid the lock on
 	// the steady-state path (host already in map).
 	hostStats sync.Map // host (string) → *hostFetchStats
+
+	// Decoupled embed pipeline. When non-nil, crawler workers push
+	// (docID, chunks, texts) onto embedQ after UpsertDocument +
+	// IndexDocument and immediately claim the next URL — embedding
+	// + HNSW writes happen in a separate worker pool. Cuts per-doc
+	// crawler-worker cycle time from ~85s (mu contention + synchronous
+	// embed + HNSW lock waits) to fetch+parse+BM25 only.
+	//
+	// Activated when COSIFT_EMBED_DECOUPLE_WORKERS > 0. Bounded buffer
+	// keeps memory predictable; non-blocking send means a slow embedder
+	// can't stall the crawl (dropped jobs land in embed-backfill later,
+	// which the operator runs anyway).
+	embedQ         chan *embedJob
+	embedQueued    atomic.Int64
+	embedDropped   atomic.Int64
+	embedDone      atomic.Int64
+	embedFailed    atomic.Int64
+}
+
+// embedJob is one unit of work for the embed worker pool.
+type embedJob struct {
+	url    string
+	docID  int64
+	chunks []index.Chunk
+	texts  []string
 }
 
 type hostFetchStats struct {
@@ -393,6 +418,26 @@ func (c *Crawler) Run(ctx context.Context) error {
 	runCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
+	// Embed worker pool: when COSIFT_EMBED_DECOUPLE_WORKERS > 0 and we
+	// have an embedder + passageWriter wired, spin up the pool BEFORE
+	// crawler workers so the hot-path decoupled branch (processClaimed)
+	// has a non-nil c.embedQ to push onto. Each embed worker is a
+	// dedicated goroutine doing Embed → UpsertPassage[Batch], freeing
+	// crawl workers from synchronous embed + HNSW write latency.
+	var embedWG sync.WaitGroup
+	if c.embedder != nil && c.passageWriter != nil {
+		embedWorkers := envIntCrawler("COSIFT_EMBED_DECOUPLE_WORKERS", 0)
+		if embedWorkers > 0 {
+			bufSize := envIntCrawler("COSIFT_EMBED_DECOUPLE_BUFFER", 4096)
+			c.embedQ = make(chan *embedJob, bufSize)
+			for i := 0; i < embedWorkers; i++ {
+				embedWG.Add(1)
+				go c.embedWorker(runCtx, &embedWG)
+			}
+			log.Printf("crawler: embed decouple ON (%d workers, %d-buf)", embedWorkers, bufSize)
+		}
+	}
+
 	var wg sync.WaitGroup
 	for i := 0; i < workers; i++ {
 		wg.Add(1)
@@ -410,9 +455,93 @@ func (c *Crawler) Run(ctx context.Context) error {
 	}
 
 	wg.Wait()
+	// Close embedQ AFTER crawl workers have exited (no more producers),
+	// then wait for embed workers to drain. Otherwise an early close
+	// would race a still-running crawl worker's send and panic.
+	if c.embedQ != nil {
+		close(c.embedQ)
+		embedWG.Wait()
+		log.Printf("crawler: embed pool drained — queued=%d done=%d failed=%d dropped=%d",
+			c.embedQueued.Load(), c.embedDone.Load(), c.embedFailed.Load(), c.embedDropped.Load())
+	}
 	return nil
 }
 
+// envIntCrawler reads an integer env var, falling back to def on parse
+// failure. Used for the embed-decouple knobs so operators can tune
+// without a config-file edit.
+func envIntCrawler(key string, def int) int {
+	v := os.Getenv(key)
+	if v == "" {
+		return def
+	}
+	n, err := strconv.Atoi(v)
+	if err != nil || n < 0 {
+		return def
+	}
+	return n
+}
+
+// embedWorker drains c.embedQ. For each job, embeds the chunk texts and
+// writes passages to the configured PassageWriter. Uses the optional
+// batch writer when available for one HNSW lock per doc instead of
+// one per chunk.
+func (c *Crawler) embedWorker(ctx context.Context, wg *sync.WaitGroup) {
+	defer wg.Done()
+	for job := range c.embedQ {
+		vecs, err := c.embedder.Embed(ctx, job.texts)
+		if err != nil {
+			c.embedFailed.Add(1)
+			log.Printf("embed-decouple %s: %v", job.url, err)
+			continue
+		}
+		if len(vecs) != len(job.chunks) {
+			c.embedFailed.Add(1)
+			continue
+		}
+		// Mirror the synchronous path's zombie reclaim so re-crawled
+		// URLs don't accumulate generations of vectors in HNSW.
+		if os.Getenv("COSIFT_ZOMBIE_RECLAIM") == "1" {
+			if inv, ok := c.passageWriter.(URLInvalidator); ok {
+				_, _ = inv.MarkURLInvalid(ctx, job.url)
+			}
+		}
+		// Prefer the batch interface (single HNSW lock for the whole
+		// doc) over per-chunk writes.
+		if bw, ok := c.passageWriter.(PassageWriterBatch); ok {
+			ps := make([]*store.Passage, len(job.chunks))
+			for i, ch := range job.chunks {
+				ps[i] = &store.Passage{
+					DocID:     job.docID,
+					Offset:    ch.Offset,
+					Length:    ch.Length,
+					Model:     c.embedder.Model(),
+					Embedding: vecs[i],
+				}
+			}
+			if err := bw.UpsertPassageBatch(ctx, ps); err != nil {
+				c.embedFailed.Add(1)
+				log.Printf("embed-decouple batch %s: %v", job.url, err)
+				continue
+			}
+		} else {
+			for i, ch := range job.chunks {
+				p := &store.Passage{
+					DocID:     job.docID,
+					Offset:    ch.Offset,
+					Length:    ch.Length,
+					Model:     c.embedder.Model(),
+					Embedding: vecs[i],
+				}
+				if err := c.passageWriter.UpsertPassage(ctx, p); err != nil {
+					log.Printf("embed-decouple passage %s offset=%d: %v", job.url, ch.Offset, err)
+				}
+			}
+		}
+		c.embedDone.Add(1)
+	}
+}
+
 // statusDumper writes a JSON snapshot of crawl progress every 10s to path.
 // Cheap: just reads the running counters from the store.
 // Stops when ctx is cancelled.
@@ -837,6 +966,17 @@ func (c *Crawler) processClaimed(ctx context.Context, item store.FrontierItem, g
 			for i, ch := range chunks {
 				texts[i] = truncateForEmbed(ch.Text, tokenCap)
 			}
+			if c.embedQ != nil {
+				job := &embedJob{url: item.URL, docID: id, chunks: chunks, texts: texts}
+				select {
+				case c.embedQ <- job:
+					c.embedQueued.Add(1)
+				default:
+					c.embedDropped.Add(1)
+				}
+				c.enqueueLinks(ctx, parsed.Links, item.Depth+1)
+				return nil
+			}
 			vecs, embErr := c.embedder.Embed(ctx, texts)
 			if embErr != nil {
 				log.Printf("embed %s: %v", item.URL, embErr)

From ff9ba0175c825dbba160da9f497bae25fea96ecb Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 12:28:57 +0300
Subject: [PATCH 04/10] fix(frontier): RecoverInFlight rebuilt lane-aware
 indexes; PurgeStaleInFlight
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RecoverInFlight predates lanes — at every restart it deleted only the
LEGACY 'i' key and re-queued under the LEGACY 'q' key. Two consequences
that took a session to spot:

1. Stale lane-aware 'i' keys leaked one set per restart, eventually
   pushing GetLaneStats's in_flight count above max_concurrent (saw
   lane 1 if=891 with cap=512).
2. URLs that lived in lane 1/2/3 silently reverted to the legacy
   queue on every recovery, so the lane infrastructure's gains
   melted away across restarts.

Recovery now: blind-deletes both legacy and lane-aware 'i' keys (mirrors
transitionFrontier), then re-queues at the entry's own Lane so recovered
work stays in its priority class.

PurgeStaleInFlight + POST /admin/frontier-purge-stale-inflight is the
one-shot sweep for pre-fix leftovers: walks all 'f'+'i'+... keys and
drops any without a matching primary in InFlight. Ran on GH200 after
deploy — purged 783 keys, lane 1 in_flight dropped from 891 → 239.

Also adds COSIFT_EMBED_DECOUPLE_WORKERS / _BUFFER plumbing (Crawler
embed pool + buffered channel) — committed in a prior change but the
recovery bug was making it look like a regression. Live testing on the
clean indexes is the right way to actually measure its impact.
---
 cmd/cosift/pebble_serve.go | 30 +++++++++++++
 internal/store/pebble.go   | 90 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/cmd/cosift/pebble_serve.go b/cmd/cosift/pebble_serve.go
index be8e32b..0e3e93e 100644
--- a/cmd/cosift/pebble_serve.go
+++ b/cmd/cosift/pebble_serve.go
@@ -494,6 +494,7 @@ func runPebbleServe(ctx context.Context, cfg *config.Config, args []string) erro
 	mux.HandleFunc("POST /admin/frontier-purge-host", wrap(srv.handleFrontierPurgeHost))
 	mux.HandleFunc("POST /admin/frontier-clear", wrap(srv.handleFrontierClear))
 	mux.HandleFunc("POST /admin/frontier-demote-host", wrap(srv.handleFrontierDemoteHost))
+	mux.HandleFunc("POST /admin/frontier-purge-stale-inflight", wrap(srv.handleFrontierPurgeStaleInFlight))
 	mux.HandleFunc("POST /admin/rss-import", wrap(srv.handleRSSImport))
 	mux.HandleFunc("POST /admin/crawl-now", wrap(srv.handleCrawlNow))
 	mux.HandleFunc("POST /admin/wet-import", wrap(srv.handleWETImport))
@@ -2851,6 +2852,35 @@ func (s *pebbleHTTP) handleFrontierDemoteHost(w http.ResponseWriter, r *http.Req
 	})
 }
 
+// handleFrontierPurgeStaleInFlight clears the stale 'i' secondary keys
+// left over from the pre-fix RecoverInFlight bug. Pre-fix, every restart
+// re-queued in-flight URLs via the LEGACY 'q' index only and skipped the
+// lane-aware 'i' delete, so each restart leaked the URL's lane-aware 'i'
+// key. GetLaneStats then reported impossibly-high in_flight counts
+// (>max_concurrent). Idempotent — re-running is a no-op once clean.
+func (s *pebbleHTTP) handleFrontierPurgeStaleInFlight(w http.ResponseWriter, r *http.Request) {
+	if want := s.cluster.PeerAuthToken; want != "" {
+		got := strings.TrimPrefix(r.Header.Get("Authorization"), "Bearer ")
+		if got != want {
+			writeProblem(w, http.StatusUnauthorized, "missing or invalid admin token")
+			return
+		}
+	}
+	ps, ok := any(s.store).(*store.PebbleStore)
+	if !ok {
+		writeProblem(w, http.StatusNotImplemented, "PebbleStore-only")
+		return
+	}
+	t0 := time.Now()
+	n, err := ps.PurgeStaleInFlight(r.Context())
+	if err != nil {
+		writeProblem(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	log.Printf("frontier-purge-stale-inflight: purged %d keys in %s", n, time.Since(t0).Round(time.Millisecond))
+	writeJSON(w, http.StatusOK, map[string]any{"purged": n, "elapsed": time.Since(t0).String()})
+}
+
 // handleRSSImport fetches an RSS 2.0 or Atom feed and pushes every <item>/
 // <entry> link to the live frontier. Same auth shape as sitemap-import.
 // Designed to be cron-friendly: idempotent against the frontier (re-seeding
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index 15c1d99..551b9a6 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -2416,17 +2416,103 @@ func (p *PebbleStore) RecoverInFlight(ctx context.Context) error {
 		if err := batch.Set(key, packFrontierEntry(entry), nil); err != nil {
 			return err
 		}
-		// rebuild secondary indexes for the transition.
+		// rebuild secondary indexes for the transition. Blind-delete BOTH
+		// formats so stale 'i' keys from prior code revisions (or from a
+		// crash that landed mid-transition) are cleaned up — without this
+		// the lane-aware 'i' index leaked across restarts and GetLaneStats
+		// reported impossibly-high in_flight counts.
 		if err := batch.Delete(frontierStatusIndexKey('i', entry.Host, url), nil); err != nil {
 			return err
 		}
-		if err := batch.Set(frontierStatusIndexKey('q', entry.Host, url), nil, nil); err != nil {
+		if err := batch.Delete(frontierStatusIndexKeyLane('i', entry.Lane, entry.Host, url), nil); err != nil {
+			return err
+		}
+		// Re-queue in the lane that the entry already belongs to — keeps
+		// recovered work in its original priority class instead of all
+		// reverting to the legacy fallback (which was the silent
+		// regression on every restart before this fix).
+		if err := batch.Set(frontierStatusIndexKeyLane('q', entry.Lane, entry.Host, url), nil, nil); err != nil {
 			return err
 		}
 	}
 	return batch.Commit(p.writeOpts)
 }
 
+// PurgeStaleInFlight scans both legacy and lane-aware 'f'+'i'+... keys
+// and drops any without a matching primary entry in InFlight status. Use
+// after a code upgrade that fixed an 'i'-cleanup bug: the new code will
+// no longer leak keys, but pre-fix leftovers remain until this sweep.
+// Cheap key-only iteration; returns the count purged.
+func (p *PebbleStore) PurgeStaleInFlight(ctx context.Context) (int, error) {
+	if err := ctx.Err(); err != nil {
+		return 0, err
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	it, err := p.db.NewIter(&pebble.IterOptions{
+		LowerBound: []byte{famFrontier, 'i'},
+		UpperBound: []byte{famFrontier, 'i' + 1},
+	})
+	if err != nil {
+		return 0, err
+	}
+	defer it.Close()
+	batch := p.db.NewBatch()
+	defer batch.Close()
+	purged := 0
+	for valid := it.First(); valid; valid = it.Next() {
+		k := it.Key()
+		if len(k) < 3 {
+			continue
+		}
+		var host, url string
+		if k[2] < laneCount {
+			host, _ = frontierStatusIndexHostLane(k)
+			urlOffset := 3 + len(host) + 1
+			if urlOffset > len(k) {
+				continue
+			}
+			url = string(k[urlOffset:])
+		} else {
+			host = frontierStatusIndexHost(k)
+			urlOffset := 2 + len(host) + 1
+			if urlOffset > len(k) {
+				continue
+			}
+			url = string(k[urlOffset:])
+		}
+		// Look up the primary; if missing OR not InFlight, the secondary
+		// key is stale.
+		val, closer, gerr := p.db.Get(frontierKey(url))
+		if errors.Is(gerr, pebble.ErrNotFound) {
+			keyCopy := append([]byte{}, k...)
+			if err := batch.Delete(keyCopy, nil); err != nil {
+				return purged, err
+			}
+			purged++
+			continue
+		}
+		if gerr != nil {
+			return purged, gerr
+		}
+		entry, uerr := unpackFrontierEntry(val)
+		_ = closer.Close()
+		if uerr != nil || entry.Status != FrontierStatusInFlight {
+			keyCopy := append([]byte{}, k...)
+			if err := batch.Delete(keyCopy, nil); err != nil {
+				return purged, err
+			}
+			purged++
+		}
+	}
+	if purged > 0 {
+		if err := batch.Commit(p.writeOpts); err != nil {
+			return purged, err
+		}
+	}
+	return purged, nil
+}
+
 // readDocTermsLocked reads the 'g' family entry for docID under p.mu.
 // Returns an empty slice with no error when no prior entry exists.
 func (p *PebbleStore) readDocTermsLocked(docID int64) ([]int64, error) {

From 4e80e1d4e029ecf0df8d036561935b0bd1638d14 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 12:59:08 +0300
Subject: [PATCH 05/10] perf(crawler): combined
 UpsertDocument+IndexDocument+CompleteFrontier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hot path was taking p.mu THREE times per finished doc — Upsert,
Index, Complete — each one queueing 512 workers in a single global lock
that took 5-15ms per round-trip. At sustained crawl load that's a
synchronous bottleneck no amount of worker concurrency could break.

PebbleStore.WriteCrawlResult folds all three operations into ONE mu
acquire + ONE batch commit:
  - Tokenize runs OUTSIDE the lock (CPU-parallel, no shared state)
  - Inside the lock: ID resolution, BM25 postings prep, frontier
    in_flight→Done transition
  - Single batch.Commit at the end

CrawlResultWriter interface is optional: stores that don't implement
it (SQLite, mocks) fall back to the three-call legacy path
automatically. PebbleStore satisfies it; in-serve crawler picks it up
via type assertion in processClaimed.

To avoid a redundant CompleteFrontier in the worker loop after
WriteCrawlResult already did it, processClaimed marks the URL in a
small completedInlineSet; the worker loop consumes-and-deletes the
marker before deciding whether to call its own Complete. Single
sync.Map operation per cycle — far cheaper than the mu round-trip
this replaces.

Expected effect: per-worker cycle time should drop by ~50% (mu hops
were ~60% of the per-cycle non-network time per pprof), letting the
existing 512-worker cap translate into proportionally higher doc/min
throughput.
---
 internal/crawler/crawler.go     |  54 +++++++-
 internal/crawler/store_iface.go |  15 +++
 internal/store/pebble.go        | 216 ++++++++++++++++++++++++++++++++
 3 files changed, 279 insertions(+), 6 deletions(-)

diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index ef70913..97be568 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -91,6 +91,24 @@ type Crawler struct {
 	embedDropped   atomic.Int64
 	embedDone      atomic.Int64
 	embedFailed    atomic.Int64
+
+	// URLs whose frontier transition happened inside processClaimed via
+	// WriteCrawlResult. The worker checks this set before its own
+	// CompleteFrontier call so we don't pay a redundant mu hop.
+	completedInlineSet sync.Map // url (string) → struct{}
+}
+
+// markCompletedInline records that the URL's frontier transition
+// happened inside processClaimed (via WriteCrawlResult).
+func (c *Crawler) markCompletedInline(url string) {
+	c.completedInlineSet.Store(url, struct{}{})
+}
+
+// takeCompletedInline returns true and clears the marker if the URL was
+// completed inline. Returns false otherwise.
+func (c *Crawler) takeCompletedInline(url string) bool {
+	_, ok := c.completedInlineSet.LoadAndDelete(url)
+	return ok
 }
 
 // embedJob is one unit of work for the embed worker pool.
@@ -658,7 +676,9 @@ func (c *Crawler) worker(ctx context.Context, wg *sync.WaitGroup, gate *hostGate
 			_ = c.store.FailFrontier(ctx, item.URL, err.Error())
 			continue
 		}
-		_ = c.store.CompleteFrontier(ctx, item.URL)
+		if !c.takeCompletedInline(item.URL) {
+			_ = c.store.CompleteFrontier(ctx, item.URL)
+		}
 	}
 }
 
@@ -929,12 +949,34 @@ func (c *Crawler) processClaimed(ctx context.Context, item store.FrontierItem, g
 		Image:         parsed.Image,       // og:image / twitter:image / JSON-LD image (empty if absent)
 		Favicon:       parsed.Favicon,     // <link rel="icon"> resolved absolute (empty if absent)
 	}
-	id, err := c.store.UpsertDocument(ctx, doc)
-	if err != nil {
-		return err
+	// Prefer the combined-write path when the store supports it: one mu
+	// hop covers Upsert+Index+Complete instead of three separate calls.
+	// Marks frontier Done inline so the worker skips its own
+	// CompleteFrontier call (signalled via c.completedInline pulled off
+	// the item-scoped flag below).
+	var id int64
+	var completedInline bool
+	if w, ok := c.store.(CrawlResultWriter); ok {
+		var err error
+		id, err = w.WriteCrawlResult(ctx, doc, parsed.Title, parsed.Text, item.URL, index.Tokenize, index.TitleBoost)
+		if err != nil {
+			return err
+		}
+		completedInline = true
+	} else {
+		var err error
+		id, err = c.store.UpsertDocument(ctx, doc)
+		if err != nil {
+			return err
+		}
+		if err := c.idx.IndexDocument(ctx, id, parsed.Title, parsed.Text); err != nil {
+			return err
+		}
 	}
-	if err := c.idx.IndexDocument(ctx, id, parsed.Title, parsed.Text); err != nil {
-		return err
+	// Stash on the context-bound item so the worker loop can skip its
+	// own CompleteFrontier call when WriteCrawlResult already did it.
+	if completedInline {
+		c.markCompletedInline(item.URL)
 	}
 
 	// Dense indexing — optional, non-fatal. Multi-passage: chunk into ~512-token
diff --git a/internal/crawler/store_iface.go b/internal/crawler/store_iface.go
index b79abff..984b377 100644
--- a/internal/crawler/store_iface.go
+++ b/internal/crawler/store_iface.go
@@ -40,6 +40,21 @@ type CrawlerStore interface {
 	GetDocByURL(ctx context.Context, url string) (*store.Document, error)
 }
 
+// CrawlResultWriter folds UpsertDocument + IndexDocument +
+// CompleteFrontier into a SINGLE mu acquire + SINGLE batch commit. When
+// the store satisfies this interface, the crawler hot path uses it to
+// shave 2/3 of the per-doc lock-queue waits. Optional: stores that don't
+// implement it fall back to the legacy three-call path automatically.
+type CrawlResultWriter interface {
+	WriteCrawlResult(
+		ctx context.Context,
+		d *store.Document,
+		title, text, completeURL string,
+		tokenize func(string) []string,
+		titleBoost int,
+	) (int64, error)
+}
+
 // LexicalIndexer abstracts the BM25 writer. Both *index.BM25 (SQLite) and
 // *index.PebbleBM25 satisfy the single-method signature.
 type LexicalIndexer interface {
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index 551b9a6..a8b32a0 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -1258,6 +1258,222 @@ func (p *PebbleStore) IndexDocument(ctx context.Context, docID int64, title, tex
 	return nil
 }
 
+// WriteCrawlResult combines UpsertDocument + IndexDocument + CompleteFrontier
+// into a SINGLE p.mu acquisition + SINGLE batch commit. This is the fix for
+// the per-doc serialization wall: with 512 crawler workers contending on
+// p.mu, three separate writes per doc means three lock-queue waits per
+// cycle. Folded together, each finished crawl costs ONE mu hop, slashing
+// queue depth at the lock by 3x.
+//
+// Tokenization runs OUTSIDE the lock (CPU-only, no shared state). The
+// term-info / prior-doc-terms reads stay inside because they depend on
+// term IDs allocated under the lock; moving them out would race the ID
+// allocator on cold terms.
+//
+// If completeURL is empty the frontier transition step is skipped — lets
+// the same path serve ingest flows (WET, JSONL) that have no frontier
+// row to clear.
+func (p *PebbleStore) WriteCrawlResult(
+	ctx context.Context,
+	d *Document,
+	title, text, completeURL string,
+	tokenize func(string) []string,
+	titleBoost int,
+) (int64, error) {
+	if err := ctx.Err(); err != nil {
+		return 0, err
+	}
+	if d == nil || d.URL == "" {
+		return 0, errors.New("PebbleStore.WriteCrawlResult: nil doc or empty URL")
+	}
+	if titleBoost <= 0 {
+		titleBoost = 1
+	}
+
+	// Tokenize before acquiring the lock so 300+ workers can run their
+	// CPU-bound tokenization in parallel instead of queuing on mu.
+	titleTokens := tokenize(title)
+	bodyTokens := tokenize(text)
+	tf := make(map[string]int, len(titleTokens)+len(bodyTokens))
+	for _, t := range titleTokens {
+		tf[t] += titleBoost
+	}
+	for _, t := range bodyTokens {
+		tf[t]++
+	}
+	docLen := len(titleTokens) + len(bodyTokens)
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// ---- Upsert phase ----
+	var id int64
+	var isNew bool
+	if existingID, ok, err := p.lookupIDByURL(d.URL); err != nil {
+		return 0, err
+	} else if ok {
+		id = existingID
+	} else {
+		id = p.nextID.Add(1)
+		isNew = true
+	}
+	d.ID = id
+	if isNew && os.Getenv("COSIFT_DEBUG_UPSERT") == "1" {
+		fmt.Fprintf(os.Stderr, "upsert-new: id=%d url=%s\n", id, d.URL)
+	}
+
+	var docBuf bytes.Buffer
+	if err := gob.NewEncoder(&docBuf).Encode(d); err != nil {
+		return 0, fmt.Errorf("encode doc: %w", err)
+	}
+	idBuf := make([]byte, 8)
+	binary.BigEndian.PutUint64(idBuf, uint64(id))
+
+	batch := p.db.NewBatch()
+	defer batch.Close()
+	if err := batch.Set(docKey(id), docBuf.Bytes(), nil); err != nil {
+		return 0, err
+	}
+	if err := batch.Set(urlKey(d.URL), idBuf, nil); err != nil {
+		return 0, err
+	}
+	if err := batch.Set(docMetaKey(id), packDocMeta(d.URL, d.Title), nil); err != nil {
+		return 0, err
+	}
+	if d.Domain != "" {
+		if err := batch.Set(hostKey(d.Domain, id), nil, nil); err != nil {
+			return 0, err
+		}
+	}
+	if isNew {
+		if err := batch.Set(metaKey("next_doc_id"), idBuf, nil); err != nil {
+			return 0, err
+		}
+	}
+
+	// ---- Index phase ----
+	if len(tf) > 0 {
+		lenBuf := make([]byte, 8)
+		binary.BigEndian.PutUint64(lenBuf, uint64(docLen))
+
+		oldLen, hadOld, err := p.readDocLenLocked(id)
+		if err != nil {
+			return 0, err
+		}
+		var sumLen, indexedCount int64
+		if p.corpusStatsLoaded.Load() {
+			sumLen = p.corpusSumLen.Load()
+			indexedCount = p.corpusIndexedDocs.Load()
+		} else {
+			sumLen = p.readMetaInt64Locked("sum_doc_len")
+			indexedCount = p.readMetaInt64Locked("indexed_docs")
+		}
+		if hadOld {
+			sumLen -= oldLen
+		} else {
+			indexedCount++
+		}
+		sumLen += int64(docLen)
+
+		if err := batch.Set(docLenKey(id), lenBuf, nil); err != nil {
+			return 0, err
+		}
+		sumBuf := make([]byte, 8)
+		binary.BigEndian.PutUint64(sumBuf, uint64(sumLen))
+		if err := batch.Set(metaKey("sum_doc_len"), sumBuf, nil); err != nil {
+			return 0, err
+		}
+		countBuf := make([]byte, 8)
+		binary.BigEndian.PutUint64(countBuf, uint64(indexedCount))
+		if err := batch.Set(metaKey("indexed_docs"), countBuf, nil); err != nil {
+			return 0, err
+		}
+
+		oldTermIDs, err := p.readDocTermsLocked(id)
+		if err != nil {
+			return 0, err
+		}
+		oldSet := make(map[int64]struct{}, len(oldTermIDs))
+		for _, tid := range oldTermIDs {
+			oldSet[tid] = struct{}{}
+		}
+		newSet := make(map[int64]struct{}, len(tf))
+
+		for term, freq := range tf {
+			info, ok, err := p.getTermInfoLocked(term)
+			if err != nil {
+				return 0, err
+			}
+			if !ok {
+				info.ID = p.nextTermID()
+				info.DocFreq = 1
+			} else if _, alreadyIn := oldSet[info.ID]; !alreadyIn {
+				info.DocFreq++
+			}
+			newSet[info.ID] = struct{}{}
+			if err := batch.Set(termKey(term), packTermInfo(info), nil); err != nil {
+				return 0, err
+			}
+			pvBuf := make([]byte, 16)
+			binary.BigEndian.PutUint64(pvBuf[0:8], uint64(freq))
+			binary.BigEndian.PutUint64(pvBuf[8:16], uint64(docLen))
+			if err := batch.Set(postingKey(info.ID, id), pvBuf, nil); err != nil {
+				return 0, err
+			}
+		}
+		for oldID := range oldSet {
+			if _, stillPresent := newSet[oldID]; stillPresent {
+				continue
+			}
+			if err := batch.Delete(postingKey(oldID, id), nil); err != nil {
+				return 0, err
+			}
+		}
+		newIDs := make([]int64, 0, len(newSet))
+		for tid := range newSet {
+			newIDs = append(newIDs, tid)
+		}
+		if err := batch.Set(docTermsKey(id), packDocTerms(newIDs), nil); err != nil {
+			return 0, err
+		}
+
+		// Mirror counters AFTER commit succeeds.
+		defer func() {
+			p.corpusSumLen.Store(sumLen)
+			p.corpusIndexedDocs.Store(indexedCount)
+			p.corpusStatsLoaded.Store(true)
+		}()
+	}
+
+	// ---- CompleteFrontier phase ----
+	if completeURL != "" {
+		if val, closer, err := p.db.Get(frontierKey(completeURL)); err == nil {
+			entry, uerr := unpackFrontierEntry(val)
+			_ = closer.Close()
+			if uerr == nil {
+				oldStatus := entry.Status
+				entry.Status = FrontierStatusDone
+				if err := batch.Set(frontierKey(completeURL), packFrontierEntry(entry), nil); err != nil {
+					return 0, err
+				}
+				switch oldStatus {
+				case FrontierStatusQueued:
+					_ = batch.Delete(frontierStatusIndexKey('q', entry.Host, completeURL), nil)
+					_ = batch.Delete(frontierStatusIndexKeyLane('q', entry.Lane, entry.Host, completeURL), nil)
+				case FrontierStatusInFlight:
+					_ = batch.Delete(frontierStatusIndexKey('i', entry.Host, completeURL), nil)
+					_ = batch.Delete(frontierStatusIndexKeyLane('i', entry.Lane, entry.Host, completeURL), nil)
+				}
+			}
+		}
+	}
+
+	if err := batch.Commit(p.writeOpts); err != nil {
+		return 0, fmt.Errorf("WriteCrawlResult commit: %w", err)
+	}
+	return id, nil
+}
+
 // PushFrontier inserts a URL into the queue at LaneDiscovered (the
 // crawler-default lane). Thin wrapper around PushFrontierLane kept for
 // backwards compat with callers (crawler outbound-link discovery) that

From c1f7543dcf15f75d70521e5c67108af0dad2799a Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Sun, 14 Jun 2026 16:36:58 +0300
Subject: [PATCH 06/10] feat(crawler): self-cleaning host sweeper + lane-aware
 purge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PurgeFrontierByHost was lane-blind — it only walked the legacy
'f'+'q'+host+0x00+url index, silently missing the lane-aware
'f'+'q'+lane+host+0x00+url range. On the GH200 this meant the
admin/frontier-purge-host endpoint returned "purged: 291" for
cloud.google.com when 2.8M URLs were actually queued. Fixed: the
purger now walks the legacy range AND every lane's range, so demoted
hosts can actually be purged. Verified live: re-purge of
cloud.google.com after the fix dropped 3,092,546 URLs.

hostSweeperLoop is the new self-cleaning background goroutine — wakes
every 10 min (configurable via COSIFT_HOSTSWEEP_INTERVAL_SEC),
walks the existing hostStats sync.Map, and acts on hosts with
COSIFT_HOSTSWEEP_MIN_ATTEMPTS (default 100) recorded attempts:

  success_rate < COSIFT_HOSTSWEEP_DEAD_RATE (default 0.20)
    → PurgeFrontierByHost + add to autoBlocked sync.Map so future
      link discovery skips the host entirely

  COSIFT_HOSTSWEEP_DEAD_RATE ≤ rate < COSIFT_HOSTSWEEP_WEAK_RATE
  (default 0.50)
    → DemoteHostToLane(LaneBulk) so the host's URLs keep draining
      but at the 5%-weight bulk lane instead of crowding lanes 1/2

Live confirmation: within 10 min of going live, the sweeper detected
448,028 newly-discovered cloud.google.com URLs (success rate 0.21)
and demoted them to lane 3. Eliminates the manual
/admin/frontier-purge-host operator workflow.

Optional surfaces (HostFrontierPurger, HostFrontierDemoter) on the
store interface keep the SQLite legacy backend a no-op for these.
---
 internal/crawler/crawler.go     | 144 ++++++++++++++++++++++++++++++++
 internal/crawler/store_iface.go |  14 ++++
 internal/store/pebble.go        |  99 ++++++++++++++--------
 3 files changed, 221 insertions(+), 36 deletions(-)

diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 97be568..93db3cd 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -96,6 +96,12 @@ type Crawler struct {
 	// WriteCrawlResult. The worker checks this set before its own
 	// CompleteFrontier call so we don't pay a redundant mu hop.
 	completedInlineSet sync.Map // url (string) → struct{}
+
+	// Auto-blocked hosts: populated by hostSweeperLoop when a host's
+	// success rate falls below the dead threshold. The link-discovery
+	// path (enqueueLinks) consults this set so we don't keep
+	// re-enqueuing the same dead URLs the sweeper just purged.
+	autoBlocked sync.Map // host (string) → struct{}
 }
 
 // markCompletedInline records that the URL's frontier transition
@@ -462,6 +468,10 @@ func (c *Crawler) Run(ctx context.Context) error {
 		go c.worker(runCtx, &wg, gate)
 	}
 	go c.terminator(runCtx, cancel)
+	// Self-cleaning host sweeper: every 10 min (default), purges hosts
+	// with consistently-failing fetches and demotes low-yield ones.
+	// Eliminates the need for manual /admin/frontier-purge-host calls.
+	go c.hostSweeperLoop(runCtx)
 
 	// Pebble's single-writer lock blocks
 	// `cosift stats -backend=pebble` from any sidecar process during a live
@@ -703,6 +713,132 @@ func (c *Crawler) isHostBlacklisted(host string) bool {
 	return float64(succ)/float64(att) < 0.20
 }
 
+// hostSweeperLoop runs in the background, periodically walking hostStats
+// to find dead (success_rate < 20%) and weak (20–50%) hosts. Dead hosts
+// have their frontier entries purged AND get marked permanently
+// blacklisted in autoBlocked so future link discovery skips them. Weak
+// hosts get demoted to lane 3 (bulk, 5% weight) so they keep draining
+// but don't crowd lane 1 / lane 2.
+//
+// Removes the operator's need to manually invoke /admin/frontier-purge-host
+// and /admin/frontier-demote-host: the crawler keeps its own queue clean.
+//
+// Configurable via env:
+//
+//	COSIFT_HOSTSWEEP_INTERVAL_SEC (default 600 = 10 min)
+//	COSIFT_HOSTSWEEP_MIN_ATTEMPTS (default 100)
+//	COSIFT_HOSTSWEEP_DEAD_RATE    (default 0.20 — purge below this)
+//	COSIFT_HOSTSWEEP_WEAK_RATE    (default 0.50 — demote between dead and weak)
+//	COSIFT_HOSTSWEEP_DISABLED     ("1" disables the sweeper entirely)
+func (c *Crawler) hostSweeperLoop(ctx context.Context) {
+	if os.Getenv("COSIFT_HOSTSWEEP_DISABLED") == "1" {
+		return
+	}
+	interval := time.Duration(envIntCrawler("COSIFT_HOSTSWEEP_INTERVAL_SEC", 600)) * time.Second
+	if interval < 30*time.Second {
+		interval = 30 * time.Second
+	}
+	minAttempts := int32(envIntCrawler("COSIFT_HOSTSWEEP_MIN_ATTEMPTS", 100))
+	deadRate := envFloatCrawler("COSIFT_HOSTSWEEP_DEAD_RATE", 0.20)
+	weakRate := envFloatCrawler("COSIFT_HOSTSWEEP_WEAK_RATE", 0.50)
+	purger, hasPurger := c.store.(HostFrontierPurger)
+	demoter, hasDemoter := c.store.(HostFrontierDemoter)
+	log.Printf("crawler: host sweeper ON (interval=%s, min_attempts=%d, dead<%.2f, weak<%.2f, purger=%v, demoter=%v)",
+		interval, minAttempts, deadRate, weakRate, hasPurger, hasDemoter)
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+		}
+		c.runHostSweep(ctx, minAttempts, deadRate, weakRate, purger, demoter)
+	}
+}
+
+// runHostSweep is one pass of the host sweeper. Extracted so tests can
+// invoke it deterministically without spinning up the ticker.
+func (c *Crawler) runHostSweep(
+	ctx context.Context,
+	minAttempts int32,
+	deadRate, weakRate float64,
+	purger HostFrontierPurger,
+	demoter HostFrontierDemoter,
+) {
+	type hostJudgement struct {
+		host    string
+		dead    bool
+		attempts int32
+		successes int32
+	}
+	var verdicts []hostJudgement
+	c.hostStats.Range(func(k, v any) bool {
+		host, _ := k.(string)
+		s, _ := v.(*hostFetchStats)
+		if host == "" || s == nil {
+			return true
+		}
+		att := s.attempts.Load()
+		if att < minAttempts {
+			return true
+		}
+		// Skip hosts we already auto-blocked in a prior tick. The block
+		// set is consulted in the link-discovery path so we don't keep
+		// re-enqueuing the same dead URLs.
+		if _, blocked := c.autoBlocked.Load(host); blocked {
+			return true
+		}
+		succ := s.successes.Load()
+		rate := float64(succ) / float64(att)
+		switch {
+		case rate < deadRate:
+			verdicts = append(verdicts, hostJudgement{host: host, dead: true, attempts: att, successes: succ})
+		case rate < weakRate:
+			verdicts = append(verdicts, hostJudgement{host: host, dead: false, attempts: att, successes: succ})
+		}
+		return true
+	})
+	for _, v := range verdicts {
+		rate := float64(v.successes) / float64(v.attempts)
+		if v.dead {
+			if purger != nil {
+				n, err := purger.PurgeFrontierByHost(ctx, v.host)
+				if err != nil {
+					log.Printf("host-sweep: purge %s failed: %v", v.host, err)
+					continue
+				}
+				c.autoBlocked.Store(v.host, struct{}{})
+				log.Printf("host-sweep: PURGED %s (%d urls, %d/%d success_rate=%.2f)", v.host, n, v.successes, v.attempts, rate)
+			}
+		} else {
+			if demoter != nil {
+				n, err := demoter.DemoteHostToLane(ctx, v.host, 3) // LaneBulk
+				if err != nil {
+					log.Printf("host-sweep: demote %s failed: %v", v.host, err)
+					continue
+				}
+				log.Printf("host-sweep: DEMOTED %s to lane 3 (%d urls, success_rate=%.2f)", v.host, n, rate)
+			}
+		}
+	}
+}
+
+// envFloatCrawler reads a float env var, falling back to def on parse
+// failure. Used for the sweeper thresholds (rate values).
+func envFloatCrawler(key string, def float64) float64 {
+	v := os.Getenv(key)
+	if v == "" {
+		return def
+	}
+	f, err := strconv.ParseFloat(v, 64)
+	if err != nil || f < 0 {
+		return def
+	}
+	return f
+}
+
 // recordHostResult updates per-host success/attempt counters. Called from
 // the worker loop after each processClaimed return. lock-free
 // hot path via sync.Map + atomic counter increment. Only the first call
@@ -1156,6 +1292,14 @@ func (c *Crawler) enqueueLinks(ctx context.Context, links []string, depth int) {
 			continue
 		}
 		// per-link depth check against the CHILD's host cap.
+		// Skip any host the host sweeper auto-blocked (high error rate
+		// resulted in PurgeFrontierByHost — re-enqueuing it would just
+		// undo that work).
+		if u2, perr := url.Parse(canon); perr == nil {
+			if _, blocked := c.autoBlocked.Load(u2.Host); blocked {
+				continue
+			}
+		}
 		// A child on a host with override=1 is dropped if depth would exceed 1,
 		// even if the default MaxDepth is much higher (and vice versa).
 		u, err := url.Parse(canon)
diff --git a/internal/crawler/store_iface.go b/internal/crawler/store_iface.go
index 984b377..6b637a6 100644
--- a/internal/crawler/store_iface.go
+++ b/internal/crawler/store_iface.go
@@ -61,6 +61,20 @@ type LexicalIndexer interface {
 	IndexDocument(ctx context.Context, docID int64, title, text string) error
 }
 
+// HostFrontierPurger is the optional surface the in-crawler host sweeper
+// uses to drain dead hosts. Pebble satisfies it; the SQLite legacy path
+// doesn't need it (no auto-sweeper there).
+type HostFrontierPurger interface {
+	PurgeFrontierByHost(ctx context.Context, host string) (int, error)
+}
+
+// HostFrontierDemoter lets the sweeper move low-yield hosts to the
+// bulk lane so they keep consuming worker cycles at lane 3's 5% weight
+// instead of crowding lanes 1/2.
+type HostFrontierDemoter interface {
+	DemoteHostToLane(ctx context.Context, host string, lane byte) (int, error)
+}
+
 // PassageWriter is the optional vector-write surface. *store.Store
 // satisfies it via UpsertPassage; *store.PebbleStore does NOT (Pebble's
 // vector path goes through index.HNSW.AddPassage + periodic Persist —
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index a8b32a0..bd41e50 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -2230,49 +2230,76 @@ func (p *PebbleStore) PurgeFrontierByHost(ctx context.Context, host string) (int
 	p.mu.Lock()
 	defer p.mu.Unlock()
 
-	// Scan the 'f'+'q'+host+0x00+url secondary index for this host.
-	prefix := make([]byte, 2+len(host)+1)
-	prefix[0] = famFrontier
-	prefix[1] = 'q'
-	copy(prefix[2:], host)
-	prefix[2+len(host)] = 0x00
-	upper := make([]byte, len(prefix))
-	copy(upper, prefix)
-	upper[len(upper)-1] = 0x01 // bump past the 0x00 separator block
-
-	it, err := p.db.NewIter(&pebble.IterOptions{LowerBound: prefix, UpperBound: upper})
-	if err != nil {
-		return 0, err
-	}
-	defer it.Close()
-
 	batch := p.db.NewBatch()
 	defer batch.Close()
 	count := 0
-	for valid := it.First(); valid; valid = it.Next() {
-		key := it.Key()
-		// Recover the URL portion (after the 0x00 separator).
-		urlPart := key[len(prefix):]
-		// Delete the secondary index entry...
-		secCopy := make([]byte, len(key))
-		copy(secCopy, key)
-		if err := batch.Delete(secCopy, nil); err != nil {
-			return count, err
-		}
-		// ...and the primary 'f'+'u'+url entry.
-		if err := batch.Delete(frontierKey(string(urlPart)), nil); err != nil {
-			return count, err
+
+	// purgeRange walks one secondary-index prefix range, deleting both the
+	// secondary entry and its primary 'f'+'u'+url counterpart.
+	// urlOffset is the byte position where the URL starts within each
+	// matching key (after host and 0x00 separator).
+	purgeRange := func(prefix, upper []byte, urlOffset int) error {
+		it, err := p.db.NewIter(&pebble.IterOptions{LowerBound: prefix, UpperBound: upper})
+		if err != nil {
+			return err
 		}
-		count++
-		// Commit in 5k-entry chunks to bound memory.
-		if count%5000 == 0 {
-			if err := batch.Commit(p.writeOpts); err != nil {
-				return count, err
+		defer it.Close()
+		for valid := it.First(); valid; valid = it.Next() {
+			key := it.Key()
+			if len(key) <= urlOffset {
+				continue
+			}
+			urlPart := key[urlOffset:]
+			secCopy := append([]byte{}, key...)
+			if err := batch.Delete(secCopy, nil); err != nil {
+				return err
+			}
+			if err := batch.Delete(frontierKey(string(urlPart)), nil); err != nil {
+				return err
+			}
+			count++
+			if count%5000 == 0 {
+				if err := batch.Commit(p.writeOpts); err != nil {
+					return err
+				}
+				batch.Close()
+				batch = p.db.NewBatch()
 			}
-			batch.Close()
-			batch = p.db.NewBatch()
 		}
+		return nil
 	}
+
+	// 1. Legacy 'f'+'q'+host+0x00+url range (pre-lanes entries).
+	legacyPrefix := make([]byte, 2+len(host)+1)
+	legacyPrefix[0] = famFrontier
+	legacyPrefix[1] = 'q'
+	copy(legacyPrefix[2:], host)
+	legacyPrefix[2+len(host)] = 0x00
+	legacyUpper := append([]byte{}, legacyPrefix...)
+	legacyUpper[len(legacyUpper)-1] = 0x01
+	if err := purgeRange(legacyPrefix, legacyUpper, len(legacyPrefix)); err != nil {
+		return count, err
+	}
+
+	// 2. Lane-aware 'f'+'q'+lane+host+0x00+url range for every lane. This
+	// catches hosts demoted via DemoteHostToLane (which moved the
+	// cloud.google.com 2.8M URL block to lane 3 — the original purge
+	// implementation was lane-blind and silently missed them, returning
+	// "purged: 291" on a host with 2.8M queued entries).
+	for lane := byte(0); lane < laneCount; lane++ {
+		lanePrefix := make([]byte, 3+len(host)+1)
+		lanePrefix[0] = famFrontier
+		lanePrefix[1] = 'q'
+		lanePrefix[2] = lane
+		copy(lanePrefix[3:], host)
+		lanePrefix[3+len(host)] = 0x00
+		laneUpper := append([]byte{}, lanePrefix...)
+		laneUpper[len(laneUpper)-1] = 0x01
+		if err := purgeRange(lanePrefix, laneUpper, len(lanePrefix)); err != nil {
+			return count, err
+		}
+	}
+
 	if err := batch.Commit(p.writeOpts); err != nil {
 		return count, err
 	}

From f1b238d91a8f6f3e7e78b73dfac6b61f918312bf Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 15:17:06 +0000
Subject: [PATCH 07/10] feat(crawler): adult-content filter + purge-adult
 command

Adds an adult/spam classifier (host+TLD match plus >=2 distinct body-term
threshold) gated behind crawler.filter_adult, wired into the crawl pipeline,
plus a purge-adult command to sweep already-indexed adult/spam docs with a
safety gate on the match fraction.
---
 cmd/cosift/main.go                       |   4 +
 cmd/cosift/purge_adult.go                | 157 ++++++++++++++++++++
 internal/adultfilter/adultfilter.go      | 177 +++++++++++++++++++++++
 internal/adultfilter/adultfilter_test.go |  99 +++++++++++++
 internal/config/config.go                |   8 +
 internal/crawler/crawler.go              |   7 +
 internal/store/pebble.go                 | 175 +++++++++++++++++++++-
 internal/store/pebble_test.go            |  48 ++++++
 8 files changed, 669 insertions(+), 6 deletions(-)
 create mode 100644 cmd/cosift/purge_adult.go
 create mode 100644 internal/adultfilter/adultfilter.go
 create mode 100644 internal/adultfilter/adultfilter_test.go

diff --git a/cmd/cosift/main.go b/cmd/cosift/main.go
index da86022..fd813fa 100644
--- a/cmd/cosift/main.go
+++ b/cmd/cosift/main.go
@@ -334,6 +334,10 @@ func run(cfgPath string) error {
 		if err := runDomainAudit(ctx, flag.Args()[1:]); err != nil {
 			return fmt.Errorf("domain-audit: %w", err)
 		}
+	case "purge-adult":
+		if err := runPurgeAdult(ctx, flag.Args()[1:]); err != nil {
+			return fmt.Errorf("purge-adult: %w", err)
+		}
 	case "verify":
 		if err := runVerifyPebble(ctx, cfg, flag.Args()[1:]); err != nil {
 			return fmt.Errorf("verify: %w", err)
diff --git a/cmd/cosift/purge_adult.go b/cmd/cosift/purge_adult.go
new file mode 100644
index 0000000..17a2ca4
--- /dev/null
+++ b/cmd/cosift/purge_adult.go
@@ -0,0 +1,157 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"sort"
+
+	"github.com/pilot-protocol/cosift/internal/adultfilter"
+	"github.com/pilot-protocol/cosift/internal/store"
+)
+
+// runPurgeAdult sweeps an offline PebbleStore, classifies every document with
+// the adultfilter (host + lexical signals over URL + title, optionally body),
+// and soft-deletes the adult ones so they vanish from retrieval.
+//
+// DRY RUN BY DEFAULT. Without -apply it only counts and reports — including a
+// histogram of the TLDs and hosts that dominate the matches, so an operator
+// can see exactly what the filter would remove (and spot any TLD worth adding
+// to the classifier's blocklist) before committing. Pass -apply to delete.
+//
+// Soft delete (store.SoftDeleteDocument) leaves the inverted-index postings as
+// orphans — harmless, since retrieval skips any docID whose meta is gone — so
+// the sweep is a handful of point-deletes per doc rather than a full index
+// rewrite. That is what makes it tractable across a multi-million-doc corpus.
+//
+//	cosift purge-adult -dir /data/pebble                # dry run + report
+//	cosift purge-adult -dir /data/pebble -apply         # delete (URL+title)
+//	cosift purge-adult -dir /data/pebble -deep -apply   # also scan body text
+func runPurgeAdult(ctx context.Context, args []string) error {
+	fs := flag.NewFlagSet("purge-adult", flag.ExitOnError)
+	dir := fs.String("dir", "", "PebbleStore directory (required; same dir as pebble-serve -dir)")
+	apply := fs.Bool("apply", false, "actually soft-delete matches (default: dry run, report only)")
+	deep := fs.Bool("deep", false, "fetch full body text per doc for lexical scan (slower, higher recall)")
+	limit := fs.Int("limit", 0, "stop after deleting this many docs (0 = no limit)")
+	topHosts := fs.Int("top-hosts", 25, "how many top offending hosts/TLDs to print in the report")
+	readonly := fs.Bool("readonly", false, "open the store read-only (no lock) — runs alongside a live pebble-serve; forces dry run")
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if *dir == "" {
+		return fmt.Errorf("-dir required")
+	}
+	if *readonly && *apply {
+		return fmt.Errorf("-readonly cannot be combined with -apply (read-only opens take no write lock)")
+	}
+
+	var ps *store.PebbleStore
+	var err error
+	if *readonly {
+		ps, err = store.OpenPebbleReadOnly(*dir)
+	} else {
+		ps, err = store.OpenPebble(*dir)
+	}
+	if err != nil {
+		return fmt.Errorf("open store: %w", err)
+	}
+	defer ps.Close()
+
+	_, before, _ := ps.CorpusStats(ctx)
+	mode := "DRY RUN (no deletes)"
+	if *apply {
+		mode = "APPLY (soft-deleting matches)"
+	}
+	fmt.Fprintf(os.Stderr, "purge-adult: %s — scanning %d docs (deep=%v)\n", mode, before, *deep)
+
+	var scanned, matched, deleted int64
+	tldHist := map[string]int64{}
+	hostHist := map[string]int64{}
+	var samples []string
+
+	err = ps.IterDocMeta(ctx, func(docID int64, url, title string) error {
+		scanned++
+		if scanned%500_000 == 0 {
+			fmt.Fprintf(os.Stderr, "purge-adult: scanned %d, matched %d, deleted %d\n", scanned, matched, deleted)
+		}
+
+		body := ""
+		if *deep {
+			if d, e := ps.GetDocByID(ctx, docID); e == nil && d != nil {
+				body = d.Text
+			}
+		}
+		adult, score, reason := adultfilter.Classify(title, body, url)
+		if !adult {
+			return nil
+		}
+		matched++
+		host := hostFromURL(url)
+		hostHist[host]++
+		tldHist["."+tldOfHost(host)]++
+		if len(samples) < 20 {
+			samples = append(samples, fmt.Sprintf("[score=%d %s] %s", score, reason, url))
+		}
+
+		if *apply {
+			ok, derr := ps.SoftDeleteDocument(ctx, docID, url)
+			if derr != nil {
+				return fmt.Errorf("delete doc %d: %w", docID, derr)
+			}
+			if ok {
+				deleted++
+			}
+			if *limit > 0 && deleted >= int64(*limit) {
+				return errStopSweep
+			}
+		}
+		return nil
+	})
+	if err != nil && err != errStopSweep {
+		return fmt.Errorf("sweep: %w", err)
+	}
+
+	_, after, _ := ps.CorpusStats(ctx)
+	fmt.Fprintf(os.Stderr, "\npurge-adult: done — scanned=%d matched=%d deleted=%d\n", scanned, matched, deleted)
+	fmt.Fprintf(os.Stderr, "purge-adult: corpus indexed_docs %d → %d\n", before, after)
+
+	printHist(os.Stderr, "top offending TLDs", tldHist, *topHosts)
+	printHist(os.Stderr, "top offending hosts", hostHist, *topHosts)
+	if len(samples) > 0 {
+		fmt.Fprintln(os.Stderr, "\nsample matches:")
+		for _, s := range samples {
+			fmt.Fprintln(os.Stderr, "  "+s)
+		}
+	}
+	if !*apply && matched > 0 {
+		fmt.Fprintf(os.Stderr, "\npurge-adult: DRY RUN — re-run with -apply to soft-delete the %d matched docs.\n", matched)
+	}
+	return nil
+}
+
+// errStopSweep is the sentinel returned from the IterDocMeta callback to stop
+// early once -limit deletes have been made (not a real error).
+var errStopSweep = fmt.Errorf("purge-adult: stop sweep")
+
+func printHist(w *os.File, label string, h map[string]int64, top int) {
+	if len(h) == 0 {
+		return
+	}
+	type kv struct {
+		k string
+		v int64
+	}
+	rows := make([]kv, 0, len(h))
+	for k, v := range h {
+		rows = append(rows, kv{k, v})
+	}
+	sort.Slice(rows, func(i, j int) bool { return rows[i].v > rows[j].v })
+	if top > 0 && len(rows) > top {
+		rows = rows[:top]
+	}
+	fmt.Fprintf(w, "\n%s:\n", label)
+	for _, r := range rows {
+		fmt.Fprintf(w, "  %8d  %s\n", r.v, r.k)
+	}
+}
diff --git a/internal/adultfilter/adultfilter.go b/internal/adultfilter/adultfilter.go
new file mode 100644
index 0000000..b00838b
--- /dev/null
+++ b/internal/adultfilter/adultfilter.go
@@ -0,0 +1,177 @@
+// Package adultfilter provides a precision-oriented classifier that flags
+// pornographic / adult web pages so the crawler can refuse to index them and
+// an offline sweep can purge ones already in the corpus.
+//
+// Design goal: HIGH PRECISION over recall. A false positive silently drops
+// legitimate content from the index, which is far worse than letting a
+// borderline page through. The classifier therefore relies on two narrow,
+// high-confidence signals and deliberately ignores everything ambiguous:
+//
+//   - Host / TLD match (CONCLUSIVE): the URL host contains a known adult-site
+//     brand fragment (pornhub, xvideos, "porn", …) or sits under an adult /
+//     heavily-abused gTLD (.xxx, .porn, .sex, .adult, .cfd, .sbs). This is the
+//     signal that reliably identifies a porn SITE.
+//
+//   - Lexical: a page is flagged ONLY when it contains TWO OR MORE DISTINCT
+//     unambiguous explicit terms (porn, blowjob, creampie, gangbang, …). A
+//     page that merely mentions "porn" or "masturbation" once — a news story,
+//     a health article, an academic paper, a kernel doc — has at most one and
+//     is NOT flagged. Real porn pages carry many distinct explicit terms.
+//
+// Two traps motivated this design (both observed against the live corpus):
+//
+//  1. Fragment splitting. Naive tokenisation on non-letters turned "3xxx"
+//     (kernel RAID driver), Roman numeral "XXX", "7fap3" (a record ID),
+//     "analízis" (Hungarian) and "symb.anal.net" into the tokens xxx / fap /
+//     anal. Fixed two ways: tokens are runs of Unicode letters AND digits
+//     ("3xxx" stays one token, never "xxx"), and the short ambiguous words
+//     (xxx, anal, fap, tits, sex, cum, …) are NOT lexical terms at all — xxx
+//     only ever matters as a TLD/host signal.
+//
+//  2. Topical mentions. "Pornography law", "porn-induced dysfunction",
+//     "forced anal examinations" (Human Rights Watch) each contain a single
+//     explicit token. The ≥2-distinct-terms rule lets all of them through.
+package adultfilter
+
+import (
+	"net/url"
+	"strings"
+	"unicode"
+)
+
+// explicitTerms are unambiguous, multi-letter pornographic words. Each is a
+// strong signal, but ONE is never enough (topical articles contain one);
+// TWO DISTINCT terms is the lexical bar. Deliberately EXCLUDES every short or
+// dual-sense word — sex, cum, anal, xxx, tits, fap, cock, dick, escort,
+// naked, nude, adult, model, hardcore, fetish, slut, boobs — which produced
+// real false positives and add little recall that host/TLD matching misses.
+var explicitTerms = map[string]struct{}{
+	"porn": {}, "porno": {}, "pornography": {}, "pornographic": {},
+	"blowjob": {}, "blowjobs": {}, "handjob": {}, "handjobs": {},
+	"footjob": {}, "rimjob": {}, "titfuck": {}, "deepthroat": {},
+	"bukkake": {}, "gangbang": {}, "gangbangs": {}, "creampie": {}, "creampies": {},
+	"cumshot": {}, "cumshots": {}, "facials": {}, "gloryhole": {},
+	"milf": {}, "milfs": {}, "hentai": {}, "camgirl": {}, "camgirls": {},
+	"camwhore": {}, "fleshlight": {}, "buttplug": {}, "fisting": {},
+	"masturbation": {}, "masturbating": {}, "masturbate": {},
+	"cunnilingus": {}, "fellatio": {}, "shemale": {}, "shemales": {},
+	"ladyboy": {}, "nympho": {}, "cameltoe": {}, "dildo": {}, "dildos": {},
+	"bbw": {}, "creampied": {}, "cocksucking": {}, "pussyfucking": {},
+	"upskirt": {}, "downblouse": {}, "jerkoff": {}, "fapping": {},
+	"xvideos": {}, "xhamster": {}, "xnxx": {}, "youporn": {}, "redtube": {},
+	"brazzers": {}, "chaturbate": {}, "bangbros": {}, "pornstar": {}, "pornstars": {},
+}
+
+// hostTokens are adult-site name fragments. If the URL host contains any as a
+// substring the page is conclusively adult. Every entry is a brand name or
+// the literal "porn" — fragments with no innocent host-name sense. Short
+// ambiguous fragments (sex → essex, anal → analytics, milf → milford) are
+// intentionally absent: they match legitimate hosts.
+var hostTokens = []string{
+	"porn", "xvideos", "xhamster", "xnxx", "youporn", "redtube", "spankbang",
+	"brazzers", "chaturbate", "rule34", "hentai", "fapello",
+	"camsoda", "stripchat", "myfreecams", "livejasmin", "tnaflix", "eporner",
+	"nhentai", "motherless", "drtuber", "tube8", "keezmovies", "bangbros",
+	"naughtyamerica", "thothub", "faphouse", "spankwire",
+}
+
+// adultTLDs are gTLDs whose registrant base is overwhelmingly adult or
+// adult-spam: the ICANN adult-sponsored set (.xxx/.porn/.sex/.adult) plus
+// .cfd and .sbs, which the live-corpus audit showed dominated by porn/spam.
+// Generic TLDs with substantial legitimate use (.cam, .tube, .sexy, .xyz)
+// are intentionally excluded.
+var adultTLDs = []string{".xxx", ".porn", ".sex", ".adult", ".cfd", ".sbs"}
+
+// IsAdult reports whether the page is pornographic. Convenience wrapper over
+// Classify for the boolean-only caller (the crawler).
+func IsAdult(title, text, rawURL string) bool {
+	adult, _, _ := Classify(title, text, rawURL)
+	return adult
+}
+
+// Classify returns whether the page is adult, the number of DISTINCT explicit
+// terms found (0 when the verdict came from a host/TLD match — see reason),
+// and a short human-readable reason for dry-run / audit logging.
+func Classify(title, text, rawURL string) (adult bool, distinctTerms int, reason string) {
+	if host := hostOf(rawURL); host != "" {
+		if t, ok := matchHost(host); ok {
+			return true, 0, "host:" + t
+		}
+	}
+
+	// Lexical: count DISTINCT explicit terms across title + body. The URL
+	// path is intentionally NOT scanned — slugs and IDs are a false-positive
+	// minefield (3xxx, record IDs, "anal.net") and the host signal already
+	// covers the domain.
+	found := map[string]struct{}{}
+	collect := func(s string) {
+		for _, tok := range tokenize(s) {
+			if _, ok := explicitTerms[tok]; ok {
+				found[tok] = struct{}{}
+			}
+		}
+	}
+	collect(title)
+	body := text
+	if len(body) > 200_000 {
+		body = body[:200_000] // signal saturates; cap work on huge pages
+	}
+	collect(body)
+
+	if len(found) >= 2 {
+		return true, len(found), "lexical:" + strings.Join(sortedKeys(found), ",")
+	}
+	return false, len(found), ""
+}
+
+// tokenize lowercases and splits s into runs of Unicode letters OR digits.
+// Keeping digits attached is what makes "3xxx" → "3xxx" (never "xxx") and
+// "7fap3" → "7fap3" (never "fap"); treating í/é/… as letters keeps
+// "analízis" a single token (never "anal").
+func tokenize(s string) []string {
+	return strings.FieldsFunc(strings.ToLower(s), func(r rune) bool {
+		return !unicode.IsLetter(r) && !unicode.IsDigit(r)
+	})
+}
+
+func sortedKeys(m map[string]struct{}) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
+	}
+	// insertion sort — tiny maps, avoids importing sort for this alone
+	for i := 1; i < len(out); i++ {
+		for j := i; j > 0 && out[j-1] > out[j]; j-- {
+			out[j-1], out[j] = out[j], out[j-1]
+		}
+	}
+	return out
+}
+
+func hostOf(rawURL string) string {
+	if rawURL == "" {
+		return ""
+	}
+	u, err := url.Parse(rawURL)
+	if err != nil || u.Host == "" {
+		return strings.ToLower(rawURL) // maybe a bare host was passed
+	}
+	return strings.ToLower(u.Hostname())
+}
+
+// matchHost reports whether host is an adult host (brand fragment or adult
+// TLD). TLDs are matched on the final label only, so a path/subdomain that
+// happens to end in ".sex" elsewhere can't trip it.
+func matchHost(host string) (string, bool) {
+	for _, t := range hostTokens {
+		if strings.Contains(host, t) {
+			return t, true
+		}
+	}
+	for _, tld := range adultTLDs {
+		if strings.HasSuffix(host, tld) {
+			return tld, true
+		}
+	}
+	return "", false
+}
diff --git a/internal/adultfilter/adultfilter_test.go b/internal/adultfilter/adultfilter_test.go
new file mode 100644
index 0000000..94da654
--- /dev/null
+++ b/internal/adultfilter/adultfilter_test.go
@@ -0,0 +1,99 @@
+package adultfilter
+
+import "testing"
+
+// TestNoFalsePositives — every case here is a REAL false positive observed
+// against the live 13M-doc corpus (or a classic substring trap). None may be
+// classified adult.
+func TestNoFalsePositives(t *testing.T) {
+	cases := []struct {
+		name, title, text, url string
+	}{
+		// Fragment-splitting traps (the bug class that mis-deleted live docs).
+		{"kernel-3xxx", "HighPoint RocketRAID 3xxx/4xxx Adapter Driver", "SCSI driver documentation.", "https://www.kernel.org/doc/html/latest/scsi/hptiop.html"},
+		{"roman-numeral-xxx", "Historia Augusta — Tyranni XXX", "The Thirty Tyrants, a Roman text.", "https://penelope.uchicago.edu/Thayer/E/Roman/Texts/Historia_Augusta/Tyranni_XXX.html"},
+		{"jacques-tits", "Abel Prize Laureates 2008", "John G. Thompson and Jacques Tits.", "https://abelprize.no/abel-prize-laureates/2008"},
+		{"hungarian-analizis", "Legközelebbi szomszéd analízis", "Nearest-neighbour analízis eredmény.", "https://commons.wikimedia.org/wiki/File:analizis.JPG"},
+		{"symb-anal-net", "Symbol analysis networks", "Harnad symb.anal.net.searle paper.", "https://www.southampton.ac.uk/~harnad/Papers/Harnad/harnad93.symb.anal.net.searle.html"},
+		{"record-id-fap", "Caltech library record", "Archived dataset.", "https://authors.library.caltech.edu/records/7fap3-5v118"},
+		// Single-explicit-term topical mentions (≥2-distinct rule lets through).
+		{"hrw-anal-exam", "Forced anal examinations", "Human Rights Watch report on prosecutions.", "https://www.hrw.org/report/forced-anal-examinations-homosexuality"},
+		{"mentalhealth-porn", "When does porn become a problem", "A clinical overview of compulsive behaviour.", "https://www.mentalhealth.com/library/when-does-porn-become-a-problem"},
+		{"pornography-law", "Pornography law in the EU", "A legal overview of regulation.", "https://law.example.org/eu"},
+		{"hardcore-engineer", "Hardcore Engineer blog", "Cut your AI agent token costs.", "https://dev.to/hardcore-engineer"},
+		{"sex-research", "Sex differences in cognition", "Research into the biology of sex chromosomes.", "https://journal.example.org/paper"},
+		// Classic traps.
+		{"analysis", "Statistical analysis of canal sediment", "A regression analysis of the data.", "https://example.com/analysis"},
+		{"scunthorpe", "Scunthorpe United FC", "The match in Scunthorpe.", "https://bbc.co.uk/sport/scunthorpe"},
+		{"essex-sussex", "Essex and Sussex travel", "Middlesex and Wessex regions.", "https://travel.example.org/sussex"},
+		{"cum-laude", "Graduated magna cum laude", "Summa cum laude degree.", "https://uni.example.edu/news"},
+		{"camera", "Best webcam for streaming", "Reviewing camera hardware.", "https://tech.example.com/webcam"},
+		{"dotcam-legit", "Acme Camera Co", "Photography gear.", "https://acme.cam/store"},
+		{"plain", "Intro to Go programming", "Goroutines and channels.", "https://go.example.dev/intro"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if adult, n, reason := Classify(c.title, c.text, c.url); adult {
+				t.Errorf("FALSE POSITIVE: classified adult (distinctTerms=%d reason=%q)", n, reason)
+			}
+		})
+	}
+}
+
+// TestTruePositives — clear porn must be caught, primarily via host/TLD.
+func TestTruePositives(t *testing.T) {
+	cases := []struct {
+		name, title, text, url string
+	}{
+		{"host-pornhub", "Some Video", "watch now", "https://www.pornhub.com/view"},
+		{"host-xvideos", "clip", "body", "https://xvideos.com/v/123"},
+		{"host-porn-substr", "Gallery", "x", "https://www.thotsaporn.com/"},
+		{"host-xporn", "Gallery", "x", "https://xporn.org/"},
+		{"tld-xxx", "welcome", "content", "https://site.xxx/page"},
+		{"tld-cfd", "members", "x", "https://hotcams.cfd/live"},
+		{"tld-sbs", "members", "x", "https://freecams.sbs/live"},
+		{"tld-porn", "members", "x", "https://watch.porn/clip"},
+		{"lexical-two-terms", "Free MILF Creampie Videos", "watch", "https://vids.example.net/clip"},
+		{"lexical-body", "Members Area", "blowjob gangbang cumshot compilation for adults", "https://example.com/members"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if adult, n, reason := Classify(c.title, c.text, c.url); !adult {
+				t.Errorf("FALSE NEGATIVE: not classified adult (distinctTerms=%d reason=%q)", n, reason)
+			}
+		})
+	}
+}
+
+// TestSingleExplicitTermDoesNotTrip — exactly one distinct explicit term,
+// however many times it repeats, is never enough on its own.
+func TestSingleExplicitTermDoesNotTrip(t *testing.T) {
+	if IsAdult("Porn addiction research", "porn porn porn studies of porn consumption and porn habits", "https://research.example.edu/porn-study") {
+		t.Error("a single distinct explicit term (repeated) must not trip the filter")
+	}
+}
+
+// TestTwoDistinctTermsTrips — two different explicit terms is the lexical bar.
+func TestTwoDistinctTermsTrips(t *testing.T) {
+	if !IsAdult("Gallery", "creampie and gangbang scenes", "https://generic.example.net/g") {
+		t.Error("two distinct explicit terms should trip the filter")
+	}
+}
+
+func TestHostMatchIsConclusive(t *testing.T) {
+	if adult, _, reason := Classify("", "", "https://m.redtube.com/"); !adult || reason != "host:redtube" {
+		t.Errorf("host match failed: adult=%v reason=%q", adult, reason)
+	}
+}
+
+// TestFragmentsNeverTokenize guards the tokenizer directly against the
+// fragment bugs.
+func TestFragmentsNeverTokenize(t *testing.T) {
+	for _, s := range []string{"3xxx", "4xxx", "7fap3", "analízis", "xxxi"} {
+		for _, tok := range tokenize(s) {
+			if _, bad := explicitTerms[tok]; bad {
+				t.Errorf("%q tokenized to explicit term %q", s, tok)
+			}
+		}
+	}
+}
diff --git a/internal/config/config.go b/internal/config/config.go
index 0fa85c3..09ef8db 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -279,6 +279,14 @@ type Crawler struct {
 	// or 1000 for research/news corpora where short pages are almost
 	// always navigation cruft.
 	MinTextLen int `json:"min_text_len,omitempty"`
+
+	// FilterAdult, when true, runs each parsed page through the
+	// adultfilter classifier (host + lexical signals) and refuses to
+	// index pornographic content. High-precision by design — see
+	// internal/adultfilter. Default false preserves existing behavior;
+	// the offline `cosift purge-adult` sweep cleans content already
+	// indexed before this was enabled.
+	FilterAdult bool `json:"filter_adult,omitempty"`
 }
 
 // Federation configures upstream search backends used as no-key
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index 93db3cd..751c093 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -24,6 +24,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/pilot-protocol/cosift/internal/adultfilter"
 	"github.com/pilot-protocol/cosift/internal/config"
 	"github.com/pilot-protocol/cosift/internal/embed"
 	"github.com/pilot-protocol/cosift/internal/index"
@@ -1037,6 +1038,12 @@ func (c *Crawler) processClaimed(ctx context.Context, item store.FrontierItem, g
 	if c.cfg.MinTextLen > 0 && len(parsed.Text) < c.cfg.MinTextLen {
 		return errors.New("text below min_text_len")
 	}
+	// Adult-content gate. High-precision classifier (host + lexical
+	// signals) — refuse to index pornographic pages so they never enter
+	// the corpus. Off by default; enabled via crawler.filter_adult.
+	if c.cfg.FilterAdult && adultfilter.IsAdult(parsed.Title, parsed.Text, finalURL) {
+		return errors.New("adult content filtered")
+	}
 
 	finalU, _ := url.Parse(finalURL)
 	sha := sha256.Sum256([]byte(parsed.Text))
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index bd41e50..5bd31f4 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -29,6 +29,7 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"net/url"
 	"os"
 	"sort"
 	"strconv"
@@ -75,9 +76,9 @@ type PebbleStore struct {
 	// (host, url) tuple; next claim seeks past it so each call resumes
 	// where the previous one stopped, wrapping at the end.
 	frontierCursorMu sync.Mutex
-	frontierCursor   []byte             // legacy single cursor; pre-lanes scan state.
-	laneCursors      [laneCount][]byte  // per-lane round-robin cursors.
-	laneTick         atomic.Uint64      // monotonic counter driving weighted lane pick.
+	frontierCursor   []byte            // legacy single cursor; pre-lanes scan state.
+	laneCursors      [laneCount][]byte // per-lane round-robin cursors.
+	laneTick         atomic.Uint64     // monotonic counter driving weighted lane pick.
 
 	// PILOT-190: pebble.DB.Close() panics if called twice. Wrap teardown
 	// in sync.Once so repeated Close() calls (e.g. from layered cleanups
@@ -129,6 +130,19 @@ const (
 // climbs higher (compaction, write batches, block readers) but the
 // OOM-prone block cache growth is bounded.
 func OpenPebble(path string) (*PebbleStore, error) {
+	return openPebble(path, false)
+}
+
+// OpenPebbleReadOnly opens the store WITHOUT acquiring the directory write
+// lock, so it can run alongside a live pebble-serve process (zero downtime).
+// Reads see a consistent snapshot taken at open time; any write method
+// (UpsertDocument, SoftDeleteDocument, …) will fail. Use for offline
+// inspection / dry-run sweeps over a production corpus.
+func OpenPebbleReadOnly(path string) (*PebbleStore, error) {
+	return openPebble(path, true)
+}
+
+func openPebble(path string, readOnly bool) (*PebbleStore, error) {
 	cacheMB := envInt("COSIFT_PEBBLE_CACHE_MB", 128)
 	memtableMB := envInt("COSIFT_PEBBLE_MEMTABLE_MB", 32)
 	memtables := envInt("COSIFT_PEBBLE_MEMTABLES", 2)
@@ -139,6 +153,7 @@ func OpenPebble(path string) (*PebbleStore, error) {
 		Cache:                       cache,
 		MemTableSize:                uint64(memtableMB) << 20,
 		MemTableStopWritesThreshold: memtables + 2,
+		ReadOnly:                    readOnly,
 	}
 	db, err := pebble.Open(path, opts)
 	if err != nil {
@@ -2214,6 +2229,154 @@ func (p *PebbleStore) IterDocsLite(ctx context.Context, fn func(docID int64, url
 	return nil
 }
 
+// IterDocMeta is like IterDocsLite but also yields the document title from
+// the cheap 'i' side-blob (no full gob decode). Used by content sweeps
+// (e.g. purge-adult) that classify on URL + title across the whole corpus.
+func (p *PebbleStore) IterDocMeta(ctx context.Context, fn func(docID int64, url, title string) error) error {
+	if err := ctx.Err(); err != nil {
+		return err
+	}
+	it, err := p.db.NewIter(&pebble.IterOptions{
+		LowerBound: []byte{famDocMeta},
+		UpperBound: []byte{famDocMeta + 1},
+	})
+	if err != nil {
+		return err
+	}
+	defer it.Close()
+	for valid := it.First(); valid; valid = it.Next() {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		key := it.Key()
+		if len(key) != 9 {
+			continue
+		}
+		docID := int64(binary.BigEndian.Uint64(key[1:]))
+		val, err := it.ValueAndErr()
+		if err != nil {
+			continue
+		}
+		url, title, ok, err := unpackDocMeta(val)
+		if err != nil || !ok || url == "" {
+			continue
+		}
+		if err := fn(docID, url, title); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// SoftDeleteDocument removes a document from retrieval without rewriting the
+// posting lists. It deletes the doc record ('d'), the URL→ID index ('u'), the
+// cheap meta side-blob ('i'), the host index entry ('h') and the doc-length
+// entry ('l'), then decrements the corpus counters (indexed_docs, sum_doc_len)
+// so BM25 IDF/avgdl stay accurate.
+//
+// The term postings ('p') and doc-terms list ('g') are intentionally LEFT in
+// place as orphans: the retrieval path resolves every scored docID through
+// GetDocMeta and skips any whose meta is missing (pebble_bm25.go), so an
+// orphaned posting can never surface a deleted doc — it only carries a small,
+// bounded DocFreq inaccuracy that the next reindex/compaction reconciles. This
+// makes deletion a handful of point-deletes instead of a full inverted-index
+// rewrite, which is what makes a multi-million-doc purge tractable.
+//
+// rawURL must be the document's stored URL (the caller has it from the meta
+// scan); the host index key is derived from it. Returns ok=false when the
+// document was already absent (idempotent — safe to re-run a purge).
+func (p *PebbleStore) SoftDeleteDocument(ctx context.Context, docID int64, rawURL string) (bool, error) {
+	if err := ctx.Err(); err != nil {
+		return false, err
+	}
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
+	// Confirm the doc exists via its meta blob; bail idempotently if gone.
+	if _, _, ok, err := func() (string, string, bool, error) {
+		val, closer, err := p.db.Get(docMetaKey(docID))
+		if errors.Is(err, pebble.ErrNotFound) {
+			return "", "", false, nil
+		}
+		if err != nil {
+			return "", "", false, err
+		}
+		defer closer.Close()
+		return unpackDocMeta(val)
+	}(); err != nil {
+		return false, err
+	} else if !ok {
+		return false, nil
+	}
+
+	docLen, hadLen, err := p.readDocLenLocked(docID)
+	if err != nil {
+		return false, err
+	}
+
+	batch := p.db.NewBatch()
+	defer batch.Close()
+	if err := batch.Delete(docKey(docID), nil); err != nil {
+		return false, err
+	}
+	if rawURL != "" {
+		if err := batch.Delete(urlKey(rawURL), nil); err != nil {
+			return false, err
+		}
+		if u, e := url.Parse(rawURL); e == nil && u.Host != "" {
+			if err := batch.Delete(hostKey(u.Host, docID), nil); err != nil {
+				return false, err
+			}
+		}
+	}
+	if err := batch.Delete(docMetaKey(docID), nil); err != nil {
+		return false, err
+	}
+	if hadLen {
+		if err := batch.Delete(docLenKey(docID), nil); err != nil {
+			return false, err
+		}
+	}
+
+	// Decrement corpus counters, mirroring IndexDocument's accounting.
+	var sumLen, indexedCount int64
+	if p.corpusStatsLoaded.Load() {
+		sumLen = p.corpusSumLen.Load()
+		indexedCount = p.corpusIndexedDocs.Load()
+	} else {
+		sumLen = p.readMetaInt64Locked("sum_doc_len")
+		indexedCount = p.readMetaInt64Locked("indexed_docs")
+	}
+	if hadLen {
+		sumLen -= docLen
+		if sumLen < 0 {
+			sumLen = 0
+		}
+	}
+	indexedCount--
+	if indexedCount < 0 {
+		indexedCount = 0
+	}
+	sumBuf := make([]byte, 8)
+	binary.BigEndian.PutUint64(sumBuf, uint64(sumLen))
+	if err := batch.Set(metaKey("sum_doc_len"), sumBuf, nil); err != nil {
+		return false, err
+	}
+	countBuf := make([]byte, 8)
+	binary.BigEndian.PutUint64(countBuf, uint64(indexedCount))
+	if err := batch.Set(metaKey("indexed_docs"), countBuf, nil); err != nil {
+		return false, err
+	}
+
+	if err := batch.Commit(p.writeOpts); err != nil {
+		return false, err
+	}
+	p.corpusSumLen.Store(sumLen)
+	p.corpusIndexedDocs.Store(indexedCount)
+	p.corpusStatsLoaded.Store(true)
+	return true, nil
+}
+
 // PurgeFrontierByHost deletes every QUEUED frontier entry for the given
 // host (both the secondary 'f'+'q'+host index and the primary 'f'+'u'+url
 // entry). Returns the count purged. In-flight and done/errored entries
@@ -2501,9 +2664,9 @@ func (p *PebbleStore) DemoteHostToLane(ctx context.Context, host string, lane by
 // (their secondary keys lack a lane byte); they're drained as a fall-through
 // in ClaimFrontier and disappear over time.
 type LaneStats struct {
-	Lanes           [laneCount]LaneCounts
-	LegacyQueued    int
-	LegacyInFlight  int
+	Lanes          [laneCount]LaneCounts
+	LegacyQueued   int
+	LegacyInFlight int
 }
 
 // LaneCounts is the per-lane summary surfaced in /queue.
diff --git a/internal/store/pebble_test.go b/internal/store/pebble_test.go
index e46b33c..96e2b82 100644
--- a/internal/store/pebble_test.go
+++ b/internal/store/pebble_test.go
@@ -334,6 +334,54 @@ func TestPebblePostingsPersistAcrossReopen(t *testing.T) {
 // terms {alpha, beta, gamma}; re-index with {alpha, delta}; beta and
 // gamma postings for the doc MUST be deleted (or queries for those terms
 // return the doc as a phantom hit).
+// TestSoftDeleteDocument verifies a soft-deleted doc disappears from meta
+// lookups, decrements the corpus counters, and is idempotent on re-delete.
+func TestSoftDeleteDocument(t *testing.T) {
+	p := newPebbleStore(t)
+	ctx := context.Background()
+
+	d := &Document{URL: "https://spam.example.com/x", Domain: "spam.example.com", Title: "T", Text: "alpha beta gamma", FetchedAt: time.Now()}
+	id, err := p.UpsertDocument(ctx, d)
+	if err != nil {
+		t.Fatalf("upsert: %v", err)
+	}
+	if err := p.IndexDocument(ctx, id, "T", "alpha beta gamma", trivialTokenize, 1); err != nil {
+		t.Fatalf("index: %v", err)
+	}
+
+	_, count, _ := p.CorpusStats(ctx)
+	if count != 1 {
+		t.Fatalf("pre-delete count = %d, want 1", count)
+	}
+
+	ok, err := p.SoftDeleteDocument(ctx, id, d.URL)
+	if err != nil || !ok {
+		t.Fatalf("soft delete: ok=%v err=%v", ok, err)
+	}
+
+	if _, _, present, _ := p.GetDocMeta(ctx, id); present {
+		t.Error("doc meta should be gone after soft delete")
+	}
+	if got, _ := p.GetDocByURL(ctx, d.URL); got != nil {
+		t.Errorf("GetDocByURL should miss after soft delete, got %+v", got)
+	}
+	if _, count, _ := p.CorpusStats(ctx); count != 0 {
+		t.Errorf("post-delete count = %d, want 0", count)
+	}
+
+	// Idempotent: deleting again reports not-found, no counter underflow.
+	ok2, err := p.SoftDeleteDocument(ctx, id, d.URL)
+	if err != nil {
+		t.Fatalf("second delete err: %v", err)
+	}
+	if ok2 {
+		t.Error("second delete should report ok=false (already gone)")
+	}
+	if _, count, _ := p.CorpusStats(ctx); count != 0 {
+		t.Errorf("count after redundant delete = %d, want 0 (no underflow)", count)
+	}
+}
+
 func TestPebbleReindexDeletesOrphanedPostings(t *testing.T) {
 	p := newPebbleStore(t)
 	ctx := context.Background()

From 74dd54f422685f8a7b6013260c0c98be92647511 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 15:17:15 +0000
Subject: [PATCH 08/10] feat(search): site= scoping filter + /admin/site-submit
 priority enqueue

Search:
- Add a 'site' parameter to /search, /answer and /research (GET query +
  POST body) that scopes results by host suffix AND optional URL path
  prefix, e.g. site=pilotprotocol.network/docs. Segment-boundary path
  match; ANDs with include_domains/exclude_domains.

Crawl:
- Add Crawler.SeedSitemapLane so sitemap URLs can be enqueued into a
  chosen frontier lane; SeedSitemap now delegates (refresh lane, unchanged).
- Add POST /admin/site-submit: discover a site's URLs (robots.txt
  Sitemap: directives, then canonical/CMS fallbacks) and enqueue them all
  into the high-priority submitted lane by default (lane configurable).
- Factor shared discoverSitemaps/normalizeBareHost helpers out of site-pack.

Tests: scope parsing, host+path matching, lane mapping, sitemap discovery,
an end-to-end /search?site= test, site-submit auth/validation/lane wiring,
and SeedSitemapLane lane placement.
---
 cmd/cosift/pebble_serve.go            | 324 ++++++++++++++++++++++----
 cmd/cosift/site_filter_submit_test.go | 276 ++++++++++++++++++++++
 internal/crawler/sitemap.go           |  26 ++-
 internal/crawler/sitemap_lane_test.go |  89 +++++++
 4 files changed, 659 insertions(+), 56 deletions(-)
 create mode 100644 cmd/cosift/site_filter_submit_test.go
 create mode 100644 internal/crawler/sitemap_lane_test.go

diff --git a/cmd/cosift/pebble_serve.go b/cmd/cosift/pebble_serve.go
index 0e3e93e..6d231bd 100644
--- a/cmd/cosift/pebble_serve.go
+++ b/cmd/cosift/pebble_serve.go
@@ -500,6 +500,7 @@ func runPebbleServe(ctx context.Context, cfg *config.Config, args []string) erro
 	mux.HandleFunc("POST /admin/wet-import", wrap(srv.handleWETImport))
 	mux.HandleFunc("POST /admin/wet-import-bulk", wrap(srv.handleWETImportBulk))
 	mux.HandleFunc("POST /admin/site-pack", wrap(srv.handleSitePack))
+	mux.HandleFunc("POST /admin/site-submit", wrap(srv.handleSiteSubmit))
 	mux.HandleFunc("POST /admin/embed-backfill", wrap(srv.handleEmbedBackfill))
 	mux.HandleFunc("GET /admin/eval-quick", wrap(srv.handleEvalQuick))
 	mux.HandleFunc("POST /admin/hnsw-compact", wrap(srv.handleHNSWCompact))
@@ -774,6 +775,9 @@ func (s *pebbleHTTP) startInProcessCrawl(ctx context.Context, ps *store.PebbleSt
 	// expose SeedSitemap so /admin/sitemap-import can push
 	// sitemap-discovered URLs into the live frontier.
 	s.crawlSeedSitemap = c.SeedSitemap
+	// expose SeedSitemapLane so /admin/site-submit can push a whole site's
+	// URLs into a chosen priority lane (default: submitted/priority).
+	s.crawlSeedSitemapLane = c.SeedSitemapLane
 	s.crawlSeedRSS = c.SeedRSS
 	s.crawlFetchNow = c.FetchAndIndexNow
 	s.crawlSeedWET = c.SeedWET
@@ -937,9 +941,13 @@ type pebbleHTTP struct {
 	// crawlSeedSitemap wraps Crawler.SeedSitemap so the /admin/
 	// sitemap-import endpoint can push sitemap URLs into the live frontier.
 	crawlSeedSitemap func(ctx context.Context, url string) (int, error)
-	crawlSeedRSS     func(ctx context.Context, url string) (int, error)
-	crawlFetchNow    func(ctx context.Context, url string) error
-	crawlSeedWET     func(ctx context.Context, url string, dedupeFresh, lexicalOnly bool) (int, error)
+	// crawlSeedSitemapLane is like crawlSeedSitemap but lets the caller pick
+	// the frontier lane — used by /admin/site-submit to land a site's URLs
+	// in the high-priority submitted lane.
+	crawlSeedSitemapLane func(ctx context.Context, url string, lane byte) (int, error)
+	crawlSeedRSS         func(ctx context.Context, url string) (int, error)
+	crawlFetchNow        func(ctx context.Context, url string) error
+	crawlSeedWET         func(ctx context.Context, url string, dedupeFresh, lexicalOnly bool) (int, error)
 
 	// doc count at startup so /stats can report crawl rate
 	// without persistent counter tables. docs_added = current - startup,
@@ -2527,11 +2535,8 @@ func (s *pebbleHTTP) handleSitePack(w http.ResponseWriter, r *http.Request) {
 		writeProblem(w, http.StatusBadRequest, "expected {\"host\":\"example.com\"}")
 		return
 	}
-	host := strings.TrimSpace(req.Host)
-	host = strings.TrimPrefix(host, "https://")
-	host = strings.TrimPrefix(host, "http://")
-	host = strings.TrimSuffix(host, "/")
-	if host == "" || strings.Contains(host, "/") {
+	host, ok := normalizeBareHost(req.Host)
+	if !ok {
 		writeProblem(w, http.StatusBadRequest, "host must be a bare hostname like example.com")
 		return
 	}
@@ -2547,46 +2552,11 @@ func (s *pebbleHTTP) handleSitePack(w http.ResponseWriter, r *http.Request) {
 	results := make([]result, 0, 8)
 	t0 := time.Now()
 
-	// Step 1: robots.txt for Sitemap: directives.
-	sitemapsFromRobots := []string{}
-	if rresp, err := hc.Get(base + "/robots.txt"); err == nil && rresp.StatusCode < 400 {
-		rbody, _ := io.ReadAll(io.LimitReader(rresp.Body, 2<<20))
-		rresp.Body.Close()
-		for _, line := range strings.Split(string(rbody), "\n") {
-			line = strings.TrimSpace(line)
-			if strings.HasPrefix(strings.ToLower(line), "sitemap:") {
-				val := strings.TrimSpace(line[len("sitemap:"):])
-				if val != "" {
-					sitemapsFromRobots = append(sitemapsFromRobots, val)
-				}
-			}
-		}
-	}
-	// Step 2: if robots.txt gave nothing, try canonical paths.
-	candidateSitemaps := sitemapsFromRobots
-	if len(candidateSitemaps) == 0 {
-		// /sitemap.xml is the canonical
-		// spec but many CMSes (WordPress, Yoast, Ghost, Hugo themes) ship
-		// at non-canonical paths. Try a small ordered list before giving up.
-		// Stops on first successful fetch — the order matters: /sitemap.xml
-		// first (most common), then WordPress's /wp-sitemap.xml + Yoast's
-		// per-content-type splits, then index variants.
-		for _, p := range []string{
-			"/sitemap.xml",
-			"/wp-sitemap.xml",    // WordPress 5.5+
-			"/sitemap_index.xml", // Yoast SEO
-			"/post-sitemap.xml",  // Yoast posts
-			"/page-sitemap.xml",  // Yoast pages
-			"/sitemap-index.xml", // some CMSes hyphenate
-			"/sitemap.xml.gz",    // gzipped variant (sitemap.go handles .gz)
-		} {
-			candidateSitemaps = append(candidateSitemaps, base+p)
-		}
-	}
+	candidateSitemaps, fromRobots := discoverSitemaps(r.Context(), hc, base)
 	for _, su := range candidateSitemaps {
 		n, err := s.crawlSeedSitemap(r.Context(), su)
 		res := result{URL: su, Indexed: n}
-		if len(sitemapsFromRobots) > 0 {
+		if fromRobots {
 			res.Source = "robots-sitemap"
 		} else {
 			res.Source = "fallback-sitemap"
@@ -2623,6 +2593,173 @@ func (s *pebbleHTTP) handleSitePack(w http.ResponseWriter, r *http.Request) {
 	})
 }
 
+// normalizeBareHost strips scheme/trailing-slash from a host or URL and
+// returns the bare hostname. ok is false when the result still contains a
+// path segment (so callers can reject "example.com/foo" as a host).
+func normalizeBareHost(s string) (host string, ok bool) {
+	host = strings.TrimSpace(s)
+	host = strings.TrimPrefix(host, "https://")
+	host = strings.TrimPrefix(host, "http://")
+	host = strings.TrimSuffix(host, "/")
+	host = strings.ToLower(host)
+	if host == "" || strings.Contains(host, "/") {
+		return "", false
+	}
+	return host, true
+}
+
+// discoverSitemaps returns candidate sitemap URLs for a site, given its base
+// origin (e.g. "https://example.com"). It prefers Sitemap: directives in
+// robots.txt; when robots.txt yields none it falls back to a small ordered
+// list of canonical/CMS paths. fromRobots reports which source was used.
+func discoverSitemaps(ctx context.Context, hc *http.Client, base string) (sitemaps []string, fromRobots bool) {
+	if hc == nil {
+		hc = &http.Client{Timeout: 20 * time.Second}
+	}
+	if req, err := http.NewRequestWithContext(ctx, http.MethodGet, base+"/robots.txt", nil); err == nil {
+		if rresp, err := hc.Do(req); err == nil {
+			if rresp.StatusCode < 400 {
+				rbody, _ := io.ReadAll(io.LimitReader(rresp.Body, 2<<20))
+				for _, line := range strings.Split(string(rbody), "\n") {
+					line = strings.TrimSpace(line)
+					if strings.HasPrefix(strings.ToLower(line), "sitemap:") {
+						if val := strings.TrimSpace(line[len("sitemap:"):]); val != "" {
+							sitemaps = append(sitemaps, val)
+						}
+					}
+				}
+			}
+			rresp.Body.Close()
+		}
+	}
+	if len(sitemaps) > 0 {
+		return sitemaps, true
+	}
+	// /sitemap.xml is the canonical spec but many CMSes (WordPress, Yoast,
+	// Ghost, Hugo themes) ship at non-canonical paths. Try a small ordered
+	// list before giving up: /sitemap.xml (most common), then WordPress's
+	// /wp-sitemap.xml + Yoast's per-content-type splits, then index variants.
+	for _, p := range []string{
+		"/sitemap.xml",
+		"/wp-sitemap.xml",    // WordPress 5.5+
+		"/sitemap_index.xml", // Yoast SEO
+		"/post-sitemap.xml",  // Yoast posts
+		"/page-sitemap.xml",  // Yoast pages
+		"/sitemap-index.xml", // some CMSes hyphenate
+		"/sitemap.xml.gz",    // gzipped variant (sitemap.go handles .gz)
+	} {
+		sitemaps = append(sitemaps, base+p)
+	}
+	return sitemaps, false
+}
+
+// parseLaneName maps a friendly lane name to a frontier lane byte. The empty
+// string and unknown values default to the high-priority submitted lane,
+// which is the point of /admin/site-submit: jump a site to the front.
+func parseLaneName(s string) byte {
+	switch strings.ToLower(strings.TrimSpace(s)) {
+	case "refresh":
+		return store.LaneRefresh
+	case "discovered":
+		return store.LaneDiscovered
+	case "bulk":
+		return store.LaneBulk
+	default: // "", "priority", "submitted", or anything unrecognized
+		return store.LaneSubmitted
+	}
+}
+
+func laneName(lane byte) string {
+	switch lane {
+	case store.LaneSubmitted:
+		return "submitted"
+	case store.LaneRefresh:
+		return "refresh"
+	case store.LaneDiscovered:
+		return "discovered"
+	case store.LaneBulk:
+		return "bulk"
+	default:
+		return "submitted"
+	}
+}
+
+// handleSiteSubmit discovers every URL of a website (via robots.txt sitemaps,
+// then canonical fallbacks) and pushes them all onto the live crawl frontier
+// in a chosen priority lane — by default the high-priority "submitted" lane,
+// so the whole site jumps ahead of the generic discovery backlog. Same auth
+// as the other admin endpoints. Synchronous; large sitemaps take seconds.
+//
+// Body: {"host":"pilotprotocol.network", "lane":"priority"}
+//
+//	lane: "priority" (default) | "refresh" | "discovered" | "bulk"
+type siteSubmitReq struct {
+	Host string `json:"host"`
+	Lane string `json:"lane,omitempty"`
+}
+
+func (s *pebbleHTTP) handleSiteSubmit(w http.ResponseWriter, r *http.Request) {
+	if want := s.cluster.PeerAuthToken; want != "" {
+		got := strings.TrimPrefix(r.Header.Get("Authorization"), "Bearer ")
+		if got != want {
+			writeProblem(w, http.StatusUnauthorized, "missing or invalid admin token")
+			return
+		}
+	}
+	if s.crawlSeedSitemapLane == nil {
+		writeProblem(w, http.StatusNotImplemented, "this shard has no in-serve crawler (-crawl-seeds-file not set)")
+		return
+	}
+	var req siteSubmitReq
+	body, _ := io.ReadAll(io.LimitReader(r.Body, 64<<10))
+	if err := json.Unmarshal(body, &req); err != nil || req.Host == "" {
+		writeProblem(w, http.StatusBadRequest, "expected {\"host\":\"example.com\", \"lane\":\"priority\"}")
+		return
+	}
+	host, ok := normalizeBareHost(req.Host)
+	if !ok {
+		writeProblem(w, http.StatusBadRequest, "host must be a bare hostname like example.com")
+		return
+	}
+	lane := parseLaneName(req.Lane)
+	base := "https://" + host
+	hc := &http.Client{Timeout: 20 * time.Second}
+	t0 := time.Now()
+
+	type result struct {
+		Source string `json:"source"` // "robots-sitemap" | "fallback-sitemap"
+		URL    string `json:"url"`
+		Queued int    `json:"queued"`
+		Error  string `json:"error,omitempty"`
+	}
+	candidateSitemaps, fromRobots := discoverSitemaps(r.Context(), hc, base)
+	source := "fallback-sitemap"
+	if fromRobots {
+		source = "robots-sitemap"
+	}
+	results := make([]result, 0, len(candidateSitemaps))
+	total := 0
+	for _, su := range candidateSitemaps {
+		n, err := s.crawlSeedSitemapLane(r.Context(), su, lane)
+		res := result{Source: source, URL: su, Queued: n}
+		if err != nil {
+			res.Error = err.Error()
+		}
+		total += n
+		results = append(results, res)
+	}
+	log.Printf("site-submit: %s → lane=%s discovered %d sitemap(s), queued %d URLs in %s",
+		host, laneName(lane), len(results), total, time.Since(t0).Round(time.Millisecond))
+	writeJSON(w, http.StatusOK, map[string]any{
+		"host":         host,
+		"lane":         laneName(lane),
+		"sitemaps":     len(results),
+		"total_queued": total,
+		"elapsed":      time.Since(t0).String(),
+		"results":      results,
+	})
+}
+
 // handleWETImportBulk fetches a CommonCrawl `wet.paths.gz` manifest,
 // takes the first N entries (or skip+take), and runs `/admin/wet-import`
 // against each one in parallel. Lets operators bulk-ingest a release with
@@ -3552,6 +3689,7 @@ type searchRequest struct {
 	K              int    `json:"k,omitempty"`
 	IncludeDomains string `json:"include_domains,omitempty"`
 	ExcludeDomains string `json:"exclude_domains,omitempty"`
+	Site           string `json:"site,omitempty"`
 	Since          string `json:"since,omitempty"`
 	Until          string `json:"until,omitempty"`
 	Sort           string `json:"sort,omitempty"`
@@ -3580,6 +3718,9 @@ func (s *pebbleHTTP) handleSearchPOST(w http.ResponseWriter, r *http.Request) {
 	if req.ExcludeDomains != "" {
 		v.Set("exclude_domains", req.ExcludeDomains)
 	}
+	if req.Site != "" {
+		v.Set("site", req.Site)
+	}
 	if req.Since != "" {
 		v.Set("since", req.Since)
 	}
@@ -3673,6 +3814,7 @@ type synthRequest struct {
 	K              int    `json:"k,omitempty"`
 	IncludeDomains string `json:"include_domains,omitempty"`
 	ExcludeDomains string `json:"exclude_domains,omitempty"`
+	Site           string `json:"site,omitempty"`
 	Since          string `json:"since,omitempty"`
 	Until          string `json:"until,omitempty"`
 	IncludeText    bool   `json:"include_text,omitempty"`
@@ -3695,6 +3837,9 @@ func (req synthRequest) toValues() url.Values {
 	if req.ExcludeDomains != "" {
 		v.Set("exclude_domains", req.ExcludeDomains)
 	}
+	if req.Site != "" {
+		v.Set("site", req.Site)
+	}
 	if req.Since != "" {
 		v.Set("since", req.Since)
 	}
@@ -3776,6 +3921,12 @@ func (s *pebbleHTTP) handleSearch(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	dateFilter := !since.IsZero() || !until.IsZero()
+	// site — scope results to one or more host[/path] sections, e.g.
+	// ?site=pilotprotocol.network/docs. Host-suffix + path-prefix match,
+	// ANDed with include/exclude. Applied post-retrieval like the domain
+	// filters, so it widens the over-fetch below.
+	sites := parseSiteScopes(r.URL.Query().Get("site"))
+	siteFilter := len(sites) > 0
 	// rerank widens both the fetch and the keep-cap before filtering,
 	// so the reranker sees a healthy candidate pool even with restrictive filters.
 	wantRerank := r.URL.Query().Get("rerank") == "true" && s.reranker != nil
@@ -3787,7 +3938,7 @@ func (s *pebbleHTTP) handleSearch(w http.ResponseWriter, r *http.Request) {
 		}
 	}
 	fetchK := keepCap
-	if len(include) > 0 || len(exclude) > 0 || dateFilter {
+	if len(include) > 0 || len(exclude) > 0 || dateFilter || siteFilter {
 		mult := 5
 		if dateFilter {
 			mult = 10
@@ -3846,6 +3997,9 @@ func (s *pebbleHTTP) handleSearch(w http.ResponseWriter, r *http.Request) {
 				continue
 			}
 		}
+		if siteFilter && !matchesAnySite(h.URL, sites) {
+			continue
+		}
 		hit := searchHit{URL: h.URL, Title: h.Title, Score: h.Score}
 		if enrich || dateFilter || includeText {
 			doc, derr := s.store.GetDocByURL(r.Context(), h.URL)
@@ -5315,6 +5469,8 @@ func (s *pebbleHTTP) handleAnswerInner(w http.ResponseWriter, r *http.Request, s
 	// scoping research to a domain or date window is the common EXA shape.
 	include := splitDomainsCSV(r.URL.Query().Get("include_domains"))
 	exclude := splitDomainsCSV(r.URL.Query().Get("exclude_domains"))
+	sites := parseSiteScopes(r.URL.Query().Get("site"))
+	siteFilter := len(sites) > 0
 	since, sinceErr := parseDateBound(r.URL.Query().Get("since"))
 	if sinceErr != nil {
 		writeProblem(w, http.StatusBadRequest, "since: "+sinceErr.Error())
@@ -5341,7 +5497,7 @@ func (s *pebbleHTTP) handleAnswerInner(w http.ResponseWriter, r *http.Request, s
 		}
 	}
 	fetchK := keepCap
-	if len(include) > 0 || len(exclude) > 0 || dateFilter {
+	if len(include) > 0 || len(exclude) > 0 || dateFilter || siteFilter {
 		mult := 5
 		if dateFilter {
 			mult = 10
@@ -5412,6 +5568,9 @@ func (s *pebbleHTTP) handleAnswerInner(w http.ResponseWriter, r *http.Request, s
 				continue
 			}
 		}
+		if siteFilter && !matchesAnySite(h.URL, sites) {
+			continue
+		}
 		doc, derr := s.store.GetDocByURL(r.Context(), h.URL)
 		if derr != nil || doc == nil {
 			continue
@@ -6845,6 +7004,7 @@ func summarizeSourceList(srcs []answerSource) string {
 type retrievalFilters struct {
 	include    []string
 	exclude    []string
+	sites      []siteScope
 	since      time.Time
 	until      time.Time
 	dateActive bool
@@ -6854,6 +7014,7 @@ func parseRetrievalFilters(r *http.Request) (retrievalFilters, error) {
 	f := retrievalFilters{
 		include: splitDomainsCSV(r.URL.Query().Get("include_domains")),
 		exclude: splitDomainsCSV(r.URL.Query().Get("exclude_domains")),
+		sites:   parseSiteScopes(r.URL.Query().Get("site")),
 	}
 	since, err := parseDateBound(r.URL.Query().Get("since"))
 	if err != nil {
@@ -6882,6 +7043,9 @@ func (f retrievalFilters) allow(rawURL string, publishedAt time.Time) bool {
 			return false
 		}
 	}
+	if len(f.sites) > 0 && !matchesAnySite(rawURL, f.sites) {
+		return false
+	}
 	if f.dateActive {
 		if publishedAt.IsZero() {
 			return false
@@ -6947,6 +7111,72 @@ func hostOf(rawURL string) string {
 	return u.Hostname()
 }
 
+// siteScope is one entry of the `site` search filter: a host (suffix-matched
+// on dot boundaries, same as include_domains) plus an optional URL path
+// prefix. A zero path matches the whole host; a non-empty path scopes results
+// to a section of the site (e.g. host "pilotprotocol.network" + path "/docs").
+type siteScope struct {
+	host string
+	path string // normalized, no trailing slash; "" = any path
+}
+
+// parseSiteScopes parses the `site` parameter — a CSV of host or host/path
+// (or full-URL) entries — into scopes. Examples of a single entry:
+//
+//	pilotprotocol.network              → whole host (and subdomains)
+//	pilotprotocol.network/docs         → only URLs under /docs
+//	https://pilotprotocol.network/docs → same (scheme tolerated)
+func parseSiteScopes(csv string) []siteScope {
+	if csv == "" {
+		return nil
+	}
+	var out []siteScope
+	for _, raw := range strings.Split(csv, ",") {
+		t := strings.TrimSpace(raw)
+		if t == "" {
+			continue
+		}
+		t = strings.TrimPrefix(t, "https://")
+		t = strings.TrimPrefix(t, "http://")
+		host, path := t, ""
+		if i := strings.IndexByte(t, '/'); i >= 0 {
+			host, path = t[:i], t[i:]
+		}
+		host = strings.ToLower(strings.TrimSpace(host))
+		path = strings.TrimRight(strings.TrimSpace(path), "/")
+		if host == "" {
+			continue
+		}
+		out = append(out, siteScope{host: host, path: path})
+	}
+	return out
+}
+
+// matchesAnySite reports whether rawURL falls within any of the scopes. Host
+// matching reuses include_domains semantics (exact or dot-boundary suffix);
+// path matching is a segment-boundary prefix so "/docs" matches "/docs" and
+// "/docs/x" but not "/docsearch".
+func matchesAnySite(rawURL string, scopes []siteScope) bool {
+	if len(scopes) == 0 {
+		return true
+	}
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return false
+	}
+	host := strings.ToLower(u.Hostname())
+	p := u.Path
+	for _, sc := range scopes {
+		if host != sc.host && !strings.HasSuffix(host, "."+sc.host) {
+			continue
+		}
+		if sc.path == "" || p == sc.path || strings.HasPrefix(p, sc.path+"/") {
+			return true
+		}
+	}
+	return false
+}
+
 // parseDateBound accepts the same forms as the SQLite-side server: empty
 // (zero time), RFC3339 ("2026-01-15T00:00:00Z"), or a bare date
 // ("2026-01-15", treated as UTC midnight).
diff --git a/cmd/cosift/site_filter_submit_test.go b/cmd/cosift/site_filter_submit_test.go
new file mode 100644
index 0000000..82aaff6
--- /dev/null
+++ b/cmd/cosift/site_filter_submit_test.go
@@ -0,0 +1,276 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/pilot-protocol/cosift/internal/store"
+)
+
+func TestParseSiteScopes(t *testing.T) {
+	cases := []struct {
+		in   string
+		want []siteScope
+	}{
+		{"", nil},
+		{"pilotprotocol.network", []siteScope{{host: "pilotprotocol.network"}}},
+		{"pilotprotocol.network/docs", []siteScope{{host: "pilotprotocol.network", path: "/docs"}}},
+		{"https://pilotprotocol.network/docs/", []siteScope{{host: "pilotprotocol.network", path: "/docs"}}},
+		{"http://EXAMPLE.com/Blog", []siteScope{{host: "example.com", path: "/Blog"}}},
+		{"a.com , b.com/x", []siteScope{{host: "a.com"}, {host: "b.com", path: "/x"}}},
+		{"  ,  ", nil},
+	}
+	for _, c := range cases {
+		got := parseSiteScopes(c.in)
+		if len(got) != len(c.want) {
+			t.Errorf("parseSiteScopes(%q) len = %d, want %d (%v)", c.in, len(got), len(c.want), got)
+			continue
+		}
+		for i := range got {
+			if got[i] != c.want[i] {
+				t.Errorf("parseSiteScopes(%q)[%d] = %+v, want %+v", c.in, i, got[i], c.want[i])
+			}
+		}
+	}
+}
+
+func TestMatchesAnySite(t *testing.T) {
+	scopes := parseSiteScopes("pilotprotocol.network/docs")
+	cases := []struct {
+		url  string
+		want bool
+	}{
+		{"https://pilotprotocol.network/docs", true},
+		{"https://pilotprotocol.network/docs/", true},
+		{"https://pilotprotocol.network/docs/getting-started", true},
+		{"https://www.pilotprotocol.network/docs/x", true}, // subdomain matches host suffix
+		{"https://pilotprotocol.network/blog", false},      // wrong path
+		{"https://pilotprotocol.network/docsearch", false}, // not a segment boundary
+		{"https://evil.com/docs", false},                   // wrong host
+		{"https://notpilotprotocol.network/docs", false},   // suffix must be on dot boundary
+	}
+	for _, c := range cases {
+		if got := matchesAnySite(c.url, scopes); got != c.want {
+			t.Errorf("matchesAnySite(%q) = %v, want %v", c.url, got, c.want)
+		}
+	}
+
+	// Empty scopes = match everything (no filter).
+	if !matchesAnySite("https://anything.example/x", nil) {
+		t.Error("nil scopes should match all URLs")
+	}
+
+	// Host-only scope matches any path on the host.
+	hostOnly := parseSiteScopes("example.com")
+	if !matchesAnySite("https://example.com/anything/here", hostOnly) {
+		t.Error("host-only scope should match any path")
+	}
+}
+
+// TestHandleSearchSiteFilter drives the real /search handler against the
+// populated fixture (6 docs, all on x.example with distinct paths) to prove
+// the `site` param scopes results by host+path end-to-end.
+func TestHandleSearchSiteFilter(t *testing.T) {
+	t.Setenv("COSIFT_DEFAULT_DECAY_DAYS", "0")
+	f := populatedPebbleStore(t)
+	srv := f.makeServer(nil)
+
+	doSearch := func(query string) searchResponse {
+		t.Helper()
+		rec := httptest.NewRecorder()
+		req := httptest.NewRequest(http.MethodGet, "/search?k=10&"+query, nil)
+		srv.handleSearch(rec, req)
+		if rec.Code != http.StatusOK {
+			t.Fatalf("status = %d, body=%s", rec.Code, rec.Body.String())
+		}
+		var resp searchResponse
+		if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+			t.Fatalf("decode: %v", err)
+		}
+		return resp
+	}
+
+	// Unscoped: "consensus" matches several docs (raft, paxos, distributed).
+	all := doSearch("q=consensus")
+	if len(all.Hits) < 2 {
+		t.Fatalf("baseline: want ≥2 hits for 'consensus', got %d (%+v)", len(all.Hits), all.Hits)
+	}
+
+	// Path-scoped to /paxos: only the paxos doc may survive.
+	scoped := doSearch("q=consensus&site=x.example/paxos")
+	if len(scoped.Hits) == 0 {
+		t.Fatal("site=x.example/paxos returned no hits")
+	}
+	for _, h := range scoped.Hits {
+		if !strings.HasPrefix(h.URL, "https://x.example/paxos") {
+			t.Errorf("site=x.example/paxos leaked %s", h.URL)
+		}
+	}
+	if len(scoped.Hits) >= len(all.Hits) {
+		t.Errorf("path scope did not narrow results: scoped=%d all=%d", len(scoped.Hits), len(all.Hits))
+	}
+
+	// Host scope that matches nothing → zero hits.
+	none := doSearch("q=consensus&site=nonexistent.example")
+	if len(none.Hits) != 0 {
+		t.Errorf("site=nonexistent.example should yield 0 hits, got %d", len(none.Hits))
+	}
+
+	// Host-only scope (no path) keeps all the host's matches.
+	host := doSearch("q=consensus&site=x.example")
+	if len(host.Hits) != len(all.Hits) {
+		t.Errorf("host-only scope changed result count: %d vs baseline %d", len(host.Hits), len(all.Hits))
+	}
+}
+
+func TestParseLaneName(t *testing.T) {
+	cases := map[string]byte{
+		"":           store.LaneSubmitted,
+		"priority":   store.LaneSubmitted,
+		"submitted":  store.LaneSubmitted,
+		"PRIORITY":   store.LaneSubmitted,
+		"refresh":    store.LaneRefresh,
+		"discovered": store.LaneDiscovered,
+		"bulk":       store.LaneBulk,
+		"nonsense":   store.LaneSubmitted, // unknown defaults to priority
+	}
+	for in, want := range cases {
+		if got := parseLaneName(in); got != want {
+			t.Errorf("parseLaneName(%q) = %d, want %d", in, got, want)
+		}
+	}
+	// laneName round-trips the known lanes.
+	for _, lane := range []byte{store.LaneSubmitted, store.LaneRefresh, store.LaneDiscovered, store.LaneBulk} {
+		if parseLaneName(laneName(lane)) != lane {
+			t.Errorf("laneName/parseLaneName round-trip failed for lane %d (%q)", lane, laneName(lane))
+		}
+	}
+}
+
+func TestNormalizeBareHost(t *testing.T) {
+	cases := []struct {
+		in   string
+		host string
+		ok   bool
+	}{
+		{"example.com", "example.com", true},
+		{"https://example.com", "example.com", true},
+		{"http://example.com/", "example.com", true},
+		{"EXAMPLE.com", "example.com", true},
+		{"example.com/docs", "", false}, // path present
+		{"", "", false},
+		{"  ", "", false},
+	}
+	for _, c := range cases {
+		host, ok := normalizeBareHost(c.in)
+		if host != c.host || ok != c.ok {
+			t.Errorf("normalizeBareHost(%q) = (%q,%v), want (%q,%v)", c.in, host, ok, c.host, c.ok)
+		}
+	}
+}
+
+func TestDiscoverSitemapsRobots(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			_, _ = w.Write([]byte("User-agent: *\nSitemap: https://x.test/a.xml\nsitemap: https://x.test/b.xml\n"))
+			return
+		}
+		http.NotFound(w, r)
+	}))
+	defer srv.Close()
+
+	sitemaps, fromRobots := discoverSitemaps(context.Background(), srv.Client(), srv.URL)
+	if !fromRobots {
+		t.Fatal("expected fromRobots=true")
+	}
+	if len(sitemaps) != 2 || sitemaps[0] != "https://x.test/a.xml" || sitemaps[1] != "https://x.test/b.xml" {
+		t.Errorf("robots sitemaps = %v, want [a.xml b.xml]", sitemaps)
+	}
+}
+
+func TestDiscoverSitemapsFallback(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		http.NotFound(w, nil) // no robots.txt
+	}))
+	defer srv.Close()
+
+	sitemaps, fromRobots := discoverSitemaps(context.Background(), srv.Client(), srv.URL)
+	if fromRobots {
+		t.Fatal("expected fromRobots=false when robots.txt has no Sitemap directives")
+	}
+	if len(sitemaps) == 0 || sitemaps[0] != srv.URL+"/sitemap.xml" {
+		t.Errorf("fallback sitemaps = %v, want canonical list starting with /sitemap.xml", sitemaps)
+	}
+}
+
+func TestHandleSiteSubmitAuthAndValidation(t *testing.T) {
+	// 501 when no in-serve crawler is wired.
+	s := &pebbleHTTP{}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/admin/site-submit", strings.NewReader(`{"host":"example.com"}`))
+	s.handleSiteSubmit(rec, req)
+	if rec.Code != http.StatusNotImplemented {
+		t.Errorf("no crawler: got %d want 501", rec.Code)
+	}
+
+	// 400 on missing host.
+	s = &pebbleHTTP{crawlSeedSitemapLane: func(context.Context, string, byte) (int, error) { return 0, nil }}
+	rec = httptest.NewRecorder()
+	req = httptest.NewRequest(http.MethodPost, "/admin/site-submit", strings.NewReader(`{}`))
+	s.handleSiteSubmit(rec, req)
+	if rec.Code != http.StatusBadRequest {
+		t.Errorf("missing host: got %d want 400", rec.Code)
+	}
+
+	// 400 on host with a path segment.
+	rec = httptest.NewRecorder()
+	req = httptest.NewRequest(http.MethodPost, "/admin/site-submit", strings.NewReader(`{"host":"example.com/docs"}`))
+	s.handleSiteSubmit(rec, req)
+	if rec.Code != http.StatusBadRequest {
+		t.Errorf("host with path: got %d want 400", rec.Code)
+	}
+}
+
+// TestHandleSiteSubmitLane drives the happy path against an unreachable host
+// (.invalid never resolves, so discovery fast-falls to the canonical fallback
+// list) and asserts the handler forwards the chosen lane to the seed function
+// for every candidate sitemap.
+func TestHandleSiteSubmitLane(t *testing.T) {
+	var gotLanes []byte
+	s := &pebbleHTTP{
+		crawlSeedSitemapLane: func(_ context.Context, _ string, lane byte) (int, error) {
+			gotLanes = append(gotLanes, lane)
+			return 0, nil
+		},
+	}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodPost, "/admin/site-submit",
+		strings.NewReader(`{"host":"nonexistent.invalid","lane":"priority"}`))
+	s.handleSiteSubmit(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("got %d want 200 (body=%s)", rec.Code, rec.Body.String())
+	}
+	if len(gotLanes) == 0 {
+		t.Fatal("seed func never called")
+	}
+	for i, l := range gotLanes {
+		if l != store.LaneSubmitted {
+			t.Errorf("candidate %d: lane %d, want submitted(%d)", i, l, store.LaneSubmitted)
+		}
+	}
+	var resp struct {
+		Host string `json:"host"`
+		Lane string `json:"lane"`
+	}
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode resp: %v", err)
+	}
+	if resp.Host != "nonexistent.invalid" || resp.Lane != "submitted" {
+		t.Errorf("resp = %+v, want host=nonexistent.invalid lane=submitted", resp)
+	}
+}
diff --git a/internal/crawler/sitemap.go b/internal/crawler/sitemap.go
index 379575a..61554c7 100644
--- a/internal/crawler/sitemap.go
+++ b/internal/crawler/sitemap.go
@@ -45,7 +45,23 @@ import (
 // directly — it's expected this runs at startup, not in a hot loop.
 //
 // Returns the number of URLs enqueued.
+//
+// Sitemap-imported URLs go into the refresh lane: callers run sitemap-import
+// to refresh known-good sources (kubernetes.io, docs.python.org, etc.), so
+// prioritize over generic discovery. Use SeedSitemapLane to land them in a
+// different lane (e.g. store.LaneSubmitted for an operator-driven priority
+// site submission).
 func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, error) {
+	return c.SeedSitemapLane(ctx, sitemapURL, store.LaneRefresh)
+}
+
+// SeedSitemapLane fetches a sitemap (or sitemap-index, two levels of
+// recursion) and pushes every <loc> URL into the given priority lane.
+//
+// Bypass include_domains the same way SeedRSS does: the operator explicitly
+// requested this sitemap, so trust its URLs regardless of the curated crawler
+// allowlist.
+func (c *Crawler) SeedSitemapLane(ctx context.Context, sitemapURL string, lane byte) (int, error) {
 	// stream URLs into the frontier via callback instead of
 	// materializing the full URL list. The prior approach accumulated
 	// every URL across the entire recursive sitemap-index walk into a
@@ -54,14 +70,6 @@ func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, erro
 	// showed strings.Builder.Write at 107 GB). Streaming bounds heap to
 	// O(current sitemap size) regardless of nesting depth or total URLs.
 	n := 0
-	// Sitemap-imported URLs go into the refresh lane: callers run
-	// sitemap-import to refresh known-good sources (kubernetes.io,
-	// docs.python.org, etc.), so prioritize over generic discovery.
-	//
-	// Bypass include_domains here for the same reason as SeedRSS: the
-	// operator explicitly requested this sitemap, so trust its URLs
-	// regardless of the curated crawler allowlist.
-	//
 	// Buffer URLs into 1024-item batches and flush via PushFrontierBatch.
 	// Single mu acquire per batch instead of per URL — at scale (MDN,
 	// kubernetes.io sitemaps with 100K+ URLs) this is the difference
@@ -83,7 +91,7 @@ func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, erro
 		if cerr != nil {
 			return
 		}
-		buf = append(buf, store.FrontierPushItem{URL: canon, Depth: 0, Lane: 1, Priority: 1.0})
+		buf = append(buf, store.FrontierPushItem{URL: canon, Depth: 0, Lane: lane, Priority: 1.0})
 		if len(buf) >= batchSize {
 			flush()
 		}
diff --git a/internal/crawler/sitemap_lane_test.go b/internal/crawler/sitemap_lane_test.go
new file mode 100644
index 0000000..cb883f8
--- /dev/null
+++ b/internal/crawler/sitemap_lane_test.go
@@ -0,0 +1,89 @@
+package crawler
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"path/filepath"
+	"testing"
+
+	"github.com/pilot-protocol/cosift/internal/config"
+	"github.com/pilot-protocol/cosift/internal/index"
+	"github.com/pilot-protocol/cosift/internal/store"
+)
+
+// TestSeedSitemapLane confirms SeedSitemapLane lands every <loc> URL in the
+// requested frontier lane (here the high-priority submitted lane, lane 0),
+// while plain SeedSitemap keeps using the refresh lane (lane 1). This is the
+// plumbing behind /admin/site-submit's "submit a whole site to the priority
+// queue".
+func TestSeedSitemapLane(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Content-Type", "application/xml")
+		_, _ = w.Write([]byte(urlsetXML)) // 3 URLs, defined in sitemap_test.go
+	}))
+	defer srv.Close()
+
+	dir := filepath.Join(t.TempDir(), "pebble")
+	ps, err := store.OpenPebble(dir)
+	if err != nil {
+		t.Fatalf("OpenPebble: %v", err)
+	}
+	defer ps.Close()
+
+	cfg := config.Default().Crawler
+	cfg.RespectRobots = false
+	c := NewWithBackend(cfg, ps, index.NewPebbleBM25(ps))
+
+	n, err := c.SeedSitemapLane(context.Background(), srv.URL+"/sitemap.xml", store.LaneSubmitted)
+	if err != nil {
+		t.Fatalf("SeedSitemapLane: %v", err)
+	}
+	if n != 3 {
+		t.Fatalf("queued: got %d want 3", n)
+	}
+
+	stats, err := ps.GetLaneStats(context.Background())
+	if err != nil {
+		t.Fatalf("GetLaneStats: %v", err)
+	}
+	if got := stats.Lanes[store.LaneSubmitted].Queued; got != 3 {
+		t.Errorf("submitted lane queued: got %d want 3", got)
+	}
+	if got := stats.Lanes[store.LaneRefresh].Queued; got != 0 {
+		t.Errorf("refresh lane queued: got %d want 0 (should not leak into refresh)", got)
+	}
+}
+
+// TestSeedSitemapDefaultLane confirms the plain wrapper still uses the refresh
+// lane — a regression guard so site-submit's lane change doesn't silently
+// alter sitemap-import behavior.
+func TestSeedSitemapDefaultLane(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.Header().Set("Content-Type", "application/xml")
+		_, _ = w.Write([]byte(urlsetXML))
+	}))
+	defer srv.Close()
+
+	dir := filepath.Join(t.TempDir(), "pebble")
+	ps, err := store.OpenPebble(dir)
+	if err != nil {
+		t.Fatalf("OpenPebble: %v", err)
+	}
+	defer ps.Close()
+
+	cfg := config.Default().Crawler
+	cfg.RespectRobots = false
+	c := NewWithBackend(cfg, ps, index.NewPebbleBM25(ps))
+
+	if _, err := c.SeedSitemap(context.Background(), srv.URL+"/sitemap.xml"); err != nil {
+		t.Fatalf("SeedSitemap: %v", err)
+	}
+	stats, err := ps.GetLaneStats(context.Background())
+	if err != nil {
+		t.Fatalf("GetLaneStats: %v", err)
+	}
+	if got := stats.Lanes[store.LaneRefresh].Queued; got != 3 {
+		t.Errorf("refresh lane queued: got %d want 3", got)
+	}
+}

From ebf230b88fccea441c4a994e63bcaa8a095813c9 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Mon, 15 Jun 2026 15:48:10 +0000
Subject: [PATCH 09/10] =?UTF-8?q?feat(purge):=20purge-domain=20command=20?=
 =?UTF-8?q?=E2=80=94=20soft-delete=20docs=20by=20host/TLD=20suffix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pure host-suffix sweep (dot-boundary) over the corpus: -suffix cfd,sbs
soft-deletes every *.cfd and *.sbs doc regardless of content. Companion to
the crawler exclude_domains blacklist (which stops new ones) for clearing an
already-indexed backlog. Dry-run by default; -apply to delete; -readonly to
report alongside a live serve. Mirrors purge-adult's soft-delete + histogram
report; reuses matchesAnyDomain for dot-boundary matching.
---
 cmd/cosift/main.go              |   4 ++
 cmd/cosift/purge_domain.go      | 124 ++++++++++++++++++++++++++++++++
 cmd/cosift/purge_domain_test.go |  98 +++++++++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 cmd/cosift/purge_domain.go
 create mode 100644 cmd/cosift/purge_domain_test.go

diff --git a/cmd/cosift/main.go b/cmd/cosift/main.go
index fd813fa..0a59a3c 100644
--- a/cmd/cosift/main.go
+++ b/cmd/cosift/main.go
@@ -338,6 +338,10 @@ func run(cfgPath string) error {
 		if err := runPurgeAdult(ctx, flag.Args()[1:]); err != nil {
 			return fmt.Errorf("purge-adult: %w", err)
 		}
+	case "purge-domain":
+		if err := runPurgeDomain(ctx, flag.Args()[1:]); err != nil {
+			return fmt.Errorf("purge-domain: %w", err)
+		}
 	case "verify":
 		if err := runVerifyPebble(ctx, cfg, flag.Args()[1:]); err != nil {
 			return fmt.Errorf("verify: %w", err)
diff --git a/cmd/cosift/purge_domain.go b/cmd/cosift/purge_domain.go
new file mode 100644
index 0000000..eadd2be
--- /dev/null
+++ b/cmd/cosift/purge_domain.go
@@ -0,0 +1,124 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/pilot-protocol/cosift/internal/store"
+)
+
+// runPurgeDomain sweeps an offline PebbleStore and soft-deletes every document
+// whose host matches one of the given domain/TLD suffixes (dot-boundary), e.g.
+// -suffix cfd,sbs removes every *.cfd and *.sbs page regardless of content.
+//
+// Companion to the crawler's exclude_domains blacklist: the blacklist stops
+// NEW pages from those TLDs being crawled, this clears the backlog already
+// indexed. Unlike purge-adult (which only removes pages that ALSO trip the
+// adult classifier), this is a pure host-suffix sweep.
+//
+// DRY RUN BY DEFAULT. -apply soft-deletes (store.SoftDeleteDocument), leaving
+// inverted-index postings as harmless orphans (retrieval skips any docID whose
+// meta is gone), so it's a few point-deletes per doc rather than a full index
+// rewrite — tractable across a multi-million-doc corpus. After purging a large
+// fraction, run a compaction to reclaim disk and correct IDF.
+//
+//	cosift purge-domain -dir /data/pebble -suffix cfd,sbs            # dry run + report
+//	cosift purge-domain -dir /data/pebble -suffix cfd,sbs -apply     # delete
+//	cosift purge-domain -dir /data/pebble -suffix cfd,sbs -readonly  # dry run alongside a live serve
+func runPurgeDomain(ctx context.Context, args []string) error {
+	fs := flag.NewFlagSet("purge-domain", flag.ExitOnError)
+	dir := fs.String("dir", "", "PebbleStore directory (required; same dir as pebble-serve -dir)")
+	suffixCSV := fs.String("suffix", "", "CSV of host/TLD suffixes to purge, dot-boundary match (e.g. cfd,sbs)")
+	apply := fs.Bool("apply", false, "actually soft-delete matches (default: dry run, report only)")
+	limit := fs.Int("limit", 0, "stop after deleting this many docs (0 = no limit)")
+	topHosts := fs.Int("top-hosts", 25, "how many top matched hosts/TLDs to print in the report")
+	readonly := fs.Bool("readonly", false, "open the store read-only (no lock) — runs alongside a live pebble-serve; forces dry run")
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	if *dir == "" {
+		return fmt.Errorf("-dir required")
+	}
+	suffixes := splitDomainsCSV(*suffixCSV)
+	if len(suffixes) == 0 {
+		return fmt.Errorf("-suffix required (e.g. -suffix cfd,sbs)")
+	}
+	if *readonly && *apply {
+		return fmt.Errorf("-readonly cannot be combined with -apply (read-only opens take no write lock)")
+	}
+
+	var ps *store.PebbleStore
+	var err error
+	if *readonly {
+		ps, err = store.OpenPebbleReadOnly(*dir)
+	} else {
+		ps, err = store.OpenPebble(*dir)
+	}
+	if err != nil {
+		return fmt.Errorf("open store: %w", err)
+	}
+	defer ps.Close()
+
+	_, before, _ := ps.CorpusStats(ctx)
+	mode := "DRY RUN (no deletes)"
+	if *apply {
+		mode = "APPLY (soft-deleting matches)"
+	}
+	fmt.Fprintf(os.Stderr, "purge-domain: %s — scanning %d docs for suffixes %v\n", mode, before, suffixes)
+
+	var scanned, matched, deleted int64
+	tldHist := map[string]int64{}
+	hostHist := map[string]int64{}
+	var samples []string
+
+	err = ps.IterDocMeta(ctx, func(docID int64, url, title string) error {
+		scanned++
+		if scanned%500_000 == 0 {
+			fmt.Fprintf(os.Stderr, "purge-domain: scanned %d, matched %d, deleted %d\n", scanned, matched, deleted)
+		}
+		host := hostFromURL(url)
+		if !matchesAnyDomain(host, suffixes) {
+			return nil
+		}
+		matched++
+		hostHist[host]++
+		tldHist["."+tldOfHost(host)]++
+		if len(samples) < 20 {
+			samples = append(samples, url)
+		}
+		if *apply {
+			ok, derr := ps.SoftDeleteDocument(ctx, docID, url)
+			if derr != nil {
+				return fmt.Errorf("delete doc %d: %w", docID, derr)
+			}
+			if ok {
+				deleted++
+			}
+			if *limit > 0 && deleted >= int64(*limit) {
+				return errStopSweep
+			}
+		}
+		return nil
+	})
+	if err != nil && err != errStopSweep {
+		return fmt.Errorf("sweep: %w", err)
+	}
+
+	_, after, _ := ps.CorpusStats(ctx)
+	fmt.Fprintf(os.Stderr, "\npurge-domain: done — scanned=%d matched=%d deleted=%d\n", scanned, matched, deleted)
+	fmt.Fprintf(os.Stderr, "purge-domain: corpus indexed_docs %d → %d\n", before, after)
+	printHist(os.Stderr, "top matched TLDs", tldHist, *topHosts)
+	printHist(os.Stderr, "top matched hosts", hostHist, *topHosts)
+	if len(samples) > 0 {
+		fmt.Fprintln(os.Stderr, "\nsample matches:")
+		for _, s := range samples {
+			fmt.Fprintln(os.Stderr, "  "+s)
+		}
+	}
+	if !*apply && matched > 0 {
+		fmt.Fprintf(os.Stderr, "\npurge-domain: DRY RUN — re-run with -apply to soft-delete the %d matched docs.\n", matched)
+	}
+	return nil
+}
diff --git a/cmd/cosift/purge_domain_test.go b/cmd/cosift/purge_domain_test.go
new file mode 100644
index 0000000..4146a89
--- /dev/null
+++ b/cmd/cosift/purge_domain_test.go
@@ -0,0 +1,98 @@
+package main
+
+import (
+	"context"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/pilot-protocol/cosift/internal/index"
+	"github.com/pilot-protocol/cosift/internal/store"
+)
+
+// TestRunPurgeDomain verifies the host-suffix sweep soft-deletes exactly the
+// matching-TLD docs (dot-boundary) and leaves the rest, and that the default
+// dry run deletes nothing.
+func TestRunPurgeDomain(t *testing.T) {
+	ctx := context.Background()
+	dir := filepath.Join(t.TempDir(), "pebble")
+	ps, err := store.OpenPebble(dir)
+	if err != nil {
+		t.Fatalf("OpenPebble: %v", err)
+	}
+	idx := index.NewPebbleBM25(ps)
+
+	docs := []struct{ url, title, text string }{
+		{"https://spam1.cfd/a", "Spam A", "junk body one"},
+		{"https://x.spam.cfd/b", "Spam B", "junk body two"},  // subdomain of a .cfd host
+		{"https://gamble.sbs/c", "Bet C", "junk body three"}, // .sbs
+		{"https://good.com/d", "Good D", "real useful content"},
+		{"https://docs.example.org/e", "Docs E", "reference material"},
+		{"https://notcfd.com/f", "Edge F", "ends in cfd-ish but is .com"}, // must NOT match
+	}
+	for _, d := range docs {
+		id, err := ps.UpsertDocument(ctx, &store.Document{URL: d.url, Title: d.title, Text: d.text, FetchedAt: time.Now()})
+		if err != nil {
+			t.Fatalf("UpsertDocument %s: %v", d.url, err)
+		}
+		if err := idx.IndexDocument(ctx, id, d.title, d.text); err != nil {
+			t.Fatalf("IndexDocument %s: %v", d.url, err)
+		}
+	}
+	ps.Close() // release the write lock so runPurgeDomain can open the dir
+
+	// Dry run: must delete nothing.
+	if err := runPurgeDomain(ctx, []string{"-dir", dir, "-suffix", "cfd,sbs"}); err != nil {
+		t.Fatalf("dry run: %v", err)
+	}
+	assertDocs(t, dir, map[string]bool{
+		"https://spam1.cfd/a": true, "https://gamble.sbs/c": true, "https://good.com/d": true,
+	})
+
+	// Apply: purge *.cfd and *.sbs.
+	if err := runPurgeDomain(ctx, []string{"-dir", dir, "-suffix", "cfd,sbs", "-apply"}); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	assertDocs(t, dir, map[string]bool{
+		"https://spam1.cfd/a":        false, // purged
+		"https://x.spam.cfd/b":       false, // purged (subdomain)
+		"https://gamble.sbs/c":       false, // purged
+		"https://good.com/d":         true,  // kept
+		"https://docs.example.org/e": true,  // kept
+		"https://notcfd.com/f":       true,  // kept (dot-boundary: not a .cfd)
+	})
+}
+
+// assertDocs opens the store read-only and checks presence/absence per URL.
+func assertDocs(t *testing.T, dir string, want map[string]bool) {
+	t.Helper()
+	ps, err := store.OpenPebble(dir)
+	if err != nil {
+		t.Fatalf("reopen: %v", err)
+	}
+	defer ps.Close()
+	for u, shouldExist := range want {
+		d, _ := ps.GetDocByURL(context.Background(), u)
+		if shouldExist && d == nil {
+			t.Errorf("%s: expected present, got deleted", u)
+		}
+		if !shouldExist && d != nil {
+			t.Errorf("%s: expected purged, still present", u)
+		}
+	}
+}
+
+func TestRunPurgeDomainRequiresSuffix(t *testing.T) {
+	dir := filepath.Join(t.TempDir(), "pebble")
+	ps, err := store.OpenPebble(dir)
+	if err != nil {
+		t.Fatalf("OpenPebble: %v", err)
+	}
+	ps.Close()
+	if err := runPurgeDomain(context.Background(), []string{"-dir", dir}); err == nil {
+		t.Error("expected error when -suffix is empty")
+	}
+	if err := runPurgeDomain(context.Background(), []string{"-dir", dir, "-suffix", "cfd", "-apply", "-readonly"}); err == nil {
+		t.Error("expected error when -apply combined with -readonly")
+	}
+}

From 8216f9cecd808c2e2a9e2ef9d994c721de57cdc2 Mon Sep 17 00:00:00 2001
From: Teodor Calin <teodor@vulturelabs.io>
Date: Tue, 16 Jun 2026 10:52:07 +0300
Subject: [PATCH 10/10] fix(frontier): migrate 3 remaining read paths to
 lane-aware key layout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The site= scoping PR moved frontier secondary-index writes to the
lane-aware key layout ('f','q',<lane>,<host>,0x00,<url>) but left three
read paths on the legacy ('f','q',<host>,0x00,<url>) layout:

- CountQueuedPerHost built a legacy 'f','q',host,0x00 prefix that can
  never match a lane key (the lane byte sits between 'q' and the host),
  so it returned 0 for every host whose URLs were pushed post-migration.
  Now counts the legacy range plus all 4 lane ranges, mirroring
  PurgeFrontierByHost.

- TopQueuedHosts scanned the correct 'f','q' range but decoded the host
  with the legacy frontierStatusIndexHost, which reads from byte 2 — the
  lane byte on a lane key, not the host. Switched to the lane-aware
  decodeFrontierIndexHost, which handles both formats.

- SeedSitemapLane bypassed the include/exclude domain filter: the new
  canonicalize + PushFrontierBatch path skipped allowedDomain entirely,
  so ExcludeDomains was no longer honored. Re-applied c.allowedDomain in
  the sitemap emit callback.

Fixes TestPebbleCountQueuedPerHost, TestPebblePurgeFrontierByHost,
TestPebbleTopQueuedHosts, TestSeedSitemapRespectsDomainFilters. No
wire/key-format changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/crawler/sitemap.go |  9 +++--
 internal/store/pebble.go    | 76 +++++++++++++++++++++++++------------
 2 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/internal/crawler/sitemap.go b/internal/crawler/sitemap.go
index 61554c7..7f850ad 100644
--- a/internal/crawler/sitemap.go
+++ b/internal/crawler/sitemap.go
@@ -58,9 +58,9 @@ func (c *Crawler) SeedSitemap(ctx context.Context, sitemapURL string) (int, erro
 // SeedSitemapLane fetches a sitemap (or sitemap-index, two levels of
 // recursion) and pushes every <loc> URL into the given priority lane.
 //
-// Bypass include_domains the same way SeedRSS does: the operator explicitly
-// requested this sitemap, so trust its URLs regardless of the curated crawler
-// allowlist.
+// URLs are filtered through allowedDomain so the operator's include/exclude
+// domain lists (and ExcludeURLPatterns) are honored — a sitemap pointing at a
+// blocked host must not slip URLs past the curated allowlist.
 func (c *Crawler) SeedSitemapLane(ctx context.Context, sitemapURL string, lane byte) (int, error) {
 	// stream URLs into the frontier via callback instead of
 	// materializing the full URL list. The prior approach accumulated
@@ -91,6 +91,9 @@ func (c *Crawler) SeedSitemapLane(ctx context.Context, sitemapURL string, lane b
 		if cerr != nil {
 			return
 		}
+		if !c.allowedDomain(canon) {
+			return
+		}
 		buf = append(buf, store.FrontierPushItem{URL: canon, Depth: 0, Lane: lane, Priority: 1.0})
 		if len(buf) >= batchSize {
 			flush()
diff --git a/internal/store/pebble.go b/internal/store/pebble.go
index 5bd31f4..b758bb4 100644
--- a/internal/store/pebble.go
+++ b/internal/store/pebble.go
@@ -2734,37 +2734,59 @@ func (p *PebbleStore) CountQueuedPerHost(ctx context.Context, hosts []string) (m
 		return nil, err
 	}
 	out := make(map[string]int, len(hosts))
+	// countRange counts the keys in one secondary-index prefix range. Wrapped
+	// in a closure so defer it.Close() runs per call, not stacked until the
+	// enclosing function returns (Go defers fire on FUNCTION return).
+	countRange := func(lo, hi []byte) (int, error) {
+		it, err := p.db.NewIter(&pebble.IterOptions{LowerBound: lo, UpperBound: hi})
+		if err != nil {
+			return 0, err
+		}
+		defer it.Close()
+		var count int
+		for valid := it.First(); valid; valid = it.Next() {
+			count++
+		}
+		return count, nil
+	}
 	for _, host := range hosts {
 		if host == "" {
 			continue
 		}
-		// Prefix bound: 'f' + 'q' + host + 0x00 .. 'f' + 'q' + host + 0x01
-		lo := make([]byte, 2+len(host)+1)
-		lo[0] = famFrontier
-		lo[1] = 'q'
-		copy(lo[2:], host)
-		lo[2+len(host)] = 0x00
-		hi := append([]byte{}, lo...)
-		hi[2+len(host)] = 0x01
-		// per-iteration closure so defer it.Close() runs at each
-		// host's end, not at the enclosing function's return. Without the
-		// closure a `defer` here would stack iterators until function exit
-		// (Go defers fire on FUNCTION return, not on loop iteration).
-		n, err := func() (int, error) {
-			it, err := p.db.NewIter(&pebble.IterOptions{LowerBound: lo, UpperBound: hi})
-			if err != nil {
-				return 0, err
-			}
-			defer it.Close()
-			var count int
-			for valid := it.First(); valid; valid = it.Next() {
-				count++
-			}
-			return count, nil
-		}()
+		n := 0
+		// Legacy range: 'f' + 'q' + host + 0x00 .. 0x01 (pre-lanes entries).
+		legacyLo := make([]byte, 2+len(host)+1)
+		legacyLo[0] = famFrontier
+		legacyLo[1] = 'q'
+		copy(legacyLo[2:], host)
+		legacyLo[2+len(host)] = 0x00
+		legacyHi := append([]byte{}, legacyLo...)
+		legacyHi[2+len(host)] = 0x01
+		c, err := countRange(legacyLo, legacyHi)
 		if err != nil {
 			return nil, err
 		}
+		n += c
+		// Lane-aware ranges: 'f' + 'q' + lane + host + 0x00 .. 0x01 for every
+		// lane. The legacy prefix never matches these (the lane byte sits
+		// between 'q' and the host), so without this loop hosts whose queued
+		// URLs live in lanes — i.e. everything pushed after the lane migration —
+		// would count as 0.
+		for lane := byte(0); lane < laneCount; lane++ {
+			laneLo := make([]byte, 3+len(host)+1)
+			laneLo[0] = famFrontier
+			laneLo[1] = 'q'
+			laneLo[2] = lane
+			copy(laneLo[3:], host)
+			laneLo[3+len(host)] = 0x00
+			laneHi := append([]byte{}, laneLo...)
+			laneHi[3+len(host)] = 0x01
+			c, err := countRange(laneLo, laneHi)
+			if err != nil {
+				return nil, err
+			}
+			n += c
+		}
 		if n > 0 {
 			out[host] = n
 		}
@@ -3257,7 +3279,11 @@ func (p *PebbleStore) TopQueuedHosts(ctx context.Context, topN int) ([]DomainCou
 	defer it.Close()
 	counts := make(map[string]int, 256)
 	for valid := it.First(); valid; valid = it.Next() {
-		host := frontierStatusIndexHost(it.Key())
+		// decodeFrontierIndexHost handles both the legacy
+		// 'f'+'q'+host+0x00+url keys and the lane-aware
+		// 'f'+'q'+lane+host+0x00+url keys. The legacy frontierStatusIndexHost
+		// reads from byte 2, which on a lane key is the lane byte, not the host.
+		host := decodeFrontierIndexHost(it.Key())
 		if host == "" {
 			continue
 		}