Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,16 @@ To replay: process segments in ascending run id; each line with status `present`

```toml
[volumes.pictures]
path = "~/Pictures"
sync_to = ["nas", "offsite"]
offload_requires = ["nas", "offsite"]
path = "~/Pictures"
sync_to = ["nas", "offsite"]
offload_requires = ["nas", "offsite"]
offload_max_evidence_age = "720h" # optional; default disabled
```

`offload_requires` is the explicit per-volume policy: every named target's recorded durability must cover a file's content before its bytes may go, and a volume without the key refuses to offload entirely. The names share the flat destination/node namespace that `sync_to` uses. They may also name targets only a *peer* pushes to: evidence about those arrives through the peer durability pull (`squirrel peer-sync pull-durability`), and a name with no recorded evidence simply keeps the gate closed.

`offload_max_evidence_age` is an optional, opt-in defence-in-depth knob: when set, a target whose durability evidence was last *re-verified* longer ago than this (in wall-clock time) is treated as stale and refuses the offload, even when its version-vector coverage is sound. This stops the gate from trusting a destination that has been dead or unreachable for months on the strength of a claim never since re-confirmed. It is fail-closed — evidence with an unknown verification time (a row migrated from before the column existed, or one only ever touched without re-verification) is refused too. Re-verification (a fresh push, or a durability pull from the asserting peer) re-stamps the evidence and clears the staleness. The default is disabled (no maximum age), so existing configs are unaffected; only an equal-value re-confirmation that carries no verification never refreshes the clock, so the age tracks genuine checks rather than no-op touches.

```
squirrel offload pictures 2019/ # a subtree
squirrel offload pictures --older-than 90d # by age (indexed mtime)
Expand Down
11 changes: 6 additions & 5 deletions cmd/squirrel/offload.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,12 @@ func runOffload(cmd *cobra.Command, volumeName string, paths []string, olderThan
defer s.Close()

rep, err := offload.Offload(cmd.Context(), s, vol.Path, offload.Options{
Name: volumeName,
Paths: paths,
OlderThan: olderThan,
Require: vol.OffloadRequires,
DryRun: dryRun,
Name: volumeName,
Paths: paths,
OlderThan: olderThan,
Require: vol.OffloadRequires,
MaxEvidenceAge: vol.OffloadMaxEvidenceAge,
DryRun: dryRun,
})
printOffloadReport(cmd.OutOrStdout(), cmd.ErrOrStderr(), rep, dryRun)
if err != nil {
Expand Down
62 changes: 49 additions & 13 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ type Volume struct {
// durability pull about targets only that peer reaches; a name with
// no recorded evidence keeps the gate closed.
OffloadRequires []string
// OffloadMaxEvidenceAge bounds how old a required target's durability
// evidence may be (in wall-clock time since it was last re-verified)
// before `squirrel offload` refuses to delete on its strength. Zero —
// the default — disables the time-based staleness policy, leaving the
// version-vector coverage gate as the sole durability check, so the
// knob is opt-in and existing configs are unaffected. When set,
// evidence whose last verification is unknown or older than this is
// refused (fail-closed); it pairs with periodic re-verification
// (`squirrel verify`, scrub) that re-stamps fresh evidence.
OffloadMaxEvidenceAge time.Duration
// SyncEvery is the agent-scheduler cadence for full syncs of this
// volume. Zero means "no scheduled sync" — the agent never auto-
// triggers a sync for this volume; manual `squirrel sync` still
Expand Down Expand Up @@ -266,12 +276,13 @@ type rawConfig struct {
}

type rawVolume struct {
Path string `toml:"path"`
SyncTo []string `toml:"sync_to"`
OffloadRequires []string `toml:"offload_requires"`
SyncEvery string `toml:"sync_every"`
IndexEvery string `toml:"index_every"`
Hook *rawVolumeHook `toml:"hook"`
Path string `toml:"path"`
SyncTo []string `toml:"sync_to"`
OffloadRequires []string `toml:"offload_requires"`
OffloadMaxEvidenceAge string `toml:"offload_max_evidence_age"`
SyncEvery string `toml:"sync_every"`
IndexEvery string `toml:"index_every"`
Hook *rawVolumeHook `toml:"hook"`
}

type rawVolumeHook struct {
Expand Down Expand Up @@ -365,6 +376,10 @@ func resolveVolume(name string, raw rawVolume, dests map[string]*Destination, no
if err := validateOffloadRequires(raw.OffloadRequires); err != nil {
return nil, err
}
offloadMaxEvidenceAge, err := parseOffloadMaxEvidenceAge(raw.OffloadMaxEvidenceAge)
if err != nil {
return nil, err
}
syncEvery, err := parseVolumeCadence("sync_every", raw.SyncEvery)
if err != nil {
return nil, err
Expand All @@ -385,16 +400,37 @@ func resolveVolume(name string, raw rawVolume, dests map[string]*Destination, no
return nil, err
}
return &Volume{
Name: name,
Path: abs,
SyncTo: raw.SyncTo,
OffloadRequires: raw.OffloadRequires,
SyncEvery: syncEvery,
IndexEvery: indexEvery,
Hook: hook,
Name: name,
Path: abs,
SyncTo: raw.SyncTo,
OffloadRequires: raw.OffloadRequires,
OffloadMaxEvidenceAge: offloadMaxEvidenceAge,
SyncEvery: syncEvery,
IndexEvery: indexEvery,
Hook: hook,
}, nil
}

// parseOffloadMaxEvidenceAge parses the optional offload_max_evidence_age
// knob. Empty stays zero — the time-based staleness policy is disabled,
// its opt-in default. Non-empty must parse as a strictly positive
// time.Duration; a sub-second value is almost certainly a missing unit
// suffix (e.g. `30` where `720h` or `30d`-style was meant) for a knob
// whose sensible values are days to months, so it is rejected.
func parseOffloadMaxEvidenceAge(raw string) (time.Duration, error) {
if raw == "" {
return 0, nil
}
dur, err := time.ParseDuration(raw)
if err != nil {
return 0, fmt.Errorf("offload_max_evidence_age %q: %w", raw, err)
}
if dur < time.Second {
return 0, fmt.Errorf("offload_max_evidence_age must be at least %s, got %s", time.Second, raw)
}
return dur, nil
}

// validateOffloadRequires checks the offload policy entries
// syntactically: each must be a well-formed target name and appear
// once. Membership in this config's destinations/nodes is deliberately
Expand Down
46 changes: 46 additions & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,52 @@ offload_requires = ["nas", "nas"]
}
}

// TestLoadOffloadMaxEvidenceAge: the optional staleness knob parses as a
// duration; absent it defaults to zero (the time-based policy disabled).
func TestLoadOffloadMaxEvidenceAge(t *testing.T) {
p := writeConfig(t, `
[volumes.pictures]
path = "/tmp/pictures"
offload_requires = ["nas"]
offload_max_evidence_age = "720h"

[volumes.docs]
path = "/tmp/docs"
offload_requires = ["nas"]
`)
cfg, err := Load(p)
if err != nil {
t.Fatalf("Load: %v", err)
}
if got := cfg.Volumes["pictures"].OffloadMaxEvidenceAge; got != 720*time.Hour {
t.Fatalf("OffloadMaxEvidenceAge = %s, want 720h", got)
}
if got := cfg.Volumes["docs"].OffloadMaxEvidenceAge; got != 0 {
t.Fatalf("OffloadMaxEvidenceAge = %s, want 0 (disabled by default)", got)
}
}

func TestLoadRejectsBadOffloadMaxEvidenceAge(t *testing.T) {
cases := []struct{ name, value string }{
{"garbage", "soon"},
{"sub_second", "500ms"},
{"unitless", "30"},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
p := writeConfig(t, `
[volumes.pictures]
path = "/tmp/pictures"
offload_requires = ["nas"]
offload_max_evidence_age = "`+c.value+`"
`)
if _, err := Load(p); err == nil || !strings.Contains(err.Error(), "offload_max_evidence_age") {
t.Fatalf("expected offload_max_evidence_age error, got %v", err)
}
})
}
}

func TestLoadRejectsInvalidName(t *testing.T) {
// Names that wouldn't survive being a filesystem subfolder or an
// rclone.conf section are rejected at load time.
Expand Down
82 changes: 69 additions & 13 deletions offload/gate.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"database/sql"
"errors"
"fmt"
"time"

"github.com/mbertschler/squirrel/store"
)
Expand All @@ -16,11 +17,14 @@ import (
// behind it; source is the asserting peer's node id when the component
// was last advanced by a durability pull, and invalid (NULL) for the
// locally-verified class — so the gate's decision names which peer's
// evidence backed it.
// evidence backed it. verifiedAt is the wall-clock of the last write
// backed by genuine re-verification (NULL when unknown), the basis for
// the time-based staleness policy.
type component struct {
coveredRun int64
method string
source sql.NullInt64
verifiedAt sql.NullInt64
}

// gate is the offline durability evidence for one invocation: the self
Expand All @@ -41,22 +45,33 @@ type gate struct {
lastPush map[string]int64 // target → last whole-volume push run (local space)
freshness map[string]map[int64]int64 // target → origin node id → pulled push-freshness origin run
nodeNames map[int64]string
// nowNs is the invocation's wall-clock, captured once so every
// candidate's staleness is judged against one instant and tests can
// inject a fixed time.
nowNs int64
// maxEvidenceAge, when positive, refuses a component whose
// verified_at_ns is older than nowNs − maxEvidenceAge (or unknown).
// Zero disables the time-based policy — the opt-in default, so the
// version-vector gate behaves exactly as before.
maxEvidenceAge time.Duration
}

func loadGate(ctx context.Context, s *store.Store, volumeID int64, require []string) (*gate, error) {
func loadGate(ctx context.Context, s *store.Store, volumeID int64, require []string, nowNs int64, maxEvidenceAge time.Duration) (*gate, error) {
self, err := s.GetSelfNode(ctx)
if err != nil {
return nil, fmt.Errorf("lookup self node: %w", err)
}
g := &gate{
store: s,
volumeID: volumeID,
self: self,
require: require,
vectors: make(map[string]map[int64]component, len(require)),
lastPush: make(map[string]int64, len(require)),
freshness: make(map[string]map[int64]int64, len(require)),
nodeNames: map[int64]string{self.ID: self.Name},
store: s,
volumeID: volumeID,
self: self,
require: require,
vectors: make(map[string]map[int64]component, len(require)),
lastPush: make(map[string]int64, len(require)),
freshness: make(map[string]map[int64]int64, len(require)),
nodeNames: map[int64]string{self.ID: self.Name},
nowNs: nowNs,
maxEvidenceAge: maxEvidenceAge,
}
for _, target := range require {
components, err := s.ListDestinationRunIDs(ctx, volumeID, target)
Expand All @@ -65,7 +80,7 @@ func loadGate(ctx context.Context, s *store.Store, volumeID int64, require []str
}
vector := make(map[int64]component, len(components))
for _, c := range components {
vector[c.OriginNodeID] = component{coveredRun: c.OriginRunID, method: c.VerifyMethod, source: c.SourceNodeID}
vector[c.OriginNodeID] = component{coveredRun: c.OriginRunID, method: c.VerifyMethod, source: c.SourceNodeID, verifiedAt: c.VerifiedAtNs}
}
g.vectors[target] = vector

Expand All @@ -89,9 +104,13 @@ func loadGate(ctx context.Context, s *store.Store, volumeID int64, require []str
}

// check evaluates the gate for one present row. Content with origin
// (N, r) is durable on a target only when all three conditions hold:
// (N, r) is durable on a target only when all four conditions hold:
//
// - origin vector: the target's component for N covers r;
// - evidence age: when a max evidence age is configured, the
// component's verified_at_ns is within it — defence-in-depth against
// a destination that was once durable but has not had its coverage
// re-confirmed in wall-clock time;
// - freshness: a successful whole-volume push covers the run in which
// the path last became present, so a path re-acquired after the last
// push is held until a fresh push covers it. For a target this node
Expand All @@ -102,7 +121,7 @@ func loadGate(ctx context.Context, s *store.Store, volumeID int64, require []str
// only content-addressed component — a verified scan-back
// fingerprint backs the gated object.
//
// The file passes only when every required target satisfies all three.
// The file passes only when every required target satisfies all four.
// The returned failures name each failing target and reason; an empty
// slice means the gate passed.
func (g *gate) check(ctx context.Context, row store.FileRow) ([]string, error) {
Expand All @@ -123,6 +142,10 @@ func (g *gate) check(ctx context.Context, row store.FileRow) ([]string, error) {
fmt.Sprintf("%s: stale: have %d need %d (origin %s, %s)", target, comp.coveredRun, originRun, g.nodeName(ctx, originNode), g.provenance(ctx, comp)))
continue
}
if reason := g.staleEvidenceFailure(ctx, target, comp); reason != "" {
failures = append(failures, reason)
continue
}
if reason := g.freshnessFailure(ctx, target, row, originNode, originRun); reason != "" {
failures = append(failures, reason)
continue
Expand All @@ -139,6 +162,39 @@ func (g *gate) check(ctx context.Context, row store.FileRow) ([]string, error) {
return failures, nil
}

// staleEvidenceFailure refuses the target when a max evidence age is
// configured and the component's verification is older than that age in
// wall-clock time — the time-based staleness policy (issue #131), a
// complement to the version-vector coverage check above. It is
// fail-closed: a component whose verified_at_ns is unknown (NULL — a
// pre-v23 row, or one only ever advanced methodlessly at its recorded
// run) is treated as infinitely stale and refused, since the gate cannot
// show the coverage was recently re-confirmed.
//
// For a peer-asserted component (a durability pull tagged it) the
// verified_at_ns this node holds is when it last pulled a fresh
// assertion from that peer — the peer's own verification instant never
// travels the wire — so the policy bounds how long this node trusts
// relayed evidence without hearing from the peer again, the dead-peer
// defence the issue calls for. A zero maxEvidenceAge disables the policy
// entirely.
func (g *gate) staleEvidenceFailure(ctx context.Context, target string, comp component) string {
if g.maxEvidenceAge <= 0 {
return ""
}
cutoff := g.nowNs - int64(g.maxEvidenceAge)
if !comp.verifiedAt.Valid {
return fmt.Sprintf("%s: stale evidence: never re-verified (max age %s, %s)",
target, g.maxEvidenceAge, g.provenance(ctx, comp))
}
if comp.verifiedAt.Int64 < cutoff {
age := time.Duration(g.nowNs - comp.verifiedAt.Int64)
return fmt.Sprintf("%s: stale evidence: last verified %s ago > max age %s (%s)",
target, age.Round(time.Second), g.maxEvidenceAge, g.provenance(ctx, comp))
}
return ""
}

// freshnessFailure refuses the target when no successful whole-volume
// push covers the run in which the path last became present, closing the
// re-acquisition hole: a path deleted, re-introduced, and re-indexed must
Expand Down
15 changes: 11 additions & 4 deletions offload/offload.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ type Options struct {
// policy is an explicit precondition, there is no default target
// set.
Require []string
// MaxEvidenceAge, when positive, refuses any required target whose
// durability evidence was last re-verified longer ago than this in
// wall-clock time (issue #131). Zero disables the time-based
// staleness policy — the opt-in default, so the version-vector gate
// behaves exactly as before.
MaxEvidenceAge time.Duration
// DryRun evaluates and reports the per-file gate decisions from the
// index alone: no runs row, no file reads, no deletions, no status
// flips. Disk-drift checks only happen on a real run, immediately
Expand Down Expand Up @@ -129,15 +135,16 @@ func Offload(ctx context.Context, s *store.Store, root string, opts Options) (re
if err != nil {
return Report{}, err
}
g, err := loadGate(ctx, s, vol.ID, opts.Require)
now := time.Now()
g, err := loadGate(ctx, s, vol.ID, opts.Require, now.UnixNano(), opts.MaxEvidenceAge)
if err != nil {
return Report{}, err
}
rows, err := s.LoadVolumeIndex(ctx, vol.ID)
if err != nil {
return Report{}, err
}
candidates, misses := selectCandidates(rows, selectors, opts.OlderThan)
candidates, misses := selectCandidates(rows, selectors, opts.OlderThan, now)
report.SelectorMisses = misses

if opts.DryRun {
Expand Down Expand Up @@ -248,11 +255,11 @@ func underReservedSubtree(p string) bool {
// is applied after selector-hit tracking so a selector that only
// matched younger files still counts as matched. Candidates come back
// in path order for deterministic reports.
func selectCandidates(rows map[string]store.FileRow, selectors []string, olderThan time.Duration) ([]store.FileRow, []string) {
func selectCandidates(rows map[string]store.FileRow, selectors []string, olderThan time.Duration, now time.Time) ([]store.FileRow, []string) {
ageFiltered := olderThan > 0
var cutoffNs int64
if ageFiltered {
cutoffNs = time.Now().Add(-olderThan).UnixNano()
cutoffNs = now.Add(-olderThan).UnixNano()
}
hit := make(map[string]bool, len(selectors))
var out []store.FileRow
Expand Down
Loading
Loading