Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/bradfitz/go-tool-cache

go 1.24.0
go 1.25.0

require (
github.com/bradfitz/parentdeath v0.0.0-20260315043412-764506aeb900
Expand All @@ -10,7 +10,7 @@ require (
github.com/pierrec/lz4/v4 v4.1.25
github.com/prometheus/client_golang v1.23.0
github.com/prometheus/client_model v0.6.2
modernc.org/sqlite v1.45.0
modernc.org/sqlite v1.51.0
)

require (
Expand All @@ -24,10 +24,9 @@ require (
github.com/prometheus/common v0.65.0 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
golang.org/x/sys v0.37.0 // indirect
golang.org/x/sys v0.42.0 // indirect
google.golang.org/protobuf v1.36.6 // indirect
modernc.org/libc v1.67.6 // indirect
modernc.org/libc v1.72.3 // indirect
modernc.org/mathutil v1.7.1 // indirect
modernc.org/memory v1.11.0 // indirect
)
46 changes: 22 additions & 24 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -48,45 +48,43 @@ github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOf
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY=
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70=
golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc=
modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM=
modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA=
modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc=
modernc.org/cc/v4 v4.28.2 h1:3tQ0lf2ADtoby2EtSP+J7IE2SHwEJdP8ioR59wx7XpY=
modernc.org/cc/v4 v4.28.2/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI=
modernc.org/ccgo/v4 v4.34.0 h1:yRLPFZieg532OT4rp4JFNIVcquwalMX26G95WQDqwCQ=
modernc.org/ccgo/v4 v4.34.0/go.mod h1:AS5WYMyBakQ+fhsHhtP8mWB82KTGPkNNJDGfGQCe0/A=
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE=
modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI=
modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE=
modernc.org/libc v1.72.3 h1:ZnDF4tXn4NBXFutMMQC4vtbTFSXhhKzR73fv0beZEAU=
modernc.org/libc v1.72.3/go.mod h1:dn0dZNnnn1clLyvRxLxYExxiKRZIRENOfqQ8XEeg4Qs=
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
modernc.org/opt v0.2.0 h1:tGyef5ApycA7FSEOMraay9SaTk5zmbx7Tu+cJs4QKZg=
modernc.org/opt v0.2.0/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
modernc.org/sqlite v1.45.0 h1:r51cSGzKpbptxnby+EIIz5fop4VuE4qFoVEjNvWoObs=
modernc.org/sqlite v1.45.0/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
modernc.org/sqlite v1.51.0 h1:aH/MMSoayAIhozZ7uJbVTT9QO/VhzBf0J9tymmmuC/U=
modernc.org/sqlite v1.51.0/go.mod h1:tcNzv5p84E0skkmJn038y+hWJbLQXQqEnQfeh5r2JLM=
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
Expand Down
143 changes: 142 additions & 1 deletion gocached/gocached.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,27 @@ const (

const schemaVersion = 4

// walJournalSizeLimit caps the on-disk WAL size after a successful
// checkpoint. It is set as a per-connection PRAGMA so that even SQLite's
// built-in PASSIVE autocheckpoint, which would otherwise leave the file at
// its high-water mark, truncates the WAL down to this size. Without it the
// WAL can grow without bound under continuous write traffic and never
// shrink, even when frames are being checkpointed in place.
const walJournalSizeLimit = 1 << 30 // 1 GiB

// checkpointInterval is how often [Server.runCheckpointLoop] runs a TRUNCATE
// checkpoint in the background. SQLite's autocheckpoint only runs PASSIVE
// checkpoints (which reuse WAL space in place but never shrink the file on
// disk past walJournalSizeLimit); a periodic explicit TRUNCATE is what
// actually keeps the file small in steady state.
const checkpointInterval = time.Minute

// dbSizeMetricsInterval is how often [Server.runDBSizeMetricsLoop] re-stats
// the SQLite files to update the size gauges. It is intentionally shorter
// than checkpointInterval so the gauge sees the WAL grow between checkpoints,
// not just snap back to zero each minute.
const dbSizeMetricsInterval = 15 * time.Second

const schema = `
PRAGMA journal_mode=WAL;
CREATE TABLE IF NOT EXISTS Actions (
Expand Down Expand Up @@ -146,7 +167,9 @@ func openDB(dbDir string) (*sql.DB, error) {
}
}

db, err := sql.Open("sqlite", "file:"+dbPath+"?_pragma=busy_timeout(5000)")
dsn := fmt.Sprintf("file:%s?_pragma=busy_timeout(5000)&_pragma=journal_size_limit(%d)",
dbPath, walJournalSizeLimit)
db, err := sql.Open("sqlite", dsn)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -250,6 +273,18 @@ func (srv *Server) start() error {
}
srv.db = db

// Run a TRUNCATE checkpoint up front, before any other reader can pin a
// snapshot. If the WAL on disk is large (e.g. from a prior version of
// gocached that lacked the periodic checkpointer), this is what actually
// shrinks it.
ckCtx, ckCancel := context.WithTimeout(srv.shutdownCtx, 2*time.Minute)
if busy, log, ckpt, err := srv.checkpointTruncate(ckCtx); err != nil {
srv.logf("startup wal_checkpoint(TRUNCATE) error: %v", err)
} else {
srv.logf("startup wal_checkpoint(TRUNCATE): busy=%d log=%d ckpt=%d", busy, log, ckpt)
}
ckCancel()

reg := prometheus.NewRegistry()
reg.MustRegister(
collectors.NewGoCollector(),
Expand Down Expand Up @@ -293,6 +328,8 @@ func (srv *Server) start() error {
}

go srv.runCleanLoop()
go srv.runCheckpointLoop()
go srv.runDBSizeMetricsLoop()

return nil
}
Expand Down Expand Up @@ -457,6 +494,15 @@ func (srv *Server) Close() error {
err = errors.Join(err, srv.writeConn.Close())
}

// Final TRUNCATE checkpoint so the WAL doesn't linger on disk past
// shutdown. Use context.Background because srv.shutdownCtx has already
// been canceled.
ckCtx, ckCancel := context.WithTimeout(context.Background(), 2*time.Minute)
if _, _, _, ckErr := srv.checkpointTruncate(ckCtx); ckErr != nil {
err = errors.Join(err, fmt.Errorf("final wal_checkpoint: %w", ckErr))
}
ckCancel()

return errors.Join(err, srv.db.Close())
}

Expand Down Expand Up @@ -517,6 +563,9 @@ type Server struct {
Sessions expvar.Int `type:"gauge" name:"sessions" help:"number of active authenticated sessions"`
Auths expvar.Int `type:"counter" name:"auth_attempts" help:"number of successful token exchanges"`
AuthErrs expvar.Int `type:"counter" name:"auth_errs" help:"number of failed token exchanges"`

SQLiteDataBytes expvar.Int `type:"gauge" name:"sqlite_data_bytes" help:"size in bytes of the SQLite main database file on disk"`
SQLiteWALBytes expvar.Int `type:"gauge" name:"sqlite_wal_bytes" help:"size in bytes of the SQLite WAL file on disk; should stay bounded near walJournalSizeLimit"`
}
}

Expand Down Expand Up @@ -1587,6 +1636,98 @@ func (srv *Server) cleanOldObjects(us *usageStats) (countAndSize, error) {
return ret, nil
}

// checkpointTruncate runs PRAGMA wal_checkpoint(TRUNCATE) and returns SQLite's
// three result columns. A fully-applied checkpoint returns busy=0 and
// logFrames==ckptFrames; otherwise some frames remain in the WAL because of a
// concurrent reader pinning an older snapshot.
func (srv *Server) checkpointTruncate(ctx context.Context) (busy, logFrames, ckptFrames int, err error) {
err = srv.db.QueryRowContext(ctx, "PRAGMA wal_checkpoint(TRUNCATE)").Scan(&busy, &logFrames, &ckptFrames)
return busy, logFrames, ckptFrames, err
}

// runCheckpointLoop periodically runs a TRUNCATE checkpoint to keep the WAL
// bounded on disk. SQLite's autocheckpoint only runs PASSIVE checkpoints, which
// reuse WAL space in place but never shrink the file; without this loop the WAL
// can grow without bound under continuous traffic.
func (srv *Server) runCheckpointLoop() {
t := time.NewTicker(checkpointInterval)
defer t.Stop()
for {
select {
case <-srv.shutdownCtx.Done():
return
case <-t.C:
}
ctx, cancel := context.WithTimeout(srv.shutdownCtx, 2*time.Minute)
busy, logFrames, ckptFrames, err := srv.checkpointTruncate(ctx)
cancel()
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
srv.logf("wal_checkpoint(TRUNCATE) error: %v", err)
continue
}
if busy != 0 || logFrames != ckptFrames {
// A reader is pinning frames; we'll catch up next tick. Logged
// because persistent partial checkpoints mean walJournalSizeLimit
// is the only thing keeping the file bounded, and we'd want to
// investigate.
srv.logf("wal_checkpoint(TRUNCATE) partial: busy=%d log=%d ckpt=%d", busy, logFrames, ckptFrames)
} else if srv.verbose {
srv.logf("wal_checkpoint(TRUNCATE): log=%d ckpt=%d", logFrames, ckptFrames)
}
// Refresh the size gauges immediately so dashboards see the
// post-truncate values without waiting for the next sampler tick.
srv.updateDBSizeMetrics()
}
}

// dbPath returns the on-disk path of the SQLite main database file.
// The WAL file is at dbPath() + "-wal".
func (srv *Server) dbPath() string {
return filepath.Join(srv.dir, fmt.Sprintf("gocached-v%d.db", schemaVersion))
}

// updateDBSizeMetrics re-stats the SQLite files and updates the size gauges.
// A missing WAL file (e.g. on a fresh DB before the first write flushes) is
// reported as zero bytes. Other stat errors are logged but don't update the
// gauge, so a transient filesystem hiccup leaves the last-known value visible.
func (srv *Server) updateDBSizeMetrics() {
dbPath := srv.dbPath()
if fi, err := os.Stat(dbPath); err == nil {
srv.m.SQLiteDataBytes.Set(fi.Size())
} else {
srv.logf("stat %s: %v", dbPath, err)
}
walPath := dbPath + "-wal"
switch fi, err := os.Stat(walPath); {
case err == nil:
srv.m.SQLiteWALBytes.Set(fi.Size())
case errors.Is(err, os.ErrNotExist):
srv.m.SQLiteWALBytes.Set(0)
default:
srv.logf("stat %s: %v", walPath, err)
}
}

// runDBSizeMetricsLoop samples the SQLite file sizes more frequently than the
// checkpoint loop runs, so the WAL gauge captures inter-checkpoint growth
// rather than only the post-truncate values.
func (srv *Server) runDBSizeMetricsLoop() {
t := time.NewTicker(dbSizeMetricsInterval)
defer t.Stop()
srv.updateDBSizeMetrics() // seed an initial sample at startup
for {
select {
case <-srv.shutdownCtx.Done():
return
case <-t.C:
}
srv.updateDBSizeMetrics()
}
}

func (srv *Server) runCleanLoop() {
for {
select {
Expand Down
61 changes: 61 additions & 0 deletions gocached/gocached_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,67 @@ func TestLZ4Storage(t *testing.T) {
}
}

func TestWALCheckpoint(t *testing.T) {
st := newServerTester(t)
ctx := context.Background()

walPath := filepath.Join(st.srv.dir, fmt.Sprintf("gocached-v%d.db-wal", schemaVersion))

// Generate enough write traffic for the WAL to be a few pages large.
// The exact number isn't important; we just want it big enough that
// "shrank to nearly empty" is a meaningful observation.
for i := range 500 {
if _, err := st.srv.db.ExecContext(ctx,
`INSERT INTO Actions (NamespaceID, ActionID, BlobID, AltOutputID, CreateTime, AccessTime) VALUES (0, ?, 0, '', 0, 0)`,
fmt.Sprintf("%032x", i)); err != nil {
t.Fatalf("insert %d: %v", i, err)
}
}

before, err := os.Stat(walPath)
if err != nil {
t.Fatalf("stat WAL: %v", err)
}
t.Logf("WAL before checkpoint: %d bytes", before.Size())
if before.Size() < 4096 {
// 500 row inserts should leave at least one full WAL page even after
// the periodic autocheckpoint reuses space in place. A tiny WAL here
// means the test isn't actually exercising the truncation path.
t.Fatalf("WAL only %d bytes before checkpoint; expected meaningful traffic", before.Size())
}

st.srv.updateDBSizeMetrics()
if got := st.srv.m.SQLiteWALBytes.Value(); got != before.Size() {
t.Errorf("sqlite_wal_bytes gauge before checkpoint: got %d, want %d", got, before.Size())
}
if got := st.srv.m.SQLiteDataBytes.Value(); got <= 0 {
t.Errorf("sqlite_data_bytes gauge: got %d, want > 0", got)
}

busy, log, ckpt, err := st.srv.checkpointTruncate(ctx)
if err != nil {
t.Fatalf("checkpointTruncate: %v", err)
}
if busy != 0 || log != ckpt {
t.Errorf("checkpoint not fully applied: busy=%d log=%d ckpt=%d", busy, log, ckpt)
}

after, err := os.Stat(walPath)
if err != nil {
t.Fatalf("stat WAL after checkpoint: %v", err)
}
t.Logf("WAL after checkpoint: %d bytes", after.Size())
// TRUNCATE checkpoint with no concurrent readers truncates the WAL to 0.
if after.Size() != 0 {
t.Errorf("WAL not truncated: got %d bytes, want 0", after.Size())
}

st.srv.updateDBSizeMetrics()
if got := st.srv.m.SQLiteWALBytes.Value(); got != 0 {
t.Errorf("sqlite_wal_bytes gauge after checkpoint: got %d, want 0", got)
}
}

func TestClientConnReuse(t *testing.T) {
st := newServerTester(t)

Expand Down
Loading