From 0e689a2387a9dc854fc240788029b4be21bc280c Mon Sep 17 00:00:00 2001 From: tongjichao Date: Tue, 19 May 2026 18:37:01 +0800 Subject: [PATCH 1/2] fix(file): self-heal MinIO bucket policy on every process start (#77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ensureBucket 在 BucketExists=true 时不再早 return,仍会 SetBucketPolicy - 加 bucketReady sync.Map 做进程内 memo,热上传路径无额外 round-trip - 集成测试 TestEnsureBucket_SelfHealsPolicyOnExistingBucket 覆盖 #77 复现 deployer 用 minio-init / mc mb 预创建 bucket 时,旧版 ensureBucket 跳过 SetBucketPolicy,导致 group 这类 bucket 永远无匿名读策略,群头像 403。 现在首次 ensureBucket 调用一定会落 policy;后续调用通过 bucketReady memo 短路,进程重启即自动修复。 OSS 后端 (modules/file/service_oss.go:52) 是同形 bug 但本次 out-of-scope, 另开 #86 跟踪。 --- modules/file/service_minio.go | 67 +++++++++++++++---- .../file/service_minio_integration_test.go | 63 +++++++++++++++++ 2 files changed, 116 insertions(+), 14 deletions(-) diff --git a/modules/file/service_minio.go b/modules/file/service_minio.go index 42993773..0ff6514a 100644 --- a/modules/file/service_minio.go +++ b/modules/file/service_minio.go @@ -62,6 +62,17 @@ type ServiceMinio struct { // itself is never deleted from — bucket count is bounded by the // allow-list, so growth is O(allowed buckets). bucketLocks sync.Map + + // bucketReady records buckets whose policy has been successfully + // applied within this process lifetime. ensureBucket consults it + // inside `bucketLocks` so the first caller after process start runs + // the full BucketExists / MakeBucket / SetBucketPolicy sequence and + // every subsequent caller short-circuits. Together with bucketLocks + // this means: one bootstrap round-trip per bucket per process, + // regardless of upload concurrency. The set never shrinks — drift + // recovery only happens on process restart, which matches the + // `minio-init`-pre-creates-bucket failure mode that motivated #77. + bucketReady sync.Map } // NewServiceMinio NewServiceMinio @@ -82,40 +93,68 @@ func NewServiceMinio(ctx *config.Context) *ServiceMinio { // race the policy update. The `BucketAlreadyOwnedByYou` S3 response is // swallowed as a benign no-op for the case where another process (or another // node sharing these credentials) won the create race. +// +// The full bootstrap (BucketExists + optional MakeBucket + SetBucketPolicy) +// runs once per bucket per process. Deployers frequently pre-provision +// buckets via `mc mb` / `minio-init` containers before this process starts; +// the original implementation skipped SetBucketPolicy when BucketExists +// returned true, so those pre-existing buckets stayed policy-less and any +// browser-direct GET (e.g. group avatars) returned 403 (#77). The fix is to +// apply the policy on the first ensureBucket call of each bucket regardless +// of who created it, then memoize via `bucketReady` so the hot upload path +// is not paying an extra HTTP round-trip per request. The memo is process- +// local: a restart re-runs the bootstrap, which is exactly the self-healing +// behaviour wanted for the minio-init-pre-creates-bucket failure mode. func (sm *ServiceMinio) ensureBucket(ctx context.Context, client *minio.Client, bucket string) error { + if _, ok := sm.bucketReady.Load(bucket); ok { + return nil + } + mtxIface, _ := sm.bucketLocks.LoadOrStore(bucket, &sync.Mutex{}) mtx := mtxIface.(*sync.Mutex) mtx.Lock() defer mtx.Unlock() + // Re-check inside the lock: a parallel cold-start caller may have + // finished the bootstrap while we were waiting on the mutex. + if _, ok := sm.bucketReady.Load(bucket); ok { + return nil + } + exists, err := client.BucketExists(ctx, bucket) if err != nil { sm.Error(fmt.Sprintf("检测 %s目录是否存在错误", bucket), zap.Error(err)) return err } - if exists { - return nil - } - if err := client.MakeBucket(ctx, bucket, minio.MakeBucketOptions{Region: minioDefaultRegion}); err != nil { - // Another caller (different process / different node sharing the - // same credentials) may have created the bucket between our - // BucketExists call and our MakeBucket call. Treat that specific - // S3 response as a no-op rather than a hard failure. - if minio.ToErrorResponse(err).Code == minioBucketAlreadyOwnedByYou { - sm.Info("bucket already owned by us, skipping create", zap.String("bucket", bucket)) - } else { - sm.Error(fmt.Sprintf("创建 %s目录失败", bucket), zap.Error(err)) - return err + if !exists { + if err := client.MakeBucket(ctx, bucket, minio.MakeBucketOptions{Region: minioDefaultRegion}); err != nil { + // Another caller (different process / different node sharing + // the same credentials) may have created the bucket between + // our BucketExists call and our MakeBucket call. Treat that + // specific S3 response as a no-op rather than a hard failure + // and fall through to SetBucketPolicy — the race winner may + // not have applied the policy yet, and a redundant Set is + // cheap. + if minio.ToErrorResponse(err).Code == minioBucketAlreadyOwnedByYou { + sm.Info("bucket already owned by us, skipping create", zap.String("bucket", bucket)) + } else { + sm.Error(fmt.Sprintf("创建 %s目录失败", bucket), zap.Error(err)) + return err + } } } // Read-only public policy: allow anonymous download only. Upload and - // delete go through authenticated server-side credentials. + // delete go through authenticated server-side credentials. Applied + // whether the bucket was just created or pre-existed — see function + // doc for the self-healing rationale. if err := client.SetBucketPolicy(ctx, bucket, fmt.Sprintf(readOnlyAnonymousPolicy, bucket)); err != nil { sm.Error("设置minio文件读写权限错误", zap.Error(err)) return err } + + sm.bucketReady.Store(bucket, struct{}{}) return nil } diff --git a/modules/file/service_minio_integration_test.go b/modules/file/service_minio_integration_test.go index 287a8a10..cee37b2d 100644 --- a/modules/file/service_minio_integration_test.go +++ b/modules/file/service_minio_integration_test.go @@ -336,3 +336,66 @@ func TestPresignedPutURL_ConcurrentBucketBootstrap(t *testing.T) { assert.Equal(t, int32(1), policyCount.Load(), "SetBucketPolicy should run exactly once for a fresh shared bucket; ran %d times", policyCount.Load()) } + +// TestEnsureBucket_SelfHealsPolicyOnExistingBucket reproduces issue #77: when +// a bucket already exists at startup (typically because a `minio-init` +// container or `mc mb` ran before octo-server), ensureBucket must still apply +// the read-only anonymous policy. The old behaviour returned early on +// `exists=true` and never called SetBucketPolicy, leaving pre-provisioned +// buckets policy-less — anonymous GETs against them then returned 403, which +// breaks any browser-direct asset (e.g. group avatars in the `group` bucket). +// +// The fake MinIO server returns 200 on HEAD (bucket already exists) and +// records every PUT request. After triggering ensureBucket via PresignedPutURL +// we assert: +// - MakeBucket was NOT called (no PUT without `policy` query key) +// - SetBucketPolicy WAS called at least once (PUT with `policy` query key) +// +// Before the fix this test fails (policyCount == 0). After the fix it passes +// and the bootstrap is self-healing on every process start. +func TestEnsureBucket_SelfHealsPolicyOnExistingBucket(t *testing.T) { + var ( + makeCount atomic.Int32 + policyCount atomic.Int32 + ) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodHead: + // Bucket already exists from the deployer's perspective. + w.WriteHeader(http.StatusOK) + case http.MethodPut: + if _, ok := r.URL.Query()["policy"]; ok { + policyCount.Add(1) + } else { + makeCount.Add(1) + } + w.WriteHeader(http.StatusOK) + default: + w.WriteHeader(http.StatusOK) + } + })) + t.Cleanup(srv.Close) + + cfg := config.New() + cfg.Test = true + cfg.Minio.URL = srv.URL + cfg.Minio.UploadURL = srv.URL + cfg.Minio.DownloadURL = "https://public.example.com" + cfg.Minio.AccessKeyID = "test-access-key" + cfg.Minio.SecretAccessKey = "test-secret-access-key-1234567890" + + ctx := testutil.NewTestContext(cfg) + svc := file.NewServiceMinio(ctx) + + // PresignedPutURL invokes ensureBucket on the resolved bucket. Use the + // `group` prefix specifically — that is the bucket #77 reported as + // broken on real deployments (group avatars served from /group/...). + _, _, err := svc.PresignedPutURL("group/avatar/abc.png", "image/png", "", 1024, time.Minute) + require.NoError(t, err) + + assert.Equal(t, int32(0), makeCount.Load(), + "MakeBucket must not run when the bucket already exists; ran %d times", makeCount.Load()) + assert.GreaterOrEqual(t, policyCount.Load(), int32(1), + "SetBucketPolicy must run on every ensureBucket call to self-heal pre-provisioned buckets; ran %d times", policyCount.Load()) +} From cfb20a193150153eb7a59fc1bb15866aa1f0ffbf Mon Sep 17 00:00:00 2001 From: tongjichao Date: Wed, 20 May 2026 10:40:30 +0800 Subject: [PATCH 2/2] fix(file): bootstrap MinIO bucket policies at startup (#77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - NewServiceMinio 启动 goroutine 对 allowedMinioBuckets 全集跑 ensureBucket - 30s 总 timeout 防 minio 长时间不可达时 goroutine 永生 - 失败 WARN 不阻塞启动,下次 upload 会通过 ensureBucket 重试 - cfg.Test 跳过以保留现有测试 fake-server 计数的确定性 - 新测试 TestNewServiceMinio_BootstrapsAllAllowedBucketsAtStartup 验证 read-only / never-uploaded 场景(#77 真实复现路径) 回应 Jerry-Xin (PR #87) "self-heal on process start" 与实际行为 不符的 review 反馈:先前的 ensureBucket 修复只在 upload 路径上跑, read-only group avatar 重启后仍 403;现在 startup 就把 policy 落齐。 --- modules/file/service_minio.go | 41 ++++++++- .../file/service_minio_integration_test.go | 86 ++++++++++++++++++- 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/modules/file/service_minio.go b/modules/file/service_minio.go index 0ff6514a..ed0f16a1 100644 --- a/modules/file/service_minio.go +++ b/modules/file/service_minio.go @@ -77,13 +77,52 @@ type ServiceMinio struct { // NewServiceMinio NewServiceMinio func NewServiceMinio(ctx *config.Context) *ServiceMinio { - return &ServiceMinio{ + sm := &ServiceMinio{ Log: log.NewTLog("File"), ctx: ctx, downloadClient: &http.Client{ Timeout: time.Second * 30, }, } + // Self-heal MinIO bucket policy on every process start (#77). Without + // this proactive sweep, a pre-provisioned bucket whose policy was never + // applied (e.g. `minio-init` excluded `group` from its anonymous-read + // loop) only gets repaired the next time someone happens to upload to + // it — read-only buckets (group avatars exist but no fresh writes + // happen) stay broken across restarts. The sweep is async + best + // effort: ensureBucket failures are logged but don't block startup, + // and the upload path retries via the same ensureBucket on demand. + // Skipped under cfg.Test=true to keep unit/integration tests + // deterministic — they exercise ensureBucket explicitly via the upload + // entry points. + if !ctx.GetConfig().Test { + go sm.bootstrapAllowedBuckets() + } + return sm +} + +// bootstrapAllowedBuckets runs `ensureBucket` against every bucket in the +// allow-list once at process start. Errors are logged at WARN — the upload +// path will retry on demand via the same ensureBucket call, so a transient +// MinIO outage during boot does not require a restart to recover. A 30s +// total budget keeps the goroutine from leaking if MinIO is unreachable +// indefinitely (10 buckets × ~3s per BucketExists is generous; later +// buckets short-circuit on the per-process memo if earlier ones succeed +// after a slow start). +func (sm *ServiceMinio) bootstrapAllowedBuckets() { + client, err := sm.newClient() + if err != nil { + sm.Warn("MinIO startup bootstrap: 创建 client 失败,跳过预扫", zap.Error(err)) + return + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + for bucket := range allowedMinioBuckets { + if err := sm.ensureBucket(ctx, client, bucket); err != nil { + sm.Warn("MinIO startup bootstrap: bucket 初始化失败,下次上传时会重试", + zap.String("bucket", bucket), zap.Error(err)) + } + } } // ensureBucket guarantees that `bucket` exists on the MinIO server and has diff --git a/modules/file/service_minio_integration_test.go b/modules/file/service_minio_integration_test.go index cee37b2d..5b09ef99 100644 --- a/modules/file/service_minio_integration_test.go +++ b/modules/file/service_minio_integration_test.go @@ -19,8 +19,8 @@ import ( // newFakeMinioServer returns an httptest.Server that answers just enough of // the MinIO HTTP surface to let `ensureBucket` succeed: // -// - HEAD // → 200 (BucketExists returns true, skipping MakeBucket -// and SetBucketPolicy entirely) +// - HEAD // → 200 (BucketExists returns true; ensureBucket then +// reapplies SetBucketPolicy unconditionally per the #77 self-heal fix) // - everything else → 200 with empty body, so the test never panics on an // unexpected request shape // @@ -399,3 +399,85 @@ func TestEnsureBucket_SelfHealsPolicyOnExistingBucket(t *testing.T) { assert.GreaterOrEqual(t, policyCount.Load(), int32(1), "SetBucketPolicy must run on every ensureBucket call to self-heal pre-provisioned buckets; ran %d times", policyCount.Load()) } + +// TestNewServiceMinio_BootstrapsAllAllowedBucketsAtStartup is the proof that +// `NewServiceMinio` actually performs the policy repair *at process start*, +// not just on the first upload (PR#87 review feedback from Jerry-Xin). +// +// The earlier #77 fix made ensureBucket reapply the policy whenever it ran, +// but ensureBucket only runs from UploadFile / PresignedPutURL. A bucket +// that is read-only after seed (e.g. `group` avatars uploaded once at group +// creation and never written to again) would never see a repair across +// restarts. This test pins the new behaviour: NewServiceMinio kicks off an +// async bootstrap goroutine that walks the full `allowedMinioBuckets` set +// and runs ensureBucket against each one. +// +// Setup: fake MinIO answers HEAD with 200 (every bucket pre-exists) and +// records every PUT. After constructing the service we poll until policy +// count reaches the expected total (= 10 = len(allowedMinioBuckets), kept +// as a literal because the var is unexported; helpers.go is the source of +// truth — bump this number if the allow-list changes). MakeBucket must +// stay at zero because BucketExists returned true for all of them. +// +// cfg.Test must be false so the bootstrap goroutine actually runs; the +// other integration tests in this file set cfg.Test=true precisely to +// suppress this side effect and keep their counters deterministic. +func TestNewServiceMinio_BootstrapsAllAllowedBucketsAtStartup(t *testing.T) { + var ( + headCount atomic.Int32 + makeCount atomic.Int32 + policyCount atomic.Int32 + ) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodHead: + headCount.Add(1) + w.WriteHeader(http.StatusOK) + case http.MethodPut: + if _, ok := r.URL.Query()["policy"]; ok { + policyCount.Add(1) + } else { + makeCount.Add(1) + } + w.WriteHeader(http.StatusOK) + default: + w.WriteHeader(http.StatusOK) + } + })) + t.Cleanup(srv.Close) + + cfg := config.New() + // cfg.Test left as the zero value (false) so NewServiceMinio runs the + // bootstrap goroutine. The other tests in this file pin it true. + cfg.Minio.URL = srv.URL + cfg.Minio.UploadURL = srv.URL + cfg.Minio.DownloadURL = "https://public.example.com" + cfg.Minio.AccessKeyID = "test-access-key" + cfg.Minio.SecretAccessKey = "test-secret-access-key-1234567890" + + ctx := testutil.NewTestContext(cfg) + // testutil.NewTestContext force-sets cfg.Test=true, which by design + // short-circuits the bootstrap goroutine. Flip it back so this test + // can observe the production code path. + ctx.GetConfig().Test = false + _ = file.NewServiceMinio(ctx) + + // Bump this if `allowedMinioBuckets` in helpers.go grows or shrinks. + const wantBuckets int32 = 10 + + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + if policyCount.Load() >= wantBuckets { + break + } + time.Sleep(20 * time.Millisecond) + } + + assert.Equal(t, wantBuckets, policyCount.Load(), + "startup bootstrap must run SetBucketPolicy once per allowed bucket; got %d, want %d", policyCount.Load(), wantBuckets) + assert.Equal(t, int32(0), makeCount.Load(), + "startup bootstrap must not call MakeBucket when buckets already exist; got %d", makeCount.Load()) + assert.GreaterOrEqual(t, headCount.Load(), wantBuckets, + "startup bootstrap must HEAD each bucket; got %d, want >= %d", headCount.Load(), wantBuckets) +}