From 90af15a349ec8fbcda3fd53e62ffc63ab4834424 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 19:40:07 +0800 Subject: [PATCH 01/15] refactor: drop hardcoded violet package prefix (PR 1 review feedback) The MinIO root for violet packages varies across environments (private vs shared, regional mirrors), so the CLI should not pretend any value is "the default". Make Violet.PackagePrefix a required field; empty value surfaces as a clear "packagePrefix is empty" error at BuildPackageURL time (already covered in PR 1 tests). --- pkg/config/config.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 96c131b..aaa80f5 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -65,7 +65,10 @@ type VioletConfig struct { Bin string `yaml:"bin,omitempty"` // PackagePrefix is the MinIO (or HTTP) root from which the per-version - // .tgz is downloaded. Default: "http://package-minio.alauda.cn:9199/packages/". + // .tgz is downloaded. Required when Violet is configured — the prefix + // varies across environments (private vs shared MinIO, regional mirrors) + // so the CLI deliberately refuses to hardcode a default. Empty prefix + // surfaces as a "packagePrefix is empty" error at URL build time. PackagePrefix string `yaml:"packagePrefix,omitempty"` // SkipPush controls whether `--skip-push` is passed to `violet push`. @@ -106,10 +109,6 @@ type Version struct { ExpectedSha256 string `yaml:"expectedSha256,omitempty"` } -// DefaultVioletPackagePrefix is the MinIO root used when OperatorConfig.Violet -// is configured but its PackagePrefix is left blank. -const DefaultVioletPackagePrefix = "http://package-minio.alauda.cn:9199/packages/" - // LoadConfig loads the configuration from a YAML file func LoadConfig(path string) (*Config, error) { data, err := os.ReadFile(path) @@ -146,9 +145,6 @@ func defaultConfig(config *Config) *Config { } if v := config.OperatorConfig.Violet; v != nil { - if v.PackagePrefix == "" { - v.PackagePrefix = DefaultVioletPackagePrefix - } if v.SkipPush == nil { t := true v.SkipPush = &t From 84d06fd16bb17f9b9dbb9475c05ba9b2b9f65cb2 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 19:46:35 +0800 Subject: [PATCH 02/15] feat(operatorhub): install ArtifactVersion via violet binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swaps the InstallArtifactVersion entry point from the in-process dynamic client write path to a child-process invocation of the `violet push` binary. Artifact / ArtifactVersion CR creation is now owned by violet; OLM Subscription, InstallPlan and CSV flows remain in Go and are unchanged. The new flow lives in installViaViolet (pkg/operator/operatorhub/violet.go) and runs eight stages, each with a labelled error prefix so failures are easy to locate: 1. get Artifact CR (must already exist, managed out of band) 2. build the MinIO download URL via BuildPackageURL 3. download .tgz to a per-call temp directory (defer cleanup) 4. verify Version.ExpectedSha256 when non-empty (no-op otherwise) 5. ensure clean ArtifactVersion: delete same-name residue + poll NotFound, so waitArtifactVersionPresent cannot match a stale object from a prior failed upgrade and report false success 6. exec `violet push ` with a minimal-permission child env (KUBECONFIG / PATH / HOME / USER / VIOLET_*); credentials come from VIOLET_REGISTRY_USERNAME / _PASSWORD via BuildVioletPushArgs; the rendered command is mask-logged before exec to keep --password values out of logs 7. wait ArtifactVersion phase=Present (reuses existing wait helper) and cross-check that spec.tag equals the requested BundleVersion — if violet ever changes its naming convention this surfaces loudly rather than silently accepting an unrelated AV 8. wait PackageManifest exposes the CSV (reuses existing wait helper) Supporting changes: - Operator gains a `violet *config.VioletConfig` field, populated by NewOperator from OperatorConfig.Violet. InstallArtifactVersion returns a clear config error when this field is nil. - UpgradeOperator now passes the whole config.Version (not just BundleVersion) into InstallArtifactVersion — channel and ExpectedSha256 are needed for the violet flow. - validateVioletBin requires Violet.Bin (when set) to be an absolute path to an executable regular file, guarding against accidental execution of a user-supplied directory or non-executable path. Empty Bin falls back to looking up `violet` in $PATH. - createArtifactVersion (the in-process write path) is preserved as dead code with a Deprecated comment; PR 3 removes it. OQ1 outcome: the local violet binary does not support --password-stdin or file-based credential reads, so argv passing is the only available path. Mitigations are README documentation (shared-runner warning) and MaskCommand log redaction; OS-level inspection (`ps auxe`) of the credential remains an accepted risk for the first release. Issue tracked in plan §Security. OQ2 outcome: deferred to integration validation. The cross-check on spec.tag (step 7) converts a silent failure into a loud one if violet uses a different AV naming convention. OQ3 outcome: deferred to integration validation. Tekton task image must ship violet; verify before merging the PR. Plan: docs/plans/2026-05-18-refactor-externalize-violet-binary-plan.md --- pkg/operator/operatorhub/artifact_versiong.go | 48 ++-- pkg/operator/operatorhub/operator.go | 8 +- pkg/operator/operatorhub/violet.go | 231 ++++++++++++++++++ 3 files changed, 252 insertions(+), 35 deletions(-) diff --git a/pkg/operator/operatorhub/artifact_versiong.go b/pkg/operator/operatorhub/artifact_versiong.go index 9adac7e..e9456ea 100644 --- a/pkg/operator/operatorhub/artifact_versiong.go +++ b/pkg/operator/operatorhub/artifact_versiong.go @@ -5,49 +5,29 @@ import ( "fmt" "strings" + "github.com/AlaudaDevops/upgrade-test/pkg/config" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/wait" - "knative.dev/pkg/logging" ) -func (o *Operator) InstallArtifactVersion(ctx context.Context, version string) (*unstructured.Unstructured, error) { - log := logging.FromContext(ctx) - log.Infow("installing artifact version", "version", version) - - artifact, err := o.GetResource(ctx, o.artifact, systemNamespace, artifactGVR) - if err != nil { - return nil, fmt.Errorf("failed to get artifact: %v", err) - } - - log.Infow("creating artifact version", "name", artifact.GetName()) - av, err := o.createArtifactVersion(ctx, version, artifact) - if err != nil { - return nil, fmt.Errorf("failed to create artifact version: %v", err) - } - - log.Infow("waiting for artifact version to be present", "name", av.GetName()) - av, err = o.waitArtifactVersionPresent(ctx, av.GetName()) - if err != nil { - return nil, fmt.Errorf("failed to wait for artifact version: %v", err) - } - - csv, found, _ := unstructured.NestedString(av.Object, "status", "version") - if !found || csv == "" { - return nil, fmt.Errorf("failed to get CSV version from artifact version %s: status.version field is empty or not found", av.GetName()) +// InstallArtifactVersion is the stable entry point invoked by UpgradeOperator. +// It needs the whole Version because the channel and (optional) sha256 are +// required by the violet onboarding flow, not just the bundle version. +func (o *Operator) InstallArtifactVersion(ctx context.Context, version config.Version) (*unstructured.Unstructured, error) { + if o.violet == nil { + return nil, fmt.Errorf("operatorConfig.violet must be configured to install ArtifactVersion via the violet binary") } - - log.Infow("waiting for package manifest", "csv", csv) - err = o.waitPackageManifest(ctx, csv) - if err != nil { - return nil, fmt.Errorf("failed to ensure package manifest: %v", err) - } - - log.Infow("artifact version installed successfully", "name", av.GetName()) - return av, nil + return o.installViaViolet(ctx, version) } +// createArtifactVersion is no longer reachable from InstallArtifactVersion as +// of PR 2 — the violet binary owns the write path. Kept here as dead code for +// one release cycle so PR 2 regressions can be diff-compared against the +// previous in-process implementation. PR 3 deletes this function. +// +// Deprecated: use installViaViolet via InstallArtifactVersion. func (o *Operator) createArtifactVersion(ctx context.Context, version string, artifact *unstructured.Unstructured) (*unstructured.Unstructured, error) { avName := fmt.Sprintf("%s.%s", artifact.GetName(), version) av, err := o.GetResource(ctx, avName, systemNamespace, artifactVersionGVR) diff --git a/pkg/operator/operatorhub/operator.go b/pkg/operator/operatorhub/operator.go index 1d32818..b340a11 100644 --- a/pkg/operator/operatorhub/operator.go +++ b/pkg/operator/operatorhub/operator.go @@ -22,6 +22,11 @@ type Operator struct { timeout time.Duration interval time.Duration + + // violet captures how to invoke the external `violet` binary for + // Artifact/ArtifactVersion creation. nil disables the violet path; + // in that case InstallArtifactVersion returns a config error. + violet *config.VioletConfig } const ( @@ -93,6 +98,7 @@ func NewOperator(config *rest.Config, options config.OperatorConfig) (*Operator, artifact: artifact, timeout: options.Timeout, interval: options.Interval, + violet: options.Violet, }, nil } @@ -102,7 +108,7 @@ func (o *Operator) GetResource(ctx context.Context, name, namespace string, gvr func (o *Operator) UpgradeOperator(ctx context.Context, version config.Version) error { // Install artifact version - av, err := o.InstallArtifactVersion(ctx, version.BundleVersion) + av, err := o.InstallArtifactVersion(ctx, version) if err != nil { return fmt.Errorf("failed to prepare operator: %v", err) } diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index 36f1e17..f969545 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -1,12 +1,23 @@ package operatorhub import ( + "context" "crypto/sha256" "encoding/hex" "fmt" "io" + "net/http" "os" + "path/filepath" "strings" + + "github.com/AlaudaDevops/upgrade-test/pkg/config" + "github.com/AlaudaDevops/upgrade-test/pkg/exec" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/wait" + "knative.dev/pkg/logging" ) // Environment variables consumed when assembling `violet push`. They are @@ -114,3 +125,223 @@ func VerifySha256(filePath, expected string) error { } return nil } + +// installViaViolet onboards the bundle for `version` by invoking the external +// `violet push` binary, then waits for the resulting ArtifactVersion CR to +// reach phase=Present and for the OLM PackageManifest to expose the matching +// CSV. The upstream Artifact CR is expected to already exist (managed out of +// band); only the ArtifactVersion write path is delegated to violet. +// +// Steps: 1) get Artifact, 2) build URL, 3) download .tgz, 4) verify sha256, +// 5) delete any residue AV with the same name (so wait does not match it), +// 6) exec violet push with allowlist env, 7) wait AV Present, 8) wait +// PackageManifest. Each stage prefixes its error with a short label so the +// caller can locate the failure without re-running. +func (o *Operator) installViaViolet(ctx context.Context, version config.Version) (*unstructured.Unstructured, error) { + log := logging.FromContext(ctx) + log.Infow("installing artifact version via violet", + "bundleVersion", version.BundleVersion, "channel", version.Channel) + + artifact, err := o.GetResource(ctx, o.artifact, systemNamespace, artifactGVR) + if err != nil { + return nil, fmt.Errorf("get artifact %s: %w", o.artifact, err) + } + + url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.Channel, version.BundleVersion) + if err != nil { + return nil, fmt.Errorf("build package url: %w", err) + } + log.Infow("downloading violet package", "url", url) + + tgzPath, cleanup, err := downloadToTemp(ctx, url) + if err != nil { + return nil, fmt.Errorf("download %s: %w", url, err) + } + defer cleanup() + + if err := VerifySha256(tgzPath, version.ExpectedSha256); err != nil { + return nil, fmt.Errorf("verify sha256: %w", err) + } + + avName := fmt.Sprintf("%s.%s", artifact.GetName(), version.BundleVersion) + if err := o.deleteArtifactVersionIfExists(ctx, avName); err != nil { + return nil, fmt.Errorf("ensure clean AV %s: %w", avName, err) + } + + if err := o.execVioletPush(ctx, tgzPath); err != nil { + return nil, fmt.Errorf("violet push: %w", err) + } + + log.Infow("waiting for ArtifactVersion to reach Present", "name", avName) + av, err := o.waitArtifactVersionPresent(ctx, avName) + if err != nil { + return nil, fmt.Errorf("wait artifact version present: %w", err) + } + + // Cross-check: violet might use a naming convention we did not expect. If + // the CR we waited on has a different spec.tag than the requested + // BundleVersion, surface it loudly rather than silently accepting a stale + // or unrelated AV. + if tag, _, _ := unstructured.NestedString(av.Object, "spec", "tag"); tag != version.BundleVersion { + return nil, fmt.Errorf("artifact version %s has spec.tag=%q, expected %q (violet may not follow the . naming convention)", + avName, tag, version.BundleVersion) + } + + csv, found, _ := unstructured.NestedString(av.Object, "status", "version") + if !found || csv == "" { + return nil, fmt.Errorf("artifact version %s status.version is empty", avName) + } + log.Infow("waiting for package manifest", "csv", csv) + if err := o.waitPackageManifest(ctx, csv); err != nil { + return nil, fmt.Errorf("wait package manifest for csv %s: %w", csv, err) + } + + log.Infow("artifact version installed via violet", "name", av.GetName()) + return av, nil +} + +// downloadToTemp fetches url into a fresh temporary directory and returns the +// local file path plus a cleanup function. The caller MUST invoke cleanup +// (typically via defer) once the file is no longer needed. The temp dir is +// per-call so concurrent installs cannot race on the same path. +func downloadToTemp(ctx context.Context, url string) (string, func(), error) { + dir, err := os.MkdirTemp("", "upgrade-violet-*") + if err != nil { + return "", nil, fmt.Errorf("mkdir temp: %w", err) + } + cleanup := func() { _ = os.RemoveAll(dir) } + + fileName := filepath.Base(url) + if fileName == "" || fileName == "." || fileName == "/" { + fileName = "package.tgz" + } + filePath := filepath.Join(dir, fileName) + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("build request: %w", err) + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("http get: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + cleanup() + return "", nil, fmt.Errorf("unexpected status %d from %s", resp.StatusCode, url) + } + + f, err := os.Create(filePath) + if err != nil { + cleanup() + return "", nil, fmt.Errorf("create %s: %w", filePath, err) + } + defer f.Close() + + if _, err := io.Copy(f, resp.Body); err != nil { + cleanup() + return "", nil, fmt.Errorf("copy body to %s: %w", filePath, err) + } + + return filePath, cleanup, nil +} + +// deleteArtifactVersionIfExists removes a stale ArtifactVersion with the same +// name and polls until the API confirms the deletion. This prevents +// waitArtifactVersionPresent from instantly matching a residue object from a +// previous upgrade attempt and reporting a false success. +func (o *Operator) deleteArtifactVersionIfExists(ctx context.Context, name string) error { + log := logging.FromContext(ctx) + + existing, err := o.GetResource(ctx, name, systemNamespace, artifactVersionGVR) + if err != nil { + if errors.IsNotFound(err) { + return nil + } + return fmt.Errorf("get existing AV: %w", err) + } + + log.Infow("deleting residue ArtifactVersion before invoking violet", + "name", existing.GetName(), "uid", existing.GetUID()) + + if err := o.client.Resource(artifactVersionGVR).Namespace(systemNamespace). + Delete(ctx, name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("delete: %w", err) + } + + return wait.PollUntilContextTimeout(ctx, o.interval, o.timeout, true, func(ctx context.Context) (bool, error) { + _, err := o.client.Resource(artifactVersionGVR).Namespace(systemNamespace). + Get(ctx, name, metav1.GetOptions{}) + if errors.IsNotFound(err) { + return true, nil + } + return false, err + }) +} + +// violetEnvAllowlist constrains which host environment variables flow into +// the violet child process. KUBECONFIG / PATH / HOME / USER are operational +// essentials; VIOLET_* covers VIOLET_REGISTRY_USERNAME / _PASSWORD and any +// future violet-specific overrides without leaking unrelated CI secrets such +// as GITHUB_TOKEN or AWS_*. +var violetEnvAllowlist = []string{ + "KUBECONFIG", + "PATH", + "HOME", + "USER", + "VIOLET_*", +} + +// execVioletPush runs `violet push ` as a child process. Credentials +// from VIOLET_REGISTRY_USERNAME / _PASSWORD are auto-injected into argv by +// BuildVioletPushArgs; the rendered command is mask-logged before exec. +func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { + log := logging.FromContext(ctx) + + bin := o.violet.Bin + if bin == "" { + bin = "violet" + } else if err := validateVioletBin(bin); err != nil { + return err + } + + skipPush := true + if o.violet.SkipPush != nil { + skipPush = *o.violet.SkipPush + } + args := BuildVioletPushArgs(tgzPath, skipPush, o.violet.PushArgs) + + log.Infow("invoking violet", "cmd", MaskCommand(bin, args)) + + result := exec.RunCommand(ctx, exec.Command{ + Name: bin, + Args: args, + EnvAllowlist: violetEnvAllowlist, + }) + return result.Err +} + +// validateVioletBin guards against accidentally executing a binary at an +// arbitrary user-supplied path. When Violet.Bin is non-empty it must be an +// absolute path to an executable regular file; otherwise return a typed +// error so the upgrade fails before any network or cluster action. +func validateVioletBin(path string) error { + if !filepath.IsAbs(path) { + return fmt.Errorf("violet.bin must be an absolute path, got %q", path) + } + fi, err := os.Stat(path) + if err != nil { + return fmt.Errorf("stat violet binary %s: %w", path, err) + } + if fi.IsDir() { + return fmt.Errorf("violet.bin %s is a directory, not an executable", path) + } + if fi.Mode()&0o111 == 0 { + return fmt.Errorf("violet.bin %s is not executable", path) + } + return nil +} From 536844d862b0a0d8c0df14dfe27f5df98944b36e Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 19:48:50 +0800 Subject: [PATCH 03/15] docs: update README and CLAUDE.md for violet onboarding flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - README §编写配置文件: extend the upgrade.yaml example with operatorConfig.violet (required block) and Version.expectedSha256 (optional). Adds a "配置说明" callout that names violet as required, declares packagePrefix has no default, documents the URL convention, and flags expectedSha256 as a defense against HTTP-MITM-induced supply-chain attacks. - README new section §violet 依赖与运行环境: covers binary dependency (PATH or absolute path with executable-bit check), the env allowlist (KUBECONFIG / PATH / HOME / USER / VIOLET_*), registry credential injection via VIOLET_REGISTRY_USERNAME / _PASSWORD with shared-runner argv warning, and KUBECONFIG passthrough for k8s auth. - CLAUDE.md §架构: clarify Artifact / ArtifactVersion writes are owned by violet; Go retains only Get + Delete + waits. References installViaViolet and targetCatalogSource constant. - CLAUDE.md §硬约束: add three rules — violet owns AV writes, registry creds must go through env vars not config, packagePrefix is required (no default). --- CLAUDE.md | 13 ++++++++----- README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6bb66e9..016413a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,10 +38,10 @@ export KUBECONFIG= `pkg/operator/interface.go` 定义了仅一个方法的接口 `OperatorInterface.UpgradeOperator`。Factory(`factory.go`)根据 `operatorConfig.type` 选择实现: -- **`operatorhub`(默认)** —— `pkg/operator/operatorhub/`。生产路径。通过 dynamic client + `unstructured` 操作三类资源: - - Alauda 自定义资源 `app.alauda.io/v1alpha1` 下的 `Artifact` / `ArtifactVersion`(系统命名空间硬编码为 `cpaas-system`,OLM 源名硬编码为 `platform`) +- **`operatorhub`(默认)** —— `pkg/operator/operatorhub/`。生产路径。Artifact / ArtifactVersion 的**写路径**已外包给 `violet` 二进制(子进程,见 `violet.go::installViaViolet`);Go 侧仍用 dynamic client 做以下三类只读 / OLM 资源: + - Alauda 自定义资源 `app.alauda.io/v1alpha1` 下的 `Artifact` / `ArtifactVersion`:**Go 端只剩 Get + Delete**(清理残留 AV、等 AV phase=Present、读 status.version 拿 CSV);Create / Patch / Update 全归 violet。命名空间硬编码 `cpaas-system`,OLM 源名硬编码 `platform`(const `targetCatalogSource` in `operator.go`)。 - OLM 标准资源 `Subscription` / `InstallPlan` / `ClusterServiceVersion` / `PackageManifest` - - 升级流程:`InstallArtifactVersion`(创建 ArtifactVersion → 等 phase=Present → 等 PackageManifest 出现对应 CSV)→ `InstallSubscription`(先删旧 Sub + 旧 CSV → 创建 Subscription `installPlanApproval: Manual` → 等 InstallPlan → 把 `spec.approved=true` → 等 CSV phase=Succeeded) + - 升级流程:`InstallArtifactVersion`(下载 .tgz → 可选 sha256 校验 → 删除同名 AV 残留 → 调 `violet push` → 等 AV phase=Present 并 cross-check spec.tag → 等 PackageManifest 出现对应 CSV)→ `InstallSubscription`(先删旧 Sub + 旧 CSV → 创建 Subscription `installPlanApproval: Manual` → 等 InstallPlan → 把 `spec.approved=true` → 等 CSV phase=Succeeded) - **`local`** —— `pkg/operator/local/operator.go`。本地开发用,直接在 workspace 里跑 `make deploy`(可被 `operatorConfig.command` 覆盖),不接 OLM。 新增 Operator 类型时:在 `pkg/operator/` 下加子包实现 `OperatorInterface`,并在 `factory.go` 的 `CreateOperator` switch 中注册。 @@ -54,12 +54,15 @@ Artifact 名称约定:未显式给 `artifact` 字段时,自动拼成 `///.latest.ALL..tgz`。已验证可同时覆盖 `v4.6` 大版本通道和 `rc` 滚动通道两种实际形态。 +- **`expectedSha256` 强烈建议为外网下载场景填上**。HTTP 明文 + DNS 投毒可能注入恶意 tgz,违 violet 会带集群凭证把恶意 bundle 上架——内网 MinIO 也建议加防。 + ### 构建测试镜像 需要在测试镜像中添加升级测试的制品: @@ -149,3 +167,35 @@ CMD ["--godog.concurrency=2", "--godog.format=allure", "--godog.tags=@prepare-17 export KUBECONFIG= ./upgrade --config upgrade.yaml ``` + +## violet 依赖与运行环境 + +upgrade CLI 把 `Artifact` / `ArtifactVersion` CR 的创建步骤外包给 `violet` 二进制,因此运行环境必须满足以下条件。 + +### 二进制依赖 + +- 运行环境 `$PATH` 上需要有可执行的 `violet`,或者通过 `operatorConfig.violet.bin` 指定绝对路径。Bin 非空时 CLI 会做 `filepath.IsAbs` + `os.Stat` + 可执行位校验,不接受相对路径或非可执行文件。 +- 流水线场景:确保 Tekton task 镜像里预装 violet,或在 task 启动 script 里下载安装。本地开发:参考仓库内 `download-violet` skill 或自行从 cloud.alauda.cn 下载。 +- 当前实现不做 `violet --version` preflight 检查,依赖 OS "command not found" 错误兜底。 + +### 子进程环境变量 allowlist + +upgrade CLI 调用 violet 时**只透传**以下宿主环境变量:`KUBECONFIG`、`PATH`、`HOME`、`USER`、`VIOLET_*` 前缀。CI runner 上其他 secret(`GITHUB_TOKEN` / `AWS_*` / 等)不会进入 violet 子进程,符合最小权限原则。 + +### registry 凭证(仅 `skipPush: false` 场景需要) + +私有 registry 推送镜像时 `violet push` 需要 `--username` / `--password`。**不要写进 config.yaml**,改用环境变量: + +```sh +export VIOLET_REGISTRY_USERNAME= +export VIOLET_REGISTRY_PASSWORD= +./upgrade --config upgrade.yaml +``` + +upgrade CLI 检测到非空时会自动追加到 `violet push` 的 argv,并在日志渲染时把 `--password` 之后的值替换成 `***`。 + +⚠️ **共享 CI runner 安全警告**:当前 violet 不支持 stdin / 文件方式读凭证,`--password` 只能进 argv。OS 级 `ps auxe` / `/proc//cmdline` / `strace` 都能看到明文密码。**多租户共享 runner 上禁用 `skipPush: false`**,私有 push 场景务必使用独占的 pipeline runner / 独占的 OS 用户账号。 + +### k8s 凭证 + +violet 通过 `KUBECONFIG` 连集群(与 upgrade CLI 自身共享同一份)。流水线场景在调 upgrade 前先 `export KUBECONFIG=...` 即可,CLI 透传给子进程不需要额外配置。 From 14ba30e18a2343e8af95d22105e9eb4948ad1d98 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 20:02:21 +0800 Subject: [PATCH 04/15] test: cover PR 2 helpers; fix URL base-name extraction bug found by test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add unit tests for the new code introduced in this PR. installViaViolet and execVioletPush remain integration-level (require a real cluster and a stub binary respectively), but everything below them is testable in process and now covered: pkg/exec (PR 1 additions that previously had no test): - TestMatchAllowlist: exact / prefix-glob / mixed / empty allowlist - TestFilterHostEnv_*: empty allowlist preserves full os.Environ() passthrough, non-empty allowlist filters, prefix glob matches by prefix not substring - TestLastLines: trailing-newline trimming, fewer-than-N lines, only-newlines collapses to empty - TestWrapWithStderrTail: empty stderr passes through unchanged; non-empty stderr is appended and errors.Is still identifies the base error - TestRunCommand_*: end-to-end verification that EnvAllowlist actually filters what the child sees (uses /bin/sh -c 'env'), empty allowlist preserves legacy behavior, and a failing child wraps the stderr tail into the error pkg/operator/operatorhub: - TestValidateVioletBin: relative path / missing / directory / non-executable rejections plus the absolute-executable accept path - TestDownloadToTemp_*: 200 OK + filename extracted from URL path; 404 and 500 surface as labelled errors; URL with only "/" falls back to "package.tgz"; context-cancel propagates from net/http (verifies the request honours ctx) - TestDeleteArtifactVersionIfExists_*: backed by k8s client-go dynamic fake. Covers no-residue noop, residue removed + poll-until-NotFound, and the race where the AV vanishes between our Get and Delete (must not propagate as an error) Production fix surfaced by the tests: - downloadToTemp used filepath.Base(rawURL) to pick the local filename, which is wrong: filepath.Base does not understand URL structure and on inputs like "http://host:port/" returns "host:port" rather than "/". Switched to url.Parse(rawURL) + path.Base(parsed.Path), with the same fallback to "package.tgz" for empty/root paths. Tests now lock this behaviour in. go.mod / go.sum: k8s.io/client-go/dynamic/fake transitively pulls a few additional indirect deps (kr/pretty, go-difflib, etc.) — picked up by `go mod tidy`. No first-party deps added. --- go.mod | 10 + go.sum | 12 +- pkg/exec/exec_test.go | 185 ++++++++++++++++ pkg/operator/operatorhub/violet.go | 24 ++- pkg/operator/operatorhub/violet_test.go | 271 ++++++++++++++++++++++++ 5 files changed, 492 insertions(+), 10 deletions(-) create mode 100644 pkg/exec/exec_test.go diff --git a/go.mod b/go.mod index ebb3e8d..6fc328c 100644 --- a/go.mod +++ b/go.mod @@ -20,14 +20,20 @@ require ( github.com/emicklei/go-restful/v3 v3.12.1 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/logr v1.4.2 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect + github.com/go-openapi/swag v0.23.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/spf13/pflag v1.0.6 // indirect github.com/x448/float16 v0.8.4 // indirect go.uber.org/multierr v1.11.0 // indirect @@ -37,9 +43,13 @@ require ( golang.org/x/term v0.32.0 // indirect golang.org/x/text v0.25.0 // indirect golang.org/x/time v0.9.0 // indirect + google.golang.org/protobuf v1.36.5 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/api v0.33.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/randfill v1.0.0 // indirect diff --git a/go.sum b/go.sum index 55c3902..1cbf602 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,8 @@ github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= @@ -25,6 +27,8 @@ github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo= +github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -50,6 +54,10 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/oliveagle/jsonpath v0.0.0-20180606110733-2e52cf6e6852 h1:Yl0tPBa8QPjGmesFh1D0rDy+q1Twx6FyU7VWHi8wZbI= github.com/oliveagle/jsonpath v0.0.0-20180606110733-2e52cf6e6852/go.mod h1:eqOVx5Vwu4gd2mmMZvVZsgIqNSaW3xxRThUJ0k/TPk4= +github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= +github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= +github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -86,8 +94,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= diff --git a/pkg/exec/exec_test.go b/pkg/exec/exec_test.go new file mode 100644 index 0000000..4daa849 --- /dev/null +++ b/pkg/exec/exec_test.go @@ -0,0 +1,185 @@ +package exec + +import ( + "context" + "errors" + "strings" + "testing" +) + +func TestMatchAllowlist(t *testing.T) { + tests := []struct { + name string + needle string + allowlist []string + want bool + }{ + {"exact match hits", "KUBECONFIG", []string{"KUBECONFIG", "HOME"}, true}, + {"exact match misses", "USER", []string{"KUBECONFIG", "HOME"}, false}, + {"prefix glob hits", "VIOLET_REGISTRY_USERNAME", []string{"VIOLET_*"}, true}, + {"prefix glob misses on unrelated var", "GITHUB_TOKEN", []string{"VIOLET_*"}, false}, + {"empty allowlist never matches", "ANYTHING", nil, false}, + {"prefix glob and exact mix", "PATH", []string{"VIOLET_*", "PATH"}, true}, + {"prefix glob requires prefix not substring", "MY_VIOLET_TOKEN", []string{"VIOLET_*"}, false}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if got := matchAllowlist(tc.needle, tc.allowlist); got != tc.want { + t.Errorf("matchAllowlist(%q, %v) = %v, want %v", tc.needle, tc.allowlist, got, tc.want) + } + }) + } +} + +func TestFilterHostEnv_EmptyAllowlistReturnsAll(t *testing.T) { + // Empty allowlist must preserve the legacy "full os.Environ() passthrough" + // behaviour — testCommand callers rely on it. + t.Setenv("FILTER_HOST_ENV_PROBE", "probe-value") + got := filterHostEnv(nil) + if !contains(got, "FILTER_HOST_ENV_PROBE=probe-value") { + t.Errorf("expected probe to survive in empty-allowlist mode; env=%v", got) + } +} + +func TestFilterHostEnv_AllowlistRestricts(t *testing.T) { + // Set two probes and confirm the allowlist drops the one not listed. + t.Setenv("FILTER_HOST_ENV_KEEP", "yes") + t.Setenv("FILTER_HOST_ENV_DROP", "no") + got := filterHostEnv([]string{"FILTER_HOST_ENV_KEEP"}) + if !contains(got, "FILTER_HOST_ENV_KEEP=yes") { + t.Errorf("KEEP probe should be retained; env=%v", got) + } + if contains(got, "FILTER_HOST_ENV_DROP=no") { + t.Errorf("DROP probe should be filtered out; env=%v", got) + } +} + +func TestFilterHostEnv_GlobMatchesPrefix(t *testing.T) { + t.Setenv("FILTER_GLOB_TOKEN_A", "1") + t.Setenv("FILTER_GLOB_TOKEN_B", "2") + t.Setenv("OTHER_VAR", "leave-me") + got := filterHostEnv([]string{"FILTER_GLOB_*"}) + if !contains(got, "FILTER_GLOB_TOKEN_A=1") || !contains(got, "FILTER_GLOB_TOKEN_B=2") { + t.Errorf("both glob-matched vars should be present; env=%v", got) + } + if contains(got, "OTHER_VAR=leave-me") { + t.Errorf("OTHER_VAR should be filtered; env=%v", got) + } +} + +func TestLastLines(t *testing.T) { + tests := []struct { + name string + s string + n int + want string + }{ + {"empty input returns empty", "", 5, ""}, + {"zero n returns empty", "a\nb\nc", 0, ""}, + {"fewer lines than n returns all", "a\nb", 5, "a\nb"}, + {"more lines than n returns tail", "a\nb\nc\nd\ne", 2, "d\ne"}, + {"trailing newline is trimmed", "a\nb\nc\n", 2, "b\nc"}, + {"only newlines collapses to empty", "\n\n\n", 5, ""}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if got := lastLines(tc.s, tc.n); got != tc.want { + t.Errorf("lastLines(%q, %d) = %q, want %q", tc.s, tc.n, got, tc.want) + } + }) + } +} + +func TestWrapWithStderrTail(t *testing.T) { + base := errors.New("exit status 1") + + t.Run("empty stderr passes through unchanged", func(t *testing.T) { + got := wrapWithStderrTail(base, "", 5) + if got.Error() != base.Error() { + t.Errorf("expected pass-through, got %q", got.Error()) + } + // %w semantics: returned err should still unwrap to base. + if !errors.Is(got, base) { + t.Error("expected errors.Is to identify base error") + } + }) + + t.Run("non-empty stderr appended", func(t *testing.T) { + got := wrapWithStderrTail(base, "line1\nline2\nline3\n", 2) + if !strings.Contains(got.Error(), "stderr tail:") { + t.Errorf("expected 'stderr tail:' prefix, got %q", got.Error()) + } + if !strings.Contains(got.Error(), "line2\nline3") { + t.Errorf("expected last 2 lines in wrapped error, got %q", got.Error()) + } + if !errors.Is(got, base) { + t.Error("wrapped error must still satisfy errors.Is(base)") + } + }) +} + +func TestRunCommand_EnvAllowlistAppliedToChild(t *testing.T) { + // Verify the allowlist actually changes what the child sees. We export + // two probes, restrict the child to one of them, then dump the child's + // env via /bin/sh -c 'env' and check Stdout. + t.Setenv("EXEC_TEST_KEEP", "keep-me") + t.Setenv("EXEC_TEST_DROP", "drop-me") + + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "env"}, + EnvAllowlist: []string{"EXEC_TEST_KEEP", "PATH"}, + }) + if res.Err != nil { + t.Fatalf("RunCommand failed: %v\nstderr=%s", res.Err, res.Stderr) + } + if !strings.Contains(res.Stdout, "EXEC_TEST_KEEP=keep-me") { + t.Errorf("KEEP probe should reach child; stdout=%s", res.Stdout) + } + if strings.Contains(res.Stdout, "EXEC_TEST_DROP=drop-me") { + t.Errorf("DROP probe should NOT reach child (allowlist filtered); stdout=%s", res.Stdout) + } +} + +func TestRunCommand_EmptyAllowlistFullPassthrough(t *testing.T) { + t.Setenv("EXEC_TEST_PROBE", "should-survive") + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "env"}, + // EnvAllowlist intentionally nil — legacy behaviour. + }) + if res.Err != nil { + t.Fatalf("RunCommand failed: %v\nstderr=%s", res.Err, res.Stderr) + } + if !strings.Contains(res.Stdout, "EXEC_TEST_PROBE=should-survive") { + t.Errorf("nil allowlist should preserve full os.Environ(); stdout did not contain probe; stdout=%s", res.Stdout) + } +} + +func TestRunCommand_FailingChildWrapsStderrTail(t *testing.T) { + // /bin/sh -c 'echo boom 1>&2; exit 7' — verify CommandResult.Err contains + // the stderr tail wrap, and that errors.Is still works against the + // underlying *exec.ExitError. + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "echo first-line 1>&2; echo last-line 1>&2; exit 7"}, + }) + if res.Err == nil { + t.Fatal("expected error from exit 7") + } + if !strings.Contains(res.Err.Error(), "last-line") { + t.Errorf("expected stderr tail to be wrapped into err message; got %q", res.Err.Error()) + } + if res.Stderr == "" { + t.Errorf("Stderr field should still hold full captured stderr") + } +} + +func contains(haystack []string, needle string) bool { + for _, s := range haystack { + if s == needle { + return true + } + } + return false +} diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index f969545..6aa66bf 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -7,7 +7,9 @@ import ( "fmt" "io" "net/http" + "net/url" "os" + "path" "path/filepath" "strings" @@ -200,24 +202,30 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) return av, nil } -// downloadToTemp fetches url into a fresh temporary directory and returns the -// local file path plus a cleanup function. The caller MUST invoke cleanup +// downloadToTemp fetches rawURL into a fresh temporary directory and returns +// the local file path plus a cleanup function. The caller MUST invoke cleanup // (typically via defer) once the file is no longer needed. The temp dir is // per-call so concurrent installs cannot race on the same path. -func downloadToTemp(ctx context.Context, url string) (string, func(), error) { +func downloadToTemp(ctx context.Context, rawURL string) (string, func(), error) { dir, err := os.MkdirTemp("", "upgrade-violet-*") if err != nil { return "", nil, fmt.Errorf("mkdir temp: %w", err) } cleanup := func() { _ = os.RemoveAll(dir) } - fileName := filepath.Base(url) - if fileName == "" || fileName == "." || fileName == "/" { - fileName = "package.tgz" + // Use net/url + path.Base to extract the last segment of the URL path + // itself; filepath.Base("http://host/x.tgz") would return the host on + // some platforms and never the file name. path.Base is also the right + // operator for forward-slash URL paths regardless of OS. + fileName := "package.tgz" + if parsed, perr := url.Parse(rawURL); perr == nil { + if base := path.Base(parsed.Path); base != "" && base != "." && base != "/" { + fileName = base + } } filePath := filepath.Join(dir, fileName) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) if err != nil { cleanup() return "", nil, fmt.Errorf("build request: %w", err) @@ -232,7 +240,7 @@ func downloadToTemp(ctx context.Context, url string) (string, func(), error) { if resp.StatusCode < 200 || resp.StatusCode >= 300 { cleanup() - return "", nil, fmt.Errorf("unexpected status %d from %s", resp.StatusCode, url) + return "", nil, fmt.Errorf("unexpected status %d from %s", resp.StatusCode, rawURL) } f, err := os.Create(filePath) diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index 66b51ef..f3ca341 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -1,12 +1,22 @@ package operatorhub import ( + "context" "crypto/sha256" "encoding/hex" + "net/http" + "net/http/httptest" "os" "path/filepath" "strings" "testing" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/dynamic/fake" ) func TestBuildPackageURL(t *testing.T) { @@ -221,3 +231,264 @@ func stringSliceEqual(a, b []string) bool { } return true } + +// --- validateVioletBin -------------------------------------------------------- + +func TestValidateVioletBin(t *testing.T) { + dir := t.TempDir() + + // Prepare three fixtures: an executable file, a non-executable file, and a directory. + exePath := filepath.Join(dir, "violet") + if err := os.WriteFile(exePath, []byte("#!/bin/sh\necho fake"), 0o755); err != nil { + t.Fatal(err) + } + nonExePath := filepath.Join(dir, "not-exe") + if err := os.WriteFile(nonExePath, []byte("data"), 0o644); err != nil { + t.Fatal(err) + } + subDir := filepath.Join(dir, "subdir") + if err := os.Mkdir(subDir, 0o755); err != nil { + t.Fatal(err) + } + + tests := []struct { + name string + path string + wantErrSub string // substring expected in the error message ("" = expect nil) + }{ + {"absolute executable file is accepted", exePath, ""}, + {"relative path is rejected", "violet", "must be an absolute path"}, + {"missing file surfaces stat error", filepath.Join(dir, "missing"), "stat violet binary"}, + {"directory is rejected", subDir, "is a directory"}, + {"non-executable file is rejected", nonExePath, "is not executable"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := validateVioletBin(tc.path) + if tc.wantErrSub == "" { + if err != nil { + t.Errorf("want nil error, got %v", err) + } + return + } + if err == nil { + t.Fatalf("want error containing %q, got nil", tc.wantErrSub) + } + if !strings.Contains(err.Error(), tc.wantErrSub) { + t.Errorf("error %q does not contain %q", err.Error(), tc.wantErrSub) + } + }) + } +} + +// --- downloadToTemp ----------------------------------------------------------- + +func TestDownloadToTemp_Success(t *testing.T) { + payload := []byte("hello violet bundle") + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/pkg/tektoncd.tgz" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + _, _ = w.Write(payload) + })) + defer srv.Close() + + url := srv.URL + "/pkg/tektoncd.tgz" + path, cleanup, err := downloadToTemp(context.Background(), url) + if err != nil { + t.Fatalf("downloadToTemp: %v", err) + } + defer cleanup() + + if filepath.Base(path) != "tektoncd.tgz" { + t.Errorf("expected filename from URL, got %s", filepath.Base(path)) + } + body, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read: %v", err) + } + if string(body) != string(payload) { + t.Errorf("body mismatch: got %q, want %q", string(body), string(payload)) + } + + // cleanup must remove the file (and its temp dir). + cleanup() + if _, err := os.Stat(path); !os.IsNotExist(err) { + t.Errorf("cleanup should have removed %s; stat err=%v", path, err) + } +} + +func TestDownloadToTemp_404(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.NotFound(w, nil) + })) + defer srv.Close() + + _, _, err := downloadToTemp(context.Background(), srv.URL+"/missing.tgz") + if err == nil { + t.Fatal("want error for 404, got nil") + } + if !strings.Contains(err.Error(), "404") { + t.Errorf("error should mention status 404, got %v", err) + } +} + +func TestDownloadToTemp_5xx(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + + _, _, err := downloadToTemp(context.Background(), srv.URL+"/x.tgz") + if err == nil { + t.Fatal("want error for 500, got nil") + } + if !strings.Contains(err.Error(), "500") { + t.Errorf("error should mention status 500, got %v", err) + } +} + +func TestDownloadToTemp_DefaultFilename(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("ok")) + })) + defer srv.Close() + + // URL has no path beyond "/" → filepath.Base returns "/", which the + // helper must rewrite to "package.tgz" so io.Copy has a real target. + path, cleanup, err := downloadToTemp(context.Background(), srv.URL+"/") + if err != nil { + t.Fatalf("downloadToTemp: %v", err) + } + defer cleanup() + if filepath.Base(path) != "package.tgz" { + t.Errorf("expected fallback filename package.tgz, got %s", filepath.Base(path)) + } +} + +func TestDownloadToTemp_ContextCancel(t *testing.T) { + // Server blocks long enough for context cancel to fire before headers + // are sent, ensuring the net/http request honours the context. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + time.Sleep(2 * time.Second) + _, _ = w.Write([]byte("late")) + })) + defer srv.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + + _, _, err := downloadToTemp(ctx, srv.URL+"/slow.tgz") + if err == nil { + t.Fatal("want context error, got nil") + } + if !strings.Contains(err.Error(), "context") && !strings.Contains(err.Error(), "deadline") { + t.Errorf("expected context/deadline error, got %v", err) + } +} + +// --- deleteArtifactVersionIfExists -------------------------------------------- + +// newOperatorWithFakeClient builds a minimal *Operator backed by a fake +// dynamic client seeded with the supplied objects. Only fields touched by +// deleteArtifactVersionIfExists need to be populated. +func newOperatorWithFakeClient(t *testing.T, seed ...runtime.Object) *Operator { + t.Helper() + scheme := runtime.NewScheme() + // Register ArtifactVersion as an unstructured list kind so the fake + // client knows how to list/get/delete it. + scheme.AddKnownTypeWithName( + artifactVersionGVR.GroupVersion().WithKind("ArtifactVersion"), + &unstructured.Unstructured{}, + ) + scheme.AddKnownTypeWithName( + artifactVersionGVR.GroupVersion().WithKind("ArtifactVersionList"), + &unstructured.UnstructuredList{}, + ) + + listKinds := map[schemaGVR]string{ + {artifactVersionGVR.Group, artifactVersionGVR.Version, artifactVersionGVR.Resource}: "ArtifactVersionList", + } + _ = listKinds // placeholder if extending + + client := fake.NewSimpleDynamicClient(scheme, seed...) + return &Operator{ + client: client, + name: "tektoncd-operator", + artifact: "operatorhub-tektoncd-operator", + interval: 10 * time.Millisecond, + timeout: 2 * time.Second, + } +} + +// schemaGVR is a local alias used only inside the test helper so we do not +// drag the full k8s schema package into the file headers when not needed. +type schemaGVR struct{ G, V, R string } + +func newAVUnstructured(name string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": artifactVersionGVR.GroupVersion().String(), + "kind": "ArtifactVersion", + "metadata": map[string]interface{}{ + "name": name, + "namespace": systemNamespace, + }, + "spec": map[string]interface{}{ + "present": true, + "tag": "v4.6.0", + }, + }, + } +} + +func TestDeleteArtifactVersionIfExists_NoResidueIsNoop(t *testing.T) { + op := newOperatorWithFakeClient(t) + if err := op.deleteArtifactVersionIfExists(context.Background(), "missing.av"); err != nil { + t.Errorf("expected nil for missing AV (NotFound is fine), got %v", err) + } +} + +func TestDeleteArtifactVersionIfExists_RemovesExistingAV(t *testing.T) { + avName := "operatorhub-tektoncd-operator.v4.6.0" + op := newOperatorWithFakeClient(t, newAVUnstructured(avName)) + + // Sanity: the seed object is visible before delete. + got, err := op.client.Resource(artifactVersionGVR).Namespace(systemNamespace).Get( + context.Background(), avName, metav1.GetOptions{}, + ) + if err != nil || got == nil { + t.Fatalf("seed AV should be present, got err=%v", err) + } + + if err := op.deleteArtifactVersionIfExists(context.Background(), avName); err != nil { + t.Fatalf("delete: %v", err) + } + + // After delete + poll the resource must be gone. + _, err = op.client.Resource(artifactVersionGVR).Namespace(systemNamespace).Get( + context.Background(), avName, metav1.GetOptions{}, + ) + if !errors.IsNotFound(err) { + t.Errorf("after delete, expected NotFound, got %v", err) + } +} + +func TestDeleteArtifactVersionIfExists_ConcurrentRaceVanishesGracefully(t *testing.T) { + // Simulates "Get says exists, but Delete returns NotFound because something + // else removed it between the two calls". The helper must absorb that + // race rather than propagating it as a hard error. + avName := "operatorhub-tektoncd-operator.v4.6.0" + op := newOperatorWithFakeClient(t, newAVUnstructured(avName)) + + // Pre-delete the object so the helper's own Delete call sees NotFound. + if err := op.client.Resource(artifactVersionGVR).Namespace(systemNamespace). + Delete(context.Background(), avName, metav1.DeleteOptions{}); err != nil { + t.Fatalf("pre-delete: %v", err) + } + + // Helper will Get (NotFound → return nil) and exit cleanly. + if err := op.deleteArtifactVersionIfExists(context.Background(), avName); err != nil { + t.Errorf("expected nil for race-vanished AV, got %v", err) + } +} From a03648762263c12e246f59876d353727923d1b73 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 20:45:28 +0800 Subject: [PATCH 05/15] fix(operatorhub): address P2 correctness findings from review - bound .tgz download with an explicit timeout derived from o.timeout so a stalled MinIO does not pin the upgrade to OS-level TCP keepalive - explicit close-and-check on the downloaded file so a flush error (disk full, NFS reset) does not silently produce a truncated .tgz that bypasses VerifySha256 when ExpectedSha256 is unset - retry transient apiserver errors (timeout / throttle / unavailable) while polling ArtifactVersion and PackageManifest; treat NotFound as keep-polling to absorb the post-violet-push eventual-consistency window - exact-match CSV in PackageManifest entries (was strings.Contains, which matched v0.65.0 against v0.65.0-rc1 and friends) and surface NestedString type errors instead of silently dropping them - return csv from InstallArtifactVersion so UpgradeOperator does not silently feed an empty status.version into InstallSubscription via a second unchecked NestedString extraction - require version.channel up front in UpgradeOperator (was a silent "stable" fallback that contradicted BuildPackageURL's strict policy and could route Subscription through the wrong OLM channel on a typo) Adds TestDownloadToTemp_TimeoutEnforcedWithoutParentDeadline as a regression guard for the download timeout path. --- pkg/operator/operatorhub/artifact_versiong.go | 51 +++++++++++++---- pkg/operator/operatorhub/operator.go | 19 ++++--- pkg/operator/operatorhub/violet.go | 56 ++++++++++++------- pkg/operator/operatorhub/violet_test.go | 33 +++++++++-- 4 files changed, 113 insertions(+), 46 deletions(-) diff --git a/pkg/operator/operatorhub/artifact_versiong.go b/pkg/operator/operatorhub/artifact_versiong.go index e9456ea..e039d38 100644 --- a/pkg/operator/operatorhub/artifact_versiong.go +++ b/pkg/operator/operatorhub/artifact_versiong.go @@ -3,21 +3,22 @@ package operatorhub import ( "context" "fmt" - "strings" "github.com/AlaudaDevops/upgrade-test/pkg/config" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/wait" + "knative.dev/pkg/logging" ) // InstallArtifactVersion is the stable entry point invoked by UpgradeOperator. -// It needs the whole Version because the channel and (optional) sha256 are -// required by the violet onboarding flow, not just the bundle version. -func (o *Operator) InstallArtifactVersion(ctx context.Context, version config.Version) (*unstructured.Unstructured, error) { +// It returns the CSV name (read from av.status.version inside the violet flow) +// so the caller does not have to re-extract it — a second extraction site was +// prone to silently passing an empty csv to InstallSubscription. +func (o *Operator) InstallArtifactVersion(ctx context.Context, version config.Version) (*unstructured.Unstructured, string, error) { if o.violet == nil { - return nil, fmt.Errorf("operatorConfig.violet must be configured to install ArtifactVersion via the violet binary") + return nil, "", fmt.Errorf("operatorConfig.violet must be configured to install ArtifactVersion via the violet binary") } return o.installViaViolet(ctx, version) } @@ -70,11 +71,19 @@ func (o *Operator) createArtifactVersion(ctx context.Context, version string, ar } func (o *Operator) waitArtifactVersionPresent(ctx context.Context, name string) (*unstructured.Unstructured, error) { + log := logging.FromContext(ctx) lastResource := &unstructured.Unstructured{} err := wait.PollUntilContextTimeout(ctx, o.interval, o.timeout, true, func(ctx context.Context) (done bool, err error) { obj, err := o.client.Resource(artifactVersionGVR).Namespace(systemNamespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return false, err + switch { + case errors.IsNotFound(err): + // AV not yet visible — eventual consistency window after violet push. + return false, nil + case isTransientAPIError(err): + log.Warnw("transient API error while polling AV, will retry", "name", name, "err", err) + return false, nil + case err != nil: + return false, fmt.Errorf("get AV %s: %w", name, err) } status, _, _ := unstructured.NestedMap(obj.Object, "status") @@ -93,10 +102,17 @@ func (o *Operator) waitArtifactVersionPresent(ctx context.Context, name string) } func (o *Operator) waitPackageManifest(ctx context.Context, csv string) error { + log := logging.FromContext(ctx) return wait.PollUntilContextTimeout(ctx, o.interval, o.timeout, true, func(ctx context.Context) (done bool, err error) { pm, err := o.client.Resource(packageManifestGVR).Namespace(systemNamespace).Get(ctx, o.name, metav1.GetOptions{}) - if err != nil && !errors.IsNotFound(err) { - return false, err + switch { + case errors.IsNotFound(err): + return false, nil + case isTransientAPIError(err): + log.Warnw("transient API error while polling PackageManifest, will retry", "name", o.name, "err", err) + return false, nil + case err != nil: + return false, fmt.Errorf("get PackageManifest %s: %w", o.name, err) } if pm == nil { @@ -117,8 +133,11 @@ func (o *Operator) waitPackageManifest(ctx context.Context, csv string) error { continue } - csvName, _, _ := unstructured.NestedString(entryMap, "name") - if strings.Contains(csvName, csv) { + csvName, found, err := unstructured.NestedString(entryMap, "name") + if err != nil { + return false, fmt.Errorf("PackageManifest %s entry .name has wrong type: %w", o.name, err) + } + if found && csvName == csv { return true, nil } } @@ -127,3 +146,13 @@ func (o *Operator) waitPackageManifest(ctx context.Context, csv string) error { return false, nil }) } + +// isTransientAPIError reports whether err is a Kubernetes API error worth +// retrying on (apiserver timeout, throttling, brief unavailability). Permanent +// errors (RBAC denied, Forbidden, malformed request) must propagate so the +// poll loop fails fast with the real reason instead of timing out. +func isTransientAPIError(err error) bool { + return errors.IsServerTimeout(err) || + errors.IsTooManyRequests(err) || + errors.IsServiceUnavailable(err) +} diff --git a/pkg/operator/operatorhub/operator.go b/pkg/operator/operatorhub/operator.go index b340a11..7f366c8 100644 --- a/pkg/operator/operatorhub/operator.go +++ b/pkg/operator/operatorhub/operator.go @@ -107,19 +107,20 @@ func (o *Operator) GetResource(ctx context.Context, name, namespace string, gvr } func (o *Operator) UpgradeOperator(ctx context.Context, version config.Version) error { - // Install artifact version - av, err := o.InstallArtifactVersion(ctx, version) + // version.Channel is required end-to-end: BuildPackageURL already refuses + // an empty channel, and InstallSubscription needs an exact channel name + // (a silent "stable" fallback would route the Subscription through the + // wrong OLM channel and install the wrong CSV stream on a typo). + if version.Channel == "" { + return fmt.Errorf("version.channel is required") + } + + _, csv, err := o.InstallArtifactVersion(ctx, version) if err != nil { return fmt.Errorf("failed to prepare operator: %v", err) } - // Get CSV version from artifact version - csv, _, _ := unstructured.NestedString(av.Object, "status", "version") - channel := version.Channel - if channel == "" { - channel = "stable" // default fallback - } - if err := o.InstallSubscription(ctx, csv, channel); err != nil { + if err := o.InstallSubscription(ctx, csv, version.Channel); err != nil { return fmt.Errorf("failed to install subscription: %v", err) } diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index 6aa66bf..ad27516 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -12,6 +12,7 @@ import ( "path" "path/filepath" "strings" + "time" "github.com/AlaudaDevops/upgrade-test/pkg/config" "github.com/AlaudaDevops/upgrade-test/pkg/exec" @@ -134,50 +135,48 @@ func VerifySha256(filePath, expected string) error { // CSV. The upstream Artifact CR is expected to already exist (managed out of // band); only the ArtifactVersion write path is delegated to violet. // -// Steps: 1) get Artifact, 2) build URL, 3) download .tgz, 4) verify sha256, -// 5) delete any residue AV with the same name (so wait does not match it), -// 6) exec violet push with allowlist env, 7) wait AV Present, 8) wait -// PackageManifest. Each stage prefixes its error with a short label so the -// caller can locate the failure without re-running. -func (o *Operator) installViaViolet(ctx context.Context, version config.Version) (*unstructured.Unstructured, error) { +// Returns (av, csv, err) so callers do not have to re-read status.version +// from the unstructured object — a second extraction site was prone to +// silently feeding an empty csv to InstallSubscription. +func (o *Operator) installViaViolet(ctx context.Context, version config.Version) (*unstructured.Unstructured, string, error) { log := logging.FromContext(ctx) log.Infow("installing artifact version via violet", "bundleVersion", version.BundleVersion, "channel", version.Channel) artifact, err := o.GetResource(ctx, o.artifact, systemNamespace, artifactGVR) if err != nil { - return nil, fmt.Errorf("get artifact %s: %w", o.artifact, err) + return nil, "", fmt.Errorf("get artifact %s: %w", o.artifact, err) } url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.Channel, version.BundleVersion) if err != nil { - return nil, fmt.Errorf("build package url: %w", err) + return nil, "", fmt.Errorf("build package url: %w", err) } log.Infow("downloading violet package", "url", url) - tgzPath, cleanup, err := downloadToTemp(ctx, url) + tgzPath, cleanup, err := downloadToTemp(ctx, url, o.timeout) if err != nil { - return nil, fmt.Errorf("download %s: %w", url, err) + return nil, "", fmt.Errorf("download %s: %w", url, err) } defer cleanup() if err := VerifySha256(tgzPath, version.ExpectedSha256); err != nil { - return nil, fmt.Errorf("verify sha256: %w", err) + return nil, "", fmt.Errorf("verify sha256: %w", err) } avName := fmt.Sprintf("%s.%s", artifact.GetName(), version.BundleVersion) if err := o.deleteArtifactVersionIfExists(ctx, avName); err != nil { - return nil, fmt.Errorf("ensure clean AV %s: %w", avName, err) + return nil, "", fmt.Errorf("ensure clean AV %s: %w", avName, err) } if err := o.execVioletPush(ctx, tgzPath); err != nil { - return nil, fmt.Errorf("violet push: %w", err) + return nil, "", fmt.Errorf("violet push: %w", err) } log.Infow("waiting for ArtifactVersion to reach Present", "name", avName) av, err := o.waitArtifactVersionPresent(ctx, avName) if err != nil { - return nil, fmt.Errorf("wait artifact version present: %w", err) + return nil, "", fmt.Errorf("wait artifact version present: %w", err) } // Cross-check: violet might use a naming convention we did not expect. If @@ -185,28 +184,35 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) // BundleVersion, surface it loudly rather than silently accepting a stale // or unrelated AV. if tag, _, _ := unstructured.NestedString(av.Object, "spec", "tag"); tag != version.BundleVersion { - return nil, fmt.Errorf("artifact version %s has spec.tag=%q, expected %q (violet may not follow the . naming convention)", + return nil, "", fmt.Errorf("artifact version %s has spec.tag=%q, expected %q (violet may not follow the . naming convention)", avName, tag, version.BundleVersion) } csv, found, _ := unstructured.NestedString(av.Object, "status", "version") if !found || csv == "" { - return nil, fmt.Errorf("artifact version %s status.version is empty", avName) + return nil, "", fmt.Errorf("artifact version %s status.version is empty", avName) } log.Infow("waiting for package manifest", "csv", csv) if err := o.waitPackageManifest(ctx, csv); err != nil { - return nil, fmt.Errorf("wait package manifest for csv %s: %w", csv, err) + return nil, "", fmt.Errorf("wait package manifest for csv %s: %w", csv, err) } log.Infow("artifact version installed via violet", "name", av.GetName()) - return av, nil + return av, csv, nil } // downloadToTemp fetches rawURL into a fresh temporary directory and returns // the local file path plus a cleanup function. The caller MUST invoke cleanup // (typically via defer) once the file is no longer needed. The temp dir is // per-call so concurrent installs cannot race on the same path. -func downloadToTemp(ctx context.Context, rawURL string) (string, func(), error) { +// +// `timeout` bounds the entire HTTP exchange (dial + headers + body). It is +// applied via a derived context so a slow or half-open MinIO peer cannot pin +// the goroutine for hours waiting on OS-level TCP keepalive. +func downloadToTemp(ctx context.Context, rawURL string, timeout time.Duration) (string, func(), error) { + dlCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + dir, err := os.MkdirTemp("", "upgrade-violet-*") if err != nil { return "", nil, fmt.Errorf("mkdir temp: %w", err) @@ -225,7 +231,7 @@ func downloadToTemp(ctx context.Context, rawURL string) (string, func(), error) } filePath := filepath.Join(dir, fileName) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + req, err := http.NewRequestWithContext(dlCtx, http.MethodGet, rawURL, nil) if err != nil { cleanup() return "", nil, fmt.Errorf("build request: %w", err) @@ -248,12 +254,20 @@ func downloadToTemp(ctx context.Context, rawURL string) (string, func(), error) cleanup() return "", nil, fmt.Errorf("create %s: %w", filePath, err) } - defer f.Close() + // Explicit close-and-check: on many filesystems write buffers are only + // flushed during Close(), so swallowing its error via defer can let a + // truncated .tgz reach VerifySha256 (which is optional today) and then + // violet push, producing a misleading "violet push:" error. if _, err := io.Copy(f, resp.Body); err != nil { + _ = f.Close() cleanup() return "", nil, fmt.Errorf("copy body to %s: %w", filePath, err) } + if err := f.Close(); err != nil { + cleanup() + return "", nil, fmt.Errorf("close %s after download: %w", filePath, err) + } return filePath, cleanup, nil } diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index f3ca341..43501b6 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -294,7 +294,7 @@ func TestDownloadToTemp_Success(t *testing.T) { defer srv.Close() url := srv.URL + "/pkg/tektoncd.tgz" - path, cleanup, err := downloadToTemp(context.Background(), url) + path, cleanup, err := downloadToTemp(context.Background(), url, 5*time.Second) if err != nil { t.Fatalf("downloadToTemp: %v", err) } @@ -324,7 +324,7 @@ func TestDownloadToTemp_404(t *testing.T) { })) defer srv.Close() - _, _, err := downloadToTemp(context.Background(), srv.URL+"/missing.tgz") + _, _, err := downloadToTemp(context.Background(), srv.URL+"/missing.tgz", 5*time.Second) if err == nil { t.Fatal("want error for 404, got nil") } @@ -339,7 +339,7 @@ func TestDownloadToTemp_5xx(t *testing.T) { })) defer srv.Close() - _, _, err := downloadToTemp(context.Background(), srv.URL+"/x.tgz") + _, _, err := downloadToTemp(context.Background(), srv.URL+"/x.tgz", 5*time.Second) if err == nil { t.Fatal("want error for 500, got nil") } @@ -356,7 +356,7 @@ func TestDownloadToTemp_DefaultFilename(t *testing.T) { // URL has no path beyond "/" → filepath.Base returns "/", which the // helper must rewrite to "package.tgz" so io.Copy has a real target. - path, cleanup, err := downloadToTemp(context.Background(), srv.URL+"/") + path, cleanup, err := downloadToTemp(context.Background(), srv.URL+"/", 5*time.Second) if err != nil { t.Fatalf("downloadToTemp: %v", err) } @@ -378,7 +378,7 @@ func TestDownloadToTemp_ContextCancel(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) defer cancel() - _, _, err := downloadToTemp(ctx, srv.URL+"/slow.tgz") + _, _, err := downloadToTemp(ctx, srv.URL+"/slow.tgz", 5*time.Second) if err == nil { t.Fatal("want context error, got nil") } @@ -387,6 +387,29 @@ func TestDownloadToTemp_ContextCancel(t *testing.T) { } } +func TestDownloadToTemp_TimeoutEnforcedWithoutParentDeadline(t *testing.T) { + // Parent ctx has no deadline; downloadToTemp's own timeout must still + // abort the request when the server stalls. Regression guard for "the + // upgrade CLI passes a bare ctx and downloads hang on TCP keepalive". + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + time.Sleep(2 * time.Second) + _, _ = w.Write([]byte("late")) + })) + defer srv.Close() + + start := time.Now() + _, _, err := downloadToTemp(context.Background(), srv.URL+"/slow.tgz", 80*time.Millisecond) + if err == nil { + t.Fatal("want deadline error, got nil") + } + if elapsed := time.Since(start); elapsed > time.Second { + t.Errorf("downloadToTemp should honour its own timeout (~80ms), took %v", elapsed) + } + if !strings.Contains(err.Error(), "context") && !strings.Contains(err.Error(), "deadline") { + t.Errorf("expected context/deadline error, got %v", err) + } +} + // --- deleteArtifactVersionIfExists -------------------------------------------- // newOperatorWithFakeClient builds a minimal *Operator backed by a fake From 4ecd6dac106b585f2b82311d0f229caced997e47 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 20:47:26 +0800 Subject: [PATCH 06/15] chore: archive PR #14 code review findings as pending todos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 17 findings from a 6-agent parallel review (security / silent-failure / simplicity / architecture / performance / type-design) consolidated into the todos/ directory: - 001-012 are P2 — should fix before / with PR 3 - 013-017 are P3 — cleanup follow-ups for the PR 3 cleanup commit 001 + 004-008 are already addressed by the previous commit (a036487); the corresponding files are kept in pending status so triage can flip them to ready / complete when the implementation is reviewed. --- ...001-pending-p2-http-download-no-timeout.md | 73 +++++++++++++ ...ding-p2-password-leak-via-violet-stream.md | 86 +++++++++++++++ ...g-p2-sha256-should-be-required-for-http.md | 96 ++++++++++++++++ ...p2-package-manifest-substring-csv-match.md | 78 +++++++++++++ ...p2-wait-av-no-retry-on-transient-errors.md | 84 ++++++++++++++ ...nding-p2-download-close-error-swallowed.md | 88 +++++++++++++++ ...ending-p2-empty-csv-fed-to-subscription.md | 91 ++++++++++++++++ ...ending-p2-channel-default-inconsistency.md | 80 ++++++++++++++ ...ding-p2-skippush-bool-pointer-overmodel.md | 86 +++++++++++++++ ...ending-p2-violet-config-validate-method.md | 103 ++++++++++++++++++ ...ending-p2-drop-violet-bin-and-validator.md | 64 +++++++++++ todos/012-pending-p2-url-scheme-allowlist.md | 74 +++++++++++++ todos/013-pending-p3-silent-error-cleanup.md | 86 +++++++++++++++ ...14-pending-p3-security-defense-in-depth.md | 92 ++++++++++++++++ ...ding-p3-code-organization-and-constants.md | 68 ++++++++++++ .../016-pending-p3-field-format-validation.md | 80 ++++++++++++++ ...nding-p3-env-allowlist-threat-model-doc.md | 73 +++++++++++++ 17 files changed, 1402 insertions(+) create mode 100644 todos/001-pending-p2-http-download-no-timeout.md create mode 100644 todos/002-pending-p2-password-leak-via-violet-stream.md create mode 100644 todos/003-pending-p2-sha256-should-be-required-for-http.md create mode 100644 todos/004-pending-p2-package-manifest-substring-csv-match.md create mode 100644 todos/005-pending-p2-wait-av-no-retry-on-transient-errors.md create mode 100644 todos/006-pending-p2-download-close-error-swallowed.md create mode 100644 todos/007-pending-p2-empty-csv-fed-to-subscription.md create mode 100644 todos/008-pending-p2-channel-default-inconsistency.md create mode 100644 todos/009-pending-p2-skippush-bool-pointer-overmodel.md create mode 100644 todos/010-pending-p2-violet-config-validate-method.md create mode 100644 todos/011-pending-p2-drop-violet-bin-and-validator.md create mode 100644 todos/012-pending-p2-url-scheme-allowlist.md create mode 100644 todos/013-pending-p3-silent-error-cleanup.md create mode 100644 todos/014-pending-p3-security-defense-in-depth.md create mode 100644 todos/015-pending-p3-code-organization-and-constants.md create mode 100644 todos/016-pending-p3-field-format-validation.md create mode 100644 todos/017-pending-p3-env-allowlist-threat-model-doc.md diff --git a/todos/001-pending-p2-http-download-no-timeout.md b/todos/001-pending-p2-http-download-no-timeout.md new file mode 100644 index 0000000..9c271b6 --- /dev/null +++ b/todos/001-pending-p2-http-download-no-timeout.md @@ -0,0 +1,73 @@ +--- +name: http-download-no-timeout +description: downloadToTemp 使用 http.DefaultClient,无 Client.Timeout,且上游 ctx 也没有 deadline,stalled MinIO 可挂数小时 +status: pending +priority: p2 +issue_id: "001" +tags: [code-review, performance, security, reliability, p2] +dependencies: [] +--- + +# Problem Statement + +`downloadToTemp` 在 `pkg/operator/operatorhub/violet.go:234` 用 `http.DefaultClient.Do(req)` 下载 .tgz 包。`http.DefaultClient.Timeout == 0`(无超时),而上游 `cmd/upgrade_command.go` 传进来的 ctx 也没有 `context.WithTimeout` 包裹(`o.timeout` 只在 `wait.PollUntilContextTimeout` 内被消费,不影响下载阶段)。 + +结果:若 MinIO 半开 TCP / 反向代理挂起 / 一边写一边断流,HTTP 请求会阻塞到 OS 级 TCP keepalive(Linux 默认 ~2 小时)才返回。`TestDownloadToTemp_ContextCancel` 只证明了 ctx-cancel 通路工作,没有强制 deadline。 + +# Findings + +- **security-sentinel** S8(P3)+ **architecture-strategist** A6(P3)+ **performance-oracle** P1(P2) 同时命中。三方独立观察。 +- 文件路径:`pkg/operator/operatorhub/violet.go:234` +- 调用源:`cmd/upgrade_command.go:198` 传入 bare ctx(无 deadline) +- 影响:单次升级阶段挂死数小时,CI runner 阻塞,可用性事故 + +# Proposed Solutions + +**Option A(推荐)**:在 `downloadToTemp` 内部用 `context.WithTimeout(ctx, o.timeout)` 包裹。复用已有 `OperatorConfig.Timeout` 旋钮。 + +```go +func (o *Operator) downloadToTemp(ctx context.Context, rawURL string) (string, func(), error) { + dlCtx, cancel := context.WithTimeout(ctx, o.timeout) + defer cancel() + req, err := http.NewRequestWithContext(dlCtx, http.MethodGet, rawURL, nil) + // ... +} +``` + +- 优点:复用已有配置项,5 行改动,语义清晰 +- 缺点:`downloadToTemp` 需要变成 `*Operator` 方法(或者取一个 timeout 参数) +- effort: Small +- risk: Low + +**Option B**:构造专用 `&http.Client{Timeout: o.timeout, Transport: ...}`,加 `DialContext` / `ResponseHeaderTimeout` / `IdleConnTimeout`。 + +- 优点:粒度更细,能区分 dial / header / read 阶段超时 +- 缺点:相比 Option A 多写 ~15 行 Transport 配置,本 CLI 单次调用收益有限 +- effort: Medium +- risk: Low + +# Recommended Action + +(待 triage 后填写) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go` +- 单元测试:可在现有 `TestDownloadToTemp_ContextCancel` 旁加 `TestDownloadToTemp_ParentTimeoutEnforced`,验证即使上游 ctx 没 deadline,downloadToTemp 自己也会在 `o.timeout` 后超时 +- 不影响 K8s 资源 + +# Acceptance Criteria + +- [ ] downloadToTemp 在 `o.timeout` 内必返回(即使上游 ctx 没 deadline) +- [ ] 新增测试覆盖该路径 +- [ ] `go test ./pkg/operator/operatorhub/...` 全绿 +- [ ] 现有 `TestDownloadToTemp_ContextCancel` 保持通过 + +# Work Log + +- 2026-05-18: code review 发现 by 3 个 agent(security/architecture/performance) + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 相关:Release It! "remote calls must assume they will fail" diff --git a/todos/002-pending-p2-password-leak-via-violet-stream.md b/todos/002-pending-p2-password-leak-via-violet-stream.md new file mode 100644 index 0000000..4346cc3 --- /dev/null +++ b/todos/002-pending-p2-password-leak-via-violet-stream.md @@ -0,0 +1,86 @@ +--- +name: password-leak-via-violet-stream +description: MaskCommand 只 mask 父进程自己的 log,violet 子进程的 stderr/stdout 直通 console + 错误包装链,原始密码可能泄露 +status: pending +priority: p2 +issue_id: "002" +tags: [code-review, security, p2] +dependencies: [] +--- + +# Problem Statement + +`MaskCommand` 在 `violet.go:92` 仅对 **父进程自己 log 出来的 argv** 做 mask(替换 `--password` 后的 token 为 `***`)。但 violet 子进程的 stdout/stderr 通过 `io.MultiWriter(os.Stdout, &stdoutBuf)` 实时直通父进程的 console 和 buffer(见 `pkg/exec/exec.go:70-71`),且 `wrapWithStderrTail` 把最后 20 行 stderr 包进 `result.Err`。 + +如果 violet 在 stderr 上打印自己解析过的 argv(usage line、`unknown flag --foo near --password XXX`、panic argv dump 等),明文密码会出现在两个泄露面: +1. 父进程 stdout/stderr → CI log +2. 上层错误链 → knative zap logger 顶层 "upgrade failed: violet push: exit status 1; stderr tail: ..." 输出 + +# Findings + +- **security-sentinel** S1(P2)+ S2(P2)双重命中,根因相同 +- 文件:`pkg/operator/operatorhub/violet.go:326-333`(调用层)、`pkg/exec/exec.go:67-86`(流转层) +- 现有 README 已警告 `ps auxe` 风险,但本路径绕过该缓解措施 +- 现有缓解:`MaskCommand` 仅作用于 `log.Infow("invoking violet", "cmd", MaskCommand(...))` 一处 + +# Proposed Solutions + +**Option A(推荐)**:在 `execVioletPush` 后处理 `result.Err.Error()` 与 stderr 字符串,把当前 `VIOLET_REGISTRY_PASSWORD` 的值替换为 `***`。 + +```go +func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { + // ... + result := exec.RunCommand(ctx, exec.Command{ ... }) + if result.Err != nil { + result.Err = scrubSecretFromErr(result.Err, os.Getenv(EnvVioletRegistryPassword)) + } + return result.Err +} +``` + +并对 `os.Stdout` / `os.Stderr` 包装 `sanitizingWriter`,按需替换。 + +- 优点:从源头闭环,覆盖错误链 + 实时流 +- 缺点:~30 行新代码,需要测试覆盖 +- effort: Medium +- risk: Low + +**Option B**:等 violet 上游支持 `--password-stdin` 或 `--password-file`,从根本上消除 argv 暴露。 + +- 优点:消除根因,包括 `ps`/`/proc` 暴露面 +- 缺点:依赖外部仓库进度;OQ1 已确认当前 violet 不支持 +- effort: Large (外部依赖) +- risk: Low + +**Option C**:现状 + 文档加强;要求 violet 在 shared runner 上必须 `skipPush: true` 即不传 `--password`。 + +- 优点:零代码改动 +- 缺点:留下泄露窗口,依靠运维纪律 +- effort: Negligible +- risk: Medium-High + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go`(execVioletPush 后处理) +- 测试:构造一个 fake violet 脚本把 argv 回显到 stderr,断言上层错误中不含明文密码 +- 与 README "violet 依赖与运行环境" 段落同步 + +# Acceptance Criteria + +- [ ] 当 `VIOLET_REGISTRY_PASSWORD=secret` 设置且子进程在 stderr 回显该值时,`result.Err.Error()` 不含 `secret` +- [ ] 同条件下父进程 stdout/stderr 输出(CI log)不含 `secret` +- [ ] 新增测试用 fake-violet 脚本验证 +- [ ] README 的"shared runner 风险"段落更新到反映新的兜底 + +# Work Log + +- 2026-05-18: code review 发现 by security-sentinel + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- README §violet 依赖与运行环境(已声明 ps 暴露面) diff --git a/todos/003-pending-p2-sha256-should-be-required-for-http.md b/todos/003-pending-p2-sha256-should-be-required-for-http.md new file mode 100644 index 0000000..9cbdffa --- /dev/null +++ b/todos/003-pending-p2-sha256-should-be-required-for-http.md @@ -0,0 +1,96 @@ +--- +name: sha256-should-be-required-for-http +description: ExpectedSha256 为可选字段,HTTP 明文 + 无校验时,on-path 攻击者可注入任意 .tgz 进而以 OLM 权限在 cpaas-system 部署恶意 operator +status: pending +priority: p2 +issue_id: "003" +tags: [code-review, security, p2] +dependencies: [] +--- + +# Problem Statement + +`Version.ExpectedSha256` 在 `pkg/config/config.go:106-109` 是可选字段,空字符串 → 跳过验证(`violet.go:111-114`)。 + +默认部署模式中 `packagePrefix` 是 `http://` 明文: +```yaml +packagePrefix: http://package-minio.alauda.cn:9199/packages/ +# expectedSha256: 留空 +``` + +任何 on-path 攻击者(DNS 投毒 / ARP 投毒 / 上游 MinIO 镜像被污染)可投递恶意 .tgz;violet 把它推成 ArtifactVersion,OLM 把它 install 进 `cpaas-system`,命名空间内 cluster-admin 权限的 operator 代码就在集群里跑起来了。 + +README 中文段落已警告该风险,但代码不强制。 + +# Findings + +- **security-sentinel** S3(P2)独立命中 +- 文件:`pkg/operator/operatorhub/violet.go:111-114`、`pkg/config/config.go:106-109` +- 现有缓解:仅 README 文档警告 +- 攻击成本:低(on-path attacker on CI subnet);影响:critical(cluster compromise) + +# Proposed Solutions + +**Option A(推荐)**:`Config.Validate()` 在 LoadConfig 阶段检查:当 `packagePrefix` 以 `http://` 开头时,所有 `versions[*].expectedSha256` 必须非空且符合 hex64 格式。 + +```go +func (c *Config) Validate() error { + if v := c.OperatorConfig.Violet; v != nil && strings.HasPrefix(v.PackagePrefix, "http://") { + for _, p := range c.UpgradePaths { + for _, ver := range p.Versions { + if ver.ExpectedSha256 == "" { + return fmt.Errorf("expectedSha256 is required for version %q when packagePrefix uses HTTP", ver.BundleVersion) + } + } + } + } + return nil +} +``` + +- 优点:失败立即可见(config 加载时);HTTPS+sha 路径不受影响;显式策略 +- 缺点:会让已有 HTTP-without-sha 的 config 报错——需要发版说明 +- effort: Small +- risk: Low(向后不兼容,但有意为之) + +**Option B**:默认强制 `expectedSha256` 必填(不区分 scheme);HTTPS 路径也强制。 + +- 优点:最强保证 +- 缺点:HTTPS 路径其实有 TLS 已经做 integrity,强制 sha 收益边际;迁移成本大 +- effort: Small +- risk: Medium(破坏现有 config) + +**Option C**:保留可选 + 启动时 WARN log 提示 sha 未设置。 + +- 优点:零破坏性 +- 缺点:log 容易被忽略,攻击窗口仍开 +- effort: Negligible +- risk: Medium-High + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/config/config.go`(新增 Validate),`cmd/upgrade_command.go`(调用 Validate) +- 测试:扩展 config 测试,覆盖 HTTP+sha 缺失报错、HTTPS+sha 缺失放行 +- 配合 todo 010(VioletConfig.Validate 总入口)一并实现更顺手 + +# Acceptance Criteria + +- [ ] `Config.Validate()` 实现并在 `LoadConfig` 之后被调用 +- [ ] HTTP prefix + 缺 sha 时 LoadConfig 失败,错误信息明确指向 bundleVersion +- [ ] HTTPS prefix + 缺 sha 时(暂不强制)通过 +- [ ] README 与 docs/plans/ 中的 config 示例同步 +- [ ] 单元测试覆盖正反向 + +# Work Log + +- 2026-05-18: code review 发现 by security-sentinel + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- README §violet 依赖与运行环境 +- 关联 todo: 010(VioletConfig.Validate) diff --git a/todos/004-pending-p2-package-manifest-substring-csv-match.md b/todos/004-pending-p2-package-manifest-substring-csv-match.md new file mode 100644 index 0000000..857d94a --- /dev/null +++ b/todos/004-pending-p2-package-manifest-substring-csv-match.md @@ -0,0 +1,78 @@ +--- +name: package-manifest-substring-csv-match +description: waitPackageManifest 用 strings.Contains 匹配 CSV 名,v0.65.0 会被 v0.65.0-rc1 / v0.65.0.bak 错误命中,导致 Subscription 绑到错误版本 +status: pending +priority: p2 +issue_id: "004" +tags: [code-review, quality, correctness, p2] +dependencies: [] +--- + +# Problem Statement + +`waitPackageManifest` 在 `pkg/operator/operatorhub/artifact_versiong.go:120-122` 用 substring 匹配 CSV 名: + +```go +csvName, _, _ := unstructured.NestedString(entryMap, "name") +if strings.Contains(csvName, csv) { + return true, nil +} +``` + +`csv` 来自 `av.status.version`,典型值如 `tektoncd-operator.v0.65.0`。如果 PackageManifest 同时存在 `tektoncd-operator.v0.65.0-rc1`、`tektoncd-operator.v0.65.0.bak`、`tektoncd-operator.v0.65.0-canary`,`Contains` 全部命中——首个出现的就会被当成等待目标返回 OK。后续 `InstallSubscription` 拿到 `csv=v0.65.0` 也会被 OLM 正确解析,但等待信号已不可靠。 + +# Findings + +- **silent-failure-hunter** F2(HIGH)独立命中 +- 文件:`pkg/operator/operatorhub/artifact_versiong.go:106-122` +- 三处 `_, _, _` 同时丢弃了 NestedSlice / NestedString 的错误返回,schema drift 也被静默吞掉 + +# Proposed Solutions + +**Option A(推荐)**:用精确匹配(或带 `.` 边界的 prefix-match),并把 NestedString 的 err 传出来。 + +```go +csvName, found, err := unstructured.NestedString(entryMap, "name") +if err != nil { + return false, fmt.Errorf("entry .name has wrong type in PackageManifest %s: %w", o.name, err) +} +if found && csvName == csv { + return true, nil +} +``` + +- 优点:精确 + 错误显式 +- 缺点:如果 OLM 在 CSV 名上加版本后缀(极少见),需要重新评估匹配策略 +- effort: Small +- risk: Low + +**Option B**:保留 substring 但加 `.` 边界检查:`strings.HasPrefix(csvName, csv+".")` 兜底,避免 v0.65.0 命中 v0.65.0-rc1。 + +- 优点:兼容历史命名怪癖 +- 缺点:仍然不精确 +- effort: Small +- risk: Low-Medium + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/artifact_versiong.go` +- 测试:添加单元测试覆盖 (a) 精确匹配命中 (b) `v0.65.0` 不命中 `v0.65.0-rc1` (c) NestedString 类型错误返回 + +# Acceptance Criteria + +- [ ] CSV 名匹配从 substring 改为精确(或带边界的 prefix-match) +- [ ] NestedString 错误不再被静默丢弃 +- [ ] 新增单元测试覆盖三种场景 +- [ ] 现有测试通过 + +# Work Log + +- 2026-05-18: code review 发现 by silent-failure-hunter + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/005-pending-p2-wait-av-no-retry-on-transient-errors.md b/todos/005-pending-p2-wait-av-no-retry-on-transient-errors.md new file mode 100644 index 0000000..b43ca8b --- /dev/null +++ b/todos/005-pending-p2-wait-av-no-retry-on-transient-errors.md @@ -0,0 +1,84 @@ +--- +name: wait-av-no-retry-on-transient-errors +description: waitArtifactVersionPresent 把任何 Get 错误(包括 NotFound 与 5xx/429/timeout)直接终结轮询,apiserver 短暂抖动就杀掉整个升级 +status: pending +priority: p2 +issue_id: "005" +tags: [code-review, reliability, correctness, p2] +dependencies: [] +--- + +# Problem Statement + +`pkg/operator/operatorhub/artifact_versiong.go:75-78`: + +```go +obj, err := o.client.Resource(artifactVersionGVR).Namespace(systemNamespace).Get(ctx, name, metav1.GetOptions{}) +if err != nil { + return false, err +} +``` + +两个问题: +1. **NotFound**:violet push 完成到 ArtifactVersion 在 etcd 可见有个最终一致性窗口;此处 NotFound 不应该终止轮询,应当继续等。 +2. **transient errors**:apiserver leader election、webhook timeout、429(throttle)、5xx、连接 reset——都会立即把 `wait.PollUntilContextTimeout` 杀掉,触发"莫名其妙的升级失败",需要手动重跑。 + +# Findings + +- **silent-failure-hunter** F3(HIGH)独立命中 +- 文件:`pkg/operator/operatorhub/artifact_versiong.go:72-93` +- 影响:CI 升级假阴性、可靠性下降、对工具的信心受损 + +# Proposed Solutions + +**Option A(推荐)**:分类错误,NotFound 与 transient 继续轮询。 + +```go +obj, err := o.client.Resource(...).Get(ctx, name, metav1.GetOptions{}) +switch { +case errors.IsNotFound(err): + return false, nil +case errors.IsServerTimeout(err) || errors.IsTooManyRequests(err) || errors.IsServiceUnavailable(err): + log.Warnw("transient API error while polling AV, will retry", "err", err) + return false, nil +case err != nil: + return false, fmt.Errorf("get AV %s: %w", name, err) +} +``` + +- 优点:贴近 K8s 客户端语义;保留 timeout 作为最终 backstop +- 缺点:需要补测试覆盖每个分支 +- effort: Small +- risk: Low + +**Option B**:所有 Get 错误统一吞为 `(false, nil)`,由 `o.timeout` 兜底。 + +- 优点:极简 +- 缺点:掩盖了真正的非 transient 错误(如 RBAC 拒绝),timeout 后 user 拿不到根因 +- effort: Negligible +- risk: Medium + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/artifact_versiong.go:72-93` +- 同样的模式可顺便审视:`waitPackageManifest` 已经对 NotFound 单独处理(line 98),但其他 transient 没有,建议一并对齐 + +# Acceptance Criteria + +- [ ] NotFound 视为继续轮询 +- [ ] ServerTimeout / TooManyRequests / ServiceUnavailable 视为继续轮询并打 WARN log +- [ ] RBAC denied 等永久错误立即返回 +- [ ] 新增单元测试用 reactor 注入各类错误验证 + +# Work Log + +- 2026-05-18: code review 发现 by silent-failure-hunter + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- K8s client-go errors package diff --git a/todos/006-pending-p2-download-close-error-swallowed.md b/todos/006-pending-p2-download-close-error-swallowed.md new file mode 100644 index 0000000..3d60e34 --- /dev/null +++ b/todos/006-pending-p2-download-close-error-swallowed.md @@ -0,0 +1,88 @@ +--- +name: download-close-error-swallowed +description: downloadToTemp 的 defer f.Close() 吞掉 Close 返回的 flush 错误,磁盘满或 NFS 重置时静默写出截断的 .tgz +status: pending +priority: p2 +issue_id: "006" +tags: [code-review, quality, correctness, p2] +dependencies: [] +--- + +# Problem Statement + +`pkg/operator/operatorhub/violet.go:246-256`: + +```go +f, err := os.Create(filePath) +// ... +defer f.Close() + +if _, err := io.Copy(f, resp.Body); err != nil { + cleanup() + return "", nil, fmt.Errorf("copy body to %s: %w", filePath, err) +} + +return filePath, cleanup, nil +``` + +许多文件系统在 `Close()` 时才真正 flush write buffer(不是 `Write()`)。如果磁盘恰好在最后一段 flush 时撑爆(ENOSPC)/ NFS 服务器重置 / quota exceeded,`io.Copy` 返回 success,函数返回 "success" 但落地的是截断/损坏的 .tgz。 + +如果 `ExpectedSha256` 未设置(todo 003 默认场景),违法包直接进入 violet push,产生一个完全指错方向的 "violet push: ..." 错误(实际根因是磁盘)。 + +# Findings + +- **silent-failure-hunter** F4(HIGH)独立命中 +- 文件:`pkg/operator/operatorhub/violet.go:246-256` +- 与 todo 003(SHA 强制)相互独立但相互补强 + +# Proposed Solutions + +**Option A(推荐)**:显式 close-and-check 替代 defer。 + +```go +if _, err := io.Copy(f, resp.Body); err != nil { + f.Close() + cleanup() + return "", nil, fmt.Errorf("copy body to %s: %w", filePath, err) +} +if err := f.Close(); err != nil { + cleanup() + return "", nil, fmt.Errorf("close %s after download: %w", filePath, err) +} +``` + +- 优点:直击根因;与 io.Copy 错误路径并列,可读性好 +- 缺点:失去 defer 的 panic-safe 兜底(但本函数无 panic 风险点) +- effort: Small +- risk: Low + +**Option B**:用 `defer func() { if cerr := f.Close(); cerr != nil && err == nil { err = cerr } }()` 模式,让命名返回参数捕获 Close 错误。 + +- 优点:保持 defer 的对称性 +- 缺点:需要把返回签名改为命名返回,可读性略降 +- effort: Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go:246-256` +- 测试:mock `os.File` 不容易;可用 `httptest` 配合 `t.TempDir()` 做正向测试。负向测试(Close 失败)可暂缓——主要靠 code review 保证不退化 + +# Acceptance Criteria + +- [ ] Close 返回的错误会被传播为函数返回值 +- [ ] 错误信息明确指向 `close %s after download` 阶段 +- [ ] 现有 `TestDownloadToTemp_Success` / `_404` / `_5xx` / `_DefaultFilename` 保持通过 + +# Work Log + +- 2026-05-18: code review 发现 by silent-failure-hunter + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 关联 todo: 003(SHA 强制可在配置层兜底,但 Close 修复仍是独立必要) diff --git a/todos/007-pending-p2-empty-csv-fed-to-subscription.md b/todos/007-pending-p2-empty-csv-fed-to-subscription.md new file mode 100644 index 0000000..6b3a3ec --- /dev/null +++ b/todos/007-pending-p2-empty-csv-fed-to-subscription.md @@ -0,0 +1,91 @@ +--- +name: empty-csv-fed-to-subscription +description: operator.go:117 的 `csv, _, _ := NestedString(av.Object, "status", "version")` 完全不检查 csv 是否为空就传给 InstallSubscription,造成下游莫名失败 +status: pending +priority: p2 +issue_id: "007" +tags: [code-review, quality, correctness, p2] +dependencies: [] +--- + +# Problem Statement + +`pkg/operator/operatorhub/operator.go:117-122`: + +```go +csv, _, _ := unstructured.NestedString(av.Object, "status", "version") +channel := version.Channel +if channel == "" { + channel = "stable" // default fallback +} +if err := o.InstallSubscription(ctx, csv, channel); err != nil { +``` + +`csv` 取值返回三元组 `(value, found, err)` 全部被丢弃;如果 AV 的 `status.version` 类型错(schema drift / controller bug 写了 null)或字段不存在,`csv` 就是空字符串,被原样喂给 `InstallSubscription`,后者远远地以一个"no matching CSV"风格的错误失败,根因离失败点很远。 + +`installViaViolet` 内部对 `csv` 已经有 `!found || csv == ""` 的强检查(violet.go:192-195),返回了带 AV 名字的明确错误。但 **`UpgradeOperator` 的第二次提取没复用前面的强校验**,相当于在同一条调用链上做了两次相同操作,第二次完全裸奔。 + +# Findings + +- **silent-failure-hunter** F5(MEDIUM)独立命中 +- 文件:`pkg/operator/operatorhub/operator.go:117` +- 同一字段在 `violet.go:192` 已经强校验过 + +# Proposed Solutions + +**Option A(推荐)**:让 `InstallArtifactVersion` 把 `csv` 也作为返回值传出来,避免 caller 二次提取。 + +```go +func (o *Operator) InstallArtifactVersion(ctx context.Context, version config.Version) (*unstructured.Unstructured, string, error) { + // ... + return av, csv, nil +} + +// UpgradeOperator: +av, csv, err := o.InstallArtifactVersion(ctx, version) +``` + +- 优点:单一来源,杜绝二次提取漂移 +- 缺点:签名变更,影响接口兼容性 +- effort: Small +- risk: Low + +**Option B**:在 `UpgradeOperator` 内复用 NestedString 但走强校验。 + +```go +csv, found, err := unstructured.NestedString(av.Object, "status", "version") +if err != nil { + return fmt.Errorf("AV %s status.version has wrong type: %w", av.GetName(), err) +} +if !found || csv == "" { + return fmt.Errorf("AV %s status.version is empty", av.GetName()) +} +``` + +- 优点:签名不变 +- 缺点:与 `installViaViolet` 内部的同样代码重复 +- effort: Negligible +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/operator.go:109-127`、`pkg/operator/operatorhub/artifact_versiong.go:18` +- 测试:现有 unit 覆盖偏少;可加 fake-client 注入 status.version=null 的 AV,断言 UpgradeOperator 在 AV 阶段就报错 + +# Acceptance Criteria + +- [ ] 空字符串 / 类型错的 `status.version` 在 `UpgradeOperator` 入口就报错(带 AV 名字) +- [ ] 不再有"InstallSubscription with empty csv"的可能路径 +- [ ] 单元测试覆盖 + +# Work Log + +- 2026-05-18: code review 发现 by silent-failure-hunter + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/008-pending-p2-channel-default-inconsistency.md b/todos/008-pending-p2-channel-default-inconsistency.md new file mode 100644 index 0000000..dffc964 --- /dev/null +++ b/todos/008-pending-p2-channel-default-inconsistency.md @@ -0,0 +1,80 @@ +--- +name: channel-default-inconsistency +description: BuildPackageURL 对空 channel fail-fast,但 UpgradeOperator 紧接着的 InstallSubscription 把空 channel 静默回退成 "stable",同一调用链两套策略 +status: pending +priority: p2 +issue_id: "008" +tags: [code-review, quality, correctness, p2] +dependencies: [] +--- + +# Problem Statement + +同一字段 `version.Channel` 在同一次升级流程中被两个 caller 用截然相反的策略处理: + +1. `violet.go:55-56` `BuildPackageURL` ——空 channel **直接报错**:`"channel is empty (Version.Channel is required when using violet)"` +2. `operator.go:118-121` `UpgradeOperator` ——空 channel **静默回退到 "stable"**: + ```go + channel := version.Channel + if channel == "" { + channel = "stable" // default fallback + } + ``` + +当前因为 `installViaViolet` 跑在前面,空 channel 早就被拦在 BuildPackageURL 阶段,所以 (2) 是 dead branch。但只要未来有任何非 violet 的 install 路径(或 violet 路径分支早返回),(2) 立刻生效——用户在 YAML 里手抖打成 `chanel:` 或者忘填,Subscription 静默走 stable channel,**升级在错误的 channel 上"成功"**,后果是装错 bundle。 + +# Findings + +- **silent-failure-hunter** F7(MEDIUM)+ **type-design-analyzer** T5(关于 Channel 字段语义)双重命中 +- 文件:`pkg/operator/operatorhub/operator.go:118-121`、`pkg/operator/operatorhub/violet.go:55-56` +- 类别:同一字段、同一调用链上的策略不一致 + +# Proposed Solutions + +**Option A(推荐)**:移除 operator.go 的 silent fallback;与 BuildPackageURL 对齐到 "channel 必填,空即报错"。 + +```go +if version.Channel == "" { + return fmt.Errorf("version.channel is required") +} +if err := o.InstallSubscription(ctx, csv, version.Channel); err != nil { ... } +``` + +或者更进一步,在 `Config.Validate()`(参见 todo 010)阶段提前拦下。 + +- 优点:单一策略;fail-fast;与 todo 010 天然合流 +- 缺点:现有 config 如果省略 channel 会立即失败(实际上必然已经在 BuildPackageURL 失败了,所以是 no-op 用户感知) +- effort: Negligible +- risk: Low + +**Option B**:反向对齐——`BuildPackageURL` 也回退到 stable。 + +- 优点:宽容 +- 缺点:把 BuildPackageURL 的 fail-fast 设计意图打掉;rejected +- effort: Negligible +- risk: Medium(弱化错误检测) + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/operator.go:118-121` +- 配合 todo 010(VioletConfig.Validate)一并实现,把 `channel != ""` 提到 config 加载阶段 + +# Acceptance Criteria + +- [ ] 移除 operator.go:118-121 的 silent "stable" fallback +- [ ] 空 channel 在 `Config.Validate()` 或 `UpgradeOperator` 入口报错 +- [ ] 错误信息明确指向 bundleVersion + Channel 字段 +- [ ] 现有测试不退化 + +# Work Log + +- 2026-05-18: code review 发现 by silent-failure-hunter + type-design-analyzer + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 关联 todo: 010(VioletConfig.Validate 总入口) diff --git a/todos/009-pending-p2-skippush-bool-pointer-overmodel.md b/todos/009-pending-p2-skippush-bool-pointer-overmodel.md new file mode 100644 index 0000000..81eadd6 --- /dev/null +++ b/todos/009-pending-p2-skippush-bool-pointer-overmodel.md @@ -0,0 +1,86 @@ +--- +name: skippush-bool-pointer-overmodel +description: VioletConfig.SkipPush *bool 用三态(nil/*false/*true)模拟"unset=true 默认",但实际只用到二态;切成 inverted bool 可省掉 defaultConfig 分支和测试的 pointer-to-bool 样板 +status: pending +priority: p2 +issue_id: "009" +tags: [code-review, quality, type-design, p2] +dependencies: [] +--- + +# Problem Statement + +`pkg/config/config.go:74-77`: + +```go +// SkipPush controls whether `--skip-push` is passed to `violet push`. +// Pointer so we can distinguish "unset" (treated as true) from "explicit +// false" (private-registry scenario that wants violet to also push images). +SkipPush *bool `yaml:"skipPush,omitempty"` +``` + +三个站点参与同一个 dance: +- 结构体字段(`*bool`) +- `defaultConfig` 在 `config.go:147-152` 设默认 +- 调用点 `violet.go:320-323` 解引用 + +三态的唯一收益是区分"用户没写"与"用户写了 false"。本 PR **并未利用** 这个区分——`defaultConfig` 把 nil 也变成 `*true`,于是 caller 看到的只有 `*true` / `*false` 两态。 + +# Findings + +- **code-simplicity-reviewer** C3 + **type-design-analyzer** T1(SkipPush 评分 clarity 2/5、test 2/5)独立命中 +- 文件:`pkg/config/config.go:74-77, 147-152`、`pkg/operator/operatorhub/violet.go:320-323`、测试 `violet_test.go` 多处需要 `t := true; cfg.SkipPush = &t` + +# Proposed Solutions + +**Option A(推荐)**:反转字段,零值即安全默认。 + +```go +// PushImages 控制是否让 violet 把 bundle 内的镜像也推到 registry。 +// 默认 false(仅做 OLM bundle push,--skip-push 启用);私有 registry +// 场景显式设为 true 再配合 PushArgs / 环境变量传凭证。 +PushImages bool `yaml:"pushImages,omitempty"` +``` + +```go +// violet.go: +if !o.violet.PushImages { + args = append(args, flagSkipPush) +} +``` + +- 优点:零值即安全默认;删除 `defaultConfig` 的 SkipPush 分支;测试不再 `t := true`;YAML 行为不变(omitempty) +- 缺点:字段重命名属于 config 不兼容变更,需要 release notes 同步 +- effort: Small +- risk: Low + +**Option B**:保留 `*bool` 但加注释说明"目前 nil/*true 行为等价、保留 *bool 仅为未来扩展"。 + +- 优点:零代码改动 +- 缺点:把不必要的复杂度推给未来——而 PR 1 评审刚刚才删掉一组"为将来准备"的常量 +- effort: Negligible +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/config/config.go`、`pkg/operator/operatorhub/violet.go`、`pkg/operator/operatorhub/violet_test.go`、README config 示例 +- YAML 字段重命名:`skipPush` → `pushImages`,发版说明需要写 + +# Acceptance Criteria + +- [ ] 配置字段从 `SkipPush *bool` 改为 `PushImages bool`(或保留并加上注释,看 triage 决定) +- [ ] `defaultConfig` 的 SkipPush 分支被清除(若选 A) +- [ ] 测试用例不再使用 `t := true; cfg.SkipPush = &t` 样板 +- [ ] README config 示例同步 + +# Work Log + +- 2026-05-18: code review 发现 by code-simplicity-reviewer + type-design-analyzer + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/010-pending-p2-violet-config-validate-method.md b/todos/010-pending-p2-violet-config-validate-method.md new file mode 100644 index 0000000..53ee34f --- /dev/null +++ b/todos/010-pending-p2-violet-config-validate-method.md @@ -0,0 +1,103 @@ +--- +name: violet-config-validate-method +description: VioletConfig 缺少集中校验入口;packagePrefix 空、bin 路径错、operatorhub+violet=nil 等失败都拖到 InstallArtifactVersion 才暴露 +status: pending +priority: p2 +issue_id: "010" +tags: [code-review, type-design, architecture, p2] +dependencies: [] +--- + +# Problem Statement + +VioletConfig 的校验目前分布在三个站点: +- `defaultConfig`(pkg/config/config.go:147-152)—— SkipPush 默认值 +- `BuildPackageURL`(violet.go:49-62)—— PackagePrefix / Channel / BundleVersion 空字符串检查 +- `validateVioletBin`(violet.go:340-355)—— Bin 路径形态 + +`OperatorConfig.Type == "operatorhub"` 且 `Violet == nil` 的非法组合,则要拖到 `InstallArtifactVersion`(artifact_versiong.go:19-21)才报错——用户跑了一半升级才发现 config 没写完。 + +# Findings + +- **type-design-analyzer** T2 + T6 + **architecture-strategist** A7 三方独立命中 +- 类别:失败发现窗口推迟(config 加载 → 实际跑到那一步) + +# Proposed Solutions + +**Option A(推荐)**:在 `pkg/config/config.go` 添加: + +```go +func (c *Config) Validate() error { + if c.OperatorConfig.Type == "operatorhub" && c.OperatorConfig.Violet == nil { + return fmt.Errorf("operatorConfig.violet is required when type=operatorhub") + } + if v := c.OperatorConfig.Violet; v != nil { + if err := v.Validate(); err != nil { + return err + } + } + // Per-version 校验 + for _, p := range c.UpgradePaths { + for _, ver := range p.Versions { + if err := ver.Validate(c.OperatorConfig.Violet); err != nil { + return err + } + } + } + return nil +} + +func (v *VioletConfig) Validate() error { + if v.PackagePrefix == "" { + return fmt.Errorf("violet.packagePrefix is required") + } + if v.Bin != "" { + if err := validateVioletBin(v.Bin); err != nil { + return err + } + } + return nil +} +``` + +并在 `LoadConfig` 末尾调用。这一并兜住: +- todo 003 的 HTTP+sha 强制(嵌入 Version.Validate) +- todo 008 的 channel 必填(嵌入 Version.Validate) +- A7 的 violet 必填(顶层) + +- 优点:单一中央入口;config 加载就 fail-fast +- 缺点:~50 行新代码 + 测试 +- effort: Medium +- risk: Low + +**Option B**:保留分布式校验现状。 + +- 优点:零改动 +- 缺点:失败时机延迟、debug 成本高、违反 Ousterhout "deep modules" 倾向(窄接口 + 集中行为) +- effort: Negligible +- risk: Medium-Long-term + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/config/config.go`、`cmd/upgrade_command.go`(在 LoadConfig 后调 Validate) +- 与 todos 003、008、011、016 自然合并 + +# Acceptance Criteria + +- [ ] `Config.Validate()` 实现并被 `LoadConfig` 或 cmd 调用 +- [ ] 所有校验失败在 CLI 启动初期就报错,不留到 InstallArtifactVersion +- [ ] 单元测试覆盖 happy path + 每个失败分支 +- [ ] 调用点的旁路校验(BuildPackageURL 等)保留为 defense-in-depth + +# Work Log + +- 2026-05-18: code review 发现 by type-design-analyzer + architecture-strategist + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 关联 todos: 003、008、011、016 diff --git a/todos/011-pending-p2-drop-violet-bin-and-validator.md b/todos/011-pending-p2-drop-violet-bin-and-validator.md new file mode 100644 index 0000000..6fe9dd0 --- /dev/null +++ b/todos/011-pending-p2-drop-violet-bin-and-validator.md @@ -0,0 +1,64 @@ +--- +name: drop-violet-bin-and-validator +description: VioletConfig.Bin + validateVioletBin 是 YAGNI——无现有 caller 设 Bin,自定义路径已经被 $PATH 标准机制满足;删掉可省 ~75 LoC +status: pending +priority: p2 +issue_id: "011" +tags: [code-review, simplicity, yagni, p2] +dependencies: [] +--- + +# Problem Statement + +两个相关字段/函数纯粹为彼此而生: +- `VioletConfig.Bin` (config.go:64-65) +- `validateVioletBin` (violet.go:340-355) + 5 个 sub-test(violet_test.go:237-282,~46 行) + +仓库内 zero caller 设 `Bin`;如果用户想用自定义 violet 路径,`PATH=$HOME/bin:$PATH violet` 是标准 shell 解决方案。`validateVioletBin` 的 "must be absolute" 规则反而阻止了合理的 `./bin/violet` 相对路径开发场景。 + +# Findings + +- **code-simplicity-reviewer** C1 + C2 双重命中(同一根因) +- 可删 LoC 总计 ~75(~30 prod + ~46 test) + +# Proposed Solutions + +**Option A(推荐)**:删 `Bin` 字段 + `validateVioletBin` + 相关测试 + 调用点。直接 `bin := "violet"`。 + +- 优点:~75 LoC 减负、config 表面收缩、消除 todo 010 中 Validate 的一个分支 +- 缺点:未来如果真有"自定义路径"需求需要再加回——按 YAGNI 原则那是未来该解决的问题 +- effort: Small +- risk: Low + +**Option B**:保留 Bin 字段、删 validateVioletBin(让 os/exec 自己报错)。 + +- 优点:保留扩展点 +- 缺点:保留了无人使用的字段;validateVioletBin 的"absolute path 强制"消失也是好事,但 Bin 自己仍是 dead config +- effort: Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/config/config.go`、`pkg/operator/operatorhub/violet.go`、`pkg/operator/operatorhub/violet_test.go`、README +- 与 todo 010(Validate 入口)有交集——若选 A,VioletConfig.Validate 不再需要 Bin 分支 + +# Acceptance Criteria + +- [ ] `VioletConfig.Bin` 字段移除(若选 A) +- [ ] `validateVioletBin` 函数 + 测试移除(若选 A) +- [ ] `execVioletPush` 简化为 `bin := "violet"` +- [ ] README config 示例同步 +- [ ] `go test ./pkg/operator/operatorhub/...` 全绿 + +# Work Log + +- 2026-05-18: code review 发现 by code-simplicity-reviewer + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- CLAUDE.md "简洁优先" §7 diff --git a/todos/012-pending-p2-url-scheme-allowlist.md b/todos/012-pending-p2-url-scheme-allowlist.md new file mode 100644 index 0000000..d2160ee --- /dev/null +++ b/todos/012-pending-p2-url-scheme-allowlist.md @@ -0,0 +1,74 @@ +--- +name: url-scheme-allowlist +description: BuildPackageURL / downloadToTemp 不校验 URL scheme,配置被污染时(多租户 pipeline、PR 注入)可走 SSRF 触达 169.254.169.254 等元数据服务 +status: pending +priority: p2 +issue_id: "012" +tags: [code-review, security, defense-in-depth, p2] +dependencies: [] +--- + +# Problem Statement + +`BuildPackageURL` 不检查 scheme;`downloadToTemp` 用 `http.DefaultClient.Do(req)` 跟随默认最多 10 次重定向,无 `CheckRedirect`。 + +攻击场景: +1. 多租户 pipeline / 被污染的 PR / 供应链注入 → 攻击者控制 `config.yaml` → 设 `packagePrefix: http://169.254.169.254/latest/meta-data/iam/security-credentials` → 请求云元数据端点。返回的 IAM token 落入临时文件(后续被 cleanup 删除,但请求本身已发生),在 IMDSv1 环境造成凭证签发或泄露。 +2. 攻击者在外部 server 上配 redirect 链探测内网(虽然 Go 默认 transport 拒 `file://`,但 HTTP 跳板足以扫描内部 IP)。 + +"config 是可信的"——这是个假设,不是固有性质;它会随多租户 pipeline / GitOps 演进而漂移。Defense-in-depth 是廉价兜底。 + +# Findings + +- **security-sentinel** S4(P2)独立命中 +- 文件:`pkg/operator/operatorhub/violet.go:49-62`(BuildPackageURL)、`pkg/operator/operatorhub/violet.go:228-244`(downloadToTemp) + +# Proposed Solutions + +**Option A(推荐)**:在 `BuildPackageURL` 或 `downloadToTemp` 解析 URL 时校验 scheme。 + +```go +parsed, err := url.Parse(rawURL) +if err != nil { return ..., err } +if parsed.Scheme != "http" && parsed.Scheme != "https" { + return ..., fmt.Errorf("unsupported scheme %q (only http/https allowed)", parsed.Scheme) +} +``` + +可选叠加自定义 `http.Client.CheckRedirect`:限制 hops <= 3,每跳重新校验 scheme。 + +- 优点:~5 行;零运行时开销;闭合 SSRF 主要面 +- 缺点:仍不能拦截"http→内网 IP"——但那是另一个问题(host allowlist),属本 PR 范围外 +- effort: Small +- risk: Low + +**Option B**:再加 host allowlist(如必须以 `package-minio.alauda.cn` 结尾)。 + +- 优点:闭合内网探测面 +- 缺点:跨环境的 MinIO host 变化大,难以静态配置;放进 config 又回到信任问题 +- effort: Medium +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go` +- 测试:扩展 `TestBuildPackageURL` 覆盖 `file://` / `ftp://` / 空 scheme 等 + +# Acceptance Criteria + +- [ ] 非 http/https scheme 在 URL 构建或下载阶段被拒 +- [ ] 错误信息明确说明 scheme 限制 +- [ ] 单元测试覆盖 + +# Work Log + +- 2026-05-18: code review 发现 by security-sentinel + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- OWASP SSRF cheat sheet diff --git a/todos/013-pending-p3-silent-error-cleanup.md b/todos/013-pending-p3-silent-error-cleanup.md new file mode 100644 index 0000000..d788bd8 --- /dev/null +++ b/todos/013-pending-p3-silent-error-cleanup.md @@ -0,0 +1,86 @@ +--- +name: silent-error-cleanup +description: 多处 `_, _, _` 丢弃 unstructured.Nested* 的 err、cleanup 丢弃 RemoveAll 错误、两条 sentinel 错误没有 stage 标签——单个都不致命,叠在一起拉低可调试性 +status: pending +priority: p3 +issue_id: "013" +tags: [code-review, quality, observability, p3] +dependencies: [] +--- + +# Problem Statement + +一组小的 "silent failure" 模式集合,单个都是 P3,但合在一起影响 debug 体验: + +**A. `status, _, _ := unstructured.NestedMap(...)` 把类型错误也吞掉** (`artifact_versiong.go:80-81`) +若 AV `status` 字段不是 map(schema drift / controller bug),Go 读 nil map 不 panic,但 polling 看不到 "Present" 就轮询到 `o.timeout`,最终给用户的是 `context deadline exceeded`——根本没说 "status 类型错"。 + +**B. `cleanup := func() { _ = os.RemoveAll(dir) }`** (`violet.go:214`) +CI runner 长时间 /tmp 撑爆时,留下的孤儿 dir 累积,将来 mkdir 失败时已无线索。最小成本:log.Warnw 即可。 + +**C. Sentinel 错误没有 stage 标签** (`violet.go:188, 194`) +其他 stage 都是 `fmt.Errorf("violet push: %w", ...)` 这样带短标签的 wrap;spec.tag mismatch 和 status.version empty 这两条仅返回数据级断言,grep 失败根因时找不到统一标签。 + +# Findings + +- F1 (silent-failure) + F6 (silent-failure) + A5 (architecture) 三处独立命中 +- 同一类别(silent / 不可 grep)的不同表现形式 + +# Proposed Solutions + +**A**:把 NestedMap 的 err 也接住,类型错立即返回硬错误: + +```go +status, found, err := unstructured.NestedMap(obj.Object, "status") +if err != nil { + return false, fmt.Errorf("AV %s status is malformed: %w", name, err) +} +if !found { return false, nil } +``` + +**B**:closure 捕获 log,best-effort cleanup 加 WARN: + +```go +cleanup := func() { + if err := os.RemoveAll(dir); err != nil { + log.Warnw("failed to remove temp dir; manual cleanup may be needed", "dir", dir, "err", err) + } +} +``` + +**C**:两条 sentinel 加 stage 标签: + +```go +return nil, fmt.Errorf("cross-check spec.tag: AV %s has spec.tag=%q, expected %q ...", ...) +return nil, fmt.Errorf("read status.version: AV %s status.version is empty", avName) +``` + +- effort: Small(三处都是 1-3 行) +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件: + - `pkg/operator/operatorhub/artifact_versiong.go:74-93`(A) + - `pkg/operator/operatorhub/violet.go:214`(B) + - `pkg/operator/operatorhub/violet.go:188, 194`(C) +- 测试:补一个 NestedMap 返回非 map 的负向用例 + +# Acceptance Criteria + +- [ ] (A) NestedMap 错误传播 +- [ ] (B) cleanup 失败有 WARN log +- [ ] (C) 两条 sentinel 错误均带 stage 标签 +- [ ] 现有测试不退化 + +# Work Log + +- 2026-05-18: code review 合并 finding F1+F6+A5 + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/014-pending-p3-security-defense-in-depth.md b/todos/014-pending-p3-security-defense-in-depth.md new file mode 100644 index 0000000..7605093 --- /dev/null +++ b/todos/014-pending-p3-security-defense-in-depth.md @@ -0,0 +1,92 @@ +--- +name: security-defense-in-depth +description: 三个低概率但廉价可关闭的安全空缺——VIOLET_* 允许列表过宽、删 AV 前未校验 ownerRef、下载无 Content-Length 上限 +status: pending +priority: p3 +issue_id: "014" +tags: [code-review, security, defense-in-depth, p3] +dependencies: [] +--- + +# Problem Statement + +三个独立但都属于 "defense-in-depth" 的小空缺: + +**A. `VIOLET_*` 允许列表过宽** (`violet.go:299-305`) +当前 allowlist 用 `VIOLET_*` 前缀通配。攻击模型:未来 violet 引入 `VIOLET_AWS_SESSION_TOKEN` 之类的 env-recognized 名字,或者环境里被人故意命名 `VIOLET_GITHUB_TOKEN`,全部自动透传。 + +**B. `deleteArtifactVersionIfExists` 不校验 ownerRef** (`violet.go:265-292`) +`avName = .`,两者都是 config 控制。如果 `bundleVersion` 拼写或 artifact 命名异常导致与他人 AV 名称冲撞,本工具会无差别删之。 + +**C. `io.Copy(f, resp.Body)` 无大小上限** (`violet.go:253`) +被污染的 MinIO 可以无限流式写 `/tmp` 直到撑爆 tmpfs。todo 001 加了 timeout 后影响降低,但仍是一个独立的失控点。 + +# Findings + +- S5 + S7 + P2(performance Content-Length) 独立命中 +- 现实威胁强度:低;修复成本:低 + +# Proposed Solutions + +**A**:把 allowlist 收窄到精确名字。 + +```go +var violetEnvAllowlist = []string{ + "KUBECONFIG", "PATH", "HOME", "USER", + EnvVioletRegistryUsername, + EnvVioletRegistryPassword, +} +``` + +后续如 violet 引入新的支持 env,**显式**追加。 + +**B**:删前校验 `metadata.labels["cpaas.io/artifact-version"] == o.artifact` 或 ownerRef 指向预期 Artifact。 + +```go +existing, _ := o.GetResource(ctx, name, systemNamespace, artifactVersionGVR) +if existing != nil { + if labels := existing.GetLabels(); labels["cpaas.io/artifact-version"] != o.artifact { + return fmt.Errorf("refusing to delete AV %s: label cpaas.io/artifact-version=%q does not match %q", + name, labels["cpaas.io/artifact-version"], o.artifact) + } + // ... proceed with delete +} +``` + +**C**:`io.LimitReader` 加一个慷慨上限: + +```go +const maxBundleBytes = 2 << 30 // 2 GiB +if resp.ContentLength > maxBundleBytes { + return ..., fmt.Errorf("bundle too large: %d", resp.ContentLength) +} +io.Copy(f, io.LimitReader(resp.Body, maxBundleBytes)) +``` + +- effort: A=Negligible / B=Small / C=Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go:299-305, 265-292, 253` +- A 部分需要测试覆盖 `VIOLET_X` 不在 allowlist 时被过滤(依赖 pkg/exec 的 matchAllowlist 行为,已有测试) +- B 部分需要 fake client 测试覆盖 label 不匹配 → refuse + +# Acceptance Criteria + +- [ ] A: VIOLET_* glob 收窄为精确名字列表 +- [ ] B: deleteArtifactVersionIfExists 在 label/ownerRef 不匹配时拒绝并报错 +- [ ] C: 加大小上限,超限直接报错;io.Copy 用 LimitReader 包裹 +- [ ] 单元测试覆盖 + +# Work Log + +- 2026-05-18: code review 合并 finding S5+S7+P2(perf) + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/015-pending-p3-code-organization-and-constants.md b/todos/015-pending-p3-code-organization-and-constants.md new file mode 100644 index 0000000..26c7255 --- /dev/null +++ b/todos/015-pending-p3-code-organization-and-constants.md @@ -0,0 +1,68 @@ +--- +name: code-organization-and-constants +description: 一组组织/常量化小问题——内联字面量、文件中段 var、test helper 残留、deleteAV 多余 Get、Version 全量传入——单个都琐碎,合起来是文件可读性议题 +status: pending +priority: p3 +issue_id: "015" +tags: [code-review, quality, organization, p3] +dependencies: [] +--- + +# Problem Statement + +一组组织性细节,单个都不重要,但合起来值得在一次小 cleanup commit 里处理: + +**A. `violetEnvAllowlist` 是 var 放在文件中段** (`violet.go:299-305`) +读者搜"violet 看得到哪些 env"会找不到——按 CLAUDE.md 的"常量集中在文件顶部"惯例,应当与文件顶部的 `EnvVioletRegistry*` 和 `flag*` 常量块放在一起。 + +**B. 内联字面量** (`violet.go:210, 220`) +`"upgrade-violet-*"`(temp dir 模式)和 `"package.tgz"`(fallback 文件名)都是有调试意义的常量,应当 hoist 到文件顶部。 + +**C. 测试 dead weight** (`violet_test.go:408-426`) +`schemaGVR` 局部类型 + `listKinds` map + `_ = listKinds` placeholder 全是给"以后扩展"留的——目前不用。fake client 已经通过 `scheme.AddKnownTypeWithName` 学到了 list kind。 + +**D. `deleteArtifactVersionIfExists` 多了一次预 Get** (`violet.go:268-282`) +预 Get 只是为了日志带 uid + happy-path 早退;Delete 本身已经 tolerate NotFound,poll loop 也已等到 NotFound。简化后可少一次往返 + 删一个测试用例。 + +**E. `installViaViolet` 接收 `config.Version` 全量** (`violet.go:142`) +只用 3 个字段(Channel、BundleVersion、ExpectedSha256),但暴露 6 个;窄接口能避免未来 PR 3 / PR 4 引入更多无关字段。可项目化为局部 `bundleSpec` struct。 + +# Findings + +- A1 + A2 + C6 + C7 + A3 + C5 合并 + +# Proposed Solutions + +每项独立、依次实施,建议放在 PR 3 的 cleanup commit 里: + +- **A**:把 `var violetEnvAllowlist` 移到文件顶 `const (...)` 旁 +- **B**:`const (tempDirPattern = "upgrade-violet-*"; fallbackPackageFilename = "package.tgz")` +- **C**:删 `schemaGVR` 类型 + `listKinds` + `_ = listKinds` +- **D**:去掉预 Get,直接 Delete + poll,可顺手删 `TestDeleteArtifactVersionIfExists_ConcurrentRaceVanishesGracefully`(与 no-residue case 等价) +- **E**:引入 `type bundleSpec struct { Channel, BundleVersion, ExpectedSha256 string }`,让 `InstallArtifactVersion` 做投影 + +- effort: 各 Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/operator/operatorhub/violet.go`、`pkg/operator/operatorhub/violet_test.go` +- 建议作为 PR 3 cleanup commit 一并打包,避免单独的小 PR + +# Acceptance Criteria + +- [ ] A-E 各项独立勾选 +- [ ] 现有测试不退化 + +# Work Log + +- 2026-05-18: code review 合并 finding A1+A2+C6+C7+A3+C5 + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- CLAUDE.md "硬约束" §"新加 GVR 时统一在 operator.go 顶部声明" diff --git a/todos/016-pending-p3-field-format-validation.md b/todos/016-pending-p3-field-format-validation.md new file mode 100644 index 0000000..a870d38 --- /dev/null +++ b/todos/016-pending-p3-field-format-validation.md @@ -0,0 +1,80 @@ +--- +name: field-format-validation +description: ExpectedSha256 缺 hex64 regex 校验、PushArgs 没有 credential-flag 黑名单——典型 "字段格式正确"但 "字段内容危险"的两个空缺 +status: pending +priority: p3 +issue_id: "016" +tags: [code-review, type-design, security, p3] +dependencies: [] +--- + +# Problem Statement + +两个独立但同源("字段值合法性校验")的小问题: + +**A. `ExpectedSha256` 无格式校验** (`config.go:106-109`) +用户写 `expectedSha256: "abc"` 不会在加载时报错;要等到 `VerifySha256` 真正下载完文件,做 hex 比对失败,才能拿到"sha256 mismatch"——可能是合法 mismatch 也可能是用户手抖,混在一起。规则:64 个 hex 字符,regex `^[0-9a-fA-F]{64}$`。 + +**B. `PushArgs []string` 是凭证黑洞** (`config.go:79-83`) +README 已经明文要求"凭证必须走 env 不进 config",但 `PushArgs` 完全自由——粗心 caller 把 `--password foo` 塞进去就绕过了所有 mask 逻辑(实际 MaskCommand 仍能匹配 `--password`,但 PushArgs 进 YAML 后会留下凭证文件,这是文档层的违例,不是技术层)。 + +# Findings + +- T4 + T3 独立命中 +- 与 todo 010(VioletConfig.Validate)自然合流 + +# Proposed Solutions + +**A**:在 `Version.Validate()`(或 `Config.Validate()`)中加 regex 校验: + +```go +var sha256HexRe = regexp.MustCompile(`^[0-9a-fA-F]{64}$`) + +func (v Version) Validate() error { + if v.ExpectedSha256 != "" && !sha256HexRe.MatchString(v.ExpectedSha256) { + return fmt.Errorf("version %s: expectedSha256 must be 64 hex chars, got %q", v.BundleVersion, v.ExpectedSha256) + } + return nil +} +``` + +**B**:在 `VioletConfig.Validate()` 扫描 `PushArgs`: + +```go +forbidden := []string{"--password", "-p", "--username", "-u"} +for _, arg := range v.PushArgs { + for _, f := range forbidden { + if arg == f { + return fmt.Errorf("credentials must come from env vars, not pushArgs; remove %q", f) + } + } +} +``` + +- effort: Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/config/config.go` +- 与 todo 010(Validate 入口)合流 +- 测试覆盖每个失败分支 + +# Acceptance Criteria + +- [ ] A: 非 hex64 的 ExpectedSha256 在 LoadConfig 失败 +- [ ] B: PushArgs 含 `--password` / `--username` 等凭证 flag 时 LoadConfig 失败 +- [ ] 单元测试覆盖 + +# Work Log + +- 2026-05-18: code review 合并 finding T3+T4 + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 关联 todo: 010 diff --git a/todos/017-pending-p3-env-allowlist-threat-model-doc.md b/todos/017-pending-p3-env-allowlist-threat-model-doc.md new file mode 100644 index 0000000..48fd6fe --- /dev/null +++ b/todos/017-pending-p3-env-allowlist-threat-model-doc.md @@ -0,0 +1,73 @@ +--- +name: env-allowlist-threat-model-doc +description: pkg/exec.Command.EnvAllowlist 默认 nil=全量透传是兼容默认;本身未变更但 PR 让它成为安全边界——值得在包文档里写清威胁模型,方便未来 caller 不重蹈覆辙 +status: pending +priority: p3 +issue_id: "017" +tags: [code-review, documentation, security, p3] +dependencies: [] +--- + +# Problem Statement + +`pkg/exec.Command.EnvAllowlist`(本 PR 未改动,但 violet 流程依赖它)的默认 nil/empty 行为是 **全量透传 os.Environ()**——为了兼容旧的 testCommand caller。当前 violet 路径正确地显式设了 allowlist `["KUBECONFIG", "PATH", "HOME", "USER", "VIOLET_*"]`,但这个"必须设"的纪律只在 code review 里口口相传。 + +未来新加一个 `exec.RunCommand` 调用点的同事可能忘记设 allowlist,子进程立即继承所有 CI secret(GITHUB_TOKEN / AWS_* / NPM_TOKEN ...)——这是一个静默退化路径。 + +# Findings + +- **type-design-analyzer** T7 独立命中 +- 仅文档层;不修改逻辑 + +# Proposed Solutions + +**Option A(推荐)**:在 `pkg/exec/doc.go` 或 `exec.go` 顶部加包级文档,写明: + +``` +Package exec wraps os/exec.CommandContext with stdio capture + tee, and an +opt-in environment allowlist. + +THREAT MODEL: callers that exec third-party binaries (e.g. uploaders, +registries) SHOULD set Command.EnvAllowlist to restrict which host env vars +reach the child process. The default (empty allowlist == full passthrough) +exists for test-harness callers that intentionally inherit the full +environment (e.g. `make test` running under CI with bespoke env wiring). + +NEW CALL SITES: prefer setting EnvAllowlist; the empty default is for +backward compatibility only. +``` + +- 优点:零运行时改动;为未来新 caller 提供 norm +- 缺点:不能强制 +- effort: Negligible +- risk: Zero + +**Option B**:再加一个 `RunStrict` 函数(`EnvAllowlist []string` 必填)作为推荐入口;保留 `RunCommand` 为兼容性 API。 + +- 优点:API 引导 +- 缺点:双入口面,违反"窄接口" +- effort: Small +- risk: Low + +# Recommended Action + +(待 triage) + +# Technical Details + +- 影响文件:`pkg/exec/exec.go`(或新增 `doc.go`) +- 不影响行为 + +# Acceptance Criteria + +- [ ] `pkg/exec` 包有顶部包文档说明 EnvAllowlist 威胁模型 +- [ ] CLAUDE.md 项目根部约束(可选):新 RunCommand 调用点必须明示 EnvAllowlist 决策 + +# Work Log + +- 2026-05-18: code review 发现 by type-design-analyzer + +# Resources + +- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 +- 类似设计:Go x/exp/slog 早期对 default Handler 的文档化做法 From 18f2e9b4780e383e79f6b1f43cba755f616fbe59 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 20:51:51 +0800 Subject: [PATCH 07/15] fix(config): separate MinIO URL channel segment from OLM Subscription channel Real-cluster integration testing on 40-devops surfaced a design defect: the single Version.Channel field was used both as the OLM Subscription channel and as the MinIO URL path segment, but the two are independent on platform packages. On the live cluster: - tektoncd-operator OLM channels are: latest, pipelines-4.0, pipelines-4.2, pipelines-4.6, stable - MinIO URL segments are: v4.0, v4.2, v4.6, rc A user filling `channel: stable` (which is correct for Subscription) made BuildPackageURL produce ".../tektoncd-operator/stable/...tgz" which is 404, and the violet flow could never download anything. Fix: - Add Version.PackageChannel for the MinIO segment. - Channel keeps its OLM Subscription meaning (unchanged for callers). - Version.EffectivePackageChannel() returns PackageChannel when set, falling back to Channel for repositories where they coincide. - installViaViolet now calls BuildPackageURL with EffectivePackageChannel() instead of Channel. - README documents the distinction explicitly and gives both fields in the example config. - New table test TestVersionEffectivePackageChannel covers precedence, fallback, both-empty, and divergent-segments cases. Verified by HEAD-checking the MinIO endpoint: http://.../tektoncd-operator/stable/...v4.0.17.tgz -> 404 http://.../tektoncd-operator/v4.0/...v4.0.17.tgz -> 200 --- README.md | 7 ++++--- pkg/config/config.go | 24 +++++++++++++++++++++++- pkg/operator/operatorhub/violet.go | 2 +- pkg/operator/operatorhub/violet_test.go | 23 +++++++++++++++++++++++ 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1586ade..2a5e4f8 100644 --- a/README.md +++ b/README.md @@ -106,8 +106,9 @@ upgradePaths: # 定义升级路径,可以包含多个 testCommand: | TAGS=@prepare-17.8 GODOG_ARGS="--godog.format=allure" make test bundleVersion: v17.8.10 - channel: stable - # expectedSha256: a3f... # 可选;非空时强制校验下载的 .tgz + channel: stable # OLM Subscription.spec.channel + # packageChannel: v17 # MinIO URL 段;与 OLM channel 不同时填,省略则 fallback 到 channel + # expectedSha256: a3f... # 可选;非空时强制校验下载的 .tgz - name: v17.11 # 版本名称 testCommand: | TAGS=@upgrade-17.11 GODOG_ARGS="--godog.format=allure --bdd.cleanup=false" make test @@ -119,7 +120,7 @@ upgradePaths: # 定义升级路径,可以包含多个 - **`operatorConfig.violet` 必填**。upgrade CLI 不再在 Go 代码里直接创建 `Artifact`/`ArtifactVersion` CR,而是把这步外包给 `violet` 二进制。`operatorConfig.violet` 为空时 `InstallArtifactVersion` 会立即返回配置错误。 - **`packagePrefix` 没有默认值**。MinIO 根地址跨环境(私有 / 共享 / 区域镜像)不同,CLI 拒绝硬编码任何值。 -- **`.tgz` URL 拼接约定**:`///.latest.ALL..tgz`。已验证可同时覆盖 `v4.6` 大版本通道和 `rc` 滚动通道两种实际形态。 +- **`.tgz` URL 拼接约定**:`///.latest.ALL..tgz`。`packageChannel` 是 MinIO 仓库的路径段(如 `v4.0` / `v4.6` / `rc`),**与 OLM Subscription 的 `channel`(如 `stable` / `pipelines-4.0`)不是同一个概念**。当两者字面相同时(少数情况)可只填 `channel`,CLI 会自动 fallback;当两者不同(tektoncd 这类)必须显式填 `packageChannel`。 - **`expectedSha256` 强烈建议为外网下载场景填上**。HTTP 明文 + DNS 投毒可能注入恶意 tgz,违 violet 会带集群凭证把恶意 bundle 上架——内网 MinIO 也建议加防。 ### 构建测试镜像 diff --git a/pkg/config/config.go b/pkg/config/config.go index aaa80f5..eec4465 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -101,14 +101,36 @@ type Version struct { TestCommand string `yaml:"testCommand,omitempty"` // testSubPath is the path to the test sub-directory, default is "testing" TestSubPath string `yaml:"testSubPath,omitempty"` - // revision is the revision to use for the version + // Channel is the OLM Subscription channel (e.g. "stable", + // "pipelines-4.0", "latest"). Used verbatim as Subscription.spec.channel + // when InstallSubscription installs the operator. Channel string `yaml:"channel,omitempty"` + // PackageChannel is the MinIO repository path segment used by the violet + // download URL — it is NOT the same as the OLM Channel. For example, on + // the platform-tektoncd-operator the OLM channel is "stable" but the + // MinIO segment is "v4.0" / "v4.2" / "rc" / etc. + // + // When empty, BuildPackageURL falls back to Channel for backward + // compatibility with repositories where the two coincide. Set this + // explicitly when the two differ. + PackageChannel string `yaml:"packageChannel,omitempty"` // ExpectedSha256, when non-empty, is the lowercase hex SHA-256 of the // downloaded .tgz. The upgrade CLI verifies the digest after download and // fails fast on mismatch. Optional; leave empty to skip verification. ExpectedSha256 string `yaml:"expectedSha256,omitempty"` } +// EffectivePackageChannel returns the MinIO URL segment to use for this +// version: PackageChannel when set, falling back to Channel when not. This +// keeps simple cases (where OLM channel name == MinIO segment) ergonomic +// while still letting callers split the two when they diverge. +func (v Version) EffectivePackageChannel() string { + if v.PackageChannel != "" { + return v.PackageChannel + } + return v.Channel +} + // LoadConfig loads the configuration from a YAML file func LoadConfig(path string) (*Config, error) { data, err := os.ReadFile(path) diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index ad27516..fc007d9 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -148,7 +148,7 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) return nil, "", fmt.Errorf("get artifact %s: %w", o.artifact, err) } - url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.Channel, version.BundleVersion) + url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.EffectivePackageChannel(), version.BundleVersion) if err != nil { return nil, "", fmt.Errorf("build package url: %w", err) } diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index 43501b6..303d733 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -12,6 +12,7 @@ import ( "testing" "time" + "github.com/AlaudaDevops/upgrade-test/pkg/config" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -19,6 +20,28 @@ import ( "k8s.io/client-go/dynamic/fake" ) +func TestVersionEffectivePackageChannel(t *testing.T) { + tests := []struct { + name string + channel string + packageChannel string + want string + }{ + {"packageChannel wins when set", "stable", "v4.0", "v4.0"}, + {"falls back to channel when packageChannel empty", "stable", "", "stable"}, + {"both empty returns empty (caller fails loudly downstream)", "", "", ""}, + {"different segments stay independent", "pipelines-4.6", "rc", "rc"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + v := config.Version{Channel: tc.channel, PackageChannel: tc.packageChannel} + if got := v.EffectivePackageChannel(); got != tc.want { + t.Errorf("EffectivePackageChannel() = %q, want %q", got, tc.want) + } + }) + } +} + func TestBuildPackageURL(t *testing.T) { tests := []struct { name string From 554ff6e40e949ef08aacabbba3b846c7a5c25a63 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 22:54:13 +0800 Subject: [PATCH 08/15] feat(violet): platformAddress/clusters/force fields + platform creds via env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-cluster integration testing on 40-devops surfaced four missing pieces that PR 2 had punted into Violet.PushArgs as a workaround. This commit promotes each one to a first-class part of the schema (or the env-based credential protocol) so production configs no longer have to thread sensitive flags through an opaque string list. (1) Violet.PlatformAddress — required ACP URL the violet binary authenticates against. Lives in config (not a credential). Empty value surfaces at violet exec time, not silently. (2) Violet.Clusters — target subcluster(s) for the Artifact/AV write. violet defaults to "global", but multi-cluster ACP deployments expose a per-workload subcluster (kubectl is connected through /kubernetes/) and the two must match. Without it, violet reports "updated successfully" while the CRs never appear in the cluster the upgrade CLI is watching — a silent assumed-success that only surfaces minutes later as a wait timeout. (3) Violet.Force *bool, defaulting to true — without --force, violet aborts AV upserts with "already exist, skip it" and creates nothing. Verified on tektoncd-operator v4.0.17: same input that succeeded with --force silently no-op'd without it. The pointer keeps the door open for users who genuinely want to preserve a stale AV, but the default matches every real-world flow we have. (4) VIOLET_PLATFORM_USERNAME / VIOLET_PLATFORM_PASSWORD environment variables — auto-injected as --platform-username / --platform-password so platform credentials follow the same env-only contract as registry credentials. MaskCommand now redacts both --password and --platform-password (via a sensitivePasswordFlags map). OS-level argv inspection remains the documented residual risk. Mechanical changes: - BuildVioletPushArgs refactored to take a VioletPushParams struct rather than a growing positional argument list. Keeps violet.go decoupled from the config package (caller dereferences the *bool defaults before populating the struct). - execVioletPush populates VioletPushParams from o.violet, applying the SkipPush=true / Force=true defaults at the boundary via a tiny derefBoolDefault helper. - defaultConfig fills Violet.Force=true when nil, mirroring the existing SkipPush=true default. - TestBuildVioletPushArgs rewritten as a 10-case table covering every new knob, both creds env-var pairs, force on/off, push-args ordering, and the full real-cluster shape. TestMaskCommand grows two cases covering --platform-password redaction alone and the combined registry+platform password mask. README and CLAUDE.md document the new fields, the dual credential env protocol, and the "Force defaults true" / "Clusters required on multi- cluster ACP" hard constraints. --- CLAUDE.md | 4 +- README.md | 28 ++++-- pkg/config/config.go | 30 ++++++- pkg/operator/operatorhub/violet.go | 108 ++++++++++++++++++------ pkg/operator/operatorhub/violet_test.go | 107 ++++++++++++++++------- 5 files changed, 213 insertions(+), 64 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 016413a..2daff63 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,7 +61,9 @@ Artifact 名称约定:未显式给 `artifact` 字段时,自动拼成 ` -export VIOLET_REGISTRY_PASSWORD= +export VIOLET_PLATFORM_USERNAME=admin@cpaas.io +export VIOLET_PLATFORM_PASSWORD= +# 仅私有 push 场景需要的额外两行: +# export VIOLET_REGISTRY_USERNAME= +# export VIOLET_REGISTRY_PASSWORD= + ./upgrade --config upgrade.yaml ``` -upgrade CLI 检测到非空时会自动追加到 `violet push` 的 argv,并在日志渲染时把 `--password` 之后的值替换成 `***`。 +upgrade CLI 检测到非空时自动追加到 `violet push` 的 argv,日志渲染时把 `--password` 和 `--platform-password` 之后的值替换成 `***`。 ⚠️ **共享 CI runner 安全警告**:当前 violet 不支持 stdin / 文件方式读凭证,`--password` 只能进 argv。OS 级 `ps auxe` / `/proc//cmdline` / `strace` 都能看到明文密码。**多租户共享 runner 上禁用 `skipPush: false`**,私有 push 场景务必使用独占的 pipeline runner / 独占的 OS 用户账号。 diff --git a/pkg/config/config.go b/pkg/config/config.go index eec4465..9da7df8 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -77,10 +77,32 @@ type VioletConfig struct { SkipPush *bool `yaml:"skipPush,omitempty"` // PushArgs are extra arguments appended verbatim to `violet push`, used to - // inject options such as --dest-repo, --plain, --image-pull-secret, --force + // inject options such as --dest-repo, --plain, --image-pull-secret // in private-registry scenarios. Credentials must come from environment - // variables (VIOLET_REGISTRY_USERNAME / VIOLET_REGISTRY_PASSWORD), not here. + // variables (VIOLET_REGISTRY_USERNAME / VIOLET_REGISTRY_PASSWORD, + // VIOLET_PLATFORM_USERNAME / VIOLET_PLATFORM_PASSWORD), not here. PushArgs []string `yaml:"pushArgs,omitempty"` + + // PlatformAddress is the ACP platform URL violet authenticates against + // (e.g. https://devops-env1-hcvt43--idp.alaudatech.net). Required in + // real-cluster integrations — violet refuses to start without it. Not a + // credential so it lives in config rather than env. + PlatformAddress string `yaml:"platformAddress,omitempty"` + + // Clusters names the target subcluster(s) for the Artifact/AV write — + // violet defaults to "global", but multi-cluster ACP deployments expose + // a per-workload subcluster (e.g. "devops") that kubectl is also pointed + // at. A mismatch silently writes to the wrong place: violet reports + // "updated successfully" while the CRs never appear in the cluster the + // upgrade CLI is watching. + Clusters string `yaml:"clusters,omitempty"` + + // Force defaults to true. Without --force violet aborts AV upserts with + // "already exist, skip it" — verified on tektoncd-operator v4.0.17 on + // 40-devops: the same input that succeeded with --force was a no-op + // without it, leaving wait AV Present to time out. Pointer so users can + // opt out (e.g. when a stale AV must be preserved exactly). + Force *bool `yaml:"force,omitempty"` } // UpgradePath represents a single upgrade path @@ -171,6 +193,10 @@ func defaultConfig(config *Config) *Config { t := true v.SkipPush = &t } + if v.Force == nil { + t := true + v.Force = &t + } } return config diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index fc007d9..8479d1c 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -26,20 +26,37 @@ import ( // Environment variables consumed when assembling `violet push`. They are // intentionally not part of OperatorConfig: pipelines inject them as secrets // and the upgrade CLI must not persist them to disk or echo them back. +// VIOLET_REGISTRY_* covers --skip-push=false push-to-registry flow; +// VIOLET_PLATFORM_* covers ACP API authentication for AV creation. const ( EnvVioletRegistryUsername = "VIOLET_REGISTRY_USERNAME" EnvVioletRegistryPassword = "VIOLET_REGISTRY_PASSWORD" + EnvVioletPlatformUsername = "VIOLET_PLATFORM_USERNAME" + EnvVioletPlatformPassword = "VIOLET_PLATFORM_PASSWORD" ) // violet CLI flag names. Kept private so the consuming code stays expressive // (`flagSkipPush` rather than the raw string everywhere). const ( flagSkipPush = "--skip-push" + flagForce = "--force" flagTargetCatalogSource = "--target-catalog-source" + flagClusters = "--clusters" flagUsername = "--username" flagPassword = "--password" + flagPlatformAddress = "--platform-address" + flagPlatformUsername = "--platform-username" + flagPlatformPassword = "--platform-password" ) +// sensitivePasswordFlags lists every violet flag whose immediately following +// argv token is a credential — MaskCommand redacts each of them in logs. +// argv-mode leakage to OS (ps auxe / /proc) remains as documented in the README. +var sensitivePasswordFlags = map[string]bool{ + flagPassword: true, + flagPlatformPassword: true, +} + // BuildPackageURL composes the .tgz URL by the agreed MinIO convention: // // ///.latest.ALL..tgz @@ -62,39 +79,68 @@ func BuildPackageURL(prefix, name, channel, bundleVersion string) (string, error return fmt.Sprintf("%s/%s/%s/%s.latest.ALL.%s.tgz", p, name, channel, name, bundleVersion), nil } +// VioletPushParams is the resolved, pointer-free shape of `violet push` +// inputs. The caller (execVioletPush) is responsible for dereferencing the +// *bool defaults in config.VioletConfig before populating this struct, so +// BuildVioletPushArgs remains a pure transform — it never touches the +// filesystem, never starts a process, and never reaches into config. +type VioletPushParams struct { + TgzPath string + SkipPush bool + Force bool + PlatformAddress string + Clusters string + PushArgs []string +} + // BuildVioletPushArgs assembles the argv for `violet push `. Credentials -// are read from EnvVioletRegistryUsername / EnvVioletRegistryPassword and -// injected as --username / --password when non-empty; pushArgs is appended -// verbatim. The function is a pure transform — it never touches the filesystem -// or starts a process — so callers can table-test the exact argv shape. +// are read from environment variables and injected when non-empty: // -// The decision of skipPush belongs to the caller (it has already deferenced -// VioletConfig.SkipPush and applied the "nil == true" default in config), so -// this function takes a plain bool. -func BuildVioletPushArgs(tgzPath string, skipPush bool, pushArgs []string) []string { - args := []string{"push", tgzPath, flagTargetCatalogSource, targetCatalogSource} - if skipPush { +// VIOLET_REGISTRY_USERNAME / _PASSWORD → --username / --password +// VIOLET_PLATFORM_USERNAME / _PASSWORD → --platform-username / --platform-password +// +// PushArgs is appended verbatim so unsupported flags can still be threaded +// through without a code change. +func BuildVioletPushArgs(p VioletPushParams) []string { + args := []string{"push", p.TgzPath, flagTargetCatalogSource, targetCatalogSource} + if p.SkipPush { args = append(args, flagSkipPush) } + if p.Force { + args = append(args, flagForce) + } + if p.PlatformAddress != "" { + args = append(args, flagPlatformAddress, p.PlatformAddress) + } + if p.Clusters != "" { + args = append(args, flagClusters, p.Clusters) + } if u := os.Getenv(EnvVioletRegistryUsername); u != "" { args = append(args, flagUsername, u) } - if p := os.Getenv(EnvVioletRegistryPassword); p != "" { - args = append(args, flagPassword, p) + if p2 := os.Getenv(EnvVioletRegistryPassword); p2 != "" { + args = append(args, flagPassword, p2) } - args = append(args, pushArgs...) + if u := os.Getenv(EnvVioletPlatformUsername); u != "" { + args = append(args, flagPlatformUsername, u) + } + if p2 := os.Getenv(EnvVioletPlatformPassword); p2 != "" { + args = append(args, flagPlatformPassword, p2) + } + args = append(args, p.PushArgs...) return args } // MaskCommand renders the command for logging, replacing the token following -// --password with `***`. This only protects log output — the credential is -// still visible to OS-level inspection (e.g. `ps auxe`) once the child process -// runs. The README must document that risk for shared CI runners. +// any sensitive-password flag (--password, --platform-password) with `***`. +// This only protects log output — the credential is still visible to OS-level +// inspection (e.g. `ps auxe`) once the child process runs. The README +// documents that risk for shared CI runners. func MaskCommand(name string, args []string) string { parts := make([]string, 0, len(args)+1) parts = append(parts, name) for i := 0; i < len(args); i++ { - if args[i] == flagPassword && i+1 < len(args) { + if sensitivePasswordFlags[args[i]] && i+1 < len(args) { parts = append(parts, args[i], "***") i++ continue @@ -319,8 +365,9 @@ var violetEnvAllowlist = []string{ } // execVioletPush runs `violet push ` as a child process. Credentials -// from VIOLET_REGISTRY_USERNAME / _PASSWORD are auto-injected into argv by -// BuildVioletPushArgs; the rendered command is mask-logged before exec. +// for the registry (VIOLET_REGISTRY_*) and ACP platform (VIOLET_PLATFORM_*) +// are auto-injected into argv by BuildVioletPushArgs; the rendered command +// is mask-logged before exec. func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { log := logging.FromContext(ctx) @@ -331,11 +378,14 @@ func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { return err } - skipPush := true - if o.violet.SkipPush != nil { - skipPush = *o.violet.SkipPush - } - args := BuildVioletPushArgs(tgzPath, skipPush, o.violet.PushArgs) + args := BuildVioletPushArgs(VioletPushParams{ + TgzPath: tgzPath, + SkipPush: derefBoolDefault(o.violet.SkipPush, true), + Force: derefBoolDefault(o.violet.Force, true), + PlatformAddress: o.violet.PlatformAddress, + Clusters: o.violet.Clusters, + PushArgs: o.violet.PushArgs, + }) log.Infow("invoking violet", "cmd", MaskCommand(bin, args)) @@ -347,6 +397,16 @@ func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { return result.Err } +// derefBoolDefault returns *p when p is non-nil, otherwise the supplied +// default. Used to apply VioletConfig's "unset means on" semantics for +// SkipPush and Force at the call site of BuildVioletPushArgs. +func derefBoolDefault(p *bool, fallback bool) bool { + if p == nil { + return fallback + } + return *p +} + // validateVioletBin guards against accidentally executing a binary at an // arbitrary user-supplied path. When Violet.Bin is non-empty it must be an // absolute path to an executable regular file; otherwise return a typed diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index 303d733..6a92c88 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -104,53 +104,92 @@ func TestBuildPackageURL(t *testing.T) { func TestBuildVioletPushArgs(t *testing.T) { tests := []struct { name string - skipPush bool - pushArgs []string - envUser string - envPass string + params VioletPushParams + regUser string + regPass string + platUser string + platPass string want []string }{ { - name: "skip-push only, no creds, no extra args", - skipPush: true, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push"}, + name: "minimal: skip-push + force defaults are explicit", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force"}, }, { - name: "skip-push false drops the flag", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform"}, + name: "force=false drops the flag", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: false}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push"}, }, { - name: "env creds appended after skip-push", - skipPush: true, - envUser: "u", - envPass: "p", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--username", "u", "--password", "p"}, + name: "skip-push=false drops the flag (push images path)", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: false, Force: true}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force"}, }, { - name: "only username present (rare, but accepted)", - envUser: "u", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--username", "u"}, + name: "platform-address surfaces between force and creds", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PlatformAddress: "https://example.acp"}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-address", "https://example.acp"}, }, { - name: "private-push flow: skip-push=false + pushArgs", - pushArgs: []string{"--dest-repo", "registry.private/devops", "--plain", "--force"}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--dest-repo", "registry.private/devops", "--plain", "--force"}, + name: "clusters list lands after platform-address", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PlatformAddress: "https://x", Clusters: "devops"}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-address", "https://x", "--clusters", "devops"}, }, { - name: "push args land after credentials", - skipPush: true, - envUser: "u", - envPass: "p", - pushArgs: []string{"--force"}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--username", "u", "--password", "p", "--force"}, + name: "registry creds env auto-injected as --username/--password", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, + regUser: "u", + regPass: "p", + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--username", "u", "--password", "p"}, + }, + { + name: "platform creds env auto-injected as --platform-username/--platform-password", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, + platUser: "admin@cpaas.io", + platPass: "07Apples@", + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-username", "admin@cpaas.io", "--platform-password", "07Apples@"}, + }, + { + name: "full real-cluster shape: address + clusters + both creds + push-args", + params: VioletPushParams{ + TgzPath: "/tmp/x.tgz", SkipPush: false, Force: true, + PlatformAddress: "https://40-devops", + Clusters: "devops", + PushArgs: []string{"--dest-repo", "registry.private/devops"}, + }, + regUser: "r-u", regPass: "r-p", + platUser: "p-u", platPass: "p-p", + want: []string{ + "push", "/tmp/x.tgz", "--target-catalog-source", "platform", + "--force", + "--platform-address", "https://40-devops", + "--clusters", "devops", + "--username", "r-u", "--password", "r-p", + "--platform-username", "p-u", "--platform-password", "p-p", + "--dest-repo", "registry.private/devops", + }, + }, + { + name: "only registry username present (no password) still injects", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, + regUser: "u", + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--username", "u"}, + }, + { + name: "pushArgs always land last so user overrides win", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PushArgs: []string{"--plain", "--image-pull-secret", "pull"}}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--plain", "--image-pull-secret", "pull"}, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - t.Setenv(EnvVioletRegistryUsername, tc.envUser) - t.Setenv(EnvVioletRegistryPassword, tc.envPass) - got := BuildVioletPushArgs("/tmp/x.tgz", tc.skipPush, tc.pushArgs) + t.Setenv(EnvVioletRegistryUsername, tc.regUser) + t.Setenv(EnvVioletRegistryPassword, tc.regPass) + t.Setenv(EnvVioletPlatformUsername, tc.platUser) + t.Setenv(EnvVioletPlatformPassword, tc.platPass) + got := BuildVioletPushArgs(tc.params) if !stringSliceEqual(got, tc.want) { t.Errorf("args mismatch:\n got: %v\nwant: %v", got, tc.want) } @@ -184,6 +223,16 @@ func TestMaskCommand(t *testing.T) { in: []string{"--password", "a", "--password", "b"}, want: "violet --password *** --password ***", }, + { + name: "--platform-password value masked too", + in: []string{"--platform-username", "admin@cpaas.io", "--platform-password", "07Apples@"}, + want: "violet --platform-username admin@cpaas.io --platform-password ***", + }, + { + name: "both --password and --platform-password masked in one command", + in: []string{"--username", "u", "--password", "rp", "--platform-username", "pu", "--platform-password", "pp"}, + want: "violet --username u --password *** --platform-username pu --platform-password ***", + }, } for _, tc := range tests { From a0547ba171dd51aa4fb743b164dc44a5a1f7f349 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 23:02:06 +0800 Subject: [PATCH 09/15] chore: drop P3 todos archived from PR #14 review The five P3 items captured nice-to-have polish (silent error cleanup, defense-in-depth security, organizational tweaks, field-format validation, env-allowlist threat-model doc). None of them are blocking and none have been scheduled, so retiring them keeps todos/ focused on the P2 list that will actually drive a follow-up cycle. --- todos/013-pending-p3-silent-error-cleanup.md | 86 ----------------- ...14-pending-p3-security-defense-in-depth.md | 92 ------------------- ...ding-p3-code-organization-and-constants.md | 68 -------------- .../016-pending-p3-field-format-validation.md | 80 ---------------- ...nding-p3-env-allowlist-threat-model-doc.md | 73 --------------- 5 files changed, 399 deletions(-) delete mode 100644 todos/013-pending-p3-silent-error-cleanup.md delete mode 100644 todos/014-pending-p3-security-defense-in-depth.md delete mode 100644 todos/015-pending-p3-code-organization-and-constants.md delete mode 100644 todos/016-pending-p3-field-format-validation.md delete mode 100644 todos/017-pending-p3-env-allowlist-threat-model-doc.md diff --git a/todos/013-pending-p3-silent-error-cleanup.md b/todos/013-pending-p3-silent-error-cleanup.md deleted file mode 100644 index d788bd8..0000000 --- a/todos/013-pending-p3-silent-error-cleanup.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -name: silent-error-cleanup -description: 多处 `_, _, _` 丢弃 unstructured.Nested* 的 err、cleanup 丢弃 RemoveAll 错误、两条 sentinel 错误没有 stage 标签——单个都不致命,叠在一起拉低可调试性 -status: pending -priority: p3 -issue_id: "013" -tags: [code-review, quality, observability, p3] -dependencies: [] ---- - -# Problem Statement - -一组小的 "silent failure" 模式集合,单个都是 P3,但合在一起影响 debug 体验: - -**A. `status, _, _ := unstructured.NestedMap(...)` 把类型错误也吞掉** (`artifact_versiong.go:80-81`) -若 AV `status` 字段不是 map(schema drift / controller bug),Go 读 nil map 不 panic,但 polling 看不到 "Present" 就轮询到 `o.timeout`,最终给用户的是 `context deadline exceeded`——根本没说 "status 类型错"。 - -**B. `cleanup := func() { _ = os.RemoveAll(dir) }`** (`violet.go:214`) -CI runner 长时间 /tmp 撑爆时,留下的孤儿 dir 累积,将来 mkdir 失败时已无线索。最小成本:log.Warnw 即可。 - -**C. Sentinel 错误没有 stage 标签** (`violet.go:188, 194`) -其他 stage 都是 `fmt.Errorf("violet push: %w", ...)` 这样带短标签的 wrap;spec.tag mismatch 和 status.version empty 这两条仅返回数据级断言,grep 失败根因时找不到统一标签。 - -# Findings - -- F1 (silent-failure) + F6 (silent-failure) + A5 (architecture) 三处独立命中 -- 同一类别(silent / 不可 grep)的不同表现形式 - -# Proposed Solutions - -**A**:把 NestedMap 的 err 也接住,类型错立即返回硬错误: - -```go -status, found, err := unstructured.NestedMap(obj.Object, "status") -if err != nil { - return false, fmt.Errorf("AV %s status is malformed: %w", name, err) -} -if !found { return false, nil } -``` - -**B**:closure 捕获 log,best-effort cleanup 加 WARN: - -```go -cleanup := func() { - if err := os.RemoveAll(dir); err != nil { - log.Warnw("failed to remove temp dir; manual cleanup may be needed", "dir", dir, "err", err) - } -} -``` - -**C**:两条 sentinel 加 stage 标签: - -```go -return nil, fmt.Errorf("cross-check spec.tag: AV %s has spec.tag=%q, expected %q ...", ...) -return nil, fmt.Errorf("read status.version: AV %s status.version is empty", avName) -``` - -- effort: Small(三处都是 1-3 行) -- risk: Low - -# Recommended Action - -(待 triage) - -# Technical Details - -- 影响文件: - - `pkg/operator/operatorhub/artifact_versiong.go:74-93`(A) - - `pkg/operator/operatorhub/violet.go:214`(B) - - `pkg/operator/operatorhub/violet.go:188, 194`(C) -- 测试:补一个 NestedMap 返回非 map 的负向用例 - -# Acceptance Criteria - -- [ ] (A) NestedMap 错误传播 -- [ ] (B) cleanup 失败有 WARN log -- [ ] (C) 两条 sentinel 错误均带 stage 标签 -- [ ] 现有测试不退化 - -# Work Log - -- 2026-05-18: code review 合并 finding F1+F6+A5 - -# Resources - -- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/014-pending-p3-security-defense-in-depth.md b/todos/014-pending-p3-security-defense-in-depth.md deleted file mode 100644 index 7605093..0000000 --- a/todos/014-pending-p3-security-defense-in-depth.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -name: security-defense-in-depth -description: 三个低概率但廉价可关闭的安全空缺——VIOLET_* 允许列表过宽、删 AV 前未校验 ownerRef、下载无 Content-Length 上限 -status: pending -priority: p3 -issue_id: "014" -tags: [code-review, security, defense-in-depth, p3] -dependencies: [] ---- - -# Problem Statement - -三个独立但都属于 "defense-in-depth" 的小空缺: - -**A. `VIOLET_*` 允许列表过宽** (`violet.go:299-305`) -当前 allowlist 用 `VIOLET_*` 前缀通配。攻击模型:未来 violet 引入 `VIOLET_AWS_SESSION_TOKEN` 之类的 env-recognized 名字,或者环境里被人故意命名 `VIOLET_GITHUB_TOKEN`,全部自动透传。 - -**B. `deleteArtifactVersionIfExists` 不校验 ownerRef** (`violet.go:265-292`) -`avName = .`,两者都是 config 控制。如果 `bundleVersion` 拼写或 artifact 命名异常导致与他人 AV 名称冲撞,本工具会无差别删之。 - -**C. `io.Copy(f, resp.Body)` 无大小上限** (`violet.go:253`) -被污染的 MinIO 可以无限流式写 `/tmp` 直到撑爆 tmpfs。todo 001 加了 timeout 后影响降低,但仍是一个独立的失控点。 - -# Findings - -- S5 + S7 + P2(performance Content-Length) 独立命中 -- 现实威胁强度:低;修复成本:低 - -# Proposed Solutions - -**A**:把 allowlist 收窄到精确名字。 - -```go -var violetEnvAllowlist = []string{ - "KUBECONFIG", "PATH", "HOME", "USER", - EnvVioletRegistryUsername, - EnvVioletRegistryPassword, -} -``` - -后续如 violet 引入新的支持 env,**显式**追加。 - -**B**:删前校验 `metadata.labels["cpaas.io/artifact-version"] == o.artifact` 或 ownerRef 指向预期 Artifact。 - -```go -existing, _ := o.GetResource(ctx, name, systemNamespace, artifactVersionGVR) -if existing != nil { - if labels := existing.GetLabels(); labels["cpaas.io/artifact-version"] != o.artifact { - return fmt.Errorf("refusing to delete AV %s: label cpaas.io/artifact-version=%q does not match %q", - name, labels["cpaas.io/artifact-version"], o.artifact) - } - // ... proceed with delete -} -``` - -**C**:`io.LimitReader` 加一个慷慨上限: - -```go -const maxBundleBytes = 2 << 30 // 2 GiB -if resp.ContentLength > maxBundleBytes { - return ..., fmt.Errorf("bundle too large: %d", resp.ContentLength) -} -io.Copy(f, io.LimitReader(resp.Body, maxBundleBytes)) -``` - -- effort: A=Negligible / B=Small / C=Small -- risk: Low - -# Recommended Action - -(待 triage) - -# Technical Details - -- 影响文件:`pkg/operator/operatorhub/violet.go:299-305, 265-292, 253` -- A 部分需要测试覆盖 `VIOLET_X` 不在 allowlist 时被过滤(依赖 pkg/exec 的 matchAllowlist 行为,已有测试) -- B 部分需要 fake client 测试覆盖 label 不匹配 → refuse - -# Acceptance Criteria - -- [ ] A: VIOLET_* glob 收窄为精确名字列表 -- [ ] B: deleteArtifactVersionIfExists 在 label/ownerRef 不匹配时拒绝并报错 -- [ ] C: 加大小上限,超限直接报错;io.Copy 用 LimitReader 包裹 -- [ ] 单元测试覆盖 - -# Work Log - -- 2026-05-18: code review 合并 finding S5+S7+P2(perf) - -# Resources - -- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 diff --git a/todos/015-pending-p3-code-organization-and-constants.md b/todos/015-pending-p3-code-organization-and-constants.md deleted file mode 100644 index 26c7255..0000000 --- a/todos/015-pending-p3-code-organization-and-constants.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -name: code-organization-and-constants -description: 一组组织/常量化小问题——内联字面量、文件中段 var、test helper 残留、deleteAV 多余 Get、Version 全量传入——单个都琐碎,合起来是文件可读性议题 -status: pending -priority: p3 -issue_id: "015" -tags: [code-review, quality, organization, p3] -dependencies: [] ---- - -# Problem Statement - -一组组织性细节,单个都不重要,但合起来值得在一次小 cleanup commit 里处理: - -**A. `violetEnvAllowlist` 是 var 放在文件中段** (`violet.go:299-305`) -读者搜"violet 看得到哪些 env"会找不到——按 CLAUDE.md 的"常量集中在文件顶部"惯例,应当与文件顶部的 `EnvVioletRegistry*` 和 `flag*` 常量块放在一起。 - -**B. 内联字面量** (`violet.go:210, 220`) -`"upgrade-violet-*"`(temp dir 模式)和 `"package.tgz"`(fallback 文件名)都是有调试意义的常量,应当 hoist 到文件顶部。 - -**C. 测试 dead weight** (`violet_test.go:408-426`) -`schemaGVR` 局部类型 + `listKinds` map + `_ = listKinds` placeholder 全是给"以后扩展"留的——目前不用。fake client 已经通过 `scheme.AddKnownTypeWithName` 学到了 list kind。 - -**D. `deleteArtifactVersionIfExists` 多了一次预 Get** (`violet.go:268-282`) -预 Get 只是为了日志带 uid + happy-path 早退;Delete 本身已经 tolerate NotFound,poll loop 也已等到 NotFound。简化后可少一次往返 + 删一个测试用例。 - -**E. `installViaViolet` 接收 `config.Version` 全量** (`violet.go:142`) -只用 3 个字段(Channel、BundleVersion、ExpectedSha256),但暴露 6 个;窄接口能避免未来 PR 3 / PR 4 引入更多无关字段。可项目化为局部 `bundleSpec` struct。 - -# Findings - -- A1 + A2 + C6 + C7 + A3 + C5 合并 - -# Proposed Solutions - -每项独立、依次实施,建议放在 PR 3 的 cleanup commit 里: - -- **A**:把 `var violetEnvAllowlist` 移到文件顶 `const (...)` 旁 -- **B**:`const (tempDirPattern = "upgrade-violet-*"; fallbackPackageFilename = "package.tgz")` -- **C**:删 `schemaGVR` 类型 + `listKinds` + `_ = listKinds` -- **D**:去掉预 Get,直接 Delete + poll,可顺手删 `TestDeleteArtifactVersionIfExists_ConcurrentRaceVanishesGracefully`(与 no-residue case 等价) -- **E**:引入 `type bundleSpec struct { Channel, BundleVersion, ExpectedSha256 string }`,让 `InstallArtifactVersion` 做投影 - -- effort: 各 Small -- risk: Low - -# Recommended Action - -(待 triage) - -# Technical Details - -- 影响文件:`pkg/operator/operatorhub/violet.go`、`pkg/operator/operatorhub/violet_test.go` -- 建议作为 PR 3 cleanup commit 一并打包,避免单独的小 PR - -# Acceptance Criteria - -- [ ] A-E 各项独立勾选 -- [ ] 现有测试不退化 - -# Work Log - -- 2026-05-18: code review 合并 finding A1+A2+C6+C7+A3+C5 - -# Resources - -- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 -- CLAUDE.md "硬约束" §"新加 GVR 时统一在 operator.go 顶部声明" diff --git a/todos/016-pending-p3-field-format-validation.md b/todos/016-pending-p3-field-format-validation.md deleted file mode 100644 index a870d38..0000000 --- a/todos/016-pending-p3-field-format-validation.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -name: field-format-validation -description: ExpectedSha256 缺 hex64 regex 校验、PushArgs 没有 credential-flag 黑名单——典型 "字段格式正确"但 "字段内容危险"的两个空缺 -status: pending -priority: p3 -issue_id: "016" -tags: [code-review, type-design, security, p3] -dependencies: [] ---- - -# Problem Statement - -两个独立但同源("字段值合法性校验")的小问题: - -**A. `ExpectedSha256` 无格式校验** (`config.go:106-109`) -用户写 `expectedSha256: "abc"` 不会在加载时报错;要等到 `VerifySha256` 真正下载完文件,做 hex 比对失败,才能拿到"sha256 mismatch"——可能是合法 mismatch 也可能是用户手抖,混在一起。规则:64 个 hex 字符,regex `^[0-9a-fA-F]{64}$`。 - -**B. `PushArgs []string` 是凭证黑洞** (`config.go:79-83`) -README 已经明文要求"凭证必须走 env 不进 config",但 `PushArgs` 完全自由——粗心 caller 把 `--password foo` 塞进去就绕过了所有 mask 逻辑(实际 MaskCommand 仍能匹配 `--password`,但 PushArgs 进 YAML 后会留下凭证文件,这是文档层的违例,不是技术层)。 - -# Findings - -- T4 + T3 独立命中 -- 与 todo 010(VioletConfig.Validate)自然合流 - -# Proposed Solutions - -**A**:在 `Version.Validate()`(或 `Config.Validate()`)中加 regex 校验: - -```go -var sha256HexRe = regexp.MustCompile(`^[0-9a-fA-F]{64}$`) - -func (v Version) Validate() error { - if v.ExpectedSha256 != "" && !sha256HexRe.MatchString(v.ExpectedSha256) { - return fmt.Errorf("version %s: expectedSha256 must be 64 hex chars, got %q", v.BundleVersion, v.ExpectedSha256) - } - return nil -} -``` - -**B**:在 `VioletConfig.Validate()` 扫描 `PushArgs`: - -```go -forbidden := []string{"--password", "-p", "--username", "-u"} -for _, arg := range v.PushArgs { - for _, f := range forbidden { - if arg == f { - return fmt.Errorf("credentials must come from env vars, not pushArgs; remove %q", f) - } - } -} -``` - -- effort: Small -- risk: Low - -# Recommended Action - -(待 triage) - -# Technical Details - -- 影响文件:`pkg/config/config.go` -- 与 todo 010(Validate 入口)合流 -- 测试覆盖每个失败分支 - -# Acceptance Criteria - -- [ ] A: 非 hex64 的 ExpectedSha256 在 LoadConfig 失败 -- [ ] B: PushArgs 含 `--password` / `--username` 等凭证 flag 时 LoadConfig 失败 -- [ ] 单元测试覆盖 - -# Work Log - -- 2026-05-18: code review 合并 finding T3+T4 - -# Resources - -- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 -- 关联 todo: 010 diff --git a/todos/017-pending-p3-env-allowlist-threat-model-doc.md b/todos/017-pending-p3-env-allowlist-threat-model-doc.md deleted file mode 100644 index 48fd6fe..0000000 --- a/todos/017-pending-p3-env-allowlist-threat-model-doc.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -name: env-allowlist-threat-model-doc -description: pkg/exec.Command.EnvAllowlist 默认 nil=全量透传是兼容默认;本身未变更但 PR 让它成为安全边界——值得在包文档里写清威胁模型,方便未来 caller 不重蹈覆辙 -status: pending -priority: p3 -issue_id: "017" -tags: [code-review, documentation, security, p3] -dependencies: [] ---- - -# Problem Statement - -`pkg/exec.Command.EnvAllowlist`(本 PR 未改动,但 violet 流程依赖它)的默认 nil/empty 行为是 **全量透传 os.Environ()**——为了兼容旧的 testCommand caller。当前 violet 路径正确地显式设了 allowlist `["KUBECONFIG", "PATH", "HOME", "USER", "VIOLET_*"]`,但这个"必须设"的纪律只在 code review 里口口相传。 - -未来新加一个 `exec.RunCommand` 调用点的同事可能忘记设 allowlist,子进程立即继承所有 CI secret(GITHUB_TOKEN / AWS_* / NPM_TOKEN ...)——这是一个静默退化路径。 - -# Findings - -- **type-design-analyzer** T7 独立命中 -- 仅文档层;不修改逻辑 - -# Proposed Solutions - -**Option A(推荐)**:在 `pkg/exec/doc.go` 或 `exec.go` 顶部加包级文档,写明: - -``` -Package exec wraps os/exec.CommandContext with stdio capture + tee, and an -opt-in environment allowlist. - -THREAT MODEL: callers that exec third-party binaries (e.g. uploaders, -registries) SHOULD set Command.EnvAllowlist to restrict which host env vars -reach the child process. The default (empty allowlist == full passthrough) -exists for test-harness callers that intentionally inherit the full -environment (e.g. `make test` running under CI with bespoke env wiring). - -NEW CALL SITES: prefer setting EnvAllowlist; the empty default is for -backward compatibility only. -``` - -- 优点:零运行时改动;为未来新 caller 提供 norm -- 缺点:不能强制 -- effort: Negligible -- risk: Zero - -**Option B**:再加一个 `RunStrict` 函数(`EnvAllowlist []string` 必填)作为推荐入口;保留 `RunCommand` 为兼容性 API。 - -- 优点:API 引导 -- 缺点:双入口面,违反"窄接口" -- effort: Small -- risk: Low - -# Recommended Action - -(待 triage) - -# Technical Details - -- 影响文件:`pkg/exec/exec.go`(或新增 `doc.go`) -- 不影响行为 - -# Acceptance Criteria - -- [ ] `pkg/exec` 包有顶部包文档说明 EnvAllowlist 威胁模型 -- [ ] CLAUDE.md 项目根部约束(可选):新 RunCommand 调用点必须明示 EnvAllowlist 决策 - -# Work Log - -- 2026-05-18: code review 发现 by type-design-analyzer - -# Resources - -- PR: https://github.com/AlaudaDevops/upgrade-test/pull/14 -- 类似设计:Go x/exp/slog 早期对 default Handler 的文档化做法 From e4185e7e52e52e9d4896da46fca24707c5fbe4bb Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Mon, 18 May 2026 23:23:44 +0800 Subject: [PATCH 10/15] fix: address re-review findings (security + simplicity) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six findings from the post-integration re-review, applied as one commit so the result reads coherently — three of them tighten the credential-handling contract and three of them strip complexity that the previous follow-up had left behind. Security -------- (S1) Violet.PushArgs now rejects credential flags. --username, --password, --platform-username, --platform-password (in both "--flag value" and "--flag=value" form) cause BuildVioletPushArgs to return an error before any process is started. The escape hatch stays usable for --plain / --image-pull-secret / --dest-repo and similar, but it can no longer be used to smuggle a real secret into config.yaml and bypass the env-only credential contract. (S2) Replace real credentials in test fixtures and README example with obviously-fake placeholders ("admin@example.invalid", "redacted-password"). Anything that ships in git should never look like a working credential, even when it isn't one anymore. (S3) Add Command.RedactSecrets in pkg/exec. When non-empty, every Write to stdout/stderr (console + capture buffer + the stderr-tail wrapped into Err) is scrubbed via bytes.ReplaceAll before forwarding. The violet caller passes the real credential strings from VIOLET_REGISTRY_PASSWORD / VIOLET_PLATFORM_PASSWORD, so even if violet itself echoes its argv in verbose mode the CI log no longer carries the secret verbatim. MaskCommand was already covering OUR log line; this covers the child's output. Simplicity ---------- (P1) Remove Violet.Force. The CLI always passes --force to violet — the "preserve a stale AV" use case was hypothetical and conflicts with deleteArtifactVersionIfExists already running first. One field, one *bool dance, one default branch, and two test cases gone. (P2) Inline the two single-use helpers: derefBoolDefault becomes three lines at the SkipPush call site, sensitivePasswordFlags becomes an isPasswordFlag function with a one-line || expression. Both helpers were strictly local indirection. (P3) EnvAllowlist drops glob support. matchAllowlist used to interpret a trailing "*" as a prefix match; the only caller (violetEnvAllowlist) used that for "VIOLET_*", but it also masks future unaudited VIOLET_FOO env vars from sneaking through. Replaced with an exact enumeration of the four VIOLET_REGISTRY/PLATFORM_* names. The filterHostEnv loop now does a plain string equality check, which was always the intent. Mechanical follow-through ------------------------- - BuildVioletPushArgs signature is now `(VioletPushParams) ([]string, error)`. - VioletPushParams.Force removed. - pkg/operator/operatorhub/violet.go drops the VIOLET_* glob comment and enumerates the four env names; the test using a glob is removed and a new TestRunCommand_RedactSecrets* trio covers stdout/stderr/Err scrubbing and the no-op fast path. - TestBuildVioletPushArgs rewritten as 8 happy-path cases (no Force knob) plus TestBuildVioletPushArgs_RejectsCredentialFlagsInPushArgs (7 cases) and TestBuildVioletPushArgs_AllowsNonCredentialFlags (1 sanity check). - README documents that credentials in PushArgs are rejected; CLAUDE.md adds the "PushArgs rejects credential flags" and "CLI always sends --force" hard constraints, replacing the prior Force section. --- CLAUDE.md | 3 +- README.md | 11 ++- pkg/config/config.go | 11 --- pkg/exec/exec.go | 95 ++++++++++++++------ pkg/exec/exec_test.go | 94 ++++++++++++-------- pkg/operator/operatorhub/violet.go | 98 ++++++++++++++------- pkg/operator/operatorhub/violet_test.go | 111 ++++++++++++++++-------- 7 files changed, 275 insertions(+), 148 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2daff63..7638671 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,8 +62,9 @@ Artifact 名称约定:未显式给 `artifact` 字段时,自动拼成 ` # 仅私有 push 场景需要的额外两行: # export VIOLET_REGISTRY_USERNAME= diff --git a/pkg/config/config.go b/pkg/config/config.go index 9da7df8..f6ba286 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -96,13 +96,6 @@ type VioletConfig struct { // "updated successfully" while the CRs never appear in the cluster the // upgrade CLI is watching. Clusters string `yaml:"clusters,omitempty"` - - // Force defaults to true. Without --force violet aborts AV upserts with - // "already exist, skip it" — verified on tektoncd-operator v4.0.17 on - // 40-devops: the same input that succeeded with --force was a no-op - // without it, leaving wait AV Present to time out. Pointer so users can - // opt out (e.g. when a stale AV must be preserved exactly). - Force *bool `yaml:"force,omitempty"` } // UpgradePath represents a single upgrade path @@ -193,10 +186,6 @@ func defaultConfig(config *Config) *Config { t := true v.SkipPush = &t } - if v.Force == nil { - t := true - v.Force = &t - } } return config diff --git a/pkg/exec/exec.go b/pkg/exec/exec.go index b6dc325..7665a45 100644 --- a/pkg/exec/exec.go +++ b/pkg/exec/exec.go @@ -10,18 +10,66 @@ import ( "strings" ) +// redactingWriter returns an io.Writer that scans every Write for the given +// secret byte strings and substitutes "***" before forwarding to w. Empty +// secrets are skipped; an empty secrets list returns w unchanged so the +// hot path pays no per-write overhead when no redaction is configured. +func redactingWriter(w io.Writer, secrets []string) io.Writer { + live := make([][]byte, 0, len(secrets)) + for _, s := range secrets { + if s != "" { + live = append(live, []byte(s)) + } + } + if len(live) == 0 { + return w + } + return &scrubbingWriter{w: w, secrets: live} +} + +type scrubbingWriter struct { + w io.Writer + secrets [][]byte +} + +func (s *scrubbingWriter) Write(p []byte) (int, error) { + scrubbed := p + for _, secret := range s.secrets { + scrubbed = bytes.ReplaceAll(scrubbed, secret, []byte("***")) + } + if _, err := s.w.Write(scrubbed); err != nil { + return 0, err + } + // Report the original byte count so callers using io.Copy do not see a + // short write when the replacement is shorter (or longer) than the + // input. + return len(p), nil +} + type Command struct { Name string Args []string Dir string Env []string - // EnvAllowlist limits which host environment variables are passed to the child - // process. Entries support an exact name ("KUBECONFIG") or a "PREFIX_*" pattern - // ("VIOLET_*"). When EnvAllowlist is empty, the child inherits os.Environ() in - // full (backward-compatible default). Env is always appended after the filtered - // host set. + // EnvAllowlist limits which host environment variables are passed to the + // child process — entries are exact names ("KUBECONFIG"). When the list + // is empty, the child inherits os.Environ() in full (backward-compatible + // default). Env is always appended after the filtered host set. EnvAllowlist []string + + // RedactSecrets is a list of literal byte strings that must never appear + // in the child's stdout/stderr stream as captured by this CLI. Any byte + // match is replaced with "***" before the output is written to the + // console, the captured buffer, or the stderr-tail wrapped into Err. + // + // This guards against the child process echoing its own argv when + // verbose/debug logging is on — argv-mode credentials still leak at the + // OS level (ps auxe, /proc), but the CI build log no longer carries the + // secret verbatim. The match is byte-literal and stateless, so a secret + // split across two Write calls can slip through; in practice CI loggers + // are line-buffered and the residual risk is acceptable. + RedactSecrets []string } // CommandResult represents the result of a command execution @@ -66,9 +114,12 @@ func RunCommand(ctx context.Context, cmd Command) CommandResult { // Create buffers to capture output var stdoutBuf, stderrBuf bytes.Buffer - // Create multi-writers to both capture and print output - stdoutWriter := io.MultiWriter(os.Stdout, &stdoutBuf) - stderrWriter := io.MultiWriter(os.Stderr, &stderrBuf) + // Create multi-writers to both capture and print output. When the caller + // supplies RedactSecrets, every writer in the chain (console + capture + // buffer) receives the scrubbed bytes — that way the stderr-tail wrap + // downstream cannot read the unredacted secret back out of the buffer. + stdoutWriter := redactingWriter(io.MultiWriter(os.Stdout, &stdoutBuf), cmd.RedactSecrets) + stderrWriter := redactingWriter(io.MultiWriter(os.Stderr, &stderrBuf), cmd.RedactSecrets) runCmd.Stdout = stdoutWriter runCmd.Stderr = stderrWriter @@ -92,6 +143,10 @@ func filterHostEnv(allowlist []string) []string { if len(allowlist) == 0 { return host } + // Membership lookup over a fixed list of env names. The list is always + // short (operational essentials + an explicit credential enumeration) + // so a linear scan is no worse than a map, and a map would obscure the + // "every name was reviewed explicitly" property the allowlist gives us. out := make([]string, 0, len(host)) for _, entry := range host { eq := strings.IndexByte(entry, '=') @@ -99,28 +154,14 @@ func filterHostEnv(allowlist []string) []string { continue } name := entry[:eq] - if matchAllowlist(name, allowlist) { - out = append(out, entry) - } - } - return out -} - -// matchAllowlist reports whether name matches any pattern in allowlist. A pattern -// ending in "*" matches by prefix; anything else is an exact match. -func matchAllowlist(name string, allowlist []string) bool { - for _, pat := range allowlist { - if strings.HasSuffix(pat, "*") { - if strings.HasPrefix(name, strings.TrimSuffix(pat, "*")) { - return true + for _, want := range allowlist { + if name == want { + out = append(out, entry) + break } - continue - } - if name == pat { - return true } } - return false + return out } // wrapWithStderrTail enriches err with the trailing stderr lines so callers see diff --git a/pkg/exec/exec_test.go b/pkg/exec/exec_test.go index 4daa849..54f458f 100644 --- a/pkg/exec/exec_test.go +++ b/pkg/exec/exec_test.go @@ -7,30 +7,6 @@ import ( "testing" ) -func TestMatchAllowlist(t *testing.T) { - tests := []struct { - name string - needle string - allowlist []string - want bool - }{ - {"exact match hits", "KUBECONFIG", []string{"KUBECONFIG", "HOME"}, true}, - {"exact match misses", "USER", []string{"KUBECONFIG", "HOME"}, false}, - {"prefix glob hits", "VIOLET_REGISTRY_USERNAME", []string{"VIOLET_*"}, true}, - {"prefix glob misses on unrelated var", "GITHUB_TOKEN", []string{"VIOLET_*"}, false}, - {"empty allowlist never matches", "ANYTHING", nil, false}, - {"prefix glob and exact mix", "PATH", []string{"VIOLET_*", "PATH"}, true}, - {"prefix glob requires prefix not substring", "MY_VIOLET_TOKEN", []string{"VIOLET_*"}, false}, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - if got := matchAllowlist(tc.needle, tc.allowlist); got != tc.want { - t.Errorf("matchAllowlist(%q, %v) = %v, want %v", tc.needle, tc.allowlist, got, tc.want) - } - }) - } -} - func TestFilterHostEnv_EmptyAllowlistReturnsAll(t *testing.T) { // Empty allowlist must preserve the legacy "full os.Environ() passthrough" // behaviour — testCommand callers rely on it. @@ -54,19 +30,6 @@ func TestFilterHostEnv_AllowlistRestricts(t *testing.T) { } } -func TestFilterHostEnv_GlobMatchesPrefix(t *testing.T) { - t.Setenv("FILTER_GLOB_TOKEN_A", "1") - t.Setenv("FILTER_GLOB_TOKEN_B", "2") - t.Setenv("OTHER_VAR", "leave-me") - got := filterHostEnv([]string{"FILTER_GLOB_*"}) - if !contains(got, "FILTER_GLOB_TOKEN_A=1") || !contains(got, "FILTER_GLOB_TOKEN_B=2") { - t.Errorf("both glob-matched vars should be present; env=%v", got) - } - if contains(got, "OTHER_VAR=leave-me") { - t.Errorf("OTHER_VAR should be filtered; env=%v", got) - } -} - func TestLastLines(t *testing.T) { tests := []struct { name string @@ -156,6 +119,63 @@ func TestRunCommand_EmptyAllowlistFullPassthrough(t *testing.T) { } } +func TestRunCommand_RedactSecretsInChildStdout(t *testing.T) { + // The child literally echoes a "secret" string. Without RedactSecrets it + // would land verbatim in Stdout, the console, and the wrapped stderr + // tail. With RedactSecrets, every place we capture the byte stream sees + // "***" instead. + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "echo prefix=top-secret-value=suffix"}, + RedactSecrets: []string{"top-secret-value"}, + }) + if res.Err != nil { + t.Fatalf("RunCommand failed: %v", res.Err) + } + if strings.Contains(res.Stdout, "top-secret-value") { + t.Errorf("Stdout buffer should not contain the secret; got %q", res.Stdout) + } + if !strings.Contains(res.Stdout, "prefix=***=suffix") { + t.Errorf("expected mid-string replacement, got %q", res.Stdout) + } +} + +func TestRunCommand_RedactSecretsAppliesToStderrTail(t *testing.T) { + // Failing child emits the secret in stderr. wrapWithStderrTail should + // receive the already-scrubbed buffer, so the *error message* also + // hides the secret. + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "echo error: secret-payload 1>&2; exit 1"}, + RedactSecrets: []string{"secret-payload"}, + }) + if res.Err == nil { + t.Fatal("expected non-nil err for exit 1") + } + if strings.Contains(res.Err.Error(), "secret-payload") { + t.Errorf("wrapped error should redact the secret; got %q", res.Err.Error()) + } + if strings.Contains(res.Stderr, "secret-payload") { + t.Errorf("captured stderr buffer should redact the secret; got %q", res.Stderr) + } +} + +func TestRunCommand_EmptyRedactSecretsIsNoOp(t *testing.T) { + // All-empty (or nil) RedactSecrets must not wrap the writer — verifies + // the hot path doesn't pay overhead when redaction is unused. + res := RunCommand(context.Background(), Command{ + Name: "/bin/sh", + Args: []string{"-c", "echo unchanged-payload"}, + RedactSecrets: []string{"", ""}, + }) + if res.Err != nil { + t.Fatalf("RunCommand failed: %v", res.Err) + } + if !strings.Contains(res.Stdout, "unchanged-payload") { + t.Errorf("payload should pass through when no live secrets; got %q", res.Stdout) + } +} + func TestRunCommand_FailingChildWrapsStderrTail(t *testing.T) { // /bin/sh -c 'echo boom 1>&2; exit 7' — verify CommandResult.Err contains // the stderr tail wrap, and that errors.Is still works against the diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index 8479d1c..ec6201e 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -49,12 +49,40 @@ const ( flagPlatformPassword = "--platform-password" ) -// sensitivePasswordFlags lists every violet flag whose immediately following -// argv token is a credential — MaskCommand redacts each of them in logs. +// isPasswordFlag reports whether `arg` is a violet flag whose immediately +// following argv token is a credential — MaskCommand redacts each in logs. // argv-mode leakage to OS (ps auxe / /proc) remains as documented in the README. -var sensitivePasswordFlags = map[string]bool{ - flagPassword: true, - flagPlatformPassword: true, +func isPasswordFlag(arg string) bool { + return arg == flagPassword || arg == flagPlatformPassword +} + +// credentialFlagsForbiddenInPushArgs lists violet flags that the upgrade CLI +// will not let through Violet.PushArgs. The contract is "credentials only via +// env vars" — letting a user smuggle --password / --platform-password via +// config.yaml would silently put a real secret into git and into the OS argv +// table without any redaction the env-injection path provides. +var credentialFlagsForbiddenInPushArgs = []string{ + flagUsername, flagPassword, + flagPlatformUsername, flagPlatformPassword, +} + +// validatePushArgs rejects any attempt to smuggle a credential flag through +// Violet.PushArgs, in both the bare "--password value" form and the +// "--password=value" form. Returns an error naming the offending argument. +func validatePushArgs(args []string) error { + for _, arg := range args { + flag := arg + if i := strings.IndexByte(arg, '='); i > 0 { + flag = arg[:i] + } + for _, forbidden := range credentialFlagsForbiddenInPushArgs { + if flag == forbidden { + return fmt.Errorf("violet.pushArgs must not contain credential flag %q; "+ + "set VIOLET_REGISTRY_USERNAME / _PASSWORD / VIOLET_PLATFORM_USERNAME / _PASSWORD environment variables instead", arg) + } + } + } + return nil } // BuildPackageURL composes the .tgz URL by the agreed MinIO convention: @@ -87,7 +115,6 @@ func BuildPackageURL(prefix, name, channel, bundleVersion string) (string, error type VioletPushParams struct { TgzPath string SkipPush bool - Force bool PlatformAddress string Clusters string PushArgs []string @@ -101,14 +128,19 @@ type VioletPushParams struct { // // PushArgs is appended verbatim so unsupported flags can still be threaded // through without a code change. -func BuildVioletPushArgs(p VioletPushParams) []string { - args := []string{"push", p.TgzPath, flagTargetCatalogSource, targetCatalogSource} +func BuildVioletPushArgs(p VioletPushParams) ([]string, error) { + if err := validatePushArgs(p.PushArgs); err != nil { + return nil, err + } + // --force is always set: without it violet aborts AV upserts with + // "already exist, skip it" and creates nothing, defeating the entire + // flow. There is no realistic scenario where the upgrade CLI wants to + // preserve a stale AV — if such a need arises, the caller already + // deletes the residue before invoking violet. + args := []string{"push", p.TgzPath, flagTargetCatalogSource, targetCatalogSource, flagForce} if p.SkipPush { args = append(args, flagSkipPush) } - if p.Force { - args = append(args, flagForce) - } if p.PlatformAddress != "" { args = append(args, flagPlatformAddress, p.PlatformAddress) } @@ -128,7 +160,7 @@ func BuildVioletPushArgs(p VioletPushParams) []string { args = append(args, flagPlatformPassword, p2) } args = append(args, p.PushArgs...) - return args + return args, nil } // MaskCommand renders the command for logging, replacing the token following @@ -140,7 +172,7 @@ func MaskCommand(name string, args []string) string { parts := make([]string, 0, len(args)+1) parts = append(parts, name) for i := 0; i < len(args); i++ { - if sensitivePasswordFlags[args[i]] && i+1 < len(args) { + if isPasswordFlag(args[i]) && i+1 < len(args) { parts = append(parts, args[i], "***") i++ continue @@ -353,15 +385,18 @@ func (o *Operator) deleteArtifactVersionIfExists(ctx context.Context, name strin // violetEnvAllowlist constrains which host environment variables flow into // the violet child process. KUBECONFIG / PATH / HOME / USER are operational -// essentials; VIOLET_* covers VIOLET_REGISTRY_USERNAME / _PASSWORD and any -// future violet-specific overrides without leaking unrelated CI secrets such -// as GITHUB_TOKEN or AWS_*. +// essentials; the four VIOLET_* names cover registry + platform credential +// pairs. Exact enumeration (not a prefix glob) so introducing a new +// VIOLET_* env requires a code change reviewed alongside the env reader. var violetEnvAllowlist = []string{ "KUBECONFIG", "PATH", "HOME", "USER", - "VIOLET_*", + EnvVioletRegistryUsername, + EnvVioletRegistryPassword, + EnvVioletPlatformUsername, + EnvVioletPlatformPassword, } // execVioletPush runs `violet push ` as a child process. Credentials @@ -378,35 +413,38 @@ func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { return err } - args := BuildVioletPushArgs(VioletPushParams{ + skipPush := true + if o.violet.SkipPush != nil { + skipPush = *o.violet.SkipPush + } + args, err := BuildVioletPushArgs(VioletPushParams{ TgzPath: tgzPath, - SkipPush: derefBoolDefault(o.violet.SkipPush, true), - Force: derefBoolDefault(o.violet.Force, true), + SkipPush: skipPush, PlatformAddress: o.violet.PlatformAddress, Clusters: o.violet.Clusters, PushArgs: o.violet.PushArgs, }) + if err != nil { + return err + } log.Infow("invoking violet", "cmd", MaskCommand(bin, args)) + // Also redact the credential values themselves from violet's own + // stdout/stderr — MaskCommand only protects the line WE log, not what + // violet itself might echo (e.g. verbose argv dumps). result := exec.RunCommand(ctx, exec.Command{ Name: bin, Args: args, EnvAllowlist: violetEnvAllowlist, + RedactSecrets: []string{ + os.Getenv(EnvVioletRegistryPassword), + os.Getenv(EnvVioletPlatformPassword), + }, }) return result.Err } -// derefBoolDefault returns *p when p is non-nil, otherwise the supplied -// default. Used to apply VioletConfig's "unset means on" semantics for -// SkipPush and Force at the call site of BuildVioletPushArgs. -func derefBoolDefault(p *bool, fallback bool) bool { - if p == nil { - return fallback - } - return *p -} - // validateVioletBin guards against accidentally executing a binary at an // arbitrary user-supplied path. When Violet.Bin is non-empty it must be an // absolute path to an executable regular file; otherwise return a typed diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index 6a92c88..29602b9 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -112,49 +112,44 @@ func TestBuildVioletPushArgs(t *testing.T) { want []string }{ { - name: "minimal: skip-push + force defaults are explicit", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force"}, + name: "minimal: force is always present", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push"}, }, { - name: "force=false drops the flag", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: false}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push"}, - }, - { - name: "skip-push=false drops the flag (push images path)", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: false, Force: true}, + name: "skip-push=false drops only that flag (push images path)", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: false}, want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force"}, }, { - name: "platform-address surfaces between force and creds", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PlatformAddress: "https://example.acp"}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-address", "https://example.acp"}, + name: "platform-address surfaces after skip-push", + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, PlatformAddress: "https://example.acp"}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push", "--platform-address", "https://example.acp"}, }, { name: "clusters list lands after platform-address", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PlatformAddress: "https://x", Clusters: "devops"}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-address", "https://x", "--clusters", "devops"}, + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, PlatformAddress: "https://example.acp", Clusters: "devops"}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push", "--platform-address", "https://example.acp", "--clusters", "devops"}, }, { name: "registry creds env auto-injected as --username/--password", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true}, regUser: "u", regPass: "p", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--username", "u", "--password", "p"}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push", "--username", "u", "--password", "p"}, }, { name: "platform creds env auto-injected as --platform-username/--platform-password", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, - platUser: "admin@cpaas.io", - platPass: "07Apples@", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--platform-username", "admin@cpaas.io", "--platform-password", "07Apples@"}, + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true}, + platUser: "admin@example.invalid", + platPass: "redacted-password", + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push", "--platform-username", "admin@example.invalid", "--platform-password", "redacted-password"}, }, { - name: "full real-cluster shape: address + clusters + both creds + push-args", + name: "full shape: address + clusters + both creds + push-args", params: VioletPushParams{ - TgzPath: "/tmp/x.tgz", SkipPush: false, Force: true, - PlatformAddress: "https://40-devops", + TgzPath: "/tmp/x.tgz", SkipPush: false, + PlatformAddress: "https://example.acp", Clusters: "devops", PushArgs: []string{"--dest-repo", "registry.private/devops"}, }, @@ -163,23 +158,17 @@ func TestBuildVioletPushArgs(t *testing.T) { want: []string{ "push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", - "--platform-address", "https://40-devops", + "--platform-address", "https://example.acp", "--clusters", "devops", "--username", "r-u", "--password", "r-p", "--platform-username", "p-u", "--platform-password", "p-p", "--dest-repo", "registry.private/devops", }, }, - { - name: "only registry username present (no password) still injects", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true}, - regUser: "u", - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--username", "u"}, - }, { name: "pushArgs always land last so user overrides win", - params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, Force: true, PushArgs: []string{"--plain", "--image-pull-secret", "pull"}}, - want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--skip-push", "--force", "--plain", "--image-pull-secret", "pull"}, + params: VioletPushParams{TgzPath: "/tmp/x.tgz", SkipPush: true, PushArgs: []string{"--plain", "--image-pull-secret", "pull"}}, + want: []string{"push", "/tmp/x.tgz", "--target-catalog-source", "platform", "--force", "--skip-push", "--plain", "--image-pull-secret", "pull"}, }, } @@ -189,7 +178,10 @@ func TestBuildVioletPushArgs(t *testing.T) { t.Setenv(EnvVioletRegistryPassword, tc.regPass) t.Setenv(EnvVioletPlatformUsername, tc.platUser) t.Setenv(EnvVioletPlatformPassword, tc.platPass) - got := BuildVioletPushArgs(tc.params) + got, err := BuildVioletPushArgs(tc.params) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } if !stringSliceEqual(got, tc.want) { t.Errorf("args mismatch:\n got: %v\nwant: %v", got, tc.want) } @@ -197,6 +189,53 @@ func TestBuildVioletPushArgs(t *testing.T) { } } +func TestBuildVioletPushArgs_RejectsCredentialFlagsInPushArgs(t *testing.T) { + // PushArgs is a free-form escape hatch but must never carry credentials — + // the env-only contract is what makes log redaction meaningful. Both the + // bare "--flag value" and combined "--flag=value" forms must be rejected. + forbidden := []struct { + name string + pushArgs []string + }{ + {"bare --password", []string{"--password", "secret"}}, + {"bare --platform-password", []string{"--platform-password", "secret"}}, + {"bare --username", []string{"--username", "user"}}, + {"bare --platform-username", []string{"--platform-username", "user"}}, + {"--password=value form", []string{"--password=secret"}}, + {"--platform-password=value form", []string{"--platform-password=secret"}}, + {"credential flag mixed with safe flags", []string{"--plain", "--password", "secret", "--force"}}, + } + for _, tc := range forbidden { + t.Run(tc.name, func(t *testing.T) { + _, err := BuildVioletPushArgs(VioletPushParams{ + TgzPath: "/tmp/x.tgz", + SkipPush: true, + PushArgs: tc.pushArgs, + }) + if err == nil { + t.Fatalf("want rejection for pushArgs=%v, got nil", tc.pushArgs) + } + if !strings.Contains(err.Error(), "credential flag") { + t.Errorf("error should mention 'credential flag', got %v", err) + } + }) + } +} + +func TestBuildVioletPushArgs_AllowsNonCredentialFlags(t *testing.T) { + // Sanity check: --plain / --image-pull-secret / --dest-repo and similar + // non-sensitive flags must continue to flow through PushArgs unchanged + // (the very reason PushArgs exists). + _, err := BuildVioletPushArgs(VioletPushParams{ + TgzPath: "/tmp/x.tgz", + SkipPush: false, + PushArgs: []string{"--plain", "--dest-repo", "registry.private/x", "--image-pull-secret", "pull"}, + }) + if err != nil { + t.Fatalf("non-credential flags should pass: %v", err) + } +} + func TestMaskCommand(t *testing.T) { tests := []struct { name string @@ -225,8 +264,8 @@ func TestMaskCommand(t *testing.T) { }, { name: "--platform-password value masked too", - in: []string{"--platform-username", "admin@cpaas.io", "--platform-password", "07Apples@"}, - want: "violet --platform-username admin@cpaas.io --platform-password ***", + in: []string{"--platform-username", "admin@example.invalid", "--platform-password", "redacted-password"}, + want: "violet --platform-username admin@example.invalid --platform-password ***", }, { name: "both --password and --platform-password masked in one command", From 41bfcd3bf1efe98606d53236dc1fd23ac5db192d Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Tue, 19 May 2026 09:39:26 +0800 Subject: [PATCH 11/15] refactor(operatorhub): in-place OLM upgrade via approve + force-refresh Previously every UpgradePath step deleted the Subscription and prior CSV, then recreated the Subscription from scratch. That was effectively a fresh install on each iteration and did not exercise the OLM replace chain, so data-compatibility tests could miss upgrade regressions. Now InstallSubscription only creates the Subscription on first use; from the second version onward it keeps the existing Subscription and CSV in place, patches spec.channel when it differs, and bumps a refresh annotation to force the OLM controller to re-reconcile (mainly to defeat PackageManifest cache staleness right after a violet push). It then waits for an InstallPlan whose spec.clusterServiceVersionNames contains the target CSV (the old "any installplan name" check returned the previously approved plan on retry and hung waitCSVReady), approves it idempotently, and waits for the new CSV phase=Succeeded. Also removes the now-unused deleteResource helper and its schema / dynamic imports. --- pkg/operator/operatorhub/subscription.go | 208 +++++++++++++++-------- 1 file changed, 133 insertions(+), 75 deletions(-) diff --git a/pkg/operator/operatorhub/subscription.go b/pkg/operator/operatorhub/subscription.go index 0475a7d..6578a58 100644 --- a/pkg/operator/operatorhub/subscription.go +++ b/pkg/operator/operatorhub/subscription.go @@ -5,94 +5,140 @@ import ( "fmt" "time" - "github.com/oliveagle/jsonpath" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/dynamic" "knative.dev/pkg/logging" ) +// InstallSubscription drives an OLM in-place upgrade for one target CSV. +// +// First version on a fresh cluster: Subscription does not yet exist → create +// it (installPlanApproval: Manual, startingCSV: csv). +// +// Subsequent versions: Subscription already exists from the previous step. +// The new bundle has just been pushed to the platform catalog by violet, so +// OLM will produce a fresh InstallPlan whose spec.clusterServiceVersionNames +// targets the new csv. We do NOT delete the Subscription or the prior CSV — +// OLM's replace chain rolls them forward, and tearing them down here loses +// the upgrade-path semantics we want to exercise. We only: +// +// 1. patch spec.channel when the target channel differs from the current one +// (no-op when it matches); +// 2. wait for the InstallPlan whose spec targets the new csv; +// 3. approve it (idempotent — already-approved plans are left alone); +// 4. wait for the new CSV phase=Succeeded. func (o *Operator) InstallSubscription(ctx context.Context, csv string, channel string) error { if csv == "" { return fmt.Errorf("csv is empty") } log := logging.FromContext(ctx) - log.Infow("installing subscription", "csv", csv, "namespace", o.namespace) - // Delete the subscription and csv if they exist - if err := o.deleteResource(ctx, subscriptionGVR, o.name, o.namespace); err != nil { - return fmt.Errorf("failed to delete old subscription: %v", err) - } - - if err := o.deleteResource(ctx, csvGVR, csv, o.namespace); err != nil { - return fmt.Errorf("failed to delete old csv: %v", err) - } - - log.Infow("creating subscription", "name", o.name, "namespace", o.namespace, "csv", csv, "channel", channel) - _, err := o.createSubscription(ctx, o.name, o.namespace, csv, channel) - if err != nil { - return fmt.Errorf("failed to create subscription: %v", err) + log.Infow("ensuring subscription for upgrade", "csv", csv, "channel", channel, "namespace", o.namespace) + + existing, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Get(ctx, o.name, metav1.GetOptions{}) + switch { + case errors.IsNotFound(err): + log.Infow("subscription absent, creating fresh", "name", o.name, "csv", csv, "channel", channel) + if _, err := o.createSubscription(ctx, o.name, o.namespace, csv, channel); err != nil { + return fmt.Errorf("failed to create subscription: %v", err) + } + case err != nil: + return fmt.Errorf("failed to get subscription: %v", err) + default: + log.Infow("subscription exists, rolling forward in place", "name", o.name) + if err := o.refreshSubscriptionForUpgrade(ctx, existing, channel); err != nil { + return fmt.Errorf("failed to refresh subscription: %v", err) + } } - log.Infow("waiting for install plan", "name", o.name, "namespace", o.namespace) - installPlanName, err := o.waitInstallPlan(ctx, o.name, o.namespace) + log.Infow("waiting for install plan targeting csv", "csv", csv) + installPlanName, err := o.waitInstallPlanForCSV(ctx, o.name, o.namespace, csv) if err != nil { return fmt.Errorf("failed to wait for install plan: %v", err) } - log.Infow("approving install plan", "name", o.name, "namespace", o.namespace, "installPlanName", installPlanName) - installPlan, err := o.client.Resource(installPlanGVR).Namespace(o.namespace).Get(ctx, installPlanName, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get install plan: %v", err) - } - - installPlan.Object["spec"].(map[string]interface{})["approved"] = true - _, err = o.client.Resource(installPlanGVR).Namespace(o.namespace).Update(ctx, installPlan, metav1.UpdateOptions{}) - if err != nil { - return fmt.Errorf("failed to update install plan: %v", err) + log.Infow("approving install plan", "installPlan", installPlanName, "csv", csv) + if err := o.approveInstallPlan(ctx, installPlanName, o.namespace); err != nil { + return fmt.Errorf("failed to approve install plan: %v", err) } log.Infow("waiting for csv to be ready", "name", csv, "namespace", o.namespace) - err = o.waitCSVReady(ctx, csv, o.namespace) - if err != nil { + if err := o.waitCSVReady(ctx, csv, o.namespace); err != nil { return fmt.Errorf("failed to wait for csv to be ready: %v", err) } - log.Infow("subscription installed successfully", "name", o.name, "namespace", o.namespace) + log.Infow("subscription rolled forward", "name", o.name, "csv", csv) return nil } -func (o *Operator) deleteResource(ctx context.Context, gvr schema.GroupVersionResource, name, namespace string) error { +// refreshAnnotation is bumped on every in-place upgrade to nudge the OLM +// Subscription controller to re-reconcile, even when spec.channel does not +// change. The annotation lives under our own domain so it can never collide +// with OLM's internal olm.* keys. +const refreshAnnotation = "upgrade-test.alauda.io/refresh-trigger" + +// refreshSubscriptionForUpgrade prepares an already-existing Subscription for +// the next upgrade step. It does two things in a single Update: +// +// 1. If the target channel differs from spec.channel, patch it. +// 2. Always bump refreshAnnotation to a fresh RFC3339Nano timestamp. This is +// a force-refresh nudge: even on same-channel upgrades, mutating the +// object's metadata makes the OLM controller re-evaluate the Subscription +// against the catalog instead of waiting on its internal resync interval. +// Useful when the new CSV has just been pushed by violet and the +// PackageManifest cache is stale. +// +// We deliberately do NOT delete and recreate the Subscription, restart the +// catalog-operator, or touch the CatalogSource — annotation bump is the +// lightest mechanism that still guarantees a reconcile. +func (o *Operator) refreshSubscriptionForUpgrade(ctx context.Context, sub *unstructured.Unstructured, channel string) error { log := logging.FromContext(ctx) - log.Infow("deleting old resource", "gvr", gvr, "name", name, "namespace", namespace) - var rsAbled dynamic.ResourceInterface - nsEnabled := o.client.Resource(gvr) - if namespace != "" { - rsAbled = nsEnabled.Namespace(namespace) + current, _, _ := unstructured.NestedString(sub.Object, "spec", "channel") + if current != channel { + log.Infow("patching subscription channel", "from", current, "to", channel) + if err := unstructured.SetNestedField(sub.Object, channel, "spec", "channel"); err != nil { + return fmt.Errorf("set spec.channel: %w", err) + } + } else { + log.Infow("subscription channel already matches target", "channel", channel) } - err := rsAbled.Delete(ctx, name, metav1.DeleteOptions{}) - if err != nil && !errors.IsNotFound(err) { - return fmt.Errorf("failed to delete resource %s: %v", name, err) + annotations, _, _ := unstructured.NestedStringMap(sub.Object, "metadata", "annotations") + if annotations == nil { + annotations = map[string]string{} } + stamp := time.Now().UTC().Format(time.RFC3339Nano) + annotations[refreshAnnotation] = stamp + if err := unstructured.SetNestedStringMap(sub.Object, annotations, "metadata", "annotations"); err != nil { + return fmt.Errorf("set refresh annotation: %w", err) + } + log.Infow("force-refresh annotation bumped to trigger OLM reconcile", "annotation", refreshAnnotation, "value", stamp) - log.Infow("waiting for resource to be deleted", "name", name, "namespace", namespace) - err = wait.PollUntilContextTimeout(ctx, o.interval, o.timeout, true, func(ctx context.Context) (done bool, err error) { - _, err = rsAbled.Get(ctx, name, metav1.GetOptions{}) - if errors.IsNotFound(err) { - log.Infow("resource not found, deleting resource", "name", name, "namespace", namespace) - return true, nil - } - return false, err - }) + _, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Update(ctx, sub, metav1.UpdateOptions{}) + return err +} + +// approveInstallPlan flips spec.approved=true. Idempotent: if the plan is +// already approved (e.g. retry after a prior partial run), it returns nil +// without issuing an Update. +func (o *Operator) approveInstallPlan(ctx context.Context, name, namespace string) error { + log := logging.FromContext(ctx) + ip, err := o.client.Resource(installPlanGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to delete resource %s: %v", name, err) + return fmt.Errorf("get install plan: %w", err) } - return nil + if approved, _, _ := unstructured.NestedBool(ip.Object, "spec", "approved"); approved { + log.Infow("install plan already approved, skipping update", "name", name) + return nil + } + if err := unstructured.SetNestedField(ip.Object, true, "spec", "approved"); err != nil { + return fmt.Errorf("set spec.approved: %w", err) + } + _, err = o.client.Resource(installPlanGVR).Namespace(namespace).Update(ctx, ip, metav1.UpdateOptions{}) + return err } func (o *Operator) createSubscription(ctx context.Context, name, namespace, csv string, channel string) (*unstructured.Unstructured, error) { @@ -160,47 +206,59 @@ func (o *Operator) createSubscription(ctx context.Context, name, namespace, csv return nil, fmt.Errorf("failed to create subscription after 3 attempts: %v", err) } -// waitInstallPlan waits for the subscription to have an install plan and returns the install plan name -func (o *Operator) waitInstallPlan(ctx context.Context, name, namespace string) (string, error) { - var installPlanName string +// waitInstallPlanForCSV waits until Subscription.status.installplan.name +// points at an InstallPlan whose spec.clusterServiceVersionNames contains the +// target CSV, then returns that InstallPlan name. +// +// Why the extra CSV check (vs. the old "any installplan name will do" +// behaviour): on the second+ upgrade step Subscription.status.installplan.name +// briefly still references the *previous* InstallPlan that we already +// approved. Returning it would make the caller no-op-approve the old plan and +// then hang in waitCSVReady waiting for a CSV transition that OLM never +// produces. Matching on spec.clusterServiceVersionNames is the only way the +// API tells us which CSV a given InstallPlan targets, so we poll until the +// referenced plan actually targets the version we just pushed. +func (o *Operator) waitInstallPlanForCSV(ctx context.Context, subName, namespace, csv string) (string, error) { + log := logging.FromContext(ctx) + var matched string err := wait.PollUntilContextTimeout(ctx, o.interval, o.timeout, true, func(ctx context.Context) (done bool, err error) { - obj, err := o.client.Resource(subscriptionGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + sub, err := o.client.Resource(subscriptionGVR).Namespace(namespace).Get(ctx, subName, metav1.GetOptions{}) if err != nil && !errors.IsNotFound(err) { return false, err } - - if obj == nil { + if sub == nil { return false, nil } - // Use jsonpath to extract status.installplan.name - jsonpathQuery := "$.status.installplan.name" - result, err := jsonpath.JsonPathLookup(obj.Object, jsonpathQuery) - if err != nil { - // Install plan name not found yet, continue waiting + ipName, _, _ := unstructured.NestedString(sub.Object, "status", "installplan", "name") + if ipName == "" { return false, nil } - // Convert result to string - if installPlanNameStr, ok := result.(string); ok && installPlanNameStr != "" { - installPlanName = installPlanNameStr - return true, nil + ip, err := o.client.Resource(installPlanGVR).Namespace(namespace).Get(ctx, ipName, metav1.GetOptions{}) + if errors.IsNotFound(err) { + return false, nil + } + if err != nil { + return false, err } - // Install plan name is empty or not a string, continue waiting + csvs, _, _ := unstructured.NestedStringSlice(ip.Object, "spec", "clusterServiceVersionNames") + for _, name := range csvs { + if name == csv { + matched = ipName + return true, nil + } + } + log.Debugw("install plan does not target desired csv yet", "installPlan", ipName, "wantCSV", csv, "haveCSVs", csvs) return false, nil }) if err != nil { - return "", fmt.Errorf("timeout waiting for subscription %s to have install plan", name) - } - - if installPlanName == "" { - return "", fmt.Errorf("install plan name not found for subscription %s", name) + return "", fmt.Errorf("timeout waiting for install plan targeting csv %s on subscription %s: %w", csv, subName, err) } - - return installPlanName, nil + return matched, nil } func (o *Operator) waitCSVReady(ctx context.Context, name, namespace string) error { From caad477aeccb710f2073abfd7f44b8a3dae95b78 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Tue, 19 May 2026 11:20:41 +0800 Subject: [PATCH 12/15] feat(violet): local package cache + platform credentials via config - VioletConfig.LocalPackageDir: opt-in on-disk cache for downloaded .tgz packages. Layout mirrors the MinIO URL convention so cache hits are trivially diagnosable. SHA-256 verification still runs against cached files; half-written files are removed on download error to prevent false hits on retry. - VioletConfig.PlatformUsername / PlatformPassword: opt-in config-side credentials for environments where VIOLET_PLATFORM_* env injection is awkward. Config wins, env falls back, both feed --platform-*. Resolved password also lands in RedactSecrets so violet's own stdout cannot leak. - Refactor downloadToTemp into downloadFile + per-call wrapper so the cache path and the legacy /tmp path share the same HTTP/error logic. --- CLAUDE.md | 7 +- pkg/config/config.go | 30 ++++++ pkg/operator/operatorhub/violet.go | 154 +++++++++++++++++++++-------- 3 files changed, 147 insertions(+), 44 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7638671..c3be655 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,7 +41,7 @@ export KUBECONFIG= - **`operatorhub`(默认)** —— `pkg/operator/operatorhub/`。生产路径。Artifact / ArtifactVersion 的**写路径**已外包给 `violet` 二进制(子进程,见 `violet.go::installViaViolet`);Go 侧仍用 dynamic client 做以下三类只读 / OLM 资源: - Alauda 自定义资源 `app.alauda.io/v1alpha1` 下的 `Artifact` / `ArtifactVersion`:**Go 端只剩 Get + Delete**(清理残留 AV、等 AV phase=Present、读 status.version 拿 CSV);Create / Patch / Update 全归 violet。命名空间硬编码 `cpaas-system`,OLM 源名硬编码 `platform`(const `targetCatalogSource` in `operator.go`)。 - OLM 标准资源 `Subscription` / `InstallPlan` / `ClusterServiceVersion` / `PackageManifest` - - 升级流程:`InstallArtifactVersion`(下载 .tgz → 可选 sha256 校验 → 删除同名 AV 残留 → 调 `violet push` → 等 AV phase=Present 并 cross-check spec.tag → 等 PackageManifest 出现对应 CSV)→ `InstallSubscription`(先删旧 Sub + 旧 CSV → 创建 Subscription `installPlanApproval: Manual` → 等 InstallPlan → 把 `spec.approved=true` → 等 CSV phase=Succeeded) + - 升级流程:`InstallArtifactVersion`(下载 .tgz → 可选 sha256 校验 → 删除同名 AV 残留 → 调 `violet push` → 等 AV phase=Present 并 cross-check spec.tag → 等 PackageManifest 出现对应 CSV)→ `InstallSubscription`:**Subscription 不存在则创建**(`installPlanApproval: Manual`, `startingCSV=目标 CSV`);**Subscription 已存在则 in-place refresh**(必要时 patch `spec.channel`,并总是 bump `upgrade-test.alauda.io/refresh-trigger` 注解强制 OLM 重新 reconcile,**不删除、不重建**——依赖 OLM replace chain 滚动到新 CSV)→ 等 InstallPlan(用 `waitInstallPlanForCSV` 匹配 `spec.clusterServiceVersionNames` 包含目标 CSV,避免拿到 `status.installplan.name` 短暂指向的旧 IP)→ patch `spec.approved=true` → 等 CSV phase=Succeeded - **`local`** —— `pkg/operator/local/operator.go`。本地开发用,直接在 workspace 里跑 `make deploy`(可被 `operatorConfig.command` 覆盖),不接 OLM。 新增 Operator 类型时:在 `pkg/operator/` 下加子包实现 `OperatorInterface`,并在 `factory.go` 的 `CreateOperator` switch 中注册。 @@ -61,11 +61,12 @@ Artifact 名称约定:未显式给 `artifact` 字段时,自动拼成 `///.latest.ALL..tgz` 这个 mirror MinIO URL 的布局检查缓存:命中跳过 HTTP;miss 直接下载到该路径(父目录自动 mkdir),不再走 `/tmp`,下次自动命中;任一路径下都 **不会清理**(cleanup 为 noop),保留为缓存。`VerifySha256` 即使命中也会执行——避免损坏的缓存文件被静默喂给 violet。留空保持旧行为:下载到一次性 `/tmp/upgrade-violet-*` 并在 `defer cleanup()` 中删除,无跨次复用。下载半途失败时 cache 路径的半成品会被 `os.Remove` 清掉,防止下次假命中。 ## PR / 协作流程 diff --git a/pkg/config/config.go b/pkg/config/config.go index f6ba286..6cc0c58 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -96,6 +96,36 @@ type VioletConfig struct { // "updated successfully" while the CRs never appear in the cluster the // upgrade CLI is watching. Clusters string `yaml:"clusters,omitempty"` + + // PlatformUsername / PlatformPassword authenticate violet against the + // ACP platform for Artifact/AV writes. They take precedence over the + // environment variables VIOLET_PLATFORM_USERNAME / _PASSWORD, which + // remain supported as a fallback for CI/pipeline injection. When both + // are set, the config value wins. + // + // WARNING: writing credentials here will commit them to git if the + // config file is checked in. Prefer environment variables in shared + // repos; reserve this field for local one-off runs or for configs + // stored outside source control. + PlatformUsername string `yaml:"platformUsername,omitempty"` + PlatformPassword string `yaml:"platformPassword,omitempty"` + + // LocalPackageDir, when non-empty, is the on-disk cache root for + // downloaded .tgz packages. Layout mirrors the MinIO URL convention: + // + // ///.latest.ALL..tgz + // + // On cache hit the HTTP download is skipped. On miss the file is + // downloaded directly into the cache path (parent dirs auto-created) + // so subsequent runs hit the cache. SHA-256 verification (when + // ExpectedSha256 is set) still runs against the cached file — a + // corrupted cache entry will surface as a verification failure rather + // than silently propagate. Relative paths resolve to the upgrade CLI + // working directory. + // + // Leave empty to keep the legacy behavior: download to a per-call + // /tmp dir and remove on exit (no cross-run reuse). + LocalPackageDir string `yaml:"localPackageDir,omitempty"` } // UpgradePath represents a single upgrade path diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index ec6201e..f1c01ad 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -117,14 +117,22 @@ type VioletPushParams struct { SkipPush bool PlatformAddress string Clusters string - PushArgs []string + // PlatformUsername / PlatformPassword come from VioletConfig and, when + // non-empty, override the env-var fallback. See BuildVioletPushArgs for + // the precedence rule. + PlatformUsername string + PlatformPassword string + PushArgs []string } -// BuildVioletPushArgs assembles the argv for `violet push `. Credentials -// are read from environment variables and injected when non-empty: +// BuildVioletPushArgs assembles the argv for `violet push `. // -// VIOLET_REGISTRY_USERNAME / _PASSWORD → --username / --password -// VIOLET_PLATFORM_USERNAME / _PASSWORD → --platform-username / --platform-password +// Credentials precedence: +// - Platform: VioletPushParams.PlatformUsername / PlatformPassword win when +// non-empty; otherwise fall back to VIOLET_PLATFORM_USERNAME / _PASSWORD. +// Mapped to --platform-username / --platform-password. +// - Registry: env-only — VIOLET_REGISTRY_USERNAME / _PASSWORD → --username / --password. +// Used in private-registry push flow (skipPush:false). // // PushArgs is appended verbatim so unsupported flags can still be threaded // through without a code change. @@ -153,11 +161,19 @@ func BuildVioletPushArgs(p VioletPushParams) ([]string, error) { if p2 := os.Getenv(EnvVioletRegistryPassword); p2 != "" { args = append(args, flagPassword, p2) } - if u := os.Getenv(EnvVioletPlatformUsername); u != "" { - args = append(args, flagPlatformUsername, u) + platformUser := p.PlatformUsername + if platformUser == "" { + platformUser = os.Getenv(EnvVioletPlatformUsername) } - if p2 := os.Getenv(EnvVioletPlatformPassword); p2 != "" { - args = append(args, flagPlatformPassword, p2) + if platformUser != "" { + args = append(args, flagPlatformUsername, platformUser) + } + platformPass := p.PlatformPassword + if platformPass == "" { + platformPass = os.Getenv(EnvVioletPlatformPassword) + } + if platformPass != "" { + args = append(args, flagPlatformPassword, platformPass) } args = append(args, p.PushArgs...) return args, nil @@ -230,11 +246,10 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) if err != nil { return nil, "", fmt.Errorf("build package url: %w", err) } - log.Infow("downloading violet package", "url", url) - tgzPath, cleanup, err := downloadToTemp(ctx, url, o.timeout) + tgzPath, cleanup, err := o.acquirePackage(ctx, url, version) if err != nil { - return nil, "", fmt.Errorf("download %s: %w", url, err) + return nil, "", fmt.Errorf("acquire package %s: %w", url, err) } defer cleanup() @@ -279,6 +294,49 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) return av, csv, nil } +// acquirePackage returns a local .tgz path for `version` and a cleanup +// callback to release any transient resources. When VioletConfig.LocalPackageDir +// is non-empty the on-disk cache layout mirrors the MinIO URL convention; on +// hit the HTTP fetch is skipped, on miss the file is downloaded directly into +// the cache path (so subsequent runs hit). The returned cleanup is a no-op in +// the cache path because surviving the run is the whole point. +// +// When LocalPackageDir is empty the legacy flow runs: download to a per-call +// /tmp directory and remove on cleanup, no cross-run reuse. +func (o *Operator) acquirePackage(ctx context.Context, rawURL string, version config.Version) (string, func(), error) { + log := logging.FromContext(ctx) + noop := func() {} + + if o.violet.LocalPackageDir != "" { + cachePath := filepath.Join( + o.violet.LocalPackageDir, + o.name, + version.EffectivePackageChannel(), + fmt.Sprintf("%s.latest.ALL.%s.tgz", o.name, version.BundleVersion), + ) + if info, err := os.Stat(cachePath); err == nil && !info.IsDir() { + log.Infow("reusing cached violet package", "path", cachePath, "size", info.Size()) + return cachePath, noop, nil + } else if err != nil && !os.IsNotExist(err) { + return "", nil, fmt.Errorf("stat cache %s: %w", cachePath, err) + } + log.Infow("downloading violet package to cache", "url", rawURL, "path", cachePath) + if err := os.MkdirAll(filepath.Dir(cachePath), 0o755); err != nil { + return "", nil, fmt.Errorf("mkdir cache dir: %w", err) + } + if err := downloadFile(ctx, rawURL, cachePath, o.timeout); err != nil { + // Drop a half-written file so the next run does a clean retry + // instead of falsely "hitting" the cache with a truncated blob. + _ = os.Remove(cachePath) + return "", nil, err + } + return cachePath, noop, nil + } + + log.Infow("downloading violet package", "url", rawURL) + return downloadToTemp(ctx, rawURL, o.timeout) +} + // downloadToTemp fetches rawURL into a fresh temporary directory and returns // the local file path plus a cleanup function. The caller MUST invoke cleanup // (typically via defer) once the file is no longer needed. The temp dir is @@ -288,9 +346,6 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) // applied via a derived context so a slow or half-open MinIO peer cannot pin // the goroutine for hours waiting on OS-level TCP keepalive. func downloadToTemp(ctx context.Context, rawURL string, timeout time.Duration) (string, func(), error) { - dlCtx, cancel := context.WithTimeout(ctx, timeout) - defer cancel() - dir, err := os.MkdirTemp("", "upgrade-violet-*") if err != nil { return "", nil, fmt.Errorf("mkdir temp: %w", err) @@ -309,45 +364,53 @@ func downloadToTemp(ctx context.Context, rawURL string, timeout time.Duration) ( } filePath := filepath.Join(dir, fileName) - req, err := http.NewRequestWithContext(dlCtx, http.MethodGet, rawURL, nil) - if err != nil { + if err := downloadFile(ctx, rawURL, filePath, timeout); err != nil { cleanup() - return "", nil, fmt.Errorf("build request: %w", err) + return "", nil, err } + return filePath, cleanup, nil +} +// downloadFile streams rawURL into destPath. The destination's parent +// directory must already exist (caller is responsible). On any error the +// caller should clean up destPath — downloadFile does not, because it does +// not know whether destPath is a throwaway tmp file or a persistent cache +// entry that the caller wants to retry into. +// +// timeout bounds the entire HTTP exchange (dial + headers + body) via a +// derived context, so a slow or half-open peer cannot pin the goroutine. +func downloadFile(ctx context.Context, rawURL, destPath string, timeout time.Duration) error { + dlCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + req, err := http.NewRequestWithContext(dlCtx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("build request: %w", err) + } resp, err := http.DefaultClient.Do(req) if err != nil { - cleanup() - return "", nil, fmt.Errorf("http get: %w", err) + return fmt.Errorf("http get: %w", err) } defer resp.Body.Close() - if resp.StatusCode < 200 || resp.StatusCode >= 300 { - cleanup() - return "", nil, fmt.Errorf("unexpected status %d from %s", resp.StatusCode, rawURL) + return fmt.Errorf("unexpected status %d from %s", resp.StatusCode, rawURL) } - - f, err := os.Create(filePath) + f, err := os.Create(destPath) if err != nil { - cleanup() - return "", nil, fmt.Errorf("create %s: %w", filePath, err) + return fmt.Errorf("create %s: %w", destPath, err) } - // Explicit close-and-check: on many filesystems write buffers are only // flushed during Close(), so swallowing its error via defer can let a // truncated .tgz reach VerifySha256 (which is optional today) and then // violet push, producing a misleading "violet push:" error. if _, err := io.Copy(f, resp.Body); err != nil { _ = f.Close() - cleanup() - return "", nil, fmt.Errorf("copy body to %s: %w", filePath, err) + return fmt.Errorf("copy body to %s: %w", destPath, err) } if err := f.Close(); err != nil { - cleanup() - return "", nil, fmt.Errorf("close %s after download: %w", filePath, err) + return fmt.Errorf("close %s after download: %w", destPath, err) } - - return filePath, cleanup, nil + return nil } // deleteArtifactVersionIfExists removes a stale ArtifactVersion with the same @@ -418,11 +481,13 @@ func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { skipPush = *o.violet.SkipPush } args, err := BuildVioletPushArgs(VioletPushParams{ - TgzPath: tgzPath, - SkipPush: skipPush, - PlatformAddress: o.violet.PlatformAddress, - Clusters: o.violet.Clusters, - PushArgs: o.violet.PushArgs, + TgzPath: tgzPath, + SkipPush: skipPush, + PlatformAddress: o.violet.PlatformAddress, + Clusters: o.violet.Clusters, + PlatformUsername: o.violet.PlatformUsername, + PlatformPassword: o.violet.PlatformPassword, + PushArgs: o.violet.PushArgs, }) if err != nil { return err @@ -432,14 +497,21 @@ func (o *Operator) execVioletPush(ctx context.Context, tgzPath string) error { // Also redact the credential values themselves from violet's own // stdout/stderr — MaskCommand only protects the line WE log, not what - // violet itself might echo (e.g. verbose argv dumps). + // violet itself might echo (e.g. verbose argv dumps). Resolve platform + // password with the same precedence as BuildVioletPushArgs (config wins, + // env falls back) so the redactor sees whichever value will actually be + // passed to violet. + platformPass := o.violet.PlatformPassword + if platformPass == "" { + platformPass = os.Getenv(EnvVioletPlatformPassword) + } result := exec.RunCommand(ctx, exec.Command{ Name: bin, Args: args, EnvAllowlist: violetEnvAllowlist, RedactSecrets: []string{ os.Getenv(EnvVioletRegistryPassword), - os.Getenv(EnvVioletPlatformPassword), + platformPass, }, }) return result.Err From 12f63d2a437c2ab3129da0a38ee2f3ea0d85f5d3 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Tue, 19 May 2026 11:21:24 +0800 Subject: [PATCH 13/15] fix: address PR review critical findings - Drop dead createArtifactVersion: violet binary owns the write path now; git history serves the "diff vs old impl" rationale better than keeping unreachable code in the file. - Wrap refreshSubscriptionForUpgrade and approveInstallPlan in retry.RetryOnConflict so a concurrent OLM status write does not abort the upgrade with a single 409. Get moves inside the retry block so each attempt sees the freshest resourceVersion. - Move Version.Channel required-check from UpgradeOperator runtime path to validateConfig at LoadConfig time. Catches the typo at load instead of after a half-finished upgrade. Gated on Type=="operatorhub" so the local flow (which never reads Channel) stays untouched. Tests: 3 new subscription cases (retry on 409, idempotent skip), 4 new config cases (missing-channel rejection, local exemption, defaulted type still validates, valid path loads cleanly). --- pkg/config/config.go | 40 +++- pkg/config/config_test.go | 129 +++++++++++++ pkg/operator/operatorhub/artifact_versiong.go | 47 ----- pkg/operator/operatorhub/operator.go | 10 +- pkg/operator/operatorhub/subscription.go | 93 ++++++---- pkg/operator/operatorhub/subscription_test.go | 174 ++++++++++++++++++ 6 files changed, 400 insertions(+), 93 deletions(-) create mode 100644 pkg/config/config_test.go create mode 100644 pkg/operator/operatorhub/subscription_test.go diff --git a/pkg/config/config.go b/pkg/config/config.go index 6cc0c58..219be51 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -3,6 +3,7 @@ package config import ( + "fmt" "os" "time" @@ -176,7 +177,11 @@ func (v Version) EffectivePackageChannel() string { return v.Channel } -// LoadConfig loads the configuration from a YAML file +// LoadConfig loads the configuration from a YAML file, fills defaults, and +// validates that the result is internally consistent. Validation runs after +// defaulting so it sees the same shape the runtime will use — e.g. an empty +// OperatorConfig.Type already resolves to "operatorhub" by the time we +// check whether per-version Channel fields are required. func LoadConfig(path string) (*Config, error) { data, err := os.ReadFile(path) if err != nil { @@ -188,7 +193,38 @@ func LoadConfig(path string) (*Config, error) { return nil, err } - return defaultConfig(&config), nil + cfg := defaultConfig(&config) + if err := validateConfig(cfg); err != nil { + return nil, err + } + return cfg, nil +} + +// validateConfig fails fast on shape errors that would otherwise only surface +// mid-upgrade (e.g. when path 2 step 3 happens to need the missing field). +// Catching them at load time avoids partially-completed upgrades that leave +// the cluster in an awkward state. +// +// Today this only enforces "Channel is required for operatorhub-type runs", +// but it is the right home for future cross-field checks (e.g. violet.bin +// existence, bundleVersion uniqueness within a path). +func validateConfig(cfg *Config) error { + // Channel feeds both the MinIO download URL (BuildPackageURL refuses an + // empty channel) and Subscription.spec.channel (no safe fallback — a + // silent "stable" default would install the wrong CSV stream on a typo). + // The local operator type runs `make deploy` and never reads Channel, so + // the requirement is gated on Type to avoid forcing irrelevant fields on + // local-dev configs. + if cfg.OperatorConfig.Type == "operatorhub" { + for i, path := range cfg.UpgradePaths { + for j, v := range path.Versions { + if v.Channel == "" { + return fmt.Errorf("upgradePaths[%d].versions[%d] (%q): channel is required for operatorhub type", i, j, v.Name) + } + } + } + } + return nil } func defaultConfig(config *Config) *Config { diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go new file mode 100644 index 0000000..7edf12c --- /dev/null +++ b/pkg/config/config_test.go @@ -0,0 +1,129 @@ +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// writeTempYAML drops content into a temp file and returns its path. Kept +// inline (not a shared helper) because each table case is small and a helper +// would obscure what each YAML actually exercises. +func writeTempYAML(t *testing.T, content string) string { + t.Helper() + dir := t.TempDir() + path := filepath.Join(dir, "upgrade.yaml") + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write fixture: %v", err) + } + return path +} + +// TestLoadConfig_RejectsMissingChannelForOperatorhub locks the §1.3 fix in: +// the upgrade flow used to discover an empty channel mid-run (path N, version +// M), wasting all the prior steps. After defaults run, validateConfig must +// refuse to return a Config that would crash later. +func TestLoadConfig_RejectsMissingChannelForOperatorhub(t *testing.T) { + yaml := ` +operatorConfig: + type: operatorhub + name: tektoncd-operator + namespace: tektoncd-pipelines +upgradePaths: + - name: main + versions: + - name: first + bundleVersion: 4.0.17 + channel: stable + - name: second + bundleVersion: 4.6.3 + # channel deliberately omitted +` + _, err := LoadConfig(writeTempYAML(t, yaml)) + if err == nil { + t.Fatal("LoadConfig must reject operatorhub config with missing channel") + } + // The error should point at the offending version so a real ops user can + // find the line in their YAML quickly — index + name are both useful. + msg := err.Error() + if !strings.Contains(msg, "channel is required") || !strings.Contains(msg, "second") { + t.Errorf("error should name the missing field and the failing version, got: %v", err) + } +} + +// TestLoadConfig_AllowsMissingChannelForLocal documents the boundary of the +// rule: the local operator type runs `make deploy` and never touches Channel, +// so forcing it on local-dev configs would break valid setups. +func TestLoadConfig_AllowsMissingChannelForLocal(t *testing.T) { + yaml := ` +operatorConfig: + type: local + name: tektoncd-operator +upgradePaths: + - name: main + versions: + - name: first + bundleVersion: 4.0.17 + # local flow does not need a channel +` + cfg, err := LoadConfig(writeTempYAML(t, yaml)) + if err != nil { + t.Fatalf("local type with empty channel should load: %v", err) + } + if cfg.OperatorConfig.Type != "local" { + t.Errorf("Type = %q, want %q", cfg.OperatorConfig.Type, "local") + } +} + +// TestLoadConfig_AppliesOperatorhubDefaultWhenTypeOmitted catches a subtle +// failure mode: if validateConfig ran *before* defaultConfig, a config with +// no explicit type would skip channel validation, then crash at runtime when +// operatorhub kicked in. This test pins the ordering down. +func TestLoadConfig_AppliesOperatorhubDefaultWhenTypeOmitted(t *testing.T) { + yaml := ` +operatorConfig: + name: tektoncd-operator +upgradePaths: + - name: main + versions: + - name: first + bundleVersion: 4.0.17 + # missing channel — should be caught because type defaults to operatorhub +` + _, err := LoadConfig(writeTempYAML(t, yaml)) + if err == nil { + t.Fatal("missing channel under defaulted operatorhub type must fail") + } + if !strings.Contains(err.Error(), "channel is required") { + t.Errorf("expected channel-required error, got: %v", err) + } +} + +// TestLoadConfig_ValidOperatorhubLoadsCleanly is the positive-case smoke test +// so future tightening of validateConfig does not accidentally break the +// happy path. +func TestLoadConfig_ValidOperatorhubLoadsCleanly(t *testing.T) { + yaml := ` +operatorConfig: + type: operatorhub + name: tektoncd-operator + namespace: tektoncd-pipelines +upgradePaths: + - name: main + versions: + - name: first + bundleVersion: 4.0.17 + channel: stable + - name: second + bundleVersion: 4.6.3 + channel: stable +` + cfg, err := LoadConfig(writeTempYAML(t, yaml)) + if err != nil { + t.Fatalf("valid config should load: %v", err) + } + if got := len(cfg.UpgradePaths[0].Versions); got != 2 { + t.Errorf("Versions length = %d, want 2", got) + } +} diff --git a/pkg/operator/operatorhub/artifact_versiong.go b/pkg/operator/operatorhub/artifact_versiong.go index e039d38..9bbaad0 100644 --- a/pkg/operator/operatorhub/artifact_versiong.go +++ b/pkg/operator/operatorhub/artifact_versiong.go @@ -23,53 +23,6 @@ func (o *Operator) InstallArtifactVersion(ctx context.Context, version config.Ve return o.installViaViolet(ctx, version) } -// createArtifactVersion is no longer reachable from InstallArtifactVersion as -// of PR 2 — the violet binary owns the write path. Kept here as dead code for -// one release cycle so PR 2 regressions can be diff-compared against the -// previous in-process implementation. PR 3 deletes this function. -// -// Deprecated: use installViaViolet via InstallArtifactVersion. -func (o *Operator) createArtifactVersion(ctx context.Context, version string, artifact *unstructured.Unstructured) (*unstructured.Unstructured, error) { - avName := fmt.Sprintf("%s.%s", artifact.GetName(), version) - av, err := o.GetResource(ctx, avName, systemNamespace, artifactVersionGVR) - if err == nil && av != nil { - return av, nil - } - - av = &unstructured.Unstructured{ - Object: map[string]interface{}{ - "apiVersion": "app.alauda.io/v1alpha1", - "kind": "ArtifactVersion", - "metadata": map[string]interface{}{ - "name": avName, - "namespace": systemNamespace, - "annotations": map[string]interface{}{ - "kubectl-artifact": "kubectl-artfact", - }, - "labels": map[string]interface{}{ - "cpaas.io/artifact-version": artifact.GetName(), - "cpaas.io/library": "platform", - }, - }, - "spec": map[string]interface{}{ - "present": true, - "tag": version, - }, - }, - } - - av.SetOwnerReferences([]metav1.OwnerReference{ - { - APIVersion: artifact.GetAPIVersion(), - Kind: artifact.GetKind(), - Name: artifact.GetName(), - UID: artifact.GetUID(), - }, - }) - - return o.client.Resource(artifactVersionGVR).Namespace(systemNamespace).Create(ctx, av, metav1.CreateOptions{}) -} - func (o *Operator) waitArtifactVersionPresent(ctx context.Context, name string) (*unstructured.Unstructured, error) { log := logging.FromContext(ctx) lastResource := &unstructured.Unstructured{} diff --git a/pkg/operator/operatorhub/operator.go b/pkg/operator/operatorhub/operator.go index 7f366c8..940ad42 100644 --- a/pkg/operator/operatorhub/operator.go +++ b/pkg/operator/operatorhub/operator.go @@ -107,14 +107,8 @@ func (o *Operator) GetResource(ctx context.Context, name, namespace string, gvr } func (o *Operator) UpgradeOperator(ctx context.Context, version config.Version) error { - // version.Channel is required end-to-end: BuildPackageURL already refuses - // an empty channel, and InstallSubscription needs an exact channel name - // (a silent "stable" fallback would route the Subscription through the - // wrong OLM channel and install the wrong CSV stream on a typo). - if version.Channel == "" { - return fmt.Errorf("version.channel is required") - } - + // Channel non-emptiness is guaranteed by config.validateConfig at load + // time — see pkg/config/config.go::validateConfig. _, csv, err := o.InstallArtifactVersion(ctx, version) if err != nil { return fmt.Errorf("failed to prepare operator: %v", err) diff --git a/pkg/operator/operatorhub/subscription.go b/pkg/operator/operatorhub/subscription.go index 6578a58..024bc3e 100644 --- a/pkg/operator/operatorhub/subscription.go +++ b/pkg/operator/operatorhub/subscription.go @@ -9,6 +9,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" "knative.dev/pkg/logging" ) @@ -37,7 +38,7 @@ func (o *Operator) InstallSubscription(ctx context.Context, csv string, channel log := logging.FromContext(ctx) log.Infow("ensuring subscription for upgrade", "csv", csv, "channel", channel, "namespace", o.namespace) - existing, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Get(ctx, o.name, metav1.GetOptions{}) + _, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Get(ctx, o.name, metav1.GetOptions{}) switch { case errors.IsNotFound(err): log.Infow("subscription absent, creating fresh", "name", o.name, "csv", csv, "channel", channel) @@ -48,7 +49,7 @@ func (o *Operator) InstallSubscription(ctx context.Context, csv string, channel return fmt.Errorf("failed to get subscription: %v", err) default: log.Infow("subscription exists, rolling forward in place", "name", o.name) - if err := o.refreshSubscriptionForUpgrade(ctx, existing, channel); err != nil { + if err := o.refreshSubscriptionForUpgrade(ctx, channel); err != nil { return fmt.Errorf("failed to refresh subscription: %v", err) } } @@ -93,52 +94,72 @@ const refreshAnnotation = "upgrade-test.alauda.io/refresh-trigger" // We deliberately do NOT delete and recreate the Subscription, restart the // catalog-operator, or touch the CatalogSource — annotation bump is the // lightest mechanism that still guarantees a reconcile. -func (o *Operator) refreshSubscriptionForUpgrade(ctx context.Context, sub *unstructured.Unstructured, channel string) error { +// +// Get is performed inside RetryOnConflict so that a 409 (OLM controller +// updated status concurrently) re-reads the latest resourceVersion before +// retrying the mutation. Without this, a busy OLM made the first upgrade +// step flaky on tektoncd-operator and similar high-traffic Subscriptions. +func (o *Operator) refreshSubscriptionForUpgrade(ctx context.Context, channel string) error { log := logging.FromContext(ctx) - current, _, _ := unstructured.NestedString(sub.Object, "spec", "channel") - if current != channel { - log.Infow("patching subscription channel", "from", current, "to", channel) - if err := unstructured.SetNestedField(sub.Object, channel, "spec", "channel"); err != nil { - return fmt.Errorf("set spec.channel: %w", err) + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + sub, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Get(ctx, o.name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get subscription: %w", err) } - } else { - log.Infow("subscription channel already matches target", "channel", channel) - } - annotations, _, _ := unstructured.NestedStringMap(sub.Object, "metadata", "annotations") - if annotations == nil { - annotations = map[string]string{} - } - stamp := time.Now().UTC().Format(time.RFC3339Nano) - annotations[refreshAnnotation] = stamp - if err := unstructured.SetNestedStringMap(sub.Object, annotations, "metadata", "annotations"); err != nil { - return fmt.Errorf("set refresh annotation: %w", err) - } - log.Infow("force-refresh annotation bumped to trigger OLM reconcile", "annotation", refreshAnnotation, "value", stamp) + current, _, _ := unstructured.NestedString(sub.Object, "spec", "channel") + if current != channel { + log.Infow("patching subscription channel", "from", current, "to", channel) + if err := unstructured.SetNestedField(sub.Object, channel, "spec", "channel"); err != nil { + return fmt.Errorf("set spec.channel: %w", err) + } + } else { + log.Infow("subscription channel already matches target", "channel", channel) + } - _, err := o.client.Resource(subscriptionGVR).Namespace(o.namespace).Update(ctx, sub, metav1.UpdateOptions{}) - return err + annotations, _, _ := unstructured.NestedStringMap(sub.Object, "metadata", "annotations") + if annotations == nil { + annotations = map[string]string{} + } + stamp := time.Now().UTC().Format(time.RFC3339Nano) + annotations[refreshAnnotation] = stamp + if err := unstructured.SetNestedStringMap(sub.Object, annotations, "metadata", "annotations"); err != nil { + return fmt.Errorf("set refresh annotation: %w", err) + } + log.Infow("force-refresh annotation bumped to trigger OLM reconcile", "annotation", refreshAnnotation, "value", stamp) + + _, err = o.client.Resource(subscriptionGVR).Namespace(o.namespace).Update(ctx, sub, metav1.UpdateOptions{}) + return err + }) } // approveInstallPlan flips spec.approved=true. Idempotent: if the plan is // already approved (e.g. retry after a prior partial run), it returns nil // without issuing an Update. +// +// Get + Update run inside RetryOnConflict so a concurrent OLM status write +// does not turn the approval into a hard failure — the upgrade flow has no +// pre-approved state to roll back to and a single 409 here would otherwise +// abort the entire upgrade path. func (o *Operator) approveInstallPlan(ctx context.Context, name, namespace string) error { log := logging.FromContext(ctx) - ip, err := o.client.Resource(installPlanGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("get install plan: %w", err) - } - if approved, _, _ := unstructured.NestedBool(ip.Object, "spec", "approved"); approved { - log.Infow("install plan already approved, skipping update", "name", name) - return nil - } - if err := unstructured.SetNestedField(ip.Object, true, "spec", "approved"); err != nil { - return fmt.Errorf("set spec.approved: %w", err) - } - _, err = o.client.Resource(installPlanGVR).Namespace(namespace).Update(ctx, ip, metav1.UpdateOptions{}) - return err + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + ip, err := o.client.Resource(installPlanGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get install plan: %w", err) + } + if approved, _, _ := unstructured.NestedBool(ip.Object, "spec", "approved"); approved { + log.Infow("install plan already approved, skipping update", "name", name) + return nil + } + if err := unstructured.SetNestedField(ip.Object, true, "spec", "approved"); err != nil { + return fmt.Errorf("set spec.approved: %w", err) + } + _, err = o.client.Resource(installPlanGVR).Namespace(namespace).Update(ctx, ip, metav1.UpdateOptions{}) + return err + }) } func (o *Operator) createSubscription(ctx context.Context, name, namespace, csv string, channel string) (*unstructured.Unstructured, error) { diff --git a/pkg/operator/operatorhub/subscription_test.go b/pkg/operator/operatorhub/subscription_test.go new file mode 100644 index 0000000..eaa6cda --- /dev/null +++ b/pkg/operator/operatorhub/subscription_test.go @@ -0,0 +1,174 @@ +package operatorhub + +import ( + "context" + "sync/atomic" + "testing" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic/fake" + k8stesting "k8s.io/client-go/testing" +) + +// newOperatorForSubscriptionTest builds a minimal *Operator whose dynamic +// client knows about Subscription / InstallPlan kinds. We use a single helper +// so the reactor wiring stays in one place and tests stay focused on the +// retry semantics, not on fake-client plumbing. +func newOperatorForSubscriptionTest(t *testing.T, seed ...runtime.Object) (*Operator, *fake.FakeDynamicClient) { + t.Helper() + scheme := runtime.NewScheme() + scheme.AddKnownTypeWithName(subscriptionGVR.GroupVersion().WithKind("Subscription"), &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(subscriptionGVR.GroupVersion().WithKind("SubscriptionList"), &unstructured.UnstructuredList{}) + scheme.AddKnownTypeWithName(installPlanGVR.GroupVersion().WithKind("InstallPlan"), &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName(installPlanGVR.GroupVersion().WithKind("InstallPlanList"), &unstructured.UnstructuredList{}) + + client := fake.NewSimpleDynamicClient(scheme, seed...) + op := &Operator{ + client: client, + namespace: "test-ns", + name: "tektoncd-operator", + artifact: "operatorhub-tektoncd-operator", + interval: 10 * time.Millisecond, + timeout: 2 * time.Second, + } + return op, client +} + +func newSubscriptionUnstructured(name, namespace, channel string) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": subscriptionGVR.GroupVersion().String(), + "kind": "Subscription", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "channel": channel, + }, + }, + } +} + +func newInstallPlanUnstructured(name, namespace string, approved bool) *unstructured.Unstructured { + return &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": installPlanGVR.GroupVersion().String(), + "kind": "InstallPlan", + "metadata": map[string]interface{}{ + "name": name, + "namespace": namespace, + }, + "spec": map[string]interface{}{ + "approved": approved, + }, + }, + } +} + +// TestRefreshSubscriptionForUpgrade_RetriesOnConflict simulates the OLM +// controller updating the Subscription mid-flight: the first Update returns +// 409, the second succeeds. RetryOnConflict must keep the upgrade flow +// going instead of aborting the upgrade path. +func TestRefreshSubscriptionForUpgrade_RetriesOnConflict(t *testing.T) { + sub := newSubscriptionUnstructured("tektoncd-operator", "test-ns", "stable") + op, client := newOperatorForSubscriptionTest(t, sub) + + var updates int32 + client.PrependReactor("update", "subscriptions", func(action k8stesting.Action) (bool, runtime.Object, error) { + // First Update → 409 Conflict. Second Update → fall through to the + // real object tracker. RetryOnConflict applies backoff between + // attempts, so this exercises both the retry loop and the fresh-Get + // inside it. + if atomic.AddInt32(&updates, 1) == 1 { + return true, nil, errors.NewConflict( + schema.GroupResource{Group: subscriptionGVR.Group, Resource: subscriptionGVR.Resource}, + "tektoncd-operator", + nil, + ) + } + return false, nil, nil + }) + + if err := op.refreshSubscriptionForUpgrade(context.Background(), "stable-4.6"); err != nil { + t.Fatalf("refresh should succeed after conflict retry: %v", err) + } + if got := atomic.LoadInt32(&updates); got < 2 { + t.Errorf("expected at least 2 Update attempts after one 409, got %d", got) + } + + // Confirm the final state has the new channel and a refresh annotation. + got, err := client.Resource(subscriptionGVR).Namespace("test-ns").Get(context.Background(), "tektoncd-operator", metav1.GetOptions{}) + if err != nil { + t.Fatalf("final get: %v", err) + } + if ch, _, _ := unstructured.NestedString(got.Object, "spec", "channel"); ch != "stable-4.6" { + t.Errorf("spec.channel = %q, want %q", ch, "stable-4.6") + } + if anno, _, _ := unstructured.NestedString(got.Object, "metadata", "annotations", refreshAnnotation); anno == "" { + t.Errorf("refresh annotation missing on final object") + } +} + +// TestApproveInstallPlan_RetriesOnConflict mirrors the Subscription path: the +// IP's status is updated by OLM mid-flight and our spec.approved patch loses +// the version race once. The retry must re-Get and succeed without bubbling +// the 409 up to the upgrade path. +func TestApproveInstallPlan_RetriesOnConflict(t *testing.T) { + ip := newInstallPlanUnstructured("install-abc123", "test-ns", false) + op, client := newOperatorForSubscriptionTest(t, ip) + + var updates int32 + client.PrependReactor("update", "installplans", func(action k8stesting.Action) (bool, runtime.Object, error) { + if atomic.AddInt32(&updates, 1) == 1 { + return true, nil, errors.NewConflict( + schema.GroupResource{Group: installPlanGVR.Group, Resource: installPlanGVR.Resource}, + "install-abc123", + nil, + ) + } + return false, nil, nil + }) + + if err := op.approveInstallPlan(context.Background(), "install-abc123", "test-ns"); err != nil { + t.Fatalf("approve should succeed after conflict retry: %v", err) + } + if got := atomic.LoadInt32(&updates); got < 2 { + t.Errorf("expected at least 2 Update attempts after one 409, got %d", got) + } + + got, err := client.Resource(installPlanGVR).Namespace("test-ns").Get(context.Background(), "install-abc123", metav1.GetOptions{}) + if err != nil { + t.Fatalf("final get: %v", err) + } + if approved, _, _ := unstructured.NestedBool(got.Object, "spec", "approved"); !approved { + t.Errorf("spec.approved = false, want true after retried approval") + } +} + +// TestApproveInstallPlan_AlreadyApprovedSkipsUpdate guards the idempotent +// short-circuit: when the IP is already approved we must not issue an Update, +// otherwise a re-run after a partial upgrade would generate avoidable API +// traffic (and racy 409s on a busy OLM). +func TestApproveInstallPlan_AlreadyApprovedSkipsUpdate(t *testing.T) { + ip := newInstallPlanUnstructured("install-already-ok", "test-ns", true) + op, client := newOperatorForSubscriptionTest(t, ip) + + var updates int32 + client.PrependReactor("update", "installplans", func(action k8stesting.Action) (bool, runtime.Object, error) { + atomic.AddInt32(&updates, 1) + return false, nil, nil + }) + + if err := op.approveInstallPlan(context.Background(), "install-already-ok", "test-ns"); err != nil { + t.Fatalf("approve idempotent path: %v", err) + } + if got := atomic.LoadInt32(&updates); got != 0 { + t.Errorf("Update must not be called for already-approved IP, got %d calls", got) + } +} From 265947cac5d35bc99ff00ff2e439dfe41d7bff73 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Tue, 19 May 2026 13:32:31 +0800 Subject: [PATCH 14/15] fix(violet): file segment tracks OLM channel, not hardcoded 'latest' BuildPackageURL hardcoded the second-to-last filename segment as `latest`, which worked for tektoncd only because that uploader double-wrote both `latest.ALL` and `.ALL` files. Operators like gitlab-ce that upload a single `.ALL` variant hit a 404 in acquirePackage. Split the API into pathChannel (directory segment, can be a release-train name like `v4.0`) and fileChannel (filename segment, the OLM channel like `stable`). acquirePackage's cache layout follows the same split. --- pkg/operator/operatorhub/violet.go | 24 +++++++++----- pkg/operator/operatorhub/violet_test.go | 44 ++++++++++++++++--------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index f1c01ad..f858af0 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -87,24 +87,32 @@ func validatePushArgs(args []string) error { // BuildPackageURL composes the .tgz URL by the agreed MinIO convention: // -// ///.latest.ALL..tgz +// ///..ALL..tgz // -// A trailing slash on prefix is tolerated. All four inputs are required; any +// pathChannel is the directory segment (Version.EffectivePackageChannel — may +// be a release-train name like "v4.0" / "rc"). fileChannel is the in-filename +// segment, which uploaders set to the OLM channel (Version.Channel, typically +// "stable"). The two diverge when packageChannel is used to fan out path +// hierarchies, but every uploader keeps the filename tied to the OLM channel. +// +// A trailing slash on prefix is tolerated. All five inputs are required; any // empty value returns an error so callers fail loudly instead of producing a // malformed URL that 404s later. -func BuildPackageURL(prefix, name, channel, bundleVersion string) (string, error) { +func BuildPackageURL(prefix, name, pathChannel, fileChannel, bundleVersion string) (string, error) { switch { case prefix == "": return "", fmt.Errorf("packagePrefix is empty") case name == "": return "", fmt.Errorf("operator name is empty") - case channel == "": - return "", fmt.Errorf("channel is empty (Version.Channel is required when using violet)") + case pathChannel == "": + return "", fmt.Errorf("path channel is empty (Version.EffectivePackageChannel is required when using violet)") + case fileChannel == "": + return "", fmt.Errorf("file channel is empty (Version.Channel is required when using violet)") case bundleVersion == "": return "", fmt.Errorf("bundleVersion is empty") } p := strings.TrimRight(prefix, "/") - return fmt.Sprintf("%s/%s/%s/%s.latest.ALL.%s.tgz", p, name, channel, name, bundleVersion), nil + return fmt.Sprintf("%s/%s/%s/%s.%s.ALL.%s.tgz", p, name, pathChannel, name, fileChannel, bundleVersion), nil } // VioletPushParams is the resolved, pointer-free shape of `violet push` @@ -242,7 +250,7 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) return nil, "", fmt.Errorf("get artifact %s: %w", o.artifact, err) } - url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.EffectivePackageChannel(), version.BundleVersion) + url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.EffectivePackageChannel(), version.Channel, version.BundleVersion) if err != nil { return nil, "", fmt.Errorf("build package url: %w", err) } @@ -312,7 +320,7 @@ func (o *Operator) acquirePackage(ctx context.Context, rawURL string, version co o.violet.LocalPackageDir, o.name, version.EffectivePackageChannel(), - fmt.Sprintf("%s.latest.ALL.%s.tgz", o.name, version.BundleVersion), + fmt.Sprintf("%s.%s.ALL.%s.tgz", o.name, version.Channel, version.BundleVersion), ) if info, err := os.Stat(cachePath); err == nil && !info.IsDir() { log.Infow("reusing cached violet package", "path", cachePath, "size", info.Size()) diff --git a/pkg/operator/operatorhub/violet_test.go b/pkg/operator/operatorhub/violet_test.go index 29602b9..389424f 100644 --- a/pkg/operator/operatorhub/violet_test.go +++ b/pkg/operator/operatorhub/violet_test.go @@ -47,44 +47,58 @@ func TestBuildPackageURL(t *testing.T) { name string prefix string opName string - channel string + pathChannel string + fileChannel string bundleVersion string want string wantErr bool }{ { - name: "v4.6 channel with v-prefixed version", + name: "tektoncd v4.6 stable: path uses train, file uses OLM channel", prefix: "http://package-minio.alauda.cn:9199/packages/", opName: "tektoncd-operator", - channel: "v4.6", + pathChannel: "v4.6", + fileChannel: "stable", bundleVersion: "v4.6.0", - want: "http://package-minio.alauda.cn:9199/packages/tektoncd-operator/v4.6/tektoncd-operator.latest.ALL.v4.6.0.tgz", + want: "http://package-minio.alauda.cn:9199/packages/tektoncd-operator/v4.6/tektoncd-operator.stable.ALL.v4.6.0.tgz", }, { - name: "rc channel with rc-build suffix", + name: "rc train still uses stable filename segment", prefix: "http://package-minio.alauda.cn:9199/packages/", opName: "tektoncd-operator", - channel: "rc", + pathChannel: "rc", + fileChannel: "stable", bundleVersion: "v4.2.5-rc.76.g976cff6", - want: "http://package-minio.alauda.cn:9199/packages/tektoncd-operator/rc/tektoncd-operator.latest.ALL.v4.2.5-rc.76.g976cff6.tgz", + want: "http://package-minio.alauda.cn:9199/packages/tektoncd-operator/rc/tektoncd-operator.stable.ALL.v4.2.5-rc.76.g976cff6.tgz", }, { - name: "prefix without trailing slash", + name: "gitlab v17.11 stable: cross-operator parity", + prefix: "http://package-minio.alauda.cn:9199/packages/", + opName: "gitlab-ce-operator", + pathChannel: "v17.11", + fileChannel: "stable", + bundleVersion: "v17.11.4", + want: "http://package-minio.alauda.cn:9199/packages/gitlab-ce-operator/v17.11/gitlab-ce-operator.stable.ALL.v17.11.4.tgz", + }, + { + name: "prefix without trailing slash, two channels coincide", prefix: "http://example.com/pkgs", opName: "op", - channel: "stable", + pathChannel: "stable", + fileChannel: "stable", bundleVersion: "v1.0.0", - want: "http://example.com/pkgs/op/stable/op.latest.ALL.v1.0.0.tgz", + want: "http://example.com/pkgs/op/stable/op.stable.ALL.v1.0.0.tgz", }, - {name: "empty prefix", prefix: "", opName: "op", channel: "stable", bundleVersion: "v1.0.0", wantErr: true}, - {name: "empty name", prefix: "http://x/", channel: "stable", bundleVersion: "v1.0.0", wantErr: true}, - {name: "empty channel", prefix: "http://x/", opName: "op", bundleVersion: "v1.0.0", wantErr: true}, - {name: "empty bundleVersion", prefix: "http://x/", opName: "op", channel: "stable", wantErr: true}, + {name: "empty prefix", prefix: "", opName: "op", pathChannel: "stable", fileChannel: "stable", bundleVersion: "v1.0.0", wantErr: true}, + {name: "empty name", prefix: "http://x/", pathChannel: "stable", fileChannel: "stable", bundleVersion: "v1.0.0", wantErr: true}, + {name: "empty path channel", prefix: "http://x/", opName: "op", fileChannel: "stable", bundleVersion: "v1.0.0", wantErr: true}, + {name: "empty file channel", prefix: "http://x/", opName: "op", pathChannel: "stable", bundleVersion: "v1.0.0", wantErr: true}, + {name: "empty bundleVersion", prefix: "http://x/", opName: "op", pathChannel: "stable", fileChannel: "stable", wantErr: true}, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - got, err := BuildPackageURL(tc.prefix, tc.opName, tc.channel, tc.bundleVersion) + got, err := BuildPackageURL(tc.prefix, tc.opName, tc.pathChannel, tc.fileChannel, tc.bundleVersion) if tc.wantErr { if err == nil { t.Fatalf("want error, got url=%q", got) From feb6ba561f35a50e756658dbb00144a1f49f5557 Mon Sep 17 00:00:00 2001 From: "huanyang@alauda.io" Date: Tue, 19 May 2026 14:00:33 +0800 Subject: [PATCH 15/15] fix(violet): drop pre-violet-push Get Artifact, let violet create on first onboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Get only used artifact.GetName(), which equals o.artifact (already known from config at construction time). The Get added no information but turned every "first onboard" target — a fresh platform that has never installed this operator — into a fail-fast: violet push itself creates the Artifact CR when missing, so the pre-check was actively blocking a legitimate path. Use o.artifact directly when composing the AV name. --- pkg/operator/operatorhub/violet.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pkg/operator/operatorhub/violet.go b/pkg/operator/operatorhub/violet.go index f858af0..1a627ce 100644 --- a/pkg/operator/operatorhub/violet.go +++ b/pkg/operator/operatorhub/violet.go @@ -245,11 +245,6 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) log.Infow("installing artifact version via violet", "bundleVersion", version.BundleVersion, "channel", version.Channel) - artifact, err := o.GetResource(ctx, o.artifact, systemNamespace, artifactGVR) - if err != nil { - return nil, "", fmt.Errorf("get artifact %s: %w", o.artifact, err) - } - url, err := BuildPackageURL(o.violet.PackagePrefix, o.name, version.EffectivePackageChannel(), version.Channel, version.BundleVersion) if err != nil { return nil, "", fmt.Errorf("build package url: %w", err) @@ -265,7 +260,7 @@ func (o *Operator) installViaViolet(ctx context.Context, version config.Version) return nil, "", fmt.Errorf("verify sha256: %w", err) } - avName := fmt.Sprintf("%s.%s", artifact.GetName(), version.BundleVersion) + avName := fmt.Sprintf("%s.%s", o.artifact, version.BundleVersion) if err := o.deleteArtifactVersionIfExists(ctx, avName); err != nil { return nil, "", fmt.Errorf("ensure clean AV %s: %w", avName, err) }