From d0c58d39e0631a8a311ea1fa1f9963c7099da7a6 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:17:30 +0000 Subject: [PATCH 1/2] (chore): add e2e tests for workload resilience when catalog is deleted Assisted-by: Cursor --- test/e2e/features/recover.feature | 150 ++++++++++++++++++++++++++++++ test/e2e/steps/hooks.go | 24 +++-- test/e2e/steps/steps.go | 32 +++++++ 3 files changed, 196 insertions(+), 10 deletions(-) diff --git a/test/e2e/features/recover.feature b/test/e2e/features/recover.feature index 0438f2d1a1..4db8bafe75 100644 --- a/test/e2e/features/recover.feature +++ b/test/e2e/features/recover.feature @@ -115,3 +115,153 @@ Feature: Recover cluster extension from errors that might occur during its lifet Then ClusterExtension is available And ClusterExtension reports Progressing as True with Reason Succeeded And ClusterExtension reports Installed as True + + # CATALOG DELETION RESILIENCE SCENARIOS + + Scenario: Extension continues running after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + When ClusterCatalog "test" is deleted + # Verify controller still maintains resources after catalog deletion by removing and restoring a resource. + # This approach avoids race conditions because: + # - We don't rely on status flags that might be unchanged (e.g., Installed=True before and after) + # - Resource restoration is an observable event that PROVES the controller reconciled after deletion + # - The controller must actively apply manifests to restore the removed resource + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + And resource "deployment/test-operator" is available + + Scenario: Resources are restored after catalog deletion + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "configmap/test-configmap" exists + And ClusterCatalog "test" is deleted + When resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + + Scenario: Config changes are allowed even when the catalog does not exist anymore + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterCatalog "test" is deleted + When ClusterExtension is updated to add preflight config + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + install: + preflight: + crdUpgradeSafety: + enforcement: None + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + # Wait for reconciliation of the updated spec (config change should succeed without catalog) + # First ensure the controller has reconciled the new generation (spec update) + And ClusterExtension has reconciled the latest generation + # Config-only changes don't trigger resolution failure (bundle unchanged), so resolution succeeds + # using the installed bundle metadata. Verify reconciliation completed successfully. + And ClusterExtension reports Progressing as True with Reason Succeeded + Then ClusterExtension is available + And ClusterExtension reports Installed as True + + Scenario: Version upgrade does not proceed when catalog does not exist + Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + And ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to version "1.0.1" + # Wait for reconciliation after the version change request + # Note: Retrying status means controller will auto-upgrade when catalog becomes available + And ClusterExtension reports Progressing as True with Reason Retrying + # Verify upgrade did not proceed: version remains at 1.0.0 (not 1.0.1) + Then bundle "test-operator.1.0.0" is installed in version "1.0.0" + And ClusterExtension reports Installed as True diff --git a/test/e2e/steps/hooks.go b/test/e2e/steps/hooks.go index c91072947f..0ec6d74468 100644 --- a/test/e2e/steps/hooks.go +++ b/test/e2e/steps/hooks.go @@ -139,28 +139,32 @@ func stderrOutput(err error) string { return "" } -func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, err error) (context.Context, error) { +func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, scenarioErr error) (context.Context, error) { sc := scenarioCtx(ctx) for _, bgCmd := range sc.backGroundCmds { if p := bgCmd.Process; p != nil { _ = p.Kill() } } - if err != nil { - return ctx, err - } + // Run cleanup ALWAYS, even if scenario failed (to prevent resource leaks into next scenario) forDeletion := []resource{} if sc.clusterExtensionName != "" { forDeletion = append(forDeletion, resource{name: sc.clusterExtensionName, kind: "clusterextension"}) } forDeletion = append(forDeletion, resource{name: sc.namespace, kind: "namespace"}) - go func() { - for _, r := range forDeletion { - if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true"); err != nil { - logger.Info("Error deleting resource", "name", r.name, "namespace", sc.namespace, "stderr", stderrOutput(err)) + + // Cleanup must be synchronous to ensure proper test isolation. + // If cleanup runs in background, the next scenario may start before resources are deleted. + for _, r := range forDeletion { + // Try graceful deletion first (60s timeout), fall back to force if stuck + if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--wait=true", "--timeout=60s"); err != nil { + // Force delete if stuck on finalizers (test isolation > graceful cleanup) + if _, forceErr := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--force", "--grace-period=0"); forceErr != nil { + logger.Info("Error force deleting resource", "kind", r.kind, "name", r.name, "stderr", stderrOutput(forceErr)) } } - }() - return ctx, nil + } + + return ctx, scenarioErr } diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index 8a06161975..e316e21513 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -55,6 +55,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied) sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable) sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut) + sc.Step(`^(?i)ClusterExtension has reconciled the latest generation$`, ClusterExtensionReconciledLatestGeneration) sc.Step(`^(?i)ClusterExtension reports "([^"]+)" as active revision(s?)$`, ClusterExtensionReportsActiveRevisions) sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message:$`, ClusterExtensionReportsCondition) sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+)$`, ClusterExtensionReportsConditionWithoutMsg) @@ -86,6 +87,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles) sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) + sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted) sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace) sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned) @@ -249,6 +251,25 @@ func ClusterExtensionIsAvailable(ctx context.Context) error { return nil } +func ClusterExtensionReconciledLatestGeneration(ctx context.Context) error { + sc := scenarioCtx(ctx) + require.Eventually(godog.T(ctx), func() bool { + // Get generation from metadata + genOutput, err := k8sClient("get", "clusterextension", sc.clusterExtensionName, "-o", "jsonpath={.metadata.generation}") + if err != nil || genOutput == "" { + return false + } + // Get observedGeneration from Progressing condition (each condition tracks its own observedGeneration) + obsGenOutput, err := k8sClient("get", "clusterextension", sc.clusterExtensionName, "-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].observedGeneration}") + if err != nil || obsGenOutput == "" { + return false + } + // Both exist and are equal means reconciliation happened + return genOutput == obsGenOutput + }, timeout, tick) + return nil +} + func ClusterExtensionIsRolledOut(ctx context.Context) error { sc := scenarioCtx(ctx) require.Eventually(godog.T(ctx), func() bool { @@ -697,6 +718,17 @@ func TagCatalogImage(name, oldTag, newTag string) error { return crane.Tag(imageRef, newTag, crane.Insecure) } +func CatalogIsDeleted(ctx context.Context, catalogName string) error { + catalogFullName := fmt.Sprintf("%s-catalog", catalogName) + // Using --wait=true makes kubectl wait for the resource to be fully deleted, + // eliminating the need for manual polling + _, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true", "--wait=true") + if err != nil { + return fmt.Errorf("failed to delete catalog: %v", err) + } + return nil +} + func PrometheusMetricsAreReturned(ctx context.Context) error { sc := scenarioCtx(ctx) for podName, mr := range sc.metricsResponse { From 8846129e89a7dd3e2302cb0507c6bf29121140da Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Tue, 20 Jan 2026 01:12:43 +0000 Subject: [PATCH 2/2] (fix) catalog deletion resilience support Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. Assisted-by: Cursor --- cmd/operator-controller/main.go | 4 +- .../operator-controller/applier/boxcutter.go | 46 +- .../applier/boxcutter_test.go | 6 +- internal/operator-controller/applier/helm.go | 66 +++ .../controllers/boxcutter_reconcile_steps.go | 5 +- .../boxcutter_reconcile_steps_apply_test.go | 4 +- .../clusterextension_admission_test.go | 4 +- .../clusterextension_controller.go | 2 + .../clusterextension_controller_test.go | 440 ++++++++++++++++++ .../clusterextension_reconcile_steps.go | 204 +++++++- .../controllers/suite_test.go | 2 +- test/utils.go | 1 + 12 files changed, 750 insertions(+), 34 deletions(-) diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index 2d78edda4f..eef9dfc03d 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -623,7 +623,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl controllers.HandleFinalizers(c.finalizers), controllers.MigrateStorage(storageMigrator), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundleWithBoxcutter(appl.Apply), } @@ -742,7 +742,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{ controllers.HandleFinalizers(c.finalizers), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundle(appl), } diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index a10f2cbba8..928f042dc1 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -307,21 +307,37 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro return bc.Client.Patch(ctx, obj, client.Apply, client.FieldOwner(bc.FieldOwner), client.ForceOwnership) } -func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error { - // Generate desired revision - desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) +func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) { + // Note: We list revisions first (before checking contentFS) because we need the revision list + // to determine if we can fall back when contentFS is nil. If the API call fails here, + // it indicates a serious cluster connectivity issue, and we should not proceed even in fallback mode + // since the ClusterExtensionRevision controller also requires API access to maintain resources. + existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) if err != nil { - return err + return false, "", err } - if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { - return fmt.Errorf("set ownerref: %w", err) + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, we should use the existing installed revision without generating a new one. + if contentFS == nil { + if len(existingRevisions) == 0 { + return false, "", fmt.Errorf("catalog content unavailable and no revision installed") + } + // Returning true here signals that the rollout has succeeded using the current revision. The + // ClusterExtensionRevision controller will continue to reconcile, apply, and maintain the + // resources defined in that revision via Server-Side Apply, ensuring the workload keeps running + // even when catalog access (and thus new revision content) is unavailable. + return true, "", nil } - // List all existing revisions - existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) + // Generate desired revision + desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) if err != nil { - return err + return false, "", err + } + + if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { + return false, "", fmt.Errorf("set ownerref: %w", err) } currentRevision := &ocv1.ClusterExtensionRevision{} @@ -343,7 +359,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust // inplace patch was successful, no changes in phases state = StateUnchanged default: - return fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) + return false, "", fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err) } } @@ -357,7 +373,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsInstall: err := preflight.Install(ctx, plainObjs) if err != nil { - return err + return false, "", err } // TODO: jlanford's IDE says that "StateNeedsUpgrade" condition is always true, but // it isn't immediately obvious why that is. Perhaps len(existingRevisions) is @@ -366,7 +382,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust case StateNeedsUpgrade: err := preflight.Upgrade(ctx, plainObjs) if err != nil { - return err + return false, "", err } } } @@ -380,15 +396,15 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust desiredRevision.Spec.Revision = revisionNumber if err = bc.garbageCollectOldRevisions(ctx, prevRevisions); err != nil { - return fmt.Errorf("garbage collecting old revisions: %w", err) + return false, "", fmt.Errorf("garbage collecting old revisions: %w", err) } if err := bc.createOrUpdate(ctx, desiredRevision); err != nil { - return fmt.Errorf("creating new Revision: %w", err) + return false, "", fmt.Errorf("creating new Revision: %w", err) } } - return nil + return true, "", nil } // garbageCollectOldRevisions deletes archived revisions beyond ClusterExtensionRevisionRetentionLimit. diff --git a/internal/operator-controller/applier/boxcutter_test.go b/internal/operator-controller/applier/boxcutter_test.go index df3c0a915a..411879b8ab 100644 --- a/internal/operator-controller/applier/boxcutter_test.go +++ b/internal/operator-controller/applier/boxcutter_test.go @@ -986,14 +986,18 @@ func TestBoxcutter_Apply(t *testing.T) { labels.PackageNameKey: "test-package", } } - err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) + completed, status, err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations) // Assert if tc.expectedErr != "" { require.Error(t, err) assert.Contains(t, err.Error(), tc.expectedErr) + assert.False(t, completed) + assert.Empty(t, status) } else { require.NoError(t, err) + assert.True(t, completed) + assert.Empty(t, status) } if tc.validate != nil { diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 22ed096fe1..7450872bdd 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE } func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) { + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, reconcile the existing Helm release if it exists. + if contentFS == nil { + ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext) + if err != nil { + return false, "", err + } + return h.reconcileExistingRelease(ctx, ac, ext) + } + chrt, err := h.buildHelmChart(contentFS, ext) if err != nil { return false, "", err @@ -197,6 +207,62 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return true, "", nil } +// reconcileExistingRelease reconciles an existing Helm release without catalog access. +// This is used when the catalog is unavailable but we need to maintain the current installation. +// It reconciles the release to actively maintain resources, and sets up watchers for monitoring/observability. +func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) { + rel, err := ac.Get(ext.GetName()) + if errors.Is(err, driver.ErrReleaseNotFound) { + return false, "", fmt.Errorf("catalog content unavailable and no release installed") + } + if err != nil { + return false, "", fmt.Errorf("failed to get current release: %w", err) + } + + // Reconcile the existing release to ensure resources are maintained + if err := ac.Reconcile(rel); err != nil { + // Reconcile failed - resources NOT maintained + // Return false (rollout failed) with error + return false, "", err + } + + // At this point: Reconcile succeeded - resources ARE maintained (applied to cluster via Server-Side Apply) + // The operations below are for setting up watches to detect drift (i.e., if someone manually modifies the + // resources). If watch setup fails, the resources are still successfully maintained, but we won't detect + // and auto-correct manual modifications. We return true (rollout succeeded) and log watch errors. + logger := klog.FromContext(ctx) + + relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name)) + if err != nil { + logger.Error(err, "failed to parse manifest objects, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + logger.V(1).Info("setting up drift detection watches on managed objects") + + // Defensive nil checks to prevent panics if Manager or Watcher not properly initialized + if h.Manager == nil { + logger.Error(fmt.Errorf("manager is nil"), "Manager not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + cache, err := h.Manager.Get(ctx, ext) + if err != nil { + logger.Error(err, "failed to get managed content cache, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + if h.Watcher == nil { + logger.Error(fmt.Errorf("watcher is nil"), "Watcher not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil { + logger.Error(err, "failed to set up drift detection watches (resources are applied but drift detection disabled)") + return true, "", nil + } + + return true, "", nil +} + func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.HelmChartProvider == nil { return nil, errors.New("HelmChartProvider is nil") diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go index 4c9eec36ce..5c746c7b0f 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps.go @@ -94,7 +94,7 @@ func MigrateStorage(m StorageMigrator) ReconcileStepFunc { } } -func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error) ReconcileStepFunc { +func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error)) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) revisionAnnotations := map[string]string{ @@ -109,7 +109,8 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e } l.Info("applying bundle contents") - if err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations); err != nil { + _, _, err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations) + if err != nil { // If there was an error applying the resolved bundle, // report the error via the Progressing condition. setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) diff --git a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go index 63aff2b0ad..78adb70090 100644 --- a/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go +++ b/internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go @@ -133,8 +133,8 @@ func TestApplyBundleWithBoxcutter(t *testing.T) { imageFS: fstest.MapFS{}, } - stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) error { - return nil + stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) (bool, string, error) { + return true, "", nil }) result, err := stepFunc(ctx, state, ext) require.NoError(t, err) diff --git a/internal/operator-controller/controllers/clusterextension_admission_test.go b/internal/operator-controller/controllers/clusterextension_admission_test.go index 2f8791999f..6ce9fc3c76 100644 --- a/internal/operator-controller/controllers/clusterextension_admission_test.go +++ b/internal/operator-controller/controllers/clusterextension_admission_test.go @@ -13,7 +13,9 @@ import ( ) func TestClusterExtensionSourceConfig(t *testing.T) { - sourceTypeEmptyError := "Invalid value: null" + // NOTE: Kubernetes validation error format for JSON null values varies across K8s versions. + // We check for the common part "Invalid value:" which appears in all versions. + sourceTypeEmptyError := "Invalid value:" sourceTypeMismatchError := "spec.source.sourceType: Unsupported value" sourceConfigInvalidError := "spec.source: Invalid value" // unionField represents the required Catalog or (future) Bundle field required by SourceConfig diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 7f3192b0fd..f381152e6c 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req // ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension, // and assigns a specified reason and custom message to any missing condition. +// +//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) { for _, condType := range conditionsets.ConditionTypes { cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 1b09a43adf..eabdba4f94 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -1611,3 +1611,443 @@ func TestGetInstalledBundleHistory(t *testing.T) { } } } + +// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic +func TestResolutionFallbackToInstalledBundle(t *testing.T) { + t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First reconcile: catalog available, second reconcile: catalog unavailable + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog available, resolve to version 1.0.0 + v := bundle.VersionRelease{Version: bsemver.MustParse("1.0.0")} + return &declcfg.Bundle{ + Name: "test.1.0.0", + Package: "test-pkg", + Image: "test-image:1.0.0", + }, &v, &declcfg.Deprecation{}, nil + } + // Second reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + // Applier succeeds (resources maintained) + d.Applier = &MockApplier{ + installCompleted: true, + installStatus: "", + err: nil, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - should fall back + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog available, install version 1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after first install + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions are present and valid after first reconcile + verifyInvariants(ctx, t, cl, ext) + + // Second reconcile: catalog unavailable, should fallback to installed version + // Catalog watch will trigger reconciliation when catalog becomes available again + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows successful reconciliation after fallback + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Version should remain 1.0.0 (maintained from fallback) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Progressing should be Succeeded (apply completed successfully) + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Installed should be True (maintaining current version) + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + // Verify all conditions remain valid after fallback + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("fails when version upgrade requested without catalog", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension requesting version upgrade + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Version: "1.0.1", // Requesting upgrade + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail (can't upgrade without catalog) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Progressing should be Retrying (can't resolve without catalog) + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + + // Installed should be True (v1.0.0 is already installed per RevisionStatesGetter) + // but we can't upgrade to v1.0.1 without catalog + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify all conditions are present and valid + verifyInvariants(ctx, t, cl, ext) + }) + + t.Run("auto-updates when catalog becomes available after fallback", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First attempt: catalog unavailable, then becomes available + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog temporarily unavailable") + } + // Second reconcile (triggered by catalog watch): catalog available with new version + v := bundle.VersionRelease{Version: bsemver.MustParse("2.0.0")} + return &declcfg.Bundle{ + Name: "test.2.0.0", + Package: "test-pkg", + Image: "test-image:2.0.0", + }, &v, &declcfg.Deprecation{}, nil + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - auto-update to latest + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog unavailable, falls back to v1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Verify core status after fallback to installed version + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Note: When falling back without catalog access initially, deprecation conditions + // may not be set yet. Full validation happens after catalog is available. + + // Second reconcile: simulating catalog watch trigger, catalog now available with v2.0.0 + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Should have upgraded to v2.0.0 + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "2.0.0", ext.Status.Install.Bundle.Version) + + // Verify status after upgrade + instCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + + progCond = apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Verify all conditions remain valid after upgrade + verifyInvariants(ctx, t, cl, ext) + + // Verify resolution was attempted twice (fallback, then success) + require.Equal(t, 2, resolveAttempt) + }) + + t.Run("retries when catalogs exist but resolution fails", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // Resolver fails (transient issue) + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("transient catalog issue: cache stale") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create a ClusterCatalog matching the extension's selector + catalog := &ocv1.ClusterCatalog{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-catalog", + }, + Spec: ocv1.ClusterCatalogSpec{ + Source: ocv1.CatalogSource{ + Type: ocv1.SourceTypeImage, + Image: &ocv1.ImageSource{ + Ref: "test-registry/catalog:latest", + }, + }, + }, + } + require.NoError(t, cl.Create(ctx, catalog)) + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version specified + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail and RETRY (not fallback) + // Catalogs exist, so this is likely a transient issue (catalog updating, cache stale, etc.) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying (not falling back to installed bundle) + require.NoError(t, cl.Get(ctx, extKey, ext)) + + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonRetrying, progCond.Reason) + require.Contains(t, progCond.Message, "transient catalog issue") + + // Installed should remain True (existing installation is maintained) + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + + // Verify we did NOT fall back - status should show we're retrying + verifyInvariants(ctx, t, cl, ext) + + // Clean up the catalog so it doesn't affect other tests + require.NoError(t, cl.Delete(ctx, catalog)) + }) +} + +func TestCheckCatalogsExist(t *testing.T) { + t.Run("returns false when no catalogs exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist") + }) + + t.Run("returns false when no selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: nil, // No selector + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist (no selector)") + }) + + t.Run("returns false when empty selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{}, // Empty selector (matches everything) + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "empty selector should not cause error") + require.False(t, exists, "should return false when no catalogs exist (empty selector)") + }) + + t.Run("returns error for invalid selector", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", // Invalid operator + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.Error(t, err, "should return error for invalid selector") + require.Contains(t, err.Error(), "invalid catalog selector") + require.False(t, exists) + }) +} diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 09dd2ce7d2..e6b9de73df 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -19,10 +19,12 @@ package controllers import ( "context" "errors" + "fmt" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/log" @@ -83,7 +85,14 @@ func RetrieveRevisionStates(r RevisionStatesGetter) ReconcileStepFunc { } } -func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { +// ResolveBundle resolves the bundle to install or roll out for a ClusterExtension. +// It requires a controller-runtime client (in addition to the resolve.Resolver) to enable +// intelligent error handling when resolution fails. The client is used to check if ClusterCatalogs +// matching the extension's selector still exist in the cluster, allowing the controller to +// distinguish between "ClusterCatalog deleted" (fall back to installed bundle) and "transient failure" +// (retry resolution). This ensures workload resilience during ClusterCatalog outages while maintaining +// responsiveness during ClusterCatalog updates. +func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) var resolvedRevisionMetadata *RevisionMetadata @@ -95,11 +104,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm) if err != nil { - // Note: We don't distinguish between resolution-specific errors and generic errors - setStatusProgressing(ext, err) - setInstalledStatusFromRevisionStates(ext, state.revisionStates) - ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error()) - return nil, err + return handleResolutionError(ctx, c, state, ext, err) } // set deprecation status after _successful_ resolution @@ -134,19 +139,198 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } } +// handleResolutionError handles the case when bundle resolution fails. +// If a bundle is already installed and the spec isn't requesting a version change, +// we check if the ClusterCatalogs have been deleted. If so, we fall back to using the +// installed bundle to maintain the current state (ClusterCatalog deletion resilience). +// However, if ClusterCatalogs still exist, we retry to allow for transient failures or ClusterCatalog updates. +// If the spec explicitly requests a different version, we must fail and retry regardless. +func handleResolutionError(ctx context.Context, c client.Client, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) { + l := log.FromContext(ctx) + + // No installed bundle and resolution failed - cannot proceed + if state.revisionStates.Installed == nil { + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // Check if the spec is requesting a specific version that differs from installed + specVersion := "" + if ext.Spec.Source.Catalog != nil { + specVersion = ext.Spec.Source.Catalog.Version + } + installedVersion := state.revisionStates.Installed.Version + + // If spec requests a different version, we cannot fall back - must fail and retry + if specVersion != "" && specVersion != installedVersion { + msg := fmt.Sprintf("unable to upgrade to version %s: %v (currently installed: %s)", specVersion, err, installedVersion) + l.Error(err, "resolution failed and spec requests version change - cannot fall back", + "requestedVersion", specVersion, + "installedVersion", installedVersion) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // No version change requested - check if ClusterCatalogs exist + // Only fall back if ClusterCatalogs have been deleted + catalogsExist, catalogCheckErr := CheckCatalogsExist(ctx, c, ext) + if catalogCheckErr != nil { + msg := fmt.Sprintf("failed to resolve bundle: %v", err) + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Error(catalogCheckErr, "error checking if ClusterCatalogs exist, will retry resolution", + "resolutionError", err, + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + if catalogsExist { + // ClusterCatalogs exist but resolution failed - likely a transient issue (ClusterCatalog updating, cache stale, etc.) + // Retry resolution instead of falling back + msg := fmt.Sprintf("failed to resolve bundle, retrying: %v", err) + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Error(err, "resolution failed but matching ClusterCatalogs exist - retrying instead of falling back", + "packageName", getPackageName(ext), + "catalogName", catalogName) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, msg) + return nil, err + } + + // ClusterCatalogs don't exist (deleted) - fall back to installed bundle to maintain current state. + // The controller watches ClusterCatalog resources, so when ClusterCatalogs become available again, + // a reconcile will be triggered automatically, allowing the extension to upgrade. + msg := fmt.Sprintf("continuing to maintain current installation at version %s due to catalog unavailability", state.revisionStates.Installed.Version) + var catalogName string + if ext.Spec.Source.Catalog != nil { + catalogName = getCatalogNameFromSelector(ext.Spec.Source.Catalog.Selector) + } + l.Error(err, "matching ClusterCatalogs unavailable or deleted - falling back to installed bundle to maintain workload", + "packageName", getPackageName(ext), + "catalogName", catalogName, + "installedBundle", state.revisionStates.Installed.Name, + "installedVersion", state.revisionStates.Installed.Version) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonSucceeded, msg) + state.resolvedRevisionMetadata = state.revisionStates.Installed + // Return no error to allow Apply step to run and maintain resources. + return nil, nil +} + +// getCatalogNameFromSelector extracts the catalog name from the selector if available. +// Returns empty string if selector is nil or doesn't contain the metadata.name label. +func getCatalogNameFromSelector(selector *metav1.LabelSelector) string { + if selector == nil || selector.MatchLabels == nil { + return "" + } + return selector.MatchLabels["olm.operatorframework.io/metadata.name"] +} + +// getPackageName safely extracts the package name from the extension spec. +// Returns empty string if Catalog source is nil. +func getPackageName(ext *ocv1.ClusterExtension) string { + if ext.Spec.Source.Catalog == nil { + return "" + } + return ext.Spec.Source.Catalog.PackageName +} + +// CheckCatalogsExist checks if any ClusterCatalogs matching the extension's selector exist. +// Returns true if at least one matching ClusterCatalog exists, false if none exist. +// Treats "CRD doesn't exist" errors as "no ClusterCatalogs exist" (returns false, nil). +// Returns an error only if the check itself fails unexpectedly. +func CheckCatalogsExist(ctx context.Context, c client.Client, ext *ocv1.ClusterExtension) (bool, error) { + var catalogList *ocv1.ClusterCatalogList + var listErr error + + if ext.Spec.Source.Catalog == nil || ext.Spec.Source.Catalog.Selector == nil { + // No selector means all ClusterCatalogs match - check if any ClusterCatalogs exist at all + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.Limit(1)) + } else { + // Convert label selector to k8slabels.Selector + // Note: An empty LabelSelector matches everything by default + selector, err := metav1.LabelSelectorAsSelector(ext.Spec.Source.Catalog.Selector) + if err != nil { + return false, fmt.Errorf("invalid catalog selector: %w", err) + } + + // List ClusterCatalogs matching the selector (limit to 1 since we only care if any exist) + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.MatchingLabelsSelector{Selector: selector}, client.Limit(1)) + } + + if listErr != nil { + // Check if the error is because the ClusterCatalog CRD doesn't exist + // This can happen if catalogd is not installed, which means no ClusterCatalogs exist + if apimeta.IsNoMatchError(listErr) { + return false, nil + } + return false, fmt.Errorf("failed to list ClusterCatalogs: %w", listErr) + } + + return len(catalogList.Items) > 0, nil +} + func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) - l.Info("unpacking resolved bundle") + + // Defensive check: resolvedRevisionMetadata should be set by ResolveBundle step + if state.resolvedRevisionMetadata == nil { + return nil, fmt.Errorf("unable to retrieve bundle information") + } + + // Always try to pull the bundle content (Pull uses cache-first strategy, so this is efficient) + l.V(1).Info("pulling bundle content") imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache) + + // Check if resolved bundle matches installed bundle (no version change) + bundleUnchanged := state.revisionStates != nil && + state.revisionStates.Installed != nil && + state.resolvedRevisionMetadata.Name == state.revisionStates.Installed.Name && + state.resolvedRevisionMetadata.Version == state.revisionStates.Installed.Version + if err != nil { - // Wrap the error passed to this with the resolution information until we have successfully - // installed since we intend for the progressing condition to replace the resolved condition - // and will be removing the .status.resolution field from the ClusterExtension status API + if bundleUnchanged { + // Bundle hasn't changed and Pull failed (likely cache miss + catalog unavailable). + // This happens in fallback mode after catalog deletion. Set imageFS to nil so the + // applier can maintain the workload using existing Helm release or ClusterExtensionRevision. + l.V(1).Info("bundle unchanged and content unavailable, continuing with nil contentFS (applier will use existing release/revision)", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version, + "error", err.Error()) + state.imageFS = nil + return nil, nil + } + // New bundle version but Pull failed - this is an error condition setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err)) setInstalledStatusFromRevisionStates(ext, state.revisionStates) return nil, err } + + if bundleUnchanged { + l.V(1).Info("bundle unchanged, using cached content for resource reconciliation", + "bundle", state.resolvedRevisionMetadata.Name, + "version", state.resolvedRevisionMetadata.Version) + } + state.imageFS = imageFS return nil, nil } diff --git a/internal/operator-controller/controllers/suite_test.go b/internal/operator-controller/controllers/suite_test.go index 6258a5d307..57305a75fd 100644 --- a/internal/operator-controller/controllers/suite_test.go +++ b/internal/operator-controller/controllers/suite_test.go @@ -106,7 +106,7 @@ func newClientAndReconciler(t *testing.T, opts ...reconcilerOption) (client.Clie } reconciler.ReconcileSteps = []controllers.ReconcileStepFunc{controllers.HandleFinalizers(d.Finalizers), controllers.RetrieveRevisionStates(d.RevisionStatesGetter)} if r := d.Resolver; r != nil { - reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r)) + reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r, cl)) } if i := d.ImagePuller; i != nil { reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.UnpackBundle(i, d.ImageCache)) diff --git a/test/utils.go b/test/utils.go index 22a50b2b8d..60cfaa38a0 100644 --- a/test/utils.go +++ b/test/utils.go @@ -14,6 +14,7 @@ func NewEnv() *envtest.Environment { testEnv := &envtest.Environment{ CRDDirectoryPaths: []string{ pathFromProjectRoot("helm/olmv1/base/operator-controller/crd/experimental"), + pathFromProjectRoot("helm/olmv1/base/catalogd/crd/experimental"), }, ErrorIfCRDPathMissing: true, }