diff --git a/api/v1alpha1/committed_resource_types.go b/api/v1alpha1/committed_resource_types.go index a6f1bd217..31365887f 100644 --- a/api/v1alpha1/committed_resource_types.go +++ b/api/v1alpha1/committed_resource_types.go @@ -147,6 +147,12 @@ const ( // CommittedResourceConditionReady indicates whether the CommittedResource has been // successfully reconciled into active Reservation CRDs. CommittedResourceConditionReady = "Ready" + + // Condition reasons set by the CommittedResource controller. + CommittedResourceReasonAccepted = "Accepted" + CommittedResourceReasonPlanned = "Planned" + CommittedResourceReasonReserving = "Reserving" + CommittedResourceReasonRejected = "Rejected" ) // +kubebuilder:object:root=true diff --git a/api/v1alpha1/reservation_types.go b/api/v1alpha1/reservation_types.go index 21c96efad..988d4b97d 100644 --- a/api/v1alpha1/reservation_types.go +++ b/api/v1alpha1/reservation_types.go @@ -81,6 +81,15 @@ type CommittedResourceReservationSpec struct { // +kubebuilder:validation:Optional Creator string `json:"creator,omitempty"` + // ParentGeneration is the Generation of the CommittedResource CRD at the time this + // reservation was last written by the CommittedResource controller. The Reservation + // controller echoes it to Status.CommittedResourceReservation.ObservedParentGeneration + // once it has processed the reservation, allowing the CR controller to wait until + // all child reservations are up-to-date before accepting. + // Zero means the field is not set (syncer-created reservations, no parent CR). + // +kubebuilder:validation:Optional + ParentGeneration int64 `json:"parentGeneration,omitempty"` + // Allocations maps workload identifiers to their allocation details. // Key: Workload UUID (VM UUID for Nova, Pod UID for Pods, Machine UID for IronCore, etc.) // Value: allocation state and metadata @@ -148,6 +157,12 @@ const ( // CommittedResourceReservationStatus defines the status fields specific to committed resource reservations. type CommittedResourceReservationStatus struct { + // ObservedParentGeneration is the Spec.CommittedResourceReservation.ParentGeneration value + // that this Reservation controller last processed. When it matches ParentGeneration in spec, + // the CR controller knows this reservation is up-to-date for the current CR spec version. + // +kubebuilder:validation:Optional + ObservedParentGeneration int64 `json:"observedParentGeneration,omitempty"` + // Allocations maps VM/instance UUIDs to the host they are currently running on. // Key: VM/instance UUID, Value: Host name where the VM is currently running. // +kubebuilder:validation:Optional diff --git a/cmd/manager/main.go b/cmd/manager/main.go index ba1cd52e8..e031366f8 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -367,7 +367,7 @@ func main() { if commitmentsConfig.DatasourceName != "" { commitmentsUsageDB = commitments.NewDBUsageClient(multiclusterClient, commitmentsConfig.DatasourceName) } - commitmentsAPI := commitmentsapi.NewAPIWithConfig(multiclusterClient, commitmentsConfig, commitmentsUsageDB) + commitmentsAPI := commitmentsapi.NewAPIWithConfig(multiclusterClient, commitmentsConfig.API, commitmentsUsageDB) commitmentsAPI.Init(mux, metrics.Registry, ctrl.Log.WithName("commitments-api")) if slices.Contains(mainConfig.EnabledControllers, "nova-pipeline-controllers") { @@ -538,12 +538,11 @@ func main() { monitor := reservations.NewMonitor(multiclusterClient) metrics.Registry.MustRegister(&monitor) commitmentsConfig := conf.GetConfigOrDie[commitments.Config]() - commitmentsConfig.ApplyDefaults() if err := (&commitments.CommitmentReservationController{ Client: multiclusterClient, Scheme: mgr.GetScheme(), - Conf: commitmentsConfig, + Conf: commitmentsConfig.ReservationController, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "CommitmentReservation") os.Exit(1) @@ -552,7 +551,7 @@ func main() { if err := (&commitments.CommittedResourceController{ Client: multiclusterClient, Scheme: mgr.GetScheme(), - Conf: commitmentsConfig, + Conf: commitmentsConfig.CommittedResourceController, }).SetupWithManager(mgr, multiclusterClient); err != nil { setupLog.Error(err, "unable to create controller", "controller", "CommittedResource") os.Exit(1) @@ -716,13 +715,12 @@ func main() { os.Exit(1) } - syncerMonitor := commitments.NewSyncerMonitor() - must.Succeed(metrics.Registry.Register(syncerMonitor)) if slices.Contains(mainConfig.EnabledTasks, "commitments-sync-task") { setupLog.Info("starting commitments syncer") + syncerMonitor := commitments.NewSyncerMonitor() + must.Succeed(metrics.Registry.Register(syncerMonitor)) syncer := commitments.NewSyncer(multiclusterClient, syncerMonitor) syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]() - syncerConfig.ApplyDefaults() if err := (&task.Runner{ Client: multiclusterClient, Interval: syncerConfig.SyncInterval, diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index 95f8b8bd5..9359d93f7 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -55,8 +55,8 @@ flowchart LR UsageAPI[Usage API] Scheduler[Scheduler API] - ChangeAPI -->|CRUD| CR - Syncer -->|CRUD| CR + ChangeAPI -->|upsert + poll status| CR + Syncer -->|upsert| CR UsageAPI -->|read| CR UsageAPI -->|read| Res CapacityAPI -->|read| Res @@ -117,12 +117,31 @@ The controller's job is to keep child `Reservation` CRDs in sync with the desire - **`pending`**: Cortex is being asked for a yes/no decision. If placement fails for any reason, child Reservations are removed and the CR is marked Rejected. The caller (e.g. the change-commitments API) reads the outcome and reports back to Limes. No retry. -- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer (e.g. the change-commitments API on a resize request) set `Spec.AllowRejection=true`; the controller then rejects on failure instead of retrying. +- **`guaranteed` / `confirmed`**: Cortex is expected to honour the commitment. The default is to keep retrying until placement succeeds (`Ready=False, Reason=Reserving`). Callers that can accept "no" as an answer set `Spec.AllowRejection=true` (the change-commitments API sets this for confirming requests — new commitments, resizes); the controller then rejects on failure instead of retrying. - **On rejection**: rolls back child Reservations to the last successfully placed quantity (`Status.AcceptedAmount`). For a CR that was never accepted, this means removing all child Reservations. The controller communicates with the Reservation controller only through CRDs — no direct calls. +**Reconcile trigger flow:** + +```mermaid +sequenceDiagram + participant API as Change-Commitments API + participant CRCtrl as CR Controller + participant CRCRD as CommittedResource CRD + participant ResCRD as Reservation CRD + participant ResCtrl as Reservation Controller + + API->>CRCRD: write (create/update) + CRCRD-->>CRCtrl: watch fires + CRCtrl->>ResCRD: create/update child slots + ResCRD-->>ResCtrl: watch fires + ResCtrl->>ResCRD: update (ObservedParentGeneration, Ready=True/False) + ResCRD-->>CRCtrl: watch fires (Reservation→parent CR lookup) + CRCtrl->>CRCRD: update status (Accepted / Reserving / Rejected) +``` + ### Reservation Lifecycle | Component | Event | Timing | Action | @@ -228,18 +247,20 @@ The `Reservation` controller (`CommitmentReservationController`) watches `Reserv ### Change-Commitments API -The change-commitments API receives batched commitment changes from Limes and manages reservations accordingly. +The change-commitments API receives batched commitment changes from Limes and applies them using a **write-intent, watch-for-outcome** pattern: the handler creates or updates `CommittedResource` CRDs and polls their `Status.Conditions` until each reaches a terminal state — it does not interact with `Reservation` CRDs directly. **Request Semantics**: A request can contain multiple commitment changes across different projects and flavor groups. The semantic is **all-or-nothing** — if any commitment in the batch cannot be fulfilled (e.g., insufficient capacity), the entire request is rejected and rolled back. -**Operations**: Cortex performs CRUD operations on local Reservation CRDs to match the new desired state: -- Creates new reservations for increased commitment amounts -- Deletes existing reservations for decreased commitments -- Preserves existing reservations that already have VMs allocated when possible +**Operations**: +1. For each commitment in the batch, create or update a `CommittedResource` CRD. `Spec.AllowRejection` mirrors the request's `RequiresConfirmation` flag: `true` for changes where Limes needs a yes/no answer (new commitments, resizes), `false` for non-confirming changes (deletions, status-only transitions) where Limes doesn't act on the rejection reason +2. Poll `CommittedResource.Status.Conditions[Ready]` until each reaches a terminal state: `Reason=Accepted` (success), `Reason=Planned` (deferred; accepted), or `Reason=Rejected` (failure) — only for confirming changes; non-confirming changes return immediately without polling +3. On any failure or timeout, restore all modified `CommittedResource` CRDs to their pre-request specs (or delete newly-created ones) + +The `CommittedResource` controller handles all downstream `Reservation` CRUD. `AllowRejection=true` tells it to reject and roll back child Reservations on placement failure rather than retrying indefinitely. ### Syncer Task -The syncer task runs periodically and syncs local Reservation CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts. +The syncer task runs periodically and syncs local `CommittedResource` CRD state to match Limes' view of commitments, correcting drift from missed API calls or restarts. It writes `CommittedResource` CRDs only — Reservation CRUD is the controller's responsibility. ### Usage API diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 75e6deb65..8d3fe878b 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -505,52 +505,10 @@ groups: CRD retrieval. Limes scrapes may time out, affecting capacity reporting. # Committed Resource Syncer Alerts - - alert: CortexNovaCommittedResourceSyncerErrorsHigh - expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 - for: 5m - labels: - context: committed-resource-syncer - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer experiencing errors" - description: > - The committed resource syncer has encountered multiple errors in the last hour. - This may indicate connectivity issues with Limes, malformed API responses, - or failures writing reservation CRDs. Check the syncer logs for error details. - - - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0.05 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer unit mismatch rate >5%" - description: > - More than 5% of commitments are being skipped due to unit mismatches between - Limes and Cortex flavor groups. This happens when Limes has not yet been - updated to use the new unit format after a flavor group change. The affected - commitments will keep their existing reservations until Limes notices the update. - Check the logs if this error persists for longer time. - - - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh - expr: | - ( - sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) - / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) - ) > 0 - and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 + # These alerts only fire when the syncer is enabled (metrics are only registered when enabled). + # Absent metrics = syncer disabled = alerts inactive by design. + - alert: CortexNovaCommittedResourceSyncerNotRunning + expr: increase(cortex_committed_resource_syncer_duration_seconds_count{service="cortex-nova-metrics"}[3h]) < 1 for: 15m labels: context: committed-resource-syncer @@ -559,46 +517,15 @@ groups: severity: warning support_group: workload-management annotations: - summary: "Committed Resource syncer unknown flavor group rate >0%" + summary: "Committed Resource syncer has not run in 3 hours" description: > - Some commitments reference flavor groups that don't exist in - Cortex Knowledge (anymore). This may indicate that flavor group configuration is - out of sync between Limes and Cortex, or that Knowledge extraction is failing. - Check the flavor group Knowledge CRD and history to see what was changed. + No commitment sync has completed in the last 3 hours. The syncer runs hourly, + so at least 2 runs should appear in this window. Check that the syncer task + is healthy and Limes is reachable. - - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh - expr: | - ( - ( - rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0.01 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m - labels: - context: committed-resource-syncer - dashboard: cortex-status-dashboard/cortex-status-dashboard - service: cortex - severity: warning - support_group: workload-management - annotations: - summary: "Committed Resource syncer local change rate >1%" - description: > - More than 1% of synced commitments are requiring reservation changes - (creates, deletes, or repairs). This is higher than expected for steady-state - operation and may indicate data inconsistencies, external modifications to - reservations, or issues with the CRDs. Check Cortex logs for details. - - - alert: CortexNovaCommittedResourceSyncerRepairRateHigh - expr: | - ( - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) - ) > 0 - and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 - for: 15m + - alert: CortexNovaCommittedResourceSyncerErrors + expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 + for: 5m labels: context: committed-resource-syncer dashboard: cortex-status-dashboard/cortex-status-dashboard @@ -606,13 +533,11 @@ groups: severity: warning support_group: workload-management annotations: - summary: "Committed Resource syncer repair rate >0%" + summary: "Committed Resource syncer is repeatedly failing" description: > - Some commitments have reservations that needed repair - (wrong metadata like project ID or flavor group). This may indicate data - corruption, bugs in reservation creation, or external modifications. - Reservations are automatically repaired, but the root cause should be - investigated if this alert persists. + The committed resource syncer has encountered more than 3 errors in the last + hour. Check syncer logs for details; common causes are connectivity issues + with Limes or failures writing CommittedResource CRDs. - alert: CortexNovaDoesntFindValidKVMHosts expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0 diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 65ad879dd..d694bdfba 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -141,35 +141,39 @@ cortex-scheduling-controllers: # Number of top hosts to shuffle for evacuation requests. # Set to 0 or negative to disable shuffling. evacuationShuffleK: 3 - # CommittedResourceFlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations - # This allows different scheduling strategies per flavor group (e.g., HANA vs GP) - committedResourceFlavorGroupPipelines: - "2152": "kvm-hana-bin-packing" # HANA flavor group - "2101": "kvm-general-purpose-load-balancing" # General Purpose flavor group - "*": "kvm-general-purpose-load-balancing" # Catch-all fallback - # Default pipeline for CR reservations when no CommittedResourceFlavorGroupPipelines entry matches - committedResourcePipelineDefault: "kvm-general-purpose-load-balancing" - # How often to re-verify active reservations - # 5m = 300000000000 nanoseconds - committedResourceRequeueIntervalActive: 300000000000 - # How often to retry when knowledge is not ready - # 1m = 60000000000 nanoseconds - committedResourceRequeueIntervalRetry: 60000000000 - # Timeout for watching reservations to become ready before rolling back - # 10s = 10000000000 nanoseconds - committedResourceChangeAPIWatchReservationsTimeout: 10000000000 - # How often to poll reservation status during watch - # 500ms = 500000000 nanoseconds - committedResourceChangeAPIWatchReservationsPollInterval: 500000000 - # Whether the change-commitments API endpoint is active - # When false, the endpoint returns HTTP 503. The info endpoint remains available. - committedResourceEnableChangeCommitmentsAPI: true - # Whether the report-usage API endpoint is active - # When false, the endpoint returns HTTP 503. - committedResourceEnableReportUsageAPI: true - # Whether the report-capacity API endpoint is active - # When false, the endpoint returns HTTP 503. - committedResourceEnableReportCapacityAPI: true + committedResourceReservationController: + # Maps flavor group IDs to pipeline names; "*" acts as catch-all fallback + flavorGroupPipelines: + "2152": "kvm-hana-bin-packing" # HANA flavor group + "2101": "kvm-general-purpose-load-balancing" # General Purpose flavor group + "*": "kvm-general-purpose-load-balancing" # Catch-all fallback + # Fallback pipeline when no flavorGroupPipelines entry matches + pipelineDefault: "kvm-general-purpose-load-balancing" + # How often to re-verify active Reservation CRDs (healthy state) + requeueIntervalActive: "5m" + # Back-off interval when knowledge is unavailable + requeueIntervalRetry: "1m" + # Back-off interval while a VM allocation is still within allocationGracePeriod + requeueIntervalGracePeriod: "1m" + # How long after a VM is allocated to a reservation before it is expected to appear + # on the target host; allocations not confirmed within this window are removed + allocationGracePeriod: "15m" + # URL of the nova external scheduler API for placement decisions + schedulerURL: "http://localhost:8080/scheduler/nova/external" + committedResourceController: + # Back-off interval while CommittedResource placement is pending or failed + requeueIntervalRetry: "1m" + committedResourceAPI: + # Timeout for watching CommittedResource CRDs before rolling back + watchTimeout: "10s" + # How often to poll CommittedResource CRD conditions during watch + watchPollInterval: "500ms" + # When false, the endpoint returns HTTP 503; the info endpoint remains available. + enableChangeCommitments: true + # When false, the endpoint returns HTTP 503. + enableReportUsage: true + # When false, the endpoint returns HTTP 503. + enableReportCapacity: true # OvercommitMappings is a list of mappings that map hypervisor traits to # overcommit ratios. Note that this list is applied in order, so if there # are multiple mappings applying to the same hypervisors, the last mapping @@ -178,7 +182,7 @@ cortex-scheduling-controllers: # Failover reservations controller configuration # Name of the Datasource CRD that provides database connection info for Nova VM data datasourceName: nova-servers - # URL of the nova external scheduler API for placement decisions + # URL of the nova external scheduler API for placement decisions (used by failover controller) schedulerURL: "http://localhost:8080/scheduler/nova/external" # Maps flavor name patterns (glob) to required failover count # Example: {"hana_*": 2, "m1.xlarge": 1} @@ -205,6 +209,8 @@ cortex-scheduling-controllers: limitOneNewReservationPerHypervisor: false # Size failover reservations based on LargestFlavor in the flavor group useFlavorGroupResources: false + # How often the commitments syncer reconciles Limes commitments to Reservation CRDs + committedResourceSyncInterval: "1h" cortex-knowledge-controllers: <<: *cortex diff --git a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml index 686aa60fe..a30b7d221 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_reservations.yaml @@ -138,6 +138,16 @@ spec: type: string domainID: type: string + parentGeneration: + description: |- + ParentGeneration is the Generation of the CommittedResource CRD at the time this + reservation was last written by the CommittedResource controller. The Reservation + controller echoes it to Status.CommittedResourceReservation.ObservedParentGeneration + once it has processed the reservation, allowing the CR controller to wait until + all child reservations are up-to-date before accepting. + Zero means the field is not set (syncer-created reservations, no parent CR). + format: int64 + type: integer projectID: type: string resourceGroup: @@ -212,6 +222,13 @@ spec: Allocations maps VM/instance UUIDs to the host they are currently running on. Key: VM/instance UUID, Value: Host name where the VM is currently running. type: object + observedParentGeneration: + description: |- + ObservedParentGeneration is the Spec.CommittedResourceReservation.ParentGeneration value + that this Reservation controller last processed. When it matches ParentGeneration in spec, + the CR controller knows this reservation is up-to-date for the current CR spec version. + format: int64 + type: integer type: object conditions: description: |- diff --git a/internal/scheduling/reservations/commitments/api/change_commitments.go b/internal/scheduling/reservations/commitments/api/change_commitments.go index e076e41c9..9849075b9 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments.go @@ -19,10 +19,13 @@ import ( "github.com/go-logr/logr" "github.com/google/uuid" "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) // sortedKeys returns map keys sorted alphabetically for deterministic iteration. @@ -37,42 +40,47 @@ func sortedKeys[K ~string, V any](m map[K]V) []K { return keys } -// implements POST /commitments/v1/change-commitments from Limes LIQUID API: +// crSnapshot captures a CommittedResource CRD's prior state for batch rollback. +// prevSpec is nil when the CRD was newly created (i.e. did not exist before the batch). +// wasDeleted is true when the batch operation deleted the CRD; rollback must re-create it. +type crSnapshot struct { + crName string + prevSpec *v1alpha1.CommittedResourceSpec + wasDeleted bool +} + +// HandleChangeCommitments implements POST /commitments/v1/change-commitments from the Limes LIQUID API. +// It writes CommittedResource CRDs (one per commitment) and polls their status conditions until +// the controller confirms or rejects each one. On any failure the whole batch is rolled back. +// // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid -// -// This endpoint handles commitment changes by creating/updating/deleting Reservation CRDs based on the commitment lifecycle. -// A request may contain multiple commitment changes which are processed in a single transaction. If any change fails, all changes are rolled back. func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Request) { startTime := time.Now() - // Initialize resp := liquid.CommitmentChangeResponse{} req := liquid.CommitmentChangeRequest{} statusCode := http.StatusOK - // Extract or generate request ID for tracing - always set in response header requestID := r.Header.Get("X-Request-ID") if requestID == "" { requestID = uuid.New().String() } w.Header().Set("X-Request-ID", requestID) - // Check if API is enabled - if !api.config.EnableChangeCommitmentsAPI { + if !api.config.EnableChangeCommitments { statusCode = http.StatusServiceUnavailable http.Error(w, "change-commitments API is disabled", statusCode) api.recordMetrics(req, resp, statusCode, startTime) return } - // Serialize all change-commitments requests + // Serialize all change-commitments requests so the controller sees a consistent world. api.changeMutex.Lock() defer api.changeMutex.Unlock() ctx := reservations.WithGlobalRequestID(context.Background(), "committed-resource-"+requestID) logger := commitments.LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/commitments/v1/change-commitments") - // Only accept POST method if r.Method != http.MethodPost { statusCode = http.StatusMethodNotAllowed http.Error(w, "Method not allowed", statusCode) @@ -80,7 +88,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque return } - // Parse request body if err := json.NewDecoder(r.Body).Decode(&req); err != nil { logger.Error(err, "invalid request body") statusCode = http.StatusBadRequest @@ -91,7 +98,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque logger.Info("received change commitments request", "affectedProjects", len(req.ByProject), "dryRun", req.DryRun, "availabilityZone", req.AZ) - // Check for dry run -> early reject, not supported yet if req.DryRun { resp.RejectionReason = "Dry run not supported yet" api.recordMetrics(req, resp, statusCode, startTime) @@ -104,26 +110,17 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque return } - // Process commitment changes - // For now, we'll implement a simplified path that checks capacity for immediate start CRs - if err := api.processCommitmentChanges(ctx, w, logger, req, &resp); err != nil { - // Error already written to response by processCommitmentChanges - // Determine status code from error context (409 or 503) if strings.Contains(err.Error(), "version mismatch") { statusCode = http.StatusConflict } else if strings.Contains(err.Error(), "caches not ready") { statusCode = http.StatusServiceUnavailable } - // Record metrics for error cases api.recordMetrics(req, resp, statusCode, startTime) return } - // Record metrics api.recordMetrics(req, resp, statusCode, startTime) - - // Return response w.Header().Set("Content-Type", "application/json") w.WriteHeader(statusCode) if err := json.NewEncoder(w).Encode(resp); err != nil { @@ -132,11 +129,6 @@ func (api *HTTPAPI) HandleChangeCommitments(w http.ResponseWriter, r *http.Reque } func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.ResponseWriter, logger logr.Logger, req liquid.CommitmentChangeRequest, resp *liquid.CommitmentChangeResponse) error { - manager := commitments.NewReservationManager(api.client) - requireRollback := false - failedCommitments := make(map[string]string) // commitmentUUID to reason for failure, for better response messages in case of rollback - creatorRequestID := reservations.GlobalRequestIDFromContext(ctx) - knowledge := &reservations.FlavorGroupKnowledgeClient{Client: api.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { @@ -145,12 +137,10 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons return errors.New("caches not ready") } - // Validate InfoVersion from request matches current version (= last content change of flavor group knowledge) var currentVersion int64 = -1 if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { currentVersion = knowledgeCRD.Status.LastContentChange.Unix() } - if req.InfoVersion != currentVersion { logger.Info("version mismatch in commitment change request", "requestVersion", req.InfoVersion, @@ -160,160 +150,166 @@ func (api *HTTPAPI) processCommitmentChanges(ctx context.Context, w http.Respons return errors.New("version mismatch") } - statesBefore := make(map[string]*commitments.CommitmentState) // map of commitmentID to existing state for rollback - var reservationsToWatch []v1alpha1.Reservation + // If Limes does not require confirmation for this batch (e.g. deletions, status-only transitions), + // the controller must not reject — it must retry until it succeeds (AllowRejection=false). + // Conversely, when Limes requires confirmation, the controller may reject and report back. + allowRejection := req.RequiresConfirmation() - if req.DryRun { - resp.RejectionReason = "Dry run not supported yet" - return nil - } + var ( + toWatch []string // CRD names to poll for terminal conditions (upserts only) + snapshots []crSnapshot // ordered list for deterministic rollback + failedReason string + rollback bool + ) ProcessLoop: for _, projectID := range sortedKeys(req.ByProject) { projectChanges := req.ByProject[projectID] + + // Extract domain ID from Keystone project metadata if Limes provided it. + domainID := "" + if pm := projectChanges.ProjectMetadata.UnwrapOr(liquid.ProjectMetadata{}); pm.Domain.UUID != "" { + domainID = pm.Domain.UUID + } + for _, resourceName := range sortedKeys(projectChanges.ByResource) { resourceChanges := projectChanges.ByResource[resourceName] - // Validate resource name pattern (instances_group_*) + flavorGroupName, err := commitments.GetFlavorGroupNameFromResource(string(resourceName)) if err != nil { - resp.RejectionReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) - requireRollback = true + failedReason = fmt.Sprintf("project with unknown resource name %s: %v", projectID, err) + rollback = true break ProcessLoop } - // Verify flavor group exists in Knowledge CRDs - flavorGroup, flavorGroupExists := flavorGroups[flavorGroupName] - if !flavorGroupExists { - resp.RejectionReason = "flavor group not found: " + flavorGroupName - requireRollback = true + flavorGroup, ok := flavorGroups[flavorGroupName] + if !ok { + failedReason = "flavor group not found: " + flavorGroupName + rollback = true break ProcessLoop } - // Reject commitments for flavor groups that don't accept CRs if !commitments.FlavorGroupAcceptsCommitments(&flavorGroup) { - resp.RejectionReason = commitments.FlavorGroupCommitmentRejectionReason(&flavorGroup) - requireRollback = true + failedReason = commitments.FlavorGroupCommitmentRejectionReason(&flavorGroup) + rollback = true break ProcessLoop } for _, commitment := range resourceChanges.Commitments { - logger.V(1).Info("processing commitment", "commitmentUUID", commitment.UUID, "oldStatus", commitment.OldStatus.UnwrapOr("none"), "newStatus", commitment.NewStatus.UnwrapOr("none")) - - // TODO add configurable upper limit validation for commitment size (number of instances) to prevent excessive reservation creation - // TODO add domain - - // List all committed resource reservations, then filter by name prefix - var all_reservations v1alpha1.ReservationList - if err := api.client.List(ctx, &all_reservations, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { - failedCommitments[string(commitment.UUID)] = "failed to list reservations" - logger.Info("failed to list reservations for commitment", "commitmentUUID", commitment.UUID, "error", err) - requireRollback = true - break ProcessLoop - } - - // Filter by name prefix to find reservations for this commitment - namePrefix := fmt.Sprintf("commitment-%s-", string(commitment.UUID)) - var existing_reservations v1alpha1.ReservationList - for _, res := range all_reservations.Items { - if len(res.Name) >= len(namePrefix) && res.Name[:len(namePrefix)] == namePrefix { - existing_reservations.Items = append(existing_reservations.Items, res) + isDelete := commitment.NewStatus.IsNone() + crName := "commitment-" + string(commitment.UUID) + + logger.V(1).Info("processing commitment", + "commitmentUUID", commitment.UUID, + "oldStatus", commitment.OldStatus.UnwrapOr("none"), + "newStatus", commitment.NewStatus.UnwrapOr("none"), + "delete", isDelete) + + // Snapshot the current spec before mutation so we can restore it on rollback. + snap := crSnapshot{crName: crName} + existing := &v1alpha1.CommittedResource{} + if err := api.client.Get(ctx, types.NamespacedName{Name: crName}, existing); err != nil { + if !apierrors.IsNotFound(err) { + failedReason = fmt.Sprintf("commitment %s: failed to read pre-update snapshot: %v", commitment.UUID, err) + rollback = true + break ProcessLoop } + // Not found: CR is new (or already absent for deletes), prevSpec stays nil. + } else { + specCopy := existing.Spec + snap.prevSpec = &specCopy } - var stateBefore *commitments.CommitmentState - if len(existing_reservations.Items) == 0 { - stateBefore = &commitments.CommitmentState{ - CommitmentUUID: string(commitment.UUID), - ProjectID: string(projectID), - FlavorGroupName: flavorGroupName, - TotalMemoryBytes: 0, - } - } else { - stateBefore, err = commitments.FromReservations(existing_reservations.Items) - if err != nil { - failedCommitments[string(commitment.UUID)] = "failed to parse existing commitment reservations" - logger.Info("failed to get existing state for commitment", "commitmentUUID", commitment.UUID, "error", err) - requireRollback = true - break ProcessLoop + if isDelete { + // Limes is removing this commitment; delete the CRD if it exists. + snap.wasDeleted = true + if snap.prevSpec != nil { + if err := api.client.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { + failedReason = fmt.Sprintf("commitment %s: failed to delete CommittedResource CRD: %v", commitment.UUID, err) + rollback = true + break ProcessLoop + } + logger.V(1).Info("deleted CommittedResource CRD", "name", crName) } + snapshots = append(snapshots, snap) + continue } - statesBefore[string(commitment.UUID)] = stateBefore - // get desired state - stateDesired, err := commitments.FromChangeCommitmentTargetState(commitment, string(projectID), flavorGroupName, flavorGroup, string(req.AZ)) + stateDesired, err := commitments.FromChangeCommitmentTargetState( + commitment, string(projectID), domainID, flavorGroupName, flavorGroup, string(req.AZ)) if err != nil { - failedCommitments[string(commitment.UUID)] = err.Error() - logger.Info("failed to get desired state for commitment", "commitmentUUID", commitment.UUID, "error", err) - requireRollback = true + failedReason = fmt.Sprintf("commitment %s: %s", commitment.UUID, err) + rollback = true break ProcessLoop } - // Set creator request ID for traceability across controller reconciles - stateDesired.CreatorRequestID = creatorRequestID - - logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes) - applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi") - if err != nil { - failedCommitments[string(commitment.UUID)] = "failed to apply commitment state" - logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err) - requireRollback = true + cr := &v1alpha1.CommittedResource{} + cr.Name = crName + if _, err := controllerutil.CreateOrUpdate(ctx, api.client, cr, func() error { + applyCRSpec(cr, stateDesired, allowRejection) + return nil + }); err != nil { + failedReason = fmt.Sprintf("commitment %s: failed to write CommittedResource CRD: %v", commitment.UUID, err) + rollback = true break ProcessLoop } - logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations)) - reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...) + + toWatch = append(toWatch, crName) + snapshots = append(snapshots, snap) + logger.V(1).Info("upserted CommittedResource CRD", "name", crName) } } } - // TODO make the rollback defer safe - if !requireRollback { - logger.Info("applied commitment changes, now watching for reservation readiness", "reservationsToWatch", len(reservationsToWatch)) + if !rollback { + // Non-confirming changes (RequiresConfirmation=false): Limes ignores our RejectionReason, + // so there is no point blocking on the controller outcome. The CRDs are written with + // AllowRejection=false, meaning the controller will retry indefinitely in the background. + if !allowRejection { + logger.Info("non-confirming changes applied, returning without polling", "count", len(toWatch)) + return nil + } + + logger.Info("CommittedResource CRDs written, polling for controller outcome", "count", len(toWatch)) + watchStart := time.Now() - time_start := time.Now() + rejected, watchErrs := watchCRsUntilReady( + ctx, logger, api.client, toWatch, + api.config.WatchTimeout.Duration, + api.config.WatchPollInterval.Duration, + ) - if failedReservations, errors := watchReservationsUntilReady(ctx, logger, api.client, reservationsToWatch, api.config.ChangeAPIWatchReservationsTimeout, api.config.ChangeAPIWatchReservationsPollInterval); len(failedReservations) > 0 || len(errors) > 0 { - logger.Info("reservations failed to become ready, initiating rollback", - "failedReservations", len(failedReservations), - "errors", errors) + logger.Info("polling complete", "duration", time.Since(watchStart).Round(time.Millisecond)) - for _, res := range failedReservations { - failedCommitments[res.Spec.CommittedResourceReservation.CommitmentUUID] = "not sufficient capacity" + switch { + case len(rejected) > 0: + var b strings.Builder + fmt.Fprintf(&b, "%d commitment(s) failed to apply:", len(rejected)) + for _, crName := range toWatch { // iterate toWatch for deterministic order + if reason, ok := rejected[crName]; ok { + fmt.Fprintf(&b, "\n- commitment %s: %s", strings.TrimPrefix(crName, "commitment-"), reason) + } } - if len(failedReservations) == 0 { - resp.RejectionReason += "timeout reached while processing commitment changes" - api.monitor.timeouts.Inc() + failedReason = b.String() + rollback = true + case len(watchErrs) > 0: + msgs := make([]string, len(watchErrs)) + for i, e := range watchErrs { + msgs[i] = e.Error() } - requireRollback = true + failedReason = "timeout reached while processing commitment changes: " + strings.Join(msgs, "; ") + api.monitor.timeouts.Inc() + rollback = true } - - logger.Info("finished watching reservation", "totalSchedulingTimeSeconds", time.Since(time_start).Seconds()) } - if requireRollback { - // Build rejection reason from failed commitments - if len(failedCommitments) > 0 { - var reasonBuilder strings.Builder - fmt.Fprintf(&reasonBuilder, "%d commitment(s) failed to apply: ", len(failedCommitments)) - for commitmentUUID, reason := range failedCommitments { - fmt.Fprintf(&reasonBuilder, "\n- commitment %s: %s", commitmentUUID, reason) - } - resp.RejectionReason = reasonBuilder.String() + if rollback { + resp.RejectionReason = failedReason + logger.Info("rolling back CommittedResource CRDs", "reason", failedReason, "count", len(snapshots)) + for i := len(snapshots) - 1; i >= 0; i-- { + rollbackCR(ctx, logger, api.client, snapshots[i]) } - - logger.Info("rollback of commitment changes") - for commitmentUUID, state := range statesBefore { - // Rollback to statesBefore for this commitment - logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state) - _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback") - if err != nil { - logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err) - // continue with best effort rollback for other projects - } - } - - logger.Info("finished applying rollbacks for commitment changes", "reasonOfRollback", resp.RejectionReason) + logger.Info("rollback complete") return nil } @@ -321,111 +317,139 @@ ProcessLoop: return nil } -// watchReservationsUntilReady polls until all reservations reach Ready=True or timeout. -// Returns failed reservations and any errors encountered. -func watchReservationsUntilReady( +// watchCRsUntilReady polls CommittedResource conditions until each CRD reaches a terminal state: +// - Ready=True (Accepted) — success +// - Ready=False, Reason=Planned — success; controller reserves capacity at activation time +// - Ready=False, Reason=Rejected — failure; reason reported to caller +// +// Returns a map of crName → rejection reason for failed CRDs, and any polling errors (e.g. timeout). +func watchCRsUntilReady( ctx context.Context, logger logr.Logger, k8sClient client.Client, - reservations []v1alpha1.Reservation, + crNames []string, timeout time.Duration, pollInterval time.Duration, -) (failedReservations []v1alpha1.Reservation, errors []error) { +) (rejected map[string]string, errs []error) { - if len(reservations) == 0 { - return failedReservations, nil + if len(crNames) == 0 { + return nil, nil } + rejected = make(map[string]string) deadline := time.Now().Add(timeout) - startTime := time.Now() - totalReservations := len(reservations) - reservationsToWatch := make([]v1alpha1.Reservation, len(reservations)) - copy(reservationsToWatch, reservations) - - // Track successful reservations for summary - var successfulReservations []string - pollCount := 0 + pending := make(map[string]struct{}, len(crNames)) + for _, name := range crNames { + pending[name] = struct{}{} + } for { - pollCount++ - var stillWaiting []v1alpha1.Reservation if time.Now().After(deadline) { - errors = append(errors, fmt.Errorf("timeout after %v waiting for reservations to become ready", timeout)) - // Log summary on timeout - logger.Info("reservation watch completed (timeout)", - "total", totalReservations, - "ready", len(successfulReservations), - "failed", len(failedReservations), - "timedOut", len(reservationsToWatch), - "duration", time.Since(startTime).Round(time.Millisecond), - "polls", pollCount) - return failedReservations, errors + errs = append(errs, fmt.Errorf("timeout after %v waiting for %d CommittedResource CRD(s)", timeout, len(pending))) + return rejected, errs } - for _, res := range reservationsToWatch { - // Fetch current state - var current v1alpha1.Reservation - nn := types.NamespacedName{ - Name: res.Name, - Namespace: res.Namespace, + for name := range pending { + cr := &v1alpha1.CommittedResource{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, cr); err != nil { + continue // transient; keep waiting } - if err := k8sClient.Get(ctx, nn, ¤t); err != nil { - // Reservation is still in process of being created, or there is a transient error - stillWaiting = append(stillWaiting, res) - continue + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + continue // controller hasn't reconciled yet } - // Check Ready condition - readyCond := meta.FindStatusCondition( - current.Status.Conditions, - v1alpha1.ReservationConditionReady, - ) - - if readyCond == nil { - // Condition not set yet, keep waiting - stillWaiting = append(stillWaiting, res) - continue - } - - switch readyCond.Status { - case metav1.ConditionTrue: - // Only consider truly ready if Status.Host is populated - if current.Spec.TargetHost == "" || current.Status.Host == "" { - stillWaiting = append(stillWaiting, res) - continue - } - // Reservation is successfully scheduled - track for summary - successfulReservations = append(successfulReservations, current.Name) - - case metav1.ConditionFalse: - // Any failure reason counts as failed - failedReservations = append(failedReservations, current) - case metav1.ConditionUnknown: - stillWaiting = append(stillWaiting, res) + switch { + case cond.Status == metav1.ConditionTrue: + delete(pending, name) + case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonPlanned: + delete(pending, name) // planned = accepted; controller will reserve at activation + case cond.Status == metav1.ConditionFalse && cond.Reason == v1alpha1.CommittedResourceReasonRejected: + delete(pending, name) + rejected[name] = cond.Message + // Reason=Reserving: controller is placing slots; keep waiting. } } - if len(stillWaiting) == 0 { - // All reservations have reached a terminal state - log summary - logger.Info("reservation watch completed", - "total", totalReservations, - "ready", len(successfulReservations), - "failed", len(failedReservations), - "duration", time.Since(startTime).Round(time.Millisecond), - "polls", pollCount) - return failedReservations, errors + if len(pending) == 0 { + return rejected, nil } - reservationsToWatch = stillWaiting - - // Wait before next poll select { case <-time.After(pollInterval): - // Continue polling case <-ctx.Done(): - return failedReservations, append(errors, fmt.Errorf("context cancelled while waiting for reservations: %w", ctx.Err())) + return rejected, append(errs, fmt.Errorf("context cancelled: %w", ctx.Err())) } + logger.V(1).Info("polling CommittedResource CRDs", "pending", len(pending)) + } +} + +// rollbackCR reverses the batch-local change to a single CommittedResource CRD. +// - wasDeleted=true, prevSpec!=nil: CRD was deleted; re-create it from the snapshot. +// - wasDeleted=true, prevSpec==nil: CRD was absent before and after; nothing to do. +// - wasDeleted=false, prevSpec==nil: CRD was newly created; delete it. +// - wasDeleted=false, prevSpec!=nil: CRD was updated; restore its spec. +func rollbackCR(ctx context.Context, logger logr.Logger, k8sClient client.Client, snap crSnapshot) { + if snap.wasDeleted { + if snap.prevSpec == nil { + return // was absent before deletion attempt; nothing to undo + } + cr := &v1alpha1.CommittedResource{} + cr.Name = snap.crName + cr.Spec = *snap.prevSpec + if err := k8sClient.Create(ctx, cr); client.IgnoreAlreadyExists(err) != nil { + logger.Error(err, "failed to re-create CommittedResource CRD during rollback", "name", snap.crName) + } + return + } + + if snap.prevSpec == nil { + cr := &v1alpha1.CommittedResource{} + cr.Name = snap.crName + if err := k8sClient.Delete(ctx, cr); client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to delete CommittedResource CRD during rollback", "name", snap.crName) + } + return + } + + cr := &v1alpha1.CommittedResource{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: snap.crName}, cr); err != nil { + logger.Error(err, "failed to fetch CommittedResource CRD for rollback", "name", snap.crName) + return + } + cr.Spec = *snap.prevSpec + if err := k8sClient.Update(ctx, cr); err != nil { + logger.Error(err, "failed to restore CommittedResource CRD spec during rollback", "name", snap.crName) + } +} + +// applyCRSpec writes CommitmentState fields into a CommittedResource CRD spec. +// allowRejection=true for the change-commitments API path: the controller may reject +// on failure and the API reports the outcome to Limes. +func applyCRSpec(cr *v1alpha1.CommittedResource, state *commitments.CommitmentState, allowRejection bool) { + cr.Spec.CommitmentUUID = state.CommitmentUUID + cr.Spec.SchedulingDomain = v1alpha1.SchedulingDomainNova + cr.Spec.FlavorGroupName = state.FlavorGroupName + cr.Spec.ResourceType = v1alpha1.CommittedResourceTypeMemory + cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI) + cr.Spec.AvailabilityZone = state.AvailabilityZone + cr.Spec.ProjectID = state.ProjectID + cr.Spec.DomainID = state.DomainID + cr.Spec.State = state.State + cr.Spec.AllowRejection = allowRejection + + if state.StartTime != nil { + t := metav1.NewTime(*state.StartTime) + cr.Spec.StartTime = &t + } else { + cr.Spec.StartTime = nil + } + if state.EndTime != nil { + t := metav1.NewTime(*state.EndTime) + cr.Spec.EndTime = &t + } else { + cr.Spec.EndTime = nil } } diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go new file mode 100644 index 000000000..ee546655b --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/change_commitments_e2e_test.go @@ -0,0 +1,411 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +// End-to-end tests: HTTP → CommittedResource CRD → Reservation CRDs → scheduler → controllers → HTTP response. +// +// Unlike change_commitments_test.go which uses fakeControllerClient (which immediately sets +// conditions), these tests wire real CommittedResourceController and CommitmentReservationController +// against a fake k8s client. A background goroutine drives reconcile loops so the API polling +// loop can observe terminal conditions within its timeout window. + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "testing" + "time" + + schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +// Field index paths for the fake client — must match the unexported constants in the commitments package. +const ( + e2eIdxCommittedResourceByUUID = "spec.commitmentUUID" + e2eIdxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID" +) + +// e2eEnv is a full end-to-end test environment: real controllers, fake k8s client, +// mock scheduler, and a background reconcile driver goroutine. +type e2eEnv struct { + t *testing.T + k8sClient client.Client + httpServer *httptest.Server + schedulerSrv *httptest.Server + crCtrl *commitments.CommittedResourceController + resCtrl *commitments.CommitmentReservationController + cancelBg context.CancelFunc + bgDone chan struct{} +} + +// newE2EEnv creates an e2eEnv with the given flavors and scheduler handler. +// The scheduler handler controls what the mock Nova scheduler returns. +func newE2EEnv(t *testing.T, flavors []*TestFlavor, infoVersion int64, schedulerHandler http.HandlerFunc) *e2eEnv { + t.Helper() + log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) + + // Scheme: v1alpha1 for CR/Reservation/Knowledge types; hv1 for Hypervisor. + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("add v1alpha1 scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("add hv1 scheme: %v", err) + } + + // One hypervisor so the reservation controller can build a non-empty eligible-hosts list. + hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(createKnowledgeCRD(buildFlavorGroupsKnowledge(flavors, infoVersion)), hypervisor). + WithStatusSubresource( + &v1alpha1.CommittedResource{}, + &v1alpha1.Reservation{}, + &v1alpha1.Knowledge{}, + ). + WithIndex(&v1alpha1.Reservation{}, e2eIdxReservationByCommitmentUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, e2eIdxCommittedResourceByUUID, func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.CommitmentUUID == "" { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). + Build() + + schedulerSrv := httptest.NewServer(schedulerHandler) + + crCtrl := &commitments.CommittedResourceController{ + Client: k8sClient, + Scheme: scheme, + Conf: commitments.CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 100 * time.Millisecond}}, + } + + resCtrl := &commitments.CommitmentReservationController{ + Client: k8sClient, + Scheme: scheme, + Conf: commitments.ReservationControllerConfig{ + SchedulerURL: schedulerSrv.URL, + AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}, + RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute}, + RequeueIntervalRetry: metav1.Duration{Duration: 100 * time.Millisecond}, + }, + } + if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil { + t.Fatalf("resCtrl.Init: %v", err) + } + + // HTTPAPI wired directly to the real k8s client (no fakeControllerClient wrapper). + cfg := commitments.DefaultAPIConfig() + cfg.WatchTimeout = metav1.Duration{Duration: 5 * time.Second} + cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond} + api := NewAPIWithConfig(k8sClient, cfg, nil) + mux := http.NewServeMux() + api.Init(mux, prometheus.NewRegistry(), log.Log) + httpServer := httptest.NewServer(mux) + + ctx, cancel := context.WithCancel(context.Background()) + env := &e2eEnv{ + t: t, + k8sClient: k8sClient, + httpServer: httpServer, + schedulerSrv: schedulerSrv, + crCtrl: crCtrl, + resCtrl: resCtrl, + cancelBg: cancel, + bgDone: make(chan struct{}), + } + go env.driveReconciles(ctx) + return env +} + +func (e *e2eEnv) close() { + e.cancelBg() + <-e.bgDone + e.httpServer.Close() + e.schedulerSrv.Close() +} + +// asCRTestEnv wraps e2eEnv as a CRTestEnv to reuse its HTTP-call and assertion helpers. +func (e *e2eEnv) asCRTestEnv() *CRTestEnv { + return &CRTestEnv{T: e.t, K8sClient: e.k8sClient, HTTPServer: e.httpServer} +} + +// driveReconciles runs in the background, reconciling pending CRs and Reservations until ctx is cancelled. +func (e *e2eEnv) driveReconciles(ctx context.Context) { + defer close(e.bgDone) + ticker := time.NewTicker(50 * time.Millisecond) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + e.reconcileAll(ctx) + } + } +} + +// reconcileAll drives one round of reconciles: +// 1. CR pass 1 — adds finalizer and creates Reservation CRDs. +// 2. Reservation pass — calls the scheduler, sets TargetHost (first reconcile) then Ready=True (second). +// 3. CR pass 2 — re-fetches each CR and picks up Reservation outcomes (placed or rejected). +// +// CRs and Reservations that have already reached a terminal state are skipped to avoid +// overwriting the rejection signal the API polling loop needs to read. +func (e *e2eEnv) reconcileAll(ctx context.Context) { + var crList v1alpha1.CommittedResourceList + if err := e.k8sClient.List(ctx, &crList); err != nil { + return + } + + // CR pass 1. + for _, cr := range crList.Items { + if e2eIsTerminalCR(cr) { + continue + } + e.crCtrl.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}}) //nolint:errcheck + } + + // Reservation pass (two reconciles per slot: first sets TargetHost, second sets Ready=True). + var resList v1alpha1.ReservationList + if err := e.k8sClient.List(ctx, &resList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + return + } + for _, res := range resList.Items { + if e2eIsTerminalReservation(res) { + continue + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + e.resCtrl.Reconcile(ctx, req) //nolint:errcheck + e.resCtrl.Reconcile(ctx, req) //nolint:errcheck + } + + // CR pass 2: re-fetch so we see any condition changes made during the Reservation pass. + for _, cr := range crList.Items { + var latest v1alpha1.CommittedResource + if err := e.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &latest); err != nil { + continue // deleted or transient + } + if e2eIsTerminalCR(latest) { + continue + } + e.crCtrl.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: latest.Name}}) //nolint:errcheck + } +} + +// e2eIsTerminalCR returns true for states the API polling loop treats as final: +// Accepted (Ready=True), Rejected, or Planned. +// CRs with DeletionTimestamp are never terminal here: they need one more reconcile to remove +// their finalizer (set by the controller on first reconcile) so the fake client can delete them. +func e2eIsTerminalCR(cr v1alpha1.CommittedResource) bool { + if !cr.DeletionTimestamp.IsZero() { + return false + } + cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + return false + } + if cond.Status == metav1.ConditionTrue { + return true + } + return cond.Reason == v1alpha1.CommittedResourceReasonRejected || + cond.Reason == v1alpha1.CommittedResourceReasonPlanned +} + +// waitForCRAbsent polls until the named CommittedResource no longer exists or the 1s deadline passes. +// Used after rollback calls because the finalizer removal happens asynchronously in the background reconcile loop. +func (e *e2eEnv) waitForCRAbsent(t *testing.T, crName string) { + t.Helper() + deadline := time.Now().Add(1 * time.Second) + for { + cr := &v1alpha1.CommittedResource{} + err := e.k8sClient.Get(context.Background(), types.NamespacedName{Name: crName}, cr) + if apierrors.IsNotFound(err) { + return + } + if time.Now().After(deadline) { + t.Errorf("expected CommittedResource %q to be absent after rollback, but it still exists", crName) + return + } + time.Sleep(50 * time.Millisecond) + } +} + +// e2eIsTerminalReservation returns true when a Reservation is fully placed (Ready=True). +func e2eIsTerminalReservation(res v1alpha1.Reservation) bool { + cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) + return cond != nil && cond.Status == metav1.ConditionTrue +} + +// ============================================================================ +// Scheduler handlers +// ============================================================================ + +func e2eAcceptScheduler(t *testing.T) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}} + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("scheduler encode: %v", err) + } + } +} + +func e2eRejectScheduler(t *testing.T) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + // Return an empty hosts list — the reservation controller treats this as NoHostsFound. + resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{}} + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Errorf("scheduler encode: %v", err) + } + } +} + +// ============================================================================ +// E2E test cases +// ============================================================================ + +const e2eInfoVersion = int64(1234) + +var e2eFlavor = &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4} + +// TestE2EChangeCommitments is the full end-to-end suite: HTTP → CRD → controller → scheduler → HTTP response. +func TestE2EChangeCommitments(t *testing.T) { + testCases := []struct { + Name string + Scheduler func(*testing.T) http.HandlerFunc + ReqJSON string + WantResp APIResponseExpectation + WantAbsent []string + Verify func(*testing.T, *e2eEnv) + }{ + { + Name: "scheduler accepts: CR placed, Reservation on host-1", + Scheduler: e2eAcceptScheduler, + ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-ok", "confirmed", 1))), + WantResp: newAPIResponse(), + Verify: func(t *testing.T, env *e2eEnv) { + t.Helper() + env.asCRTestEnv().VerifyCRsExist([]string{"commitment-uuid-e2e-ok"}) + + var cr v1alpha1.CommittedResource + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: "commitment-uuid-e2e-ok"}, &cr); err != nil { + t.Fatalf("get CR: %v", err) + } + if !apimeta.IsStatusConditionTrue(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("expected CR Ready=True") + } + + var resList v1alpha1.ReservationList + if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(resList.Items) != 1 { + t.Fatalf("expected 1 Reservation, got %d", len(resList.Items)) + } + res := resList.Items[0] + if !apimeta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { + t.Errorf("expected Reservation Ready=True") + } + if res.Status.Host != "host-1" { + t.Errorf("Reservation Status.Host: want host-1, got %q", res.Status.Host) + } + }, + }, + { + Name: "scheduler rejects: rejection propagates to API response, CR rolled back", + Scheduler: e2eRejectScheduler, + ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-rej", "confirmed", 2))), + WantResp: newAPIResponse("no hosts found"), + WantAbsent: []string{"commitment-uuid-e2e-rej"}, + }, + { + Name: "batch with one rejection: entire batch rolled back", + Scheduler: e2eRejectScheduler, + ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-batch-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-e2e-batch-b", "confirmed", 2), + )), + WantResp: newAPIResponse("no hosts found"), + WantAbsent: []string{"commitment-uuid-e2e-batch-a", "commitment-uuid-e2e-batch-b"}, + }, + { + Name: "lifecycle: create then delete, CR and child Reservations cleaned up", + Scheduler: e2eAcceptScheduler, + ReqJSON: buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-lifecycle", "confirmed", 1))), + WantResp: newAPIResponse(), + Verify: func(t *testing.T, env *e2eEnv) { + t.Helper() + env.asCRTestEnv().VerifyCRsExist([]string{"commitment-uuid-e2e-lifecycle"}) + + te := env.asCRTestEnv() + deleteJSON := buildRequestJSON(newCommitmentRequest("az-a", false, e2eInfoVersion, + deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-e2e-lifecycle", "confirmed", 1))) + resp, _, statusCode := te.CallChangeCommitmentsAPI(deleteJSON) + te.VerifyAPIResponse(newAPIResponse(), resp, statusCode) + + env.waitForCRAbsent(t, "commitment-uuid-e2e-lifecycle") + + var resList v1alpha1.ReservationList + if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(resList.Items) != 0 { + t.Errorf("expected 0 Reservations after delete, got %d", len(resList.Items)) + } + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + env := newE2EEnv(t, []*TestFlavor{e2eFlavor}, e2eInfoVersion, tc.Scheduler(t)) + defer env.close() + + te := env.asCRTestEnv() + resp, _, statusCode := te.CallChangeCommitmentsAPI(tc.ReqJSON) + te.VerifyAPIResponse(tc.WantResp, resp, statusCode) + for _, name := range tc.WantAbsent { + env.waitForCRAbsent(t, name) + } + if tc.Verify != nil { + tc.Verify(t, env) + } + }) + } +} diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go index 2c9562ee8..1afeea5f5 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_metrics.go @@ -23,7 +23,7 @@ func (api *HTTPAPI) recordMetrics(req liquid.CommitmentChangeRequest, resp liqui commitmentCount := countCommitments(req) // Determine result based on response - result := "success" + result := "accepted" if resp.RejectionReason != "" { result = "rejected" } diff --git a/internal/scheduling/reservations/commitments/api/change_commitments_test.go b/internal/scheduling/reservations/commitments/api/change_commitments_test.go index deffc91c3..579173460 100644 --- a/internal/scheduling/reservations/commitments/api/change_commitments_test.go +++ b/internal/scheduling/reservations/commitments/api/change_commitments_test.go @@ -1,14 +1,13 @@ // Copyright SAP SE // SPDX-License-Identifier: Apache-2.0 -//nolint:unparam,unused // test helper functions have fixed parameters for simplicity +//nolint:unparam // test helper functions have fixed parameters for simplicity package api import ( "bytes" "context" "encoding/json" - "fmt" "io" "net/http" "net/http/httptest" @@ -23,10 +22,11 @@ import ( "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + . "github.com/majewsky/gg/option" "github.com/prometheus/client_golang/prometheus" "github.com/sapcc/go-api-declarations/liquid" - corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -40,657 +40,420 @@ import ( // Integration Tests // ============================================================================ -func TestCommitmentChangeIntegration(t *testing.T) { - m1Tiny := &TestFlavor{Name: "m1.tiny", Group: "gp_1", MemoryMB: 256, VCPUs: 1} +func TestHandleChangeCommitments(t *testing.T) { m1Small := &TestFlavor{Name: "m1.small", Group: "hana_1", MemoryMB: 1024, VCPUs: 4} - m1Large := &TestFlavor{Name: "m1.large", Group: "hana_1", MemoryMB: 4096, VCPUs: 16} - m1XL := &TestFlavor{Name: "m1.xl", Group: "hana_1", MemoryMB: 8192, VCPUs: 32} testCases := []CommitmentChangeTestCase{ + // --- Basic flow --- { - Name: "Shrinking CR - unused reservations removed, used reservations untouched", - VMs: []*TestVM{{UUID: "vm-a1", Flavor: m1Large, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, - Flavors: []*TestFlavor{m1Small, m1Large}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, - {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, - }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-123", "confirmed", 2)), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-a1"}}, - {CommitmentID: "uuid-123", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse(), - }, - { - Name: "Insufficient capacity when increasing CR", - VMs: []*TestVM{}, - Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 3)), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, - ExpectedReservations: []*TestReservation{{CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}}, - ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), - }, - { - Name: "Invalid CR name - too long", - VMs: []*TestVM{}, - Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("long-", 13), "confirmed", 3), - ), - AvailableResources: &AvailableResources{}, - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment long-long-long-long-long-long-long-long-long-long-long-long-long-: unexpected commitment format"), - }, - { - Name: "Planned CR is ignored in validation, no scheduling or capacity reservation", - VMs: []*TestVM{}, + Name: "New CR: controller accepts → API returns accepted", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "planned", 200), - ), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-new"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-new": true}, }, { - Name: "Invalid CR name - spaces", - VMs: []*TestVM{}, - Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{}, + Name: "New CR: controller rejects → API returns rejection reason", + Flavors: []*TestFlavor{m1Small}, + CROutcomes: map[string]string{ + "commitment-uuid-rej": "not sufficient capacity", + }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid with space", "confirmed", 3), - ), - AvailableResources: &AvailableResources{}, - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid with space: unexpected commitment format"), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-rej", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("commitment uuid-rej: not sufficient capacity"), }, + // --- Planned state --- { - Name: "Swap capacity between CRs - order dependent - delete-first succeeds", + Name: "Planned CR: controller sets Ready=False/Planned → API accepts", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-456", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-456", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}}, + CROutcomes: map[string]string{ + "commitment-uuid-plan": v1alpha1.CommittedResourceReasonPlanned, + }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 0), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 2), - ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-plan", "planned", 2)), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-plan"}, }, + // --- Update existing CR --- { - Name: "Swap capacity between CRs - order dependent - create-first fails", + Name: "Resize up: existing CR updated with new amount, accepted", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-resize", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, + }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-456", "confirmed", 2), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-123", "confirmed", 0), - ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0}}, - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-123", Host: "host-1", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-123", Host: "host-2", Flavor: m1Small, ProjectID: "project-B"}}, - ExpectedAPIResponse: newAPIResponse("1 commitment(s) failed", "commitment uuid-456: not sufficient capacity"), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-resize", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-resize"}, }, + // --- Rollback: new CR deleted on batch failure --- { - Name: "Flavor bin-packing - mixed sizes when largest doesn't fit", - // Greedy selection: 10GB request with 8/4/1GB flavors → picks 1×8GB + 2×1GB - Flavors: []*TestFlavor{m1XL, m1Large, m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-binpack", "confirmed", 10), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-binpack", Flavor: m1XL, ProjectID: "project-A"}, - {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-binpack", Flavor: m1Small, ProjectID: "project-A"}, + Name: "Rollback new CR: newly created CRD deleted on rejection", + Flavors: []*TestFlavor{m1Small}, + CROutcomes: map[string]string{ + "commitment-uuid-rollback": "not sufficient capacity", }, - ExpectedAPIResponse: newAPIResponse(), + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-rollback", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("uuid-rollback: not sufficient capacity"), + ExpectedDeletedCRs: []string{"commitment-uuid-rollback"}, }, + // --- Rollback: updated CR spec restored on batch failure --- { - Name: "Version mismatch - request rejected with 409 Conflict", - // InfoVersion validation prevents stale requests (1233 vs 1234) + Name: "Rollback updated CR: spec restored on rejection", Flavors: []*TestFlavor{m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1233, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-version", "confirmed", 2), - ), - EnvInfoVersion: 1234, - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-restore", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, + }, + CROutcomes: map[string]string{ + "commitment-uuid-restore": "not sufficient capacity", + }, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-restore", "confirmed", 4)), + ExpectedAPIResponse: newAPIResponse("uuid-restore: not sufficient capacity"), + // CRD still exists but amount restored to 1024 MiB + ExpectedCRSpecs: map[string]int64{"commitment-uuid-restore": 1024 * 1024 * 1024}, }, + // --- Batch rollback: one failure rolls back all --- { - Name: "Multi-project rollback - one failure rolls back all", - // Transactional: project-B fails (insufficient capacity) → both projects rollback + Name: "Batch rollback: project-B fails → project-A new CR also rolled back", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + CROutcomes: map[string]string{ + "commitment-uuid-b": "not sufficient capacity", }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-project-a", "confirmed", 2), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-project-b", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2), ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 1024, "host-2": 0}}, - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-project-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse("uuid-project-b", "not sufficient capacity"), + ExpectedAPIResponse: newAPIResponse("uuid-b: not sufficient capacity"), + ExpectedDeletedCRs: []string{"commitment-uuid-a", "commitment-uuid-b"}, }, + // --- Timeout --- { - Name: "Rollback with VMs allocated - limitation: VM allocations not rolled back", - // Controller will eventually clean up and repair inconsistent state - VMs: []*TestVM{{UUID: "vm-rollback", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Name: "Timeout: no condition set → rollback and timeout error", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-rollback"}}, - {CommitmentID: "commitment-A", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, + CROutcomes: map[string]string{ + "commitment-uuid-timeout": "", // empty string = no condition set (controller not responding) }, + NoCondition: []string{"commitment-uuid-timeout"}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "commitment-A", "confirmed", 0), - createCommitment("hw_version_hana_1_ram", "project-B", "commitment-B", "confirmed", 6), - ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0}}, - ExpectedReservations: []*TestReservation{ - // Rollback creates unscheduled reservations (empty Host accepts any in matching) - {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "commitment-A", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse("commitment-B", "not sufficient capacity"), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2)), + CustomConfig: func() *commitments.APIConfig { + cfg := commitments.DefaultAPIConfig() + cfg.WatchTimeout = metav1.Duration{} + cfg.WatchPollInterval = metav1.Duration{Duration: 100 * time.Millisecond} + return &cfg + }(), + ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), + ExpectedDeletedCRs: []string{"commitment-uuid-timeout"}, }, + // --- Input validation --- { - Name: "New commitment creation - from zero to N reservations", + Name: "Invalid commitment UUID: rejected before CRD write", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 3), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-new", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", strings.Repeat("x", 50), "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("unexpected commitment format"), + ExpectedDeletedCRs: []string{"commitment-" + strings.Repeat("x", 50)}, }, { - Name: "New commitment creation - large batch", + Name: "Unknown flavor group: rejected without CRD write", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-new", "confirmed", 200), - ), - ExpectedReservations: func() []*TestReservation { - var reservations []*TestReservation - for range 200 { - reservations = append(reservations, &TestReservation{ - CommitmentID: "uuid-new", - Flavor: m1Small, - ProjectID: "project-A", - }) - } - return reservations - }(), - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unk", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("flavor group not found"), }, + // --- Infrastructure --- { - Name: "With reservations of custom size - total unchanged", - // Preserves custom-sized reservations when total matches (2×2GB = 4GB) + Name: "Version mismatch: 409 Conflict", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 4), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - }, - ExpectedAPIResponse: newAPIResponse(), + CommitmentRequest: newCommitmentRequest("az-a", false, 9999, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-v", "confirmed", 2)), + EnvInfoVersion: 1234, // env is at 1234, request claims 9999 → mismatch + ExpectedAPIResponse: APIResponseExpectation{StatusCode: 409}, }, { - Name: "With reservations of custom size - increase total", - // 4GB (2×2GB custom) → 6GB: preserves custom sizes, adds standard-sized reservations + Name: "API disabled: 503 Service Unavailable", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 6), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dis", "confirmed", 2)), + CustomConfig: func() *commitments.APIConfig { + cfg := commitments.DefaultAPIConfig() + cfg.EnableChangeCommitments = false + return &cfg + }(), + ExpectedAPIResponse: APIResponseExpectation{StatusCode: 503}, }, { - Name: "With reservations of custom size - decrease total", - // 4GB (2×2GB custom) → 3GB: removes 1×2GB custom, adds 1×1GB standard + Name: "Knowledge not ready: 503 Service Unavailable", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Host: "host-2", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-custom", "confirmed", 3), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A", MemoryMB: 2048}, - {CommitmentID: "uuid-custom", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-kr", "confirmed", 2)), + EnvInfoVersion: -1, // skip Knowledge CRD creation + ExpectedAPIResponse: APIResponseExpectation{StatusCode: 503}, }, { - Name: "Complete commitment deletion - N to zero reservations", + Name: "Dry run: not supported yet", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-delete", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-delete", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-delete", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, - }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-delete", "confirmed", 0), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-b-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-a-1", Host: "host-3", Flavor: m1Small, ProjectID: "project-A"}, - }, + CommitmentRequest: newCommitmentRequest("az-a", true, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dry", "confirmed", 2)), + ExpectedAPIResponse: newAPIResponse("Dry run not supported"), + }, + { + Name: "Empty request: no CRDs created", + Flavors: []*TestFlavor{m1Small}, + CommitmentRequest: newCommitmentRequest("az-a", false, 1234), ExpectedAPIResponse: newAPIResponse(), }, + // --- Deletion --- { - Name: "VM allocation preservation - keep VMs during growth", - VMs: []*TestVM{{UUID: "vm-existing", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Name: "Deletion: existing CRD is deleted", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, - {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-del", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-growth", "confirmed", 3), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-growth", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-existing"}}, - {CommitmentID: "uuid-growth", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-growth", Flavor: m1Small, ProjectID: "project-A"}, - }, + deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-del", "confirmed", 2)), ExpectedAPIResponse: newAPIResponse(), + ExpectedDeletedCRs: []string{"commitment-uuid-del"}, }, { - Name: "Multi-project success - both projects succeed", + Name: "Deletion: non-existing CRD is a no-op", Flavors: []*TestFlavor{m1Small}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 2), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 2), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-a", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, - {CommitmentID: "uuid-b", Flavor: m1Small, ProjectID: "project-B"}, - }, + deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-absent", "confirmed", 2)), ExpectedAPIResponse: newAPIResponse(), }, { - Name: "Multiple flavor groups - hw_version_hana_1_ram and hw_version_hana_2_ram", - // Amount in multiples of smallest flavor: hana_1 (2×1GB), hana_2 (2×2GB) - Flavors: []*TestFlavor{ - m1Small, - {Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, + Name: "Deletion rollback: delete succeeds but later commitment fails → CRD re-created", + Flavors: []*TestFlavor{m1Small}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-del-rb", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-hana1", "confirmed", 2), - createCommitment("hw_version_hana_2_ram", "project-A", "uuid-hana2", "confirmed", 2), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-hana1", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, - {CommitmentID: "uuid-hana2", Flavor: &TestFlavor{Name: "m2.small", Group: "hana_2", MemoryMB: 2048, VCPUs: 8}, ProjectID: "project-A"}, + CROutcomes: map[string]string{ + "commitment-uuid-new-rb": "not enough capacity", }, - ExpectedAPIResponse: newAPIResponse(), - }, - { - Name: "Unknown flavor group - clear rejection message", - Flavors: []*TestFlavor{m1Small}, + // project-A deletion sorts before project-B creation; deletion succeeds then creation fails. CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_nonexistent_ram", "project-A", "uuid-unknown", "confirmed", 2), + deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-del-rb", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-new-rb", "confirmed", 2), ), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse("flavor group not found"), + ExpectedAPIResponse: newAPIResponse("not enough capacity"), + ExpectedCreatedCRNames: []string{"commitment-uuid-del-rb"}, // re-created during rollback }, + // --- Non-confirming changes (RequiresConfirmation=false → AllowRejection=false, no watch) --- { - Name: "Three-way capacity swap - complex reallocation", - // A:2→0, B:1→0, C:0→3 in single transaction + Name: "Non-confirming: guaranteed→confirmed, AllowRejection=false, watch skipped", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-a", Host: "host-1", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-a", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, - {CommitmentID: "uuid-b", Host: "host-3", Flavor: m1Small, ProjectID: "project-B"}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-guar", State: v1alpha1.CommitmentStatusGuaranteed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, + // Controller would reject, but we skip watching for non-confirming changes. + CROutcomes: map[string]string{"commitment-uuid-guar": "not enough capacity"}, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-a", "confirmed", 0), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-b", "confirmed", 0), - createCommitment("hw_version_hana_1_ram", "project-C", "uuid-c", "confirmed", 3), - ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 0, "host-3": 0}}, - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-c", Host: "host-1", Flavor: m1Small, ProjectID: "project-C"}, - {CommitmentID: "uuid-c", Host: "host-2", Flavor: m1Small, ProjectID: "project-C"}, - {CommitmentID: "uuid-c", Host: "host-3", Flavor: m1Small, ProjectID: "project-C"}, - }, - ExpectedAPIResponse: newAPIResponse(), + TestCommitment{ + ResourceName: "hw_version_hana_1_ram", + ProjectID: "project-A", + ConfirmationID: "uuid-guar", + OldState: "guaranteed", + State: "confirmed", + Amount: 2, + }), + ExpectedAPIResponse: newAPIResponse(), // no rejection even though controller would reject + ExpectedCreatedCRNames: []string{"commitment-uuid-guar"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-guar": false}, }, { - Name: "Reservation repair - existing reservations with wrong metadata", - Flavors: []*TestFlavor{m1Small, m1Large}, - ExistingReservations: []*TestReservation{ - {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Host: "host-1", Flavor: m1Small, ProjectID: "wrong-project", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Host: "host-2", Flavor: &TestFlavor{Name: "m1.small", Group: "hana_13", MemoryMB: 1024, VCPUs: 4}, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Host: "host-4", Flavor: m1Small, ProjectID: "project-A", AZ: "wrong-az"}, - }, + Name: "Non-confirming: planned, AllowRejection=false", + Flavors: []*TestFlavor{m1Small}, + // CROutcomes not set: controller accepts (irrelevant since watch is skipped). CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-repair", "confirmed", 8, "az-a"), - ), - ExpectedReservations: []*TestReservation{ - {CommitmentID: "uuid-repair", Host: "host-preserved", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Flavor: m1Small, ProjectID: "project-A", AZ: "az-a"}, - {CommitmentID: "uuid-repair", Flavor: m1Large, ProjectID: "project-A", AZ: "az-a"}, - }, - ExpectedAPIResponse: newAPIResponse(), - }, - { - Name: "Empty request - no commitment changes", - Flavors: []*TestFlavor{m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse(), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-plan-nc", "planned", 2)), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-plan-nc"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-plan-nc": false}, }, + // --- Pending state --- { - Name: "Dry run request - feature not yet implemented", + Name: "None→pending: non-confirming, AllowRejection=false, watch skipped", Flavors: []*TestFlavor{m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", true, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-dryrun", "confirmed", 2), - ), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse("Dry run not supported"), + // pending creates Reservation slots (like confirmed) but RequiresConfirmation=false. + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-pend", "pending", 2)), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-pend"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-pend": false}, + ExpectedCRSpecs: map[string]int64{"commitment-uuid-pend": 2 * 1024 * 1024 * 1024}, }, + // --- Inactive state transitions via upsert --- { - Name: "Knowledge not ready - clear rejection with RetryAt", + Name: "confirmed→expired: non-confirming upsert, AllowRejection=false, watch skipped", Flavors: []*TestFlavor{m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-knowledge", "confirmed", 2), - ), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: APIResponseExpectation{ - StatusCode: 503, - RetryAtPresent: false, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-to-exp", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, - EnvInfoVersion: -1, // Skip Knowledge CRD creation + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + TestCommitment{ + ResourceName: "hw_version_hana_1_ram", + ProjectID: "project-A", + ConfirmationID: "uuid-to-exp", + OldState: "confirmed", + State: "expired", + Amount: 1, + }), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-to-exp"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-to-exp": false}, + ExpectedCRSpecs: map[string]int64{"commitment-uuid-to-exp": 0}, }, { - Name: "API disabled - returns 503 Service Unavailable", + Name: "confirmed→superseded: confirming upsert, AllowRejection=true, controller accepts", Flavors: []*TestFlavor{m1Small}, - CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-disabled", "confirmed", 2), - ), - CustomConfig: func() *commitments.Config { - cfg := commitments.DefaultConfig() - cfg.EnableChangeCommitmentsAPI = false - return &cfg - }(), - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: APIResponseExpectation{ - StatusCode: 503, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-to-sup", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, + // confirmed→superseded is a confirming change (not in the liquid API's free-transition list). + CommitmentRequest: newCommitmentRequest("az-a", false, 1234, + TestCommitment{ + ResourceName: "hw_version_hana_1_ram", + ProjectID: "project-A", + ConfirmationID: "uuid-to-sup", + OldState: "confirmed", + State: "superseded", + Amount: 1, + }), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-to-sup"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-to-sup": true}, + ExpectedCRSpecs: map[string]int64{"commitment-uuid-to-sup": 0}, }, + // --- Resize --- { - Name: "Multiple commitments insufficient capacity - all listed in error", - // Tests that multiple failed commitments are all mentioned in the rejection reason - Flavors: []*TestFlavor{m1Small, m1Tiny}, + Name: "Resize down: confirmed→confirmed with less capacity, RequiresConfirmation=true", + Flavors: []*TestFlavor{m1Small}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-dn", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 4 * 1024, ProjectID: "project-A", AZ: "az-a"}, + }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-multi-fail-1", "confirmed", 3), - createCommitment("hw_version_hana_1_ram", "project-B", "uuid-multi-fail-2", "confirmed", 3), - createCommitment("hw_version_gp_1_ram", "project-C", "uuid-would-not-fail", "confirmed", 1), // would be rolled back, but not part of the reject reason - ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 256}}, - ExpectedReservations: []*TestReservation{}, - ExpectedAPIResponse: newAPIResponse("2 commitment(s) failed", "commitment uuid-multi-fail-1: not sufficient capacity", "commitment uuid-multi-fail-2: not sufficient capacity"), + TestCommitment{ + ResourceName: "hw_version_hana_1_ram", + ProjectID: "project-A", + ConfirmationID: "uuid-dn", + OldState: "confirmed", + OldAmount: 4, + State: "confirmed", + Amount: 2, + }), + ExpectedAPIResponse: newAPIResponse(), + ExpectedCreatedCRNames: []string{"commitment-uuid-dn"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-dn": true}, + ExpectedCRSpecs: map[string]int64{"commitment-uuid-dn": 2 * 1024 * 1024 * 1024}, }, + // --- Mixed batch success --- { - Name: "Deletion priority during rollback - unscheduled removed first", - // Tests that during rollback, unscheduled reservations (no TargetHost) are deleted first, - // preserving scheduled reservations (with TargetHost), especially those with VM allocations - VMs: []*TestVM{{UUID: "vm-priority", Flavor: m1Small, ProjectID: "project-A", Host: "host-1", AZ: "az-a"}}, + Name: "Mixed batch: delete + create both succeed without rollback", Flavors: []*TestFlavor{m1Small}, - ExistingReservations: []*TestReservation{ - // Reservation with VM allocation - should be preserved (lowest deletion priority) - {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}}, - // Scheduled but unused - medium deletion priority - {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, + ExistingCRs: []*TestCR{ + {CommitmentUUID: "uuid-mbdel", State: v1alpha1.CommitmentStatusConfirmed, AmountMiB: 1024, ProjectID: "project-A", AZ: "az-a"}, }, CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "commitment-1", "confirmed", 4), + deleteCommitment("hw_version_hana_1_ram", "project-A", "uuid-mbdel", "confirmed", 1), + createCommitment("hw_version_hana_1_ram", "project-B", "uuid-mbnew", "confirmed", 2), ), - AvailableResources: &AvailableResources{PerHost: map[string]int64{"host-1": 0, "host-2": 1024}}, - ExpectedReservations: []*TestReservation{ - // After rollback, should preserve the scheduled reservations (especially with VMs) - // and remove unscheduled ones first - {CommitmentID: "commitment-1", Host: "host-1", Flavor: m1Small, ProjectID: "project-A", VMs: []string{"vm-priority"}}, - {CommitmentID: "commitment-1", Host: "host-2", Flavor: m1Small, ProjectID: "project-A"}, - }, - ExpectedAPIResponse: newAPIResponse("commitment commitment-1: not sufficient capacity"), + ExpectedAPIResponse: newAPIResponse(), + ExpectedDeletedCRs: []string{"commitment-uuid-mbdel"}, + ExpectedCreatedCRNames: []string{"commitment-uuid-mbnew"}, + ExpectedAllowRejection: map[string]bool{"commitment-uuid-mbnew": true}, }, + // --- Pre-write validation failure rollback --- { - Name: "Watch timeout with custom config - triggers rollback with timeout error", + Name: "Pre-write validation failure: first CR written then rolled back on second CR's unknown flavor group", Flavors: []*TestFlavor{m1Small}, + // project-A (valid) sorts before project-B (invalid): A's CR is written, then B's + // unknown flavor group triggers a pre-watch rollback that deletes A's CR. CommitmentRequest: newCommitmentRequest("az-a", false, 1234, - createCommitment("hw_version_hana_1_ram", "project-A", "uuid-timeout", "confirmed", 2), + createCommitment("hw_version_hana_1_ram", "project-A", "uuid-pva", "confirmed", 2), + createCommitment("hw_version_nonexistent_ram", "project-B", "uuid-pvb", "confirmed", 2), ), - // With 0ms timeout, the watch will timeout immediately before reservations become ready - CustomConfig: func() *commitments.Config { - cfg := commitments.DefaultConfig() - cfg.ChangeAPIWatchReservationsTimeout = 0 * time.Millisecond - cfg.ChangeAPIWatchReservationsPollInterval = 100 * time.Millisecond - return &cfg - }(), - ExpectedReservations: []*TestReservation{}, // Rollback removes all reservations - ExpectedAPIResponse: newAPIResponse("timeout reached while processing commitment changes"), + ExpectedAPIResponse: newAPIResponse("flavor group not found"), + ExpectedDeletedCRs: []string{"commitment-uuid-pva"}, }, } for _, tc := range testCases { t.Run(tc.Name, func(t *testing.T) { - runCommitmentChangeTest(t, tc) + runChangeCommitmentsTest(t, tc) }) } } -// runCommitmentChangeTest executes a single commitment change integration test case. -func runCommitmentChangeTest(t *testing.T, tc CommitmentChangeTestCase) { +func runChangeCommitmentsTest(t *testing.T, tc CommitmentChangeTestCase) { t.Helper() - // Convert test types to actual types - var vms []VM - for _, testVM := range tc.VMs { - vms = append(vms, testVM.ToVM()) - } - - var flavorInGroups []compute.FlavorInGroup - for _, testFlavor := range tc.Flavors { - flavorInGroups = append(flavorInGroups, testFlavor.ToFlavorInGroup()) - } - - // Use EnvInfoVersion if specified (non-zero), otherwise default to CommitmentRequest.InfoVersion - envInfoVersion := tc.CommitmentRequest.InfoVersion - if tc.EnvInfoVersion != 0 { - envInfoVersion = tc.EnvInfoVersion - } - - flavorGroups := TestFlavorGroup{ - infoVersion: envInfoVersion, - flavors: flavorInGroups, - }.ToFlavorGroupsKnowledge() - - // Convert existing reservations with auto-numbering per commitment - var existingReservations []*v1alpha1.Reservation - numberCounters := make(map[string]int) - for _, testRes := range tc.ExistingReservations { - number := numberCounters[testRes.CommitmentID] - numberCounters[testRes.CommitmentID]++ - existingReservations = append(existingReservations, testRes.toReservation(number)) - } - - // Create test environment with available resources and custom config if provided - env := newCommitmentTestEnv(t, vms, nil, existingReservations, flavorGroups, tc.AvailableResources, tc.CustomConfig) + env := newCRTestEnv(t, tc) defer env.Close() - t.Log("Initial state:") - env.LogStateSummary() - - // Call commitment change API reqJSON := buildRequestJSON(tc.CommitmentRequest) - resp, respJSON, statusCode := env.CallChangeCommitmentsAPI(reqJSON) + resp, _, statusCode := env.CallChangeCommitmentsAPI(reqJSON) - t.Log("After API call:") - env.LogStateSummary() + env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, statusCode) - // Verify API response - env.VerifyAPIResponse(tc.ExpectedAPIResponse, resp, respJSON, statusCode) - - // Verify reservations using content-based matching - env.VerifyReservationsMatch(tc.ExpectedReservations) - - // Log final test result - if t.Failed() { - t.Log("❌ Test FAILED") - } else { - t.Log("✅ Test PASSED") + if len(tc.ExpectedCreatedCRNames) > 0 { + env.VerifyCRsExist(tc.ExpectedCreatedCRNames) + } + if tc.ExpectedAllowRejection != nil { + env.VerifyAllowRejection(tc.ExpectedAllowRejection) + } + for crName, expectedAmountBytes := range tc.ExpectedCRSpecs { + env.VerifyCRAmountBytes(crName, expectedAmountBytes) + } + for _, crName := range tc.ExpectedDeletedCRs { + env.VerifyCRAbsent(crName) } } // ============================================================================ -// Test Types & Constants +// Test Types // ============================================================================ const ( - defaultFlavorDiskGB = 40 - flavorGroupsKnowledgeName = "flavor-groups" - knowledgeRecencyDuration = 60 * time.Second - defaultCommitmentExpiryYears = 1 + defaultFlavorDiskGB = 40 + flavorGroupsKnowledgeName = "flavor-groups" + knowledgeRecencyDuration = 60 * time.Second ) type CommitmentChangeTestCase struct { - Name string - VMs []*TestVM - Flavors []*TestFlavor - ExistingReservations []*TestReservation - CommitmentRequest CommitmentChangeRequest - ExpectedReservations []*TestReservation - ExpectedAPIResponse APIResponseExpectation - AvailableResources *AvailableResources // If nil, all reservations accepted without checks - EnvInfoVersion int64 // Override InfoVersion for version mismatch tests - CustomConfig *commitments.Config // Override default config for testing timeout behavior -} - -// AvailableResources defines available memory per host (MB). -// Scheduler uses first-come-first-serve. CPU is ignored. -type AvailableResources struct { - PerHost map[string]int64 // host -> available memory MB -} - -type TestFlavorGroup struct { - infoVersion int64 - flavors []compute.FlavorInGroup -} - -func (tfg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { - groupMap := make(map[string][]compute.FlavorInGroup) - - for _, flavor := range tfg.flavors { - groupName := flavor.ExtraSpecs["quota:hw_version"] - if groupName == "" { - panic("Flavor " + flavor.Name + " is missing quota:hw_version in extra specs") - } - groupMap[groupName] = append(groupMap[groupName], flavor) - } - - // Sort group names for deterministic iteration - sortedGroupNames := make([]string, 0, len(groupMap)) - for groupName := range groupMap { - sortedGroupNames = append(sortedGroupNames, groupName) - } - sort.Strings(sortedGroupNames) - - var groups []compute.FlavorGroupFeature - for _, groupName := range sortedGroupNames { - groupFlavors := groupMap[groupName] - if len(groupFlavors) == 0 { - continue - } - - // Sort descending: required by reservation manager's flavor selection - sort.Slice(groupFlavors, func(i, j int) bool { - return groupFlavors[i].MemoryMB > groupFlavors[j].MemoryMB - }) - - smallest := groupFlavors[len(groupFlavors)-1] - largest := groupFlavors[0] - - // Compute RAM/core ratio (MiB per vCPU) - var minRatio, maxRatio uint64 = ^uint64(0), 0 - for _, f := range groupFlavors { - if f.VCPUs == 0 { - continue - } - ratio := f.MemoryMB / f.VCPUs - if ratio < minRatio { - minRatio = ratio - } - if ratio > maxRatio { - maxRatio = ratio - } - } - - var ramCoreRatio, ramCoreRatioMin, ramCoreRatioMax *uint64 - if minRatio == maxRatio && maxRatio != 0 { - ramCoreRatio = &minRatio - } else if maxRatio != 0 { - ramCoreRatioMin = &minRatio - ramCoreRatioMax = &maxRatio - } - - groups = append(groups, compute.FlavorGroupFeature{ - Name: groupName, - Flavors: groupFlavors, - SmallestFlavor: smallest, - LargestFlavor: largest, - RamCoreRatio: ramCoreRatio, - RamCoreRatioMin: ramCoreRatioMin, - RamCoreRatioMax: ramCoreRatioMax, - }) - } - - return FlavorGroupsKnowledge{ - InfoVersion: tfg.infoVersion, - Groups: groups, - } + Name string + Flavors []*TestFlavor + // ExistingCRs: CommittedResource CRDs present before the API call. + ExistingCRs []*TestCR + // CROutcomes: what condition the fake controller sets per crName. + // Value = rejection reason if non-empty and not a named reason constant. + // Value = CommittedResourceReasonPlanned to simulate a planned outcome. + // Absent entry = controller accepts (Ready=True). + CROutcomes map[string]string + // NoCondition: crNames for which the fake controller sets no condition (simulate stall/timeout). + NoCondition []string + CommitmentRequest CommitmentChangeRequest + ExpectedAPIResponse APIResponseExpectation + // Post-call assertions. + ExpectedCreatedCRNames []string + ExpectedAllowRejection map[string]bool // crName → expected AllowRejection value + ExpectedCRSpecs map[string]int64 // crName → expected Amount.Value() in bytes + ExpectedDeletedCRs []string + CustomConfig *commitments.APIConfig + EnvInfoVersion int64 } -type FlavorGroupsKnowledge struct { - InfoVersion int64 - Groups []compute.FlavorGroupFeature +// TestCR defines a pre-existing CommittedResource CRD. +type TestCR struct { + CommitmentUUID string + State v1alpha1.CommitmentStatus + AmountMiB int64 + ProjectID string + AZ string } type CommitmentChangeRequest struct { @@ -704,29 +467,15 @@ type TestCommitment struct { ResourceName liquid.ResourceName ProjectID string ConfirmationID string - State string + OldState string // empty = None (no prior status) + State string // empty = None (deletion) Amount uint64 + OldAmount uint64 // if non-zero, used for TotalBefore totals instead of Amount (for resize-down) } type APIResponseExpectation struct { StatusCode int RejectReasonSubstrings []string - RetryAtPresent bool -} - -type ReservationVerification struct { - Host string - Allocations map[string]string -} - -type VM struct { - UUID string - FlavorName string - ProjectID string - CurrentHypervisor string - AvailabilityZone string - Resources map[string]int64 - FlavorExtraSpecs map[string]string } type TestFlavor struct { @@ -735,7 +484,7 @@ type TestFlavor struct { MemoryMB int64 VCPUs int64 DiskGB uint64 - VideoRAMMiB *uint64 // optional, from flavor extra_specs hw_video:ram_max_mb + VideoRAMMiB *uint64 } func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup { @@ -758,1098 +507,576 @@ func (f *TestFlavor) ToFlavorInGroup() compute.FlavorInGroup { } } -type TestVM struct { - UUID string - Flavor *TestFlavor - ProjectID string - Host string - AZ string -} - -func (vm *TestVM) ToVM() VM { - return VM{ - UUID: vm.UUID, - FlavorName: vm.Flavor.Name, - ProjectID: vm.ProjectID, - CurrentHypervisor: vm.Host, - AvailabilityZone: vm.AZ, - Resources: map[string]int64{ - "memory": vm.Flavor.MemoryMB, - "vcpus": vm.Flavor.VCPUs, - }, - FlavorExtraSpecs: map[string]string{ - "quota:hw_version": vm.Flavor.Group, - }, - } +type FlavorGroupsKnowledge struct { + InfoVersion int64 + Groups []compute.FlavorGroupFeature } -type TestReservation struct { - CommitmentID string - Host string // Empty = any host accepted in matching - Flavor *TestFlavor - ProjectID string - VMs []string // VM UUIDs - MemoryMB int64 // If 0, uses Flavor.MemoryMB; else custom size - AZ string +// TestFlavorGroup groups a flat list of FlavorInGroup by hw_version extra spec +// and builds a FlavorGroupsKnowledge. Used by usage_test.go and report_usage_test.go. +type TestFlavorGroup struct { + infoVersion int64 + flavors []compute.FlavorInGroup } -func (tr *TestReservation) toReservation(number int) *v1alpha1.Reservation { - name := fmt.Sprintf("commitment-%s-%d", tr.CommitmentID, number) - - memoryMB := tr.MemoryMB - if memoryMB == 0 { - memoryMB = tr.Flavor.MemoryMB - } - - specAllocations := make(map[string]v1alpha1.CommittedResourceAllocation) - statusAllocations := make(map[string]string) - for _, vmUUID := range tr.VMs { - specAllocations[vmUUID] = v1alpha1.CommittedResourceAllocation{ - CreationTimestamp: metav1.Now(), - Resources: map[hv1.ResourceName]resource.Quantity{ - "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), - "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), - }, - } - statusAllocations[vmUUID] = tr.Host - } - - spec := v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - TargetHost: tr.Host, - Resources: map[hv1.ResourceName]resource.Quantity{ - "memory": resource.MustParse(strconv.FormatInt(memoryMB, 10) + "Mi"), - "cpu": resource.MustParse(strconv.FormatInt(tr.Flavor.VCPUs, 10)), - }, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: tr.CommitmentID, - ProjectID: tr.ProjectID, - ResourceName: tr.Flavor.Name, - ResourceGroup: tr.Flavor.Group, - Allocations: specAllocations, - }, +func (tg TestFlavorGroup) ToFlavorGroupsKnowledge() FlavorGroupsKnowledge { + groupMap := make(map[string][]compute.FlavorInGroup) + for _, f := range tg.flavors { + name := f.ExtraSpecs["quota:hw_version"] + groupMap[name] = append(groupMap[name], f) } - if tr.AZ != "" { - spec.AvailabilityZone = tr.AZ + sortedNames := make([]string, 0, len(groupMap)) + for n := range groupMap { + sortedNames = append(sortedNames, n) } + sort.Strings(sortedNames) - return &v1alpha1.Reservation{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: spec, - Status: v1alpha1.ReservationStatus{ - Conditions: []metav1.Condition{ - { - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionTrue, - Reason: "ReservationActive", - }, - }, - Host: tr.Host, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ - Allocations: statusAllocations, - }, - }, + var groups []compute.FlavorGroupFeature + for _, name := range sortedNames { + gFlavors := groupMap[name] + sort.Slice(gFlavors, func(i, j int) bool { return gFlavors[i].MemoryMB > gFlavors[j].MemoryMB }) + smallest := gFlavors[len(gFlavors)-1] + largest := gFlavors[0] + + var minR, maxR uint64 = ^uint64(0), 0 + for _, f := range gFlavors { + if f.VCPUs == 0 { + continue + } + r := f.MemoryMB / f.VCPUs + if r < minR { + minR = r + } + if r > maxR { + maxR = r + } + } + var ratio, ratioMin, ratioMax *uint64 + if minR == maxR && maxR != 0 { + ratio = &minR + } else if maxR != 0 { + ratioMin = &minR + ratioMax = &maxR + } + groups = append(groups, compute.FlavorGroupFeature{ + Name: name, + Flavors: gFlavors, + SmallestFlavor: smallest, + LargestFlavor: largest, + RamCoreRatio: ratio, + RamCoreRatioMin: ratioMin, + RamCoreRatioMax: ratioMax, + }) } + return FlavorGroupsKnowledge{InfoVersion: tg.infoVersion, Groups: groups} } // ============================================================================ -// Test Environment +// Fake Controller Client // ============================================================================ -type CommitmentTestEnv struct { - T *testing.T - Scheme *runtime.Scheme - K8sClient client.Client - VMSource *MockVMSource - FlavorGroups FlavorGroupsKnowledge - HTTPServer *httptest.Server - API *HTTPAPI - availableResources map[string]int64 // host -> available memory MB - processedReserv map[string]bool // track processed reservations - mu sync.Mutex // protects availableResources and processedReserv +// fakeControllerClient wraps a client.Client and simulates the CommittedResource +// controller by immediately setting conditions after any Create or Update of a +// CommittedResource CRD. Entries in noCondition suppress condition-setting to +// simulate a stalled controller (used for timeout tests). +type fakeControllerClient struct { + client.Client + outcomes map[string]string // crName → rejection reason (or reason constant); absent = accept + noCondition map[string]struct{} + mu sync.Mutex } -// FakeReservationController simulates synchronous reservation controller. -type FakeReservationController struct { - env *CommitmentTestEnv +func (c *fakeControllerClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { + if err := c.Client.Create(ctx, obj, opts...); err != nil { + return err + } + if cr, ok := obj.(*v1alpha1.CommittedResource); ok { + c.setConditionFor(ctx, cr.Name) + } + return nil } -func (c *FakeReservationController) OnReservationCreated(res *v1alpha1.Reservation) { - c.env.processNewReservation(res) +func (c *fakeControllerClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + if err := c.Client.Update(ctx, obj, opts...); err != nil { + return err + } + if cr, ok := obj.(*v1alpha1.CommittedResource); ok { + c.setConditionFor(ctx, cr.Name) + } + return nil } -func (c *FakeReservationController) OnReservationDeleted(res *v1alpha1.Reservation) { - c.env.mu.Lock() - defer c.env.mu.Unlock() +func (c *fakeControllerClient) setConditionFor(ctx context.Context, crName string) { + c.mu.Lock() + _, skip := c.noCondition[crName] + outcome, hasOutcome := c.outcomes[crName] + c.mu.Unlock() - // Return memory when Delete() is called directly (before deletion timestamp is set) - if c.env.availableResources != nil && res.Status.Host != "" { - memoryQuantity := res.Spec.Resources["memory"] - memoryBytes := memoryQuantity.Value() - memoryMB := memoryBytes / (1024 * 1024) + if skip { + return + } - if _, exists := c.env.availableResources[res.Status.Host]; exists { - c.env.availableResources[res.Status.Host] += memoryMB - c.env.T.Logf("↩ Returned %d MB to %s (now %d MB available) via OnReservationDeleted for %s", - memoryMB, res.Status.Host, c.env.availableResources[res.Status.Host], res.Name) + var cond metav1.Condition + switch { + case !hasOutcome || outcome == "": + // Default: controller accepts. + cond = metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionTrue, + Reason: v1alpha1.CommittedResourceReasonAccepted, + Message: "accepted", + } + case outcome == v1alpha1.CommittedResourceReasonPlanned: + cond = metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonPlanned, + Message: "commitment is not yet active", + } + default: + cond = metav1.Condition{ + Type: v1alpha1.CommittedResourceConditionReady, + Status: metav1.ConditionFalse, + Reason: v1alpha1.CommittedResourceReasonRejected, + Message: outcome, } } - // Clear tracking so recreated reservations with same name are processed - delete(c.env.processedReserv, res.Name) + cr := &v1alpha1.CommittedResource{} + if err := c.Get(ctx, client.ObjectKey{Name: crName}, cr); err != nil { + return + } + meta.SetStatusCondition(&cr.Status.Conditions, cond) + if err := c.Client.Status().Update(ctx, cr); err != nil { + return // best-effort: if the update races with another write, the polling loop retries + } } -// operationInterceptorClient routes reservation events to FakeReservationController. -type operationInterceptorClient struct { - client.Client - controller *FakeReservationController +// ============================================================================ +// Test Environment +// ============================================================================ + +type CRTestEnv struct { + T *testing.T + K8sClient client.Client + HTTPServer *httptest.Server } -func (d *operationInterceptorClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error { - err := d.Client.Create(ctx, obj, opts...) - if err != nil { - return err - } - - if res, ok := obj.(*v1alpha1.Reservation); ok { - d.controller.OnReservationCreated(res) - } - - return nil -} - -func (d *operationInterceptorClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error { - if res, ok := obj.(*v1alpha1.Reservation); ok { - d.controller.OnReservationDeleted(res) - } - - return d.Client.Delete(ctx, obj, opts...) -} - -func (env *CommitmentTestEnv) Close() { - if env.HTTPServer != nil { - env.HTTPServer.Close() - } -} - -func newCommitmentTestEnv( - t *testing.T, - vms []VM, - hypervisors []*hv1.Hypervisor, - reservations []*v1alpha1.Reservation, - flavorGroups FlavorGroupsKnowledge, - resources *AvailableResources, - customConfig *commitments.Config, -) *CommitmentTestEnv { - +func newCRTestEnv(t *testing.T, tc CommitmentChangeTestCase) *CRTestEnv { t.Helper() - log.SetLogger(zap.New(zap.WriteTo(os.Stderr), zap.UseDevMode(true))) - objects := make([]client.Object, 0, len(hypervisors)+len(reservations)) - for _, hv := range hypervisors { - objects = append(objects, hv) - } - for _, res := range reservations { - objects = append(objects, res) - } - scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add v1alpha1 scheme: %v", err) + t.Fatalf("failed to add v1alpha1 scheme: %v", err) } - if err := hv1.AddToScheme(scheme); err != nil { - t.Fatalf("Failed to add hv1 scheme: %v", err) + + objects := make([]client.Object, 0) + + // Knowledge CRD (InfoVersion=-1 simulates "not ready"). + envInfoVersion := tc.CommitmentRequest.InfoVersion + if tc.EnvInfoVersion != 0 { + envInfoVersion = tc.EnvInfoVersion + } + if envInfoVersion != -1 { + objects = append(objects, createKnowledgeCRD(buildFlavorGroupsKnowledge(tc.Flavors, envInfoVersion))) } - // InfoVersion of -1 skips Knowledge CRD creation (tests "not ready" scenario) - if flavorGroups.InfoVersion != -1 { - knowledgeCRD := createKnowledgeCRD(flavorGroups) - objects = append(objects, knowledgeCRD) + // Pre-existing CommittedResource CRDs. + for _, tcr := range tc.ExistingCRs { + objects = append(objects, tcr.toCommittedResource()) } - baseK8sClient := fake.NewClientBuilder(). + baseClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(objects...). - WithStatusSubresource(&v1alpha1.Reservation{}). - WithStatusSubresource(&v1alpha1.Knowledge{}). - WithIndex(&v1alpha1.Reservation{}, "spec.type", func(obj client.Object) []string { - res := obj.(*v1alpha1.Reservation) - return []string{string(res.Spec.Type)} - }). + WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Knowledge{}). Build() - var availableResources map[string]int64 - if resources != nil && resources.PerHost != nil { - availableResources = make(map[string]int64) - for host, memMB := range resources.PerHost { - availableResources[host] = memMB - } + noCondition := make(map[string]struct{}) + for _, name := range tc.NoCondition { + noCondition[name] = struct{}{} } - env := &CommitmentTestEnv{ - T: t, - Scheme: scheme, - K8sClient: nil, // Will be set below - VMSource: NewMockVMSource(vms), - FlavorGroups: flavorGroups, - HTTPServer: nil, // Will be set below - API: nil, // Will be set below - availableResources: availableResources, - processedReserv: make(map[string]bool), + wrapped := &fakeControllerClient{ + Client: baseClient, + outcomes: tc.CROutcomes, + noCondition: noCondition, } - controller := &FakeReservationController{env: env} - wrappedClient := &operationInterceptorClient{ - Client: baseK8sClient, - controller: controller, - } - env.K8sClient = wrappedClient - - // Use custom config if provided, otherwise use default var api *HTTPAPI - if customConfig != nil { - api = NewAPIWithConfig(wrappedClient, *customConfig, nil) + if tc.CustomConfig != nil { + api = NewAPIWithConfig(wrapped, *tc.CustomConfig, nil) } else { - api = NewAPI(wrappedClient) + api = NewAPI(wrapped) } mux := http.NewServeMux() registry := prometheus.NewRegistry() api.Init(mux, registry, log.Log) - httpServer := httptest.NewServer(mux) - - env.HTTPServer = httpServer - env.API = api - - return env -} - -// ============================================================================ -// Environment Helper Methods -// ============================================================================ - -// ListVMs returns all VMs from the VMSource. -func (env *CommitmentTestEnv) ListVMs() []VM { - vms, err := env.VMSource.ListVMs(context.Background()) - if err != nil { - env.T.Fatalf("Failed to list VMs: %v", err) - } - return vms -} - -// ListReservations returns all reservations. -func (env *CommitmentTestEnv) ListReservations() []v1alpha1.Reservation { - var list v1alpha1.ReservationList - if err := env.K8sClient.List(context.Background(), &list); err != nil { - env.T.Fatalf("Failed to list reservations: %v", err) - } - return list.Items -} -// ListHypervisors returns all hypervisors. -func (env *CommitmentTestEnv) ListHypervisors() []hv1.Hypervisor { - var list hv1.HypervisorList - if err := env.K8sClient.List(context.Background(), &list); err != nil { - env.T.Fatalf("Failed to list hypervisors: %v", err) + return &CRTestEnv{ + T: t, + K8sClient: wrapped, + HTTPServer: httptest.NewServer(mux), } - return list.Items } -// LogStateSummary logs a summary of the current state. -func (env *CommitmentTestEnv) LogStateSummary() { - env.T.Helper() - - hypervisors := env.ListHypervisors() - vms := env.ListVMs() - reservations := env.ListReservations() - - env.T.Log("=== State Summary ===") - env.T.Logf("Hypervisors: %d", len(hypervisors)) - env.T.Logf("VMs: %d", len(vms)) - env.T.Logf("Reservations: %d", len(reservations)) - - for _, res := range reservations { - allocCount := 0 - if res.Status.CommittedResourceReservation != nil { - allocCount = len(res.Status.CommittedResourceReservation.Allocations) - } - env.T.Logf(" - %s (host: %s, allocations: %d)", res.Name, res.Status.Host, allocCount) +func (env *CRTestEnv) Close() { + if env.HTTPServer != nil { + env.HTTPServer.Close() } - env.T.Log("=====================") } -// CallChangeCommitmentsAPI calls the change commitments API endpoint with JSON. -// Reservation processing is fully synchronous via operationInterceptorClient hooks. -func (env *CommitmentTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respJSON string, statusCode int) { +func (env *CRTestEnv) CallChangeCommitmentsAPI(reqJSON string) (resp liquid.CommitmentChangeResponse, respBody string, statusCode int) { env.T.Helper() - - // Make HTTP request - reservation processing happens synchronously via Create/Delete hooks url := env.HTTPServer.URL + "/commitments/v1/change-commitments" - httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx // test server URL, not user input + httpResp, err := http.Post(url, "application/json", bytes.NewReader([]byte(reqJSON))) //nolint:gosec,noctx if err != nil { - env.T.Fatalf("Failed to make HTTP request: %v", err) + env.T.Fatalf("HTTP request failed: %v", err) } defer httpResp.Body.Close() - - // Read response body - respBytes, err := io.ReadAll(httpResp.Body) + raw, err := io.ReadAll(httpResp.Body) if err != nil { - env.T.Fatalf("Failed to read response body: %v", err) + env.T.Fatalf("failed to read response: %v", err) } - - respJSON = string(respBytes) - - // Parse response - only for 200 OK responses - // Non-200 responses (like 409 Conflict for version mismatch) use plain text via http.Error() if httpResp.StatusCode == http.StatusOK { - if err := json.Unmarshal(respBytes, &resp); err != nil { - env.T.Fatalf("Failed to unmarshal response: %v", err) + if err := json.Unmarshal(raw, &resp); err != nil { + env.T.Fatalf("failed to unmarshal response: %v", err) } } - - // Final pass to handle any deletions (finalizer removal) - env.processReservations() - - statusCode = httpResp.StatusCode - return resp, respJSON, statusCode + return resp, string(raw), httpResp.StatusCode } -// processReservations handles all reservation lifecycle events synchronously. -// This includes marking reservations as Ready/Failed and removing finalizers from deleted reservations. -func (env *CommitmentTestEnv) processReservations() { - ctx := context.Background() - reservations := env.ListReservations() - - for _, res := range reservations { - // Handle deletion - return memory to host and remove finalizers - if !res.DeletionTimestamp.IsZero() { - env.T.Logf("Processing deletion for reservation %s (host: %s)", res.Name, res.Status.Host) - - env.mu.Lock() - // Return memory to host if resource tracking is enabled - if env.availableResources != nil { - env.T.Logf("Resource tracking enabled, returning memory for %s", res.Name) - memoryQuantity := res.Spec.Resources["memory"] - memoryBytes := memoryQuantity.Value() - memoryMB := memoryBytes / (1024 * 1024) - - env.T.Logf("Reservation %s has host=%s, memory=%d MB", res.Name, res.Status.Host, memoryMB) - - // Check if host exists in our tracking - if _, exists := env.availableResources[res.Status.Host]; !exists { - env.mu.Unlock() - env.T.Fatalf("Host %s not found in available resources for reservation %s - this indicates an inconsistency", - res.Status.Host, res.Name) - } - - // Return memory to host - env.availableResources[res.Status.Host] += memoryMB - env.T.Logf("↩ Returned %d MB to %s (now %d MB available) from deleted reservation %s", - memoryMB, res.Status.Host, env.availableResources[res.Status.Host], res.Name) - } else { - env.T.Logf("Resource tracking NOT enabled for %s", res.Name) - } - - // Clear tracking so recreated reservations with same name are processed - delete(env.processedReserv, res.Name) - env.mu.Unlock() - - // Remove finalizers to allow deletion - if len(res.Finalizers) > 0 { - res.Finalizers = []string{} - if err := env.K8sClient.Update(ctx, &res); err != nil { - // Ignore errors - might be already deleted - continue - } - } - continue - } - - // Skip if already processed (has a condition set) - if env.hasCondition(&res) { - continue - } - - env.mu.Lock() - alreadyProcessed := env.processedReserv[res.Name] - env.mu.Unlock() - - // Skip if already tracked as processed - if alreadyProcessed { - continue - } - - // Process new reservation with resource-based scheduling - env.processNewReservation(&res) - } -} - -// hasCondition checks if a reservation has any Ready condition set. -func (env *CommitmentTestEnv) hasCondition(res *v1alpha1.Reservation) bool { - for _, cond := range res.Status.Conditions { - if cond.Type == v1alpha1.ReservationConditionReady { - return true - } - } - return false -} - -// processNewReservation implements first-come-first-serve scheduling based on available resources. -// It tries to find a host with enough memory capacity and assigns the reservation to that host. -func (env *CommitmentTestEnv) processNewReservation(res *v1alpha1.Reservation) { - env.mu.Lock() - defer env.mu.Unlock() - - env.processedReserv[res.Name] = true - - if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.ResourceGroup == "" || res.Spec.Resources == nil || res.Spec.Resources["memory"] == (resource.Quantity{}) { - env.markReservationFailedStatus(res, "invalid reservation spec") - env.T.Logf("✗ Invalid reservation spec for %s: marking as failed (resource group: %s, resources: %v)", res.Name, res.Spec.CommittedResourceReservation.ResourceGroup, res.Spec.Resources) - return - } - - // If no available resources configured, accept all reservations without host assignment - if env.availableResources == nil { - env.T.Logf("✓ Scheduled reservation %s - no resource tracking, simply accept", res.Name) - env.markReservationSchedulerProcessedStatus(res, "some-host") - return - } - - // Get required memory from reservation spec - memoryQuantity := res.Spec.Resources["memory"] - memoryBytes := memoryQuantity.Value() - memoryMB := memoryBytes / (1024 * 1024) - - // First-come-first-serve: find first host with enough capacity - // Sort hosts to ensure deterministic behavior (Go map iteration is random) - hosts := make([]string, 0, len(env.availableResources)) - for host := range env.availableResources { - hosts = append(hosts, host) +func (env *CRTestEnv) VerifyAPIResponse(expected APIResponseExpectation, resp liquid.CommitmentChangeResponse, statusCode int) { + env.T.Helper() + expectedCode := expected.StatusCode + if expectedCode == 0 { + expectedCode = http.StatusOK } - sort.Strings(hosts) - - var selectedHost string - for _, host := range hosts { - if env.availableResources[host] >= memoryMB { - selectedHost = host - break - } + if statusCode != expectedCode { + env.T.Errorf("expected status %d, got %d", expectedCode, statusCode) } - - if selectedHost != "" { - // SUCCESS: Schedule on this host - env.availableResources[selectedHost] -= memoryMB - - // Update reservation with selected host - ctx := context.Background() - - // Update spec (TargetHost) - res.Spec.TargetHost = selectedHost - if err := env.K8sClient.Update(ctx, res); err != nil { - env.T.Logf("Warning: Failed to update reservation spec: %v", err) - } - - // Update status (Host) - requires Status().Update - res.Status.Host = selectedHost - if err := env.K8sClient.Status().Update(ctx, res); err != nil { - env.T.Logf("Warning: Failed to update reservation status host: %v", err) + for _, sub := range expected.RejectReasonSubstrings { + if !strings.Contains(resp.RejectionReason, sub) { + env.T.Errorf("rejection reason %q does not contain %q", resp.RejectionReason, sub) } - - env.markReservationSchedulerProcessedStatus(res, selectedHost) - env.T.Logf("✓ Scheduled reservation %s on %s (%d MB used, %d MB remaining)", - res.Name, selectedHost, memoryMB, env.availableResources[selectedHost]) - } else { - env.markReservationSchedulerProcessedStatus(res, "") - env.T.Logf("✗ Failed to schedule reservation %s (needs %d MB, no host has capacity)", - res.Name, memoryMB) } } -// markReservationSchedulerProcessedStatus updates a reservation status based on scheduling result. -// If host is non-empty, sets Ready=True (success). If host is empty, sets Ready=False with NoHostsFound (failure). -func (env *CommitmentTestEnv) markReservationSchedulerProcessedStatus(res *v1alpha1.Reservation, host string) { - ctx := context.Background() - - // Update spec first - res.Spec.TargetHost = host - if err := env.K8sClient.Update(ctx, res); err != nil { - env.T.Logf("Warning: Failed to update reservation spec: %v", err) - return - } - - // Then update status - Ready=True only if host was found, Ready=False otherwise - res.Status.Host = host - if host != "" { - res.Status.Conditions = []metav1.Condition{ - { - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionTrue, - Reason: "ReservationActive", - Message: "Reservation is ready (set by test controller)", - LastTransitionTime: metav1.Now(), - }, - } - } else { - res.Status.Conditions = []metav1.Condition{ - { - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionFalse, - Reason: "NoHostsFound", - Message: "No hosts with sufficient capacity (set by test controller)", - LastTransitionTime: metav1.Now(), - }, +func (env *CRTestEnv) VerifyCRsExist(names []string) { + env.T.Helper() + for _, name := range names { + cr := &v1alpha1.CommittedResource{} + if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: name}, cr); err != nil { + env.T.Errorf("expected CommittedResource %q to exist, but got: %v", name, err) } } - if err := env.K8sClient.Status().Update(ctx, res); err != nil { - env.T.Logf("Warning: Failed to update reservation status: %v", err) - } } -// markReservationFailedStatus updates a reservation to have Ready=False status -func (env *CommitmentTestEnv) markReservationFailedStatus(res *v1alpha1.Reservation, reason string) { - res.Status.Conditions = []metav1.Condition{ - { - Type: v1alpha1.ReservationConditionReady, - Status: metav1.ConditionFalse, - Reason: "Reservation invalid", - Message: reason, - LastTransitionTime: metav1.Now(), - }, - } - - if err := env.K8sClient.Status().Update(context.Background(), res); err != nil { - // Ignore errors - might be deleted during update - return +func (env *CRTestEnv) VerifyCRAbsent(name string) { + env.T.Helper() + cr := &v1alpha1.CommittedResource{} + err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: name}, cr) + if err == nil { + env.T.Errorf("expected CommittedResource %q to be absent after rollback, but it still exists", name) + } else if !apierrors.IsNotFound(err) { + env.T.Errorf("unexpected error checking if CommittedResource %q is absent: %v", name, err) } } -// VerifyAPIResponse verifies the API response matches expectations. -// For rejection reasons, it checks if ALL expected substrings are present in the actual rejection reason. -func (env *CommitmentTestEnv) VerifyAPIResponse(expected APIResponseExpectation, actual liquid.CommitmentChangeResponse, respJSON string, statusCode int) { +func (env *CRTestEnv) VerifyAllowRejection(expected map[string]bool) { env.T.Helper() - - if statusCode != expected.StatusCode { - env.T.Errorf("Expected status code %d, got %d", expected.StatusCode, statusCode) - } - - if len(expected.RejectReasonSubstrings) > 0 { - if actual.RejectionReason == "" { - env.T.Errorf("Expected rejection reason containing substrings %v, got none", expected.RejectReasonSubstrings) - } else { - // Check that ALL expected substrings are present - for _, substring := range expected.RejectReasonSubstrings { - if !strings.Contains(actual.RejectionReason, substring) { - env.T.Errorf("Expected rejection reason to contain %q, but got %q", substring, actual.RejectionReason) - } - } - } - } else { - if actual.RejectionReason != "" { - env.T.Errorf("Expected no rejection reason, got %q", actual.RejectionReason) - } - } - - // Check RetryAt field presence in JSON (avoids dealing with option.Option type) - retryAtPresent := strings.Contains(respJSON, `"retryAt"`) - if expected.RetryAtPresent { - if !retryAtPresent { - env.T.Error("Expected retryAt field to be present in JSON response, but it was not found") + for crName, want := range expected { + cr := &v1alpha1.CommittedResource{} + if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: crName}, cr); err != nil { + env.T.Errorf("CommittedResource %q not found: %v", crName, err) + continue } - } else { - if retryAtPresent { - env.T.Error("Expected retryAt field to be absent from JSON response, but it was found") + if cr.Spec.AllowRejection != want { + env.T.Errorf("CommittedResource %q: AllowRejection=%v, want %v", crName, cr.Spec.AllowRejection, want) } } } -// VerifyReservationsMatch verifies that actual reservations match expected reservations by content. -func (env *CommitmentTestEnv) VerifyReservationsMatch(expected []*TestReservation) { +func (env *CRTestEnv) VerifyCRAmountBytes(crName string, wantBytes int64) { env.T.Helper() - - actualReservations := env.ListReservations() - - // Make copies of both lists so we can remove matched items - expectedCopy := make([]*TestReservation, len(expected)) - copy(expectedCopy, expected) - - actualCopy := make([]v1alpha1.Reservation, len(actualReservations)) - copy(actualCopy, actualReservations) - - // Track unmatched items for detailed reporting - var unmatchedExpected []*TestReservation - var unmatchedActual []v1alpha1.Reservation - - // Greedy matching: while there are expected items, find matches and remove - for len(expectedCopy) > 0 { - exp := expectedCopy[0] - found := false - - // Find first actual that matches this expected - for i, actual := range actualCopy { - if env.reservationMatches(exp, &actual) { - expectedCopy = expectedCopy[1:] - actualCopy = append(actualCopy[:i], actualCopy[i+1:]...) - found = true - break - } - } - - if !found { - unmatchedExpected = append(unmatchedExpected, exp) - expectedCopy = expectedCopy[1:] - } + cr := &v1alpha1.CommittedResource{} + if err := env.K8sClient.Get(context.Background(), client.ObjectKey{Name: crName}, cr); err != nil { + env.T.Errorf("CommittedResource %q not found: %v", crName, err) + return } - - unmatchedActual = actualCopy - - // If there are any mismatches, print detailed comparison - if len(unmatchedExpected) > 0 || len(unmatchedActual) > 0 { - env.T.Error("❌ Reservation mismatch detected!") - env.T.Log("") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.T.Log("EXPECTED RESERVATIONS:") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.printExpectedReservations(expected, unmatchedExpected) - - env.T.Log("") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.T.Log("ACTUAL RESERVATIONS:") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.printActualReservations(actualReservations, unmatchedActual) - - env.T.Log("") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.T.Log("DIFF SUMMARY:") - env.T.Log("═══════════════════════════════════════════════════════════════") - env.printDiffSummary(unmatchedExpected, unmatchedActual) - env.T.Log("═══════════════════════════════════════════════════════════════") + got := cr.Spec.Amount.Value() + if got != wantBytes { + env.T.Errorf("CommittedResource %q: Amount=%d bytes, want %d bytes", crName, got, wantBytes) } } -// String returns a compact string representation of a TestReservation. -func (tr *TestReservation) String() string { - flavorName := "" - flavorGroup := "" - if tr.Flavor != nil { - flavorName = tr.Flavor.Name - flavorGroup = tr.Flavor.Group - } - - host := tr.Host - if host == "" { - host = "" - } - - az := tr.AZ - if az == "" { - az = "" - } +// ============================================================================ +// TestCR → v1alpha1.CommittedResource +// ============================================================================ - vmInfo := "" - if len(tr.VMs) > 0 { - vmInfo = fmt.Sprintf(" VMs=%v", tr.VMs) +func (tc *TestCR) toCommittedResource() *v1alpha1.CommittedResource { + amount := resource.NewQuantity(tc.AmountMiB*1024*1024, resource.BinarySI) + return &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{ + Name: "commitment-" + tc.CommitmentUUID, + }, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: tc.CommitmentUUID, + FlavorGroupName: "hana_1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: *amount, + AvailabilityZone: tc.AZ, + ProjectID: tc.ProjectID, + State: tc.State, + }, } - - return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", tr.CommitmentID, tr.ProjectID, flavorName, flavorGroup, host, az, vmInfo) } -// compactReservationString returns a compact string representation of an actual Reservation. -func compactReservationString(res *v1alpha1.Reservation) string { - commitmentID := "" - projectID := "" - flavorName := "" - flavorGroup := "" - vmCount := 0 - - if res.Spec.CommittedResourceReservation != nil { - commitmentID = res.Spec.CommittedResourceReservation.CommitmentUUID - projectID = res.Spec.CommittedResourceReservation.ProjectID - flavorName = res.Spec.CommittedResourceReservation.ResourceName - flavorGroup = res.Spec.CommittedResourceReservation.ResourceGroup - if res.Status.CommittedResourceReservation != nil { - vmCount = len(res.Status.CommittedResourceReservation.Allocations) - } - } - - host := res.Status.Host - if host == "" { - host = "" - } - - az := res.Spec.AvailabilityZone - if az == "" { - az = "" - } +// ============================================================================ +// Request / Response helpers +// ============================================================================ - vmInfo := "" - if vmCount > 0 { - vmInfo = fmt.Sprintf(" VMs=%d", vmCount) +func newAPIResponse(rejectSubstrings ...string) APIResponseExpectation { + return APIResponseExpectation{ + StatusCode: http.StatusOK, + RejectReasonSubstrings: rejectSubstrings, } - - return fmt.Sprintf("%s/%s/%s(%s)/%s/az=%s%s", commitmentID, projectID, flavorName, flavorGroup, host, az, vmInfo) } -// printExpectedReservations prints all expected reservations with markers for unmatched ones. -func (env *CommitmentTestEnv) printExpectedReservations(all, unmatched []*TestReservation) { - env.T.Helper() - - unmatchedMap := make(map[*TestReservation]bool) - for _, res := range unmatched { - unmatchedMap[res] = true - } - - if len(all) == 0 { - env.T.Log(" (none)") - return - } - - for i, res := range all { - marker := "✓" - if unmatchedMap[res] { - marker = "✗" - } - env.T.Logf(" %s [%d] %s", marker, i+1, res.String()) +func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest { + return CommitmentChangeRequest{ + AZ: az, + DryRun: dryRun, + InfoVersion: infoVersion, + Commitments: commitments, } - - env.T.Logf(" Total: %d (%d matched, %d missing)", - len(all), len(all)-len(unmatched), len(unmatched)) } -// printActualReservations prints all actual reservations with markers for unmatched ones. -func (env *CommitmentTestEnv) printActualReservations(all, unmatched []v1alpha1.Reservation) { - env.T.Helper() - - unmatchedMap := make(map[string]bool) - for _, res := range unmatched { - unmatchedMap[res.Name] = true - } - - if len(all) == 0 { - env.T.Log(" (none)") - return +func createCommitment(resourceName, projectID, uuid, state string, amount uint64, _ ...string) TestCommitment { + return TestCommitment{ + ResourceName: liquid.ResourceName(resourceName), + ProjectID: projectID, + ConfirmationID: uuid, + State: state, + Amount: amount, } +} - for i, res := range all { - marker := "✓" - if unmatchedMap[res.Name] { - marker = "⊕" - } - env.T.Logf(" %s [%d] %s", marker, i+1, compactReservationString(&res)) +// deleteCommitment builds a TestCommitment representing a removal (OldStatus=oldState, NewStatus=None). +func deleteCommitment(resourceName, projectID, uuid, oldState string, amount uint64) TestCommitment { + return TestCommitment{ + ResourceName: liquid.ResourceName(resourceName), + ProjectID: projectID, + ConfirmationID: uuid, + OldState: oldState, + State: "", // NewStatus = None + Amount: amount, } - - env.T.Logf(" Total: %d (%d matched, %d unexpected)", - len(all), len(all)-len(unmatched), len(unmatched)) } -// printDiffSummary prints a summary of differences between expected and actual. -func (env *CommitmentTestEnv) printDiffSummary(unmatchedExpected []*TestReservation, unmatchedActual []v1alpha1.Reservation) { - env.T.Helper() - - if len(unmatchedExpected) > 0 { - env.T.Logf(" MISSING (%d expected, not found):", len(unmatchedExpected)) - for _, res := range unmatchedExpected { - env.T.Logf(" • %s", res.String()) +func buildRequestJSON(req CommitmentChangeRequest) string { + byProject := make(map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset) + for _, tc := range req.Commitments { + pid := liquid.ProjectUUID(tc.ProjectID) + if byProject[pid].ByResource == nil { + byProject[pid] = liquid.ProjectCommitmentChangeset{ + ByResource: make(map[liquid.ResourceName]liquid.ResourceCommitmentChangeset), + } } - } - - if len(unmatchedActual) > 0 { - env.T.Logf(" UNEXPECTED (%d found, not expected):", len(unmatchedActual)) - for _, res := range unmatchedActual { - env.T.Logf(" • %s", compactReservationString(&res)) + var oldStatus Option[liquid.CommitmentStatus] + if tc.OldState != "" { + oldStatus = Some(liquid.CommitmentStatus(tc.OldState)) + } else { + oldStatus = None[liquid.CommitmentStatus]() } - } - - if len(unmatchedExpected) == 0 && len(unmatchedActual) == 0 { - env.T.Log(" ✓ All match!") - } -} - -// reservationMatches checks if an actual reservation matches an expected one. -// All fields are checked comprehensively for complete validation. -func (env *CommitmentTestEnv) reservationMatches(expected *TestReservation, actual *v1alpha1.Reservation) bool { - // Check CommitmentID (from reservation name prefix) - if !strings.HasPrefix(actual.Name, "commitment-"+expected.CommitmentID+"-") { - return false - } - - // Check that CommittedResourceReservation spec exists - if actual.Spec.CommittedResourceReservation == nil { - return false - } - - // Check CommitmentUUID in spec matches - if actual.Spec.CommittedResourceReservation.CommitmentUUID != expected.CommitmentID { - return false - } - - // Check ProjectID - if actual.Spec.CommittedResourceReservation.ProjectID != expected.ProjectID { - return false - } - - // Check ResourceName (flavor name) - if expected.Flavor != nil { - if actual.Spec.CommittedResourceReservation.ResourceName != expected.Flavor.Name { - return false + var newStatus Option[liquid.CommitmentStatus] + if tc.State != "" { + newStatus = Some(liquid.CommitmentStatus(tc.State)) + } else { + newStatus = None[liquid.CommitmentStatus]() } - } - - // Check ResourceGroup (flavor group) - if expected.Flavor != nil { - if actual.Spec.CommittedResourceReservation.ResourceGroup != expected.Flavor.Group { - return false + commitment := liquid.Commitment{ + UUID: liquid.CommitmentUUID(tc.ConfirmationID), + Amount: tc.Amount, + OldStatus: oldStatus, + NewStatus: newStatus, + ExpiresAt: time.Now().Add(365 * 24 * time.Hour), + } + byResource := byProject[pid].ByResource[tc.ResourceName] + byResource.Commitments = append(byResource.Commitments, commitment) + + // Compute per-resource totals so RequiresConfirmation() behaves correctly. + // OldAmount overrides Amount for TotalBefore (resize-down: old amount != new amount). + oldAmt := tc.Amount + if tc.OldAmount != 0 { + oldAmt = tc.OldAmount + } + if oldStatus == Some(liquid.CommitmentStatusConfirmed) { + byResource.TotalConfirmedBefore += oldAmt + } + if newStatus == Some(liquid.CommitmentStatusConfirmed) { + byResource.TotalConfirmedAfter += tc.Amount + } + if oldStatus == Some(liquid.CommitmentStatusGuaranteed) { + byResource.TotalGuaranteedBefore += oldAmt + } + if newStatus == Some(liquid.CommitmentStatusGuaranteed) { + byResource.TotalGuaranteedAfter += tc.Amount } - } - - // Check Host (if specified in expected) - if expected.Host != "" && actual.Status.Host != expected.Host { - return false - } - // Check AZ (if specified in expected) - if expected.AZ != "" && actual.Spec.AvailabilityZone != expected.AZ { - return false + byProject[pid].ByResource[tc.ResourceName] = byResource } - // Check Memory (use custom MemoryMB if non-zero, otherwise use flavor size) - expectedMemoryMB := expected.MemoryMB - if expectedMemoryMB == 0 && expected.Flavor != nil { - expectedMemoryMB = expected.Flavor.MemoryMB + request := liquid.CommitmentChangeRequest{ + InfoVersion: req.InfoVersion, + AZ: liquid.AvailabilityZone(req.AZ), + DryRun: req.DryRun, + ByProject: byProject, } - memoryQuantity := actual.Spec.Resources["memory"] - actualMemoryBytes := memoryQuantity.Value() - actualMemoryMB := actualMemoryBytes / (1024 * 1024) - if actualMemoryMB != expectedMemoryMB { - return false + raw, err := json.Marshal(request) + if err != nil { + panic("failed to marshal request: " + err.Error()) } + return string(raw) +} - // Check CPU (from flavor if available) - if expected.Flavor != nil { - cpuQuantity := actual.Spec.Resources["cpu"] - actualCPU := cpuQuantity.Value() - if actualCPU != expected.Flavor.VCPUs { - return false - } - } +// ============================================================================ +// FlavorGroup Knowledge helpers +// ============================================================================ - // Check VM allocations (set comparison - order doesn't matter) - if !env.vmAllocationsMatch(expected.VMs, actual) { - return false +func buildFlavorGroupsKnowledge(flavors []*TestFlavor, infoVersion int64) FlavorGroupsKnowledge { + groupMap := make(map[string][]compute.FlavorInGroup) + for _, f := range flavors { + groupMap[f.Group] = append(groupMap[f.Group], f.ToFlavorInGroup()) } - // Check reservation type - if actual.Spec.Type != v1alpha1.ReservationTypeCommittedResource { - return false + sortedNames := make([]string, 0, len(groupMap)) + for n := range groupMap { + sortedNames = append(sortedNames, n) } + sort.Strings(sortedNames) - return true -} + var groups []compute.FlavorGroupFeature + for _, name := range sortedNames { + gFlavors := groupMap[name] + sort.Slice(gFlavors, func(i, j int) bool { return gFlavors[i].MemoryMB > gFlavors[j].MemoryMB }) -// vmAllocationsMatch checks if VM allocations match (set comparison). -func (env *CommitmentTestEnv) vmAllocationsMatch(expectedVMs []string, actual *v1alpha1.Reservation) bool { - if actual.Status.CommittedResourceReservation == nil { - return len(expectedVMs) == 0 - } + smallest := gFlavors[len(gFlavors)-1] + largest := gFlavors[0] - actualVMs := make(map[string]bool) - for vmUUID := range actual.Status.CommittedResourceReservation.Allocations { - actualVMs[vmUUID] = true + var minR, maxR uint64 = ^uint64(0), 0 + for _, f := range gFlavors { + if f.VCPUs == 0 { + continue + } + r := f.MemoryMB / f.VCPUs + if r < minR { + minR = r + } + if r > maxR { + maxR = r + } + } + var ratio, ratioMin, ratioMax *uint64 + if minR == maxR && maxR != 0 { + ratio = &minR + } else if maxR != 0 { + ratioMin = &minR + ratioMax = &maxR + } + groups = append(groups, compute.FlavorGroupFeature{ + Name: name, + Flavors: gFlavors, + SmallestFlavor: smallest, + LargestFlavor: largest, + RamCoreRatio: ratio, + RamCoreRatioMin: ratioMin, + RamCoreRatioMax: ratioMax, + }) } + return FlavorGroupsKnowledge{InfoVersion: infoVersion, Groups: groups} +} - // Check counts match - if len(expectedVMs) != len(actualVMs) { - return false +func createKnowledgeCRD(fgk FlavorGroupsKnowledge) *v1alpha1.Knowledge { + raw, err := v1alpha1.BoxFeatureList(fgk.Groups) + if err != nil { + panic("failed to box flavor group features: " + err.Error()) } - // Check all expected VMs are in actual - for _, vmUUID := range expectedVMs { - if !actualVMs[vmUUID] { - return false - } + lastChange := metav1.NewTime(time.Unix(fgk.InfoVersion, 0)) + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: flavorGroupsKnowledgeName, + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []metav1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: metav1.ConditionTrue, Reason: "Extracted"}}, + Raw: raw, + LastContentChange: lastChange, + }, } - - return true } // ============================================================================ -// Mock VM Source +// MockVMSource (kept for compatibility with handler.go / report_usage tests) // ============================================================================ -// MockVMSource implements VMSource for testing. type MockVMSource struct { - VMs []VM + vms []VM + mu sync.Mutex +} + +type VM struct { + UUID string + FlavorName string + ProjectID string + CurrentHypervisor string + AvailabilityZone string + Resources map[string]int64 + FlavorExtraSpecs map[string]string } -// NewMockVMSource creates a new MockVMSource with the given VMs. func NewMockVMSource(vms []VM) *MockVMSource { - return &MockVMSource{VMs: vms} + return &MockVMSource{vms: vms} } -// ListVMs returns the configured VMs. -func (s *MockVMSource) ListVMs(_ context.Context) ([]VM, error) { - return s.VMs, nil +func (m *MockVMSource) ListVMs(_ context.Context) ([]VM, error) { + m.mu.Lock() + defer m.mu.Unlock() + result := make([]VM, len(m.vms)) + copy(result, m.vms) + return result, nil } // ============================================================================ -// Helper Functions +// TestVM (kept for tests in other files that still use it) // ============================================================================ -// newHypervisorWithAZ creates a Hypervisor CRD with the given parameters including availability zone. -func newHypervisorWithAZ(name string, cpuCap, memoryGi, cpuAlloc, memoryGiAlloc int, instances []hv1.Instance, traits []string, az string) *hv1.Hypervisor { - labels := make(map[string]string) - if az != "" { - labels[corev1.LabelTopologyZone] = az - } - return &hv1.Hypervisor{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Labels: labels, - }, - Status: hv1.HypervisorStatus{ - Capacity: map[hv1.ResourceName]resource.Quantity{ - "cpu": resource.MustParse(strconv.Itoa(cpuCap)), - "memory": resource.MustParse(strconv.Itoa(memoryGi) + "Gi"), - }, - Allocation: map[hv1.ResourceName]resource.Quantity{ - "cpu": resource.MustParse(strconv.Itoa(cpuAlloc)), - "memory": resource.MustParse(strconv.Itoa(memoryGiAlloc) + "Gi"), - }, - NumInstances: len(instances), - Instances: instances, - Traits: traits, - }, - } -} - -// createCommitment creates a TestCommitment for use in test cases. -// The az parameter is optional - if empty string, no AZ constraint is set. -func createCommitment(resourceName, projectID, confirmationID, state string, amount uint64, az ...string) TestCommitment { - return TestCommitment{ - ResourceName: liquid.ResourceName(resourceName), - ProjectID: projectID, - ConfirmationID: confirmationID, - State: state, - Amount: amount, - } -} - -// newCommitmentRequest creates a CommitmentChangeRequest with the given commitments. -func newCommitmentRequest(az string, dryRun bool, infoVersion int64, commitments ...TestCommitment) CommitmentChangeRequest { - return CommitmentChangeRequest{ - AZ: az, - DryRun: dryRun, - InfoVersion: infoVersion, - Commitments: commitments, - } -} - -// newAPIResponse creates an APIResponseExpectation with 200 OK status. -func newAPIResponse(rejectReasonSubstrings ...string) APIResponseExpectation { - return APIResponseExpectation{ - StatusCode: 200, - RejectReasonSubstrings: rejectReasonSubstrings, - } -} - -// buildRequestJSON converts a test CommitmentChangeRequest to JSON string. -// Builds the nested JSON structure directly for simplicity. -// Uses sorted iteration to ensure deterministic JSON output. -func buildRequestJSON(req CommitmentChangeRequest) string { - // Group commitments by project and resource for nested structure - type projectResources map[liquid.ResourceName][]TestCommitment - byProject := make(map[string]projectResources) - - for _, commit := range req.Commitments { - if byProject[commit.ProjectID] == nil { - byProject[commit.ProjectID] = make(projectResources) - } - byProject[commit.ProjectID][commit.ResourceName] = append( - byProject[commit.ProjectID][commit.ResourceName], - commit, - ) - } - - // Sort projects for deterministic iteration - sortedProjects := make([]string, 0, len(byProject)) - for projectID := range byProject { - sortedProjects = append(sortedProjects, projectID) - } - sort.Strings(sortedProjects) - - // Build nested JSON structure with sorted iteration - var projectParts []string - for _, projectID := range sortedProjects { - resources := byProject[projectID] - - // Sort resource names for deterministic iteration - sortedResources := make([]liquid.ResourceName, 0, len(resources)) - for resourceName := range resources { - sortedResources = append(sortedResources, resourceName) - } - sort.Slice(sortedResources, func(i, j int) bool { - return string(sortedResources[i]) < string(sortedResources[j]) - }) - - var resourceParts []string - for _, resourceName := range sortedResources { - commits := resources[resourceName] - var commitParts []string - for _, c := range commits { - expiryTime := time.Now().Add(time.Duration(defaultCommitmentExpiryYears) * 365 * 24 * time.Hour) - commitParts = append(commitParts, fmt.Sprintf(`{"uuid":"%s","newStatus":"%s","amount":%d,"expiresAt":"%s"}`, - c.ConfirmationID, c.State, c.Amount, expiryTime.Format(time.RFC3339))) - } - resourceParts = append(resourceParts, fmt.Sprintf(`"%s":{"commitments":[%s]}`, - resourceName, strings.Join(commitParts, ","))) - } - projectParts = append(projectParts, fmt.Sprintf(`"%s":{"byResource":{%s}}`, - projectID, strings.Join(resourceParts, ","))) - } - - return fmt.Sprintf(`{"az":"%s","dryRun":%t,"infoVersion":%d,"byProject":{%s}}`, - req.AZ, req.DryRun, req.InfoVersion, strings.Join(projectParts, ",")) +type TestVM struct { + UUID string + Flavor *TestFlavor + ProjectID string + Host string + AZ string } -// createKnowledgeCRD creates a Knowledge CRD populated with flavor groups. -func createKnowledgeCRD(flavorGroups FlavorGroupsKnowledge) *v1alpha1.Knowledge { - rawExt, err := v1alpha1.BoxFeatureList(flavorGroups.Groups) - if err != nil { - panic("Failed to box flavor groups: " + err.Error()) - } - - lastContentChange := time.Unix(flavorGroups.InfoVersion, 0) - - return &v1alpha1.Knowledge{ - ObjectMeta: metav1.ObjectMeta{ - Name: flavorGroupsKnowledgeName, - }, - Spec: v1alpha1.KnowledgeSpec{ - SchedulingDomain: v1alpha1.SchedulingDomainNova, - Extractor: v1alpha1.KnowledgeExtractorSpec{ - Name: flavorGroupsKnowledgeName, - }, - Recency: metav1.Duration{Duration: knowledgeRecencyDuration}, +func (vm *TestVM) ToVM() VM { + return VM{ + UUID: vm.UUID, + FlavorName: vm.Flavor.Name, + ProjectID: vm.ProjectID, + CurrentHypervisor: vm.Host, + AvailabilityZone: vm.AZ, + Resources: map[string]int64{ + "memory": vm.Flavor.MemoryMB, + "vcpus": vm.Flavor.VCPUs, }, - Status: v1alpha1.KnowledgeStatus{ - LastExtracted: metav1.Time{Time: lastContentChange}, - LastContentChange: metav1.Time{Time: lastContentChange}, - Raw: rawExt, - RawLength: len(flavorGroups.Groups), - Conditions: []metav1.Condition{ - { - Type: v1alpha1.KnowledgeConditionReady, - Status: metav1.ConditionTrue, - Reason: "KnowledgeReady", - Message: "Flavor groups knowledge is ready", - LastTransitionTime: metav1.Time{Time: lastContentChange}, - }, - }, + FlavorExtraSpecs: map[string]string{ + "quota:hw_version": vm.Flavor.Group, }, } } diff --git a/internal/scheduling/reservations/commitments/api/handler.go b/internal/scheduling/reservations/commitments/api/handler.go index f0eb24110..051a82fa2 100644 --- a/internal/scheduling/reservations/commitments/api/handler.go +++ b/internal/scheduling/reservations/commitments/api/handler.go @@ -20,7 +20,7 @@ var apiLog = ctrl.Log.WithName("committed-resource") // HTTPAPI implements Limes LIQUID commitment validation endpoints. type HTTPAPI struct { client client.Client - config commitments.Config + config commitments.APIConfig usageDB commitments.UsageDBClient monitor ChangeCommitmentsAPIMonitor usageMonitor ReportUsageAPIMonitor @@ -31,11 +31,11 @@ type HTTPAPI struct { } func NewAPI(client client.Client) *HTTPAPI { - return NewAPIWithConfig(client, commitments.DefaultConfig(), nil) + return NewAPIWithConfig(client, commitments.DefaultAPIConfig(), nil) } // NewAPIWithConfig creates an HTTPAPI with the given config and optional usageDB client. -func NewAPIWithConfig(k8sClient client.Client, config commitments.Config, usageDB commitments.UsageDBClient) *HTTPAPI { +func NewAPIWithConfig(k8sClient client.Client, config commitments.APIConfig, usageDB commitments.UsageDBClient) *HTTPAPI { return &HTTPAPI{ client: k8sClient, config: config, @@ -58,9 +58,9 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log mux.HandleFunc("/commitments/v1/projects/", api.handleProjectEndpoint) // routes to report-usage or quota log.Info("commitments API initialized", - "changeCommitmentsEnabled", api.config.EnableChangeCommitmentsAPI, - "reportUsageEnabled", api.config.EnableReportUsageAPI, - "reportCapacityEnabled", api.config.EnableReportCapacityAPI) + "changeCommitmentsEnabled", api.config.EnableChangeCommitments, + "reportUsageEnabled", api.config.EnableReportUsage, + "reportCapacityEnabled", api.config.EnableReportCapacity) } // handleProjectEndpoint routes /commitments/v1/projects/:project_id/... requests to the appropriate handler. diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index 6999b38d6..2e8ddc8a8 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -219,7 +219,8 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l "version", version) return liquid.ServiceInfo{ - Version: version, - Resources: resources, + Version: version, + Resources: resources, + CommitmentHandlingNeedsProjectMetadata: true, }, nil } diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go index f846fea8e..9f0966cce 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity.go @@ -31,7 +31,7 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) w.Header().Set("X-Request-ID", requestID) // Check if API is enabled - if !api.config.EnableReportCapacityAPI { + if !api.config.EnableReportCapacity { statusCode = http.StatusServiceUnavailable http.Error(w, "report-capacity API is disabled", statusCode) api.recordCapacityMetrics(statusCode, startTime) diff --git a/internal/scheduling/reservations/commitments/api/report_usage.go b/internal/scheduling/reservations/commitments/api/report_usage.go index d87f7c24a..bf48dfe00 100644 --- a/internal/scheduling/reservations/commitments/api/report_usage.go +++ b/internal/scheduling/reservations/commitments/api/report_usage.go @@ -36,7 +36,7 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) { log := apiLog.WithValues("requestID", requestID, "endpoint", "report-usage") // Check if API is enabled - if !api.config.EnableReportUsageAPI { + if !api.config.EnableReportUsage { statusCode = http.StatusServiceUnavailable log.Info("report-usage API is disabled, rejecting request") http.Error(w, "report-usage API is disabled", statusCode) diff --git a/internal/scheduling/reservations/commitments/api/report_usage_test.go b/internal/scheduling/reservations/commitments/api/report_usage_test.go index 4cafdc213..719a7bbb1 100644 --- a/internal/scheduling/reservations/commitments/api/report_usage_test.go +++ b/internal/scheduling/reservations/commitments/api/report_usage_test.go @@ -580,7 +580,7 @@ func newUsageTestEnv( } // Create API with mock DB client - api := NewAPIWithConfig(k8sClient, commitments.DefaultConfig(), dbClient) + api := NewAPIWithConfig(k8sClient, commitments.DefaultAPIConfig(), dbClient) mux := http.NewServeMux() registry := prometheus.NewRegistry() api.Init(mux, registry, log.Log) diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller.go b/internal/scheduling/reservations/commitments/committed_resource_controller.go index a25d63e3a..0481395fc 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller.go @@ -29,7 +29,7 @@ const crFinalizer = "committed-resource.reservations.cortex.cloud/cleanup" type CommittedResourceController struct { client.Client Scheme *runtime.Scheme - Conf Config + Conf CommittedResourceControllerConfig } func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -58,7 +58,7 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re switch cr.Spec.State { case v1alpha1.CommitmentStatusPlanned: - return ctrl.Result{}, r.setNotReady(ctx, &cr, "Planned", "commitment is not yet active") + return ctrl.Result{}, r.setNotReady(ctx, &cr, v1alpha1.CommittedResourceReasonPlanned, "commitment is not yet active") case v1alpha1.CommitmentStatusPending: return r.reconcilePending(ctx, logger, &cr) case v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommitmentStatusConfirmed: @@ -71,16 +71,41 @@ func (r *CommittedResourceController) Reconcile(ctx context.Context, req ctrl.Re } } -// reconcilePending handles a one-shot confirmation attempt (Limes state: pending). -// If placement fails for any reason, all partial reservations are removed and the -// CR is marked Rejected so the HTTP API can report the outcome back to Limes. +// reconcilePending handles a confirmation attempt (Limes state: pending). +// If AllowRejection=true (API path), placement failure marks the CR Rejected so the HTTP API +// can report the outcome back to Limes. If AllowRejection=false (syncer path), the controller +// retries indefinitely — Limes does not require confirmation for these transitions. func (r *CommittedResourceController) reconcilePending(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (ctrl.Result, error) { - if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil { - logger.Error(applyErr, "pending commitment placement failed, rejecting") - if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil { - return ctrl.Result{}, rollbackErr + result, applyErr := r.applyReservationState(ctx, logger, cr) + if applyErr != nil { + if cr.Spec.AllowRejection { + logger.Error(applyErr, "pending commitment placement failed, rejecting") + if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil { + return ctrl.Result{}, rollbackErr + } + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error()) + } + logger.Error(applyErr, "pending commitment placement failed, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error()) + } + allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots) + if err != nil { + return ctrl.Result{}, err + } + if anyFailed { + if cr.Spec.AllowRejection { + logger.Info("pending commitment rejected: reservation placement failed", "reason", failReason) + if rollbackErr := r.deleteChildReservations(ctx, cr); rollbackErr != nil { + return ctrl.Result{}, rollbackErr + } + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason) } - return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error()) + logger.Info("pending commitment placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason) + } + if !allReady { + // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue. + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement") } return ctrl.Result{}, r.setAccepted(ctx, cr) } @@ -89,42 +114,113 @@ func (r *CommittedResourceController) reconcileCommitted(ctx context.Context, lo // Spec errors are permanent regardless of AllowRejection — a bad spec won't fix itself. if _, err := FromCommittedResource(*cr); err != nil { logger.Error(err, "invalid commitment spec, rejecting") - return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", err.Error()) + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, err.Error()) } - if applyErr := r.applyReservationState(ctx, logger, cr); applyErr != nil { + result, applyErr := r.applyReservationState(ctx, logger, cr) + if applyErr != nil { if cr.Spec.AllowRejection { logger.Error(applyErr, "committed placement failed, rolling back to accepted amount") if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil { return ctrl.Result{}, rollbackErr } - return ctrl.Result{}, r.setNotReady(ctx, cr, "Rejected", applyErr.Error()) + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, applyErr.Error()) + } + logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, applyErr.Error()) + } + allReady, anyFailed, failReason, err := r.checkChildReservationStatus(ctx, cr, result.TotalSlots) + if err != nil { + return ctrl.Result{}, err + } + if anyFailed { + if cr.Spec.AllowRejection { + logger.Info("committed placement failed, rolling back to accepted amount", "reason", failReason) + if rollbackErr := r.rollbackToAccepted(ctx, logger, cr); rollbackErr != nil { + return ctrl.Result{}, rollbackErr + } + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonRejected, failReason) } - logger.Error(applyErr, "committed placement incomplete, will retry", "requeueAfter", r.Conf.RequeueIntervalRetry) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, r.setNotReady(ctx, cr, "Reserving", applyErr.Error()) + logger.Info("committed placement failed, will retry", "reason", failReason, "requeueAfter", r.Conf.RequeueIntervalRetry.Duration) + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, failReason) + } + if !allReady { + // Reservation controller hasn't processed all slots yet; Reservation watch will re-enqueue. + return ctrl.Result{}, r.setNotReady(ctx, cr, v1alpha1.CommittedResourceReasonReserving, "waiting for reservation placement") } return ctrl.Result{}, r.setAccepted(ctx, cr) } -func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) error { +func (r *CommittedResourceController) applyReservationState(ctx context.Context, logger logr.Logger, cr *v1alpha1.CommittedResource) (*ApplyResult, error) { knowledge := &reservations.FlavorGroupKnowledgeClient{Client: r.Client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { - return fmt.Errorf("flavor knowledge not ready: %w", err) + return nil, fmt.Errorf("flavor knowledge not ready: %w", err) } state, err := FromCommittedResource(*cr) if err != nil { - return fmt.Errorf("invalid commitment spec: %w", err) + return nil, fmt.Errorf("invalid commitment spec: %w", err) } state.NamePrefix = cr.Name + "-" state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx) + state.ParentGeneration = cr.Generation result, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller") if err != nil { - return err + return nil, err } logger.Info("commitment state applied", "created", result.Created, "deleted", result.Deleted, "repaired", result.Repaired) - return nil + return result, nil +} + +// checkChildReservationStatus inspects the Ready conditions of all child Reservations for cr. +// Returns allReady=true when every child has Ready=True. +// Returns anyFailed=true (and the first failure message) when any child has Ready=False. +// Returns allReady=false, anyFailed=false when some children have no condition yet (placement pending). +func (r *CommittedResourceController) checkChildReservationStatus(ctx context.Context, cr *v1alpha1.CommittedResource, expectedSlots int) (allReady, anyFailed bool, failReason string, err error) { + var list v1alpha1.ReservationList + if err := r.List(ctx, &list, + client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource}, + client.MatchingFields{idxReservationByCommitmentUUID: cr.Spec.CommitmentUUID}, + ); err != nil { + return false, false, "", fmt.Errorf("failed to list reservations: %w", err) + } + + // Cache hasn't caught up yet; Reservation watch will re-enqueue. + if len(list.Items) < expectedSlots { + return false, false, "", nil + } + + if len(list.Items) == 0 { + return true, false, "", nil + } + + // First pass: failures take priority over pending — but only for the current generation. + // A Ready=False condition from a previous generation means the reservation controller + // hasn't reprocessed this slot yet; treat it as still-pending, not as a current failure. + for _, res := range list.Items { + if res.Status.CommittedResourceReservation == nil || + res.Status.CommittedResourceReservation.ObservedParentGeneration != cr.Generation { + continue + } + cond := meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) + if cond != nil && cond.Status == metav1.ConditionFalse { + return false, true, cond.Message, nil + } + } + // Second pass: check generation and readiness for all slots. + for _, res := range list.Items { + // ObservedParentGeneration must match cr.Generation before we trust the Ready condition. + if res.Status.CommittedResourceReservation == nil || + res.Status.CommittedResourceReservation.ObservedParentGeneration != cr.Generation { + return false, false, "", nil + } + cond := meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) + if cond == nil || cond.Status != metav1.ConditionTrue { + return false, false, "", nil + } + } + return true, false, "", nil } func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alpha1.CommittedResource) error { @@ -136,7 +232,7 @@ func (r *CommittedResourceController) setAccepted(ctx context.Context, cr *v1alp meta.SetStatusCondition(&cr.Status.Conditions, metav1.Condition{ Type: v1alpha1.CommittedResourceConditionReady, Status: metav1.ConditionTrue, - Reason: "Accepted", + Reason: v1alpha1.CommittedResourceReasonAccepted, Message: "commitment successfully reserved", LastTransitionTime: now, }) @@ -170,17 +266,14 @@ func (r *CommittedResourceController) reconcileDeletion(ctx context.Context, log // identified by matching CommitmentUUID in the reservation spec. func (r *CommittedResourceController) deleteChildReservations(ctx context.Context, cr *v1alpha1.CommittedResource) error { var list v1alpha1.ReservationList - if err := r.List(ctx, &list, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { + if err := r.List(ctx, &list, + client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource}, + client.MatchingFields{idxReservationByCommitmentUUID: cr.Spec.CommitmentUUID}, + ); err != nil { return fmt.Errorf("failed to list reservations: %w", err) } for i := range list.Items { res := &list.Items[i] - if res.Spec.CommittedResourceReservation == nil || - res.Spec.CommittedResourceReservation.CommitmentUUID != cr.Spec.CommitmentUUID { - continue - } if err := r.Delete(ctx, res); client.IgnoreNotFound(err) != nil { return fmt.Errorf("failed to delete reservation %s: %w", res.Name, err) } @@ -210,6 +303,7 @@ func (r *CommittedResourceController) rollbackToAccepted(ctx context.Context, lo state.TotalMemoryBytes = cr.Status.AcceptedAmount.Value() state.NamePrefix = cr.Name + "-" state.CreatorRequestID = reservations.GlobalRequestIDFromContext(ctx) + state.ParentGeneration = cr.Generation if _, err := NewReservationManager(r.Client).ApplyCommitmentState(ctx, logger, state, flavorGroups, "committed-resource-controller-rollback"); err != nil { return fmt.Errorf("rollback apply failed: %w", err) } @@ -271,6 +365,9 @@ func (r *CommittedResourceController) SetupWithManager(mgr ctrl.Manager, mcl *mu if err != nil { return err } + // MaxConcurrentReconciles=1: the change-commitments API handler snapshots each CR's spec + // before writing and restores it on rollback. Concurrent reconciles across overlapping + // batch requests could interleave those snapshots and produce incorrect rollback state. return bldr.Named("committed-resource"). WithOptions(controller.Options{ MaxConcurrentReconciles: 1, diff --git a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go index 6e6103972..471c013e3 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_controller_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_controller_test.go @@ -10,6 +10,7 @@ import ( "time" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -106,6 +107,20 @@ func newCRTestClient(scheme *runtime.Scheme, objects ...client.Object) client.Cl WithScheme(scheme). WithObjects(objects...). WithStatusSubresource(&v1alpha1.CommittedResource{}, &v1alpha1.Reservation{}). + WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.CommitmentUUID == "" { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). Build() } @@ -153,6 +168,39 @@ func countChildReservations(t *testing.T, k8sClient client.Client, commitmentUUI return count } +// setChildReservationsReady simulates the reservation controller by marking all child +// Reservations for the given commitmentUUID as Ready=True and echoing ParentGeneration +// into ObservedParentGeneration (matching what echoParentGeneration does in production). +func setChildReservationsReady(t *testing.T, k8sClient client.Client, commitmentUUID string) { + t.Helper() + var list v1alpha1.ReservationList + if err := k8sClient.List(context.Background(), &list, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + for i := range list.Items { + res := &list.Items[i] + if res.Spec.CommittedResourceReservation == nil || + res.Spec.CommittedResourceReservation.CommitmentUUID != commitmentUUID { + continue + } + res.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + LastTransitionTime: metav1.Now(), + }} + if res.Status.CommittedResourceReservation == nil { + res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} + } + res.Status.CommittedResourceReservation.ObservedParentGeneration = res.Spec.CommittedResourceReservation.ParentGeneration + if err := k8sClient.Status().Update(context.Background(), res); err != nil { + t.Fatalf("set reservation Ready=True: %v", err) + } + } +} + // ============================================================================ // Tests: per-state reconcile paths // ============================================================================ @@ -208,10 +256,21 @@ func TestCommittedResourceController_Reconcile(t *testing.T) { objects = append(objects, newTestFlavorKnowledge()) } k8sClient := newCRTestClient(scheme, objects...) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}} + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} + // First reconcile: creates Reservation CRDs; if slots are expected, controller + // waits for the reservation controller to set Ready=True before accepting. if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { - t.Fatalf("reconcile: %v", err) + t.Fatalf("reconcile 1: %v", err) + } + + if tt.expectedSlots > 0 { + // Simulate reservation controller: mark all child reservations as Ready=True. + setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID) + // Second reconcile: sees all Ready=True and accepts. + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 2: %v", err) + } } assertCondition(t, k8sClient, cr.Name, tt.expectedStatus, tt.expectedReason) @@ -260,7 +319,7 @@ func TestCommittedResourceController_InactiveStates(t *testing.T) { }, } k8sClient := newCRTestClient(scheme, cr, existing) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}} + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { t.Fatalf("reconcile: %v", err) @@ -288,11 +347,19 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { expectRequeue bool }{ { - name: "pending: always rejects on failure, no retry", + name: "pending AllowRejection=true: rejects on failure, no retry", state: v1alpha1.CommitmentStatusPending, + allowRejection: true, expectedReason: "Rejected", expectRequeue: false, }, + { + name: "pending AllowRejection=false: retries on failure", + state: v1alpha1.CommitmentStatusPending, + allowRejection: false, + expectedReason: "Reserving", + expectRequeue: true, + }, { name: "guaranteed AllowRejection=true: rejects on failure, no retry", state: v1alpha1.CommitmentStatusGuaranteed, @@ -332,7 +399,7 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { controller := &CommittedResourceController{ Client: k8sClient, Scheme: scheme, - Conf: Config{RequeueIntervalRetry: 1 * time.Minute}, + Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 1 * time.Minute}}, } result, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)) @@ -354,8 +421,58 @@ func TestCommittedResourceController_PlacementFailure(t *testing.T) { } } +func TestCommittedResourceController_Rollback(t *testing.T) { + scheme := newCRTestScheme(t) + + // CR at generation 2; AcceptedAmount reflects what was accepted at generation 1. + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 2 + accepted := resource.MustParse("4Gi") + cr.Status.AcceptedAmount = &accepted + + // Existing reservation with stale ParentGeneration from the previous generation. + existing := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + AvailabilityZone: "test-az", + Resources: map[hv1.ResourceName]resource.Quantity{ + hv1.ResourceMemory: resource.MustParse("4Gi"), + hv1.ResourceCPU: resource.MustParse("2"), + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: "test-uuid-1234", + ProjectID: "test-project", + DomainID: "test-domain", + ResourceGroup: "test-group", + ParentGeneration: 1, // stale + }, + }, + } + + k8sClient := newCRTestClient(scheme, cr, existing, newTestFlavorKnowledge()) + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} + + if err := controller.rollbackToAccepted(context.Background(), logr.Discard(), cr); err != nil { + t.Fatalf("rollbackToAccepted: %v", err) + } + + var res v1alpha1.Reservation + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: "test-cr-0"}, &res); err != nil { + t.Fatalf("get reservation: %v", err) + } + if got := res.Spec.CommittedResourceReservation.ParentGeneration; got != cr.Generation { + t.Errorf("ParentGeneration: want %d, got %d", cr.Generation, got) + } +} + func TestCommittedResourceController_BadSpec(t *testing.T) { - // Invalid UUID fails commitmentUUIDPattern — permanently broken regardless of AllowRejection. scheme := newCRTestScheme(t) cr := &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{ @@ -374,7 +491,7 @@ func TestCommittedResourceController_BadSpec(t *testing.T) { }, } k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge()) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}} + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { t.Fatalf("reconcile: %v", err) @@ -390,11 +507,18 @@ func TestCommittedResourceController_Idempotent(t *testing.T) { scheme := newCRTestScheme(t) cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) k8sClient := newCRTestClient(scheme, cr, newTestFlavorKnowledge()) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}} + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} - for i := range 3 { + // Round 1: creates reservation, waits for placement. + if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { + t.Fatalf("reconcile 1: %v", err) + } + // Simulate reservation controller setting Ready=True. + setChildReservationsReady(t, k8sClient, cr.Spec.CommitmentUUID) + // Rounds 2 and 3: accepts, then stays accepted. + for i := 2; i <= 3; i++ { if _, err := controller.Reconcile(context.Background(), reconcileReq(cr.Name)); err != nil { - t.Fatalf("reconcile %d: %v", i+1, err) + t.Fatalf("reconcile %d: %v", i, err) } } @@ -404,6 +528,111 @@ func TestCommittedResourceController_Idempotent(t *testing.T) { assertCondition(t, k8sClient, cr.Name, metav1.ConditionTrue, "Accepted") } +// ============================================================================ +// Tests: checkChildReservationStatus generation guard +// ============================================================================ + +// TestCheckChildReservationStatus_GenerationGuard verifies the two-pass logic that +// distinguishes a stale Ready=False (previous generation) from a current failure. +func TestCheckChildReservationStatus_GenerationGuard(t *testing.T) { + tests := []struct { + name string + obsGen int64 + condStatus metav1.ConditionStatus // "" = no condition set + condMessage string + wantAllReady bool + wantAnyFailed bool + wantReason string + }{ + { + name: "Ready=False at stale generation: treated as pending", + obsGen: 1, + condStatus: metav1.ConditionFalse, + condMessage: "no hosts available", + wantAllReady: false, + wantAnyFailed: false, + }, + { + name: "Ready=False at current generation: is a current failure", + obsGen: 2, + condStatus: metav1.ConditionFalse, + condMessage: "no hosts available", + wantAllReady: false, + wantAnyFailed: true, + wantReason: "no hosts available", + }, + { + name: "Ready=True at current generation: allReady", + obsGen: 2, + condStatus: metav1.ConditionTrue, + wantAllReady: true, + }, + { + name: "no condition yet at current generation: still pending", + obsGen: 2, + condStatus: "", // no condition + wantAllReady: false, + wantAnyFailed: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := newCRTestScheme(t) + cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) + cr.Generation = 2 + + child := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: cr.Spec.CommitmentUUID, + ParentGeneration: cr.Generation, + }, + }, + } + k8sClient := newCRTestClient(scheme, child) + + child.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{ + ObservedParentGeneration: tt.obsGen, + } + if tt.condStatus != "" { + child.Status.Conditions = []metav1.Condition{{ + Type: v1alpha1.ReservationConditionReady, + Status: tt.condStatus, + Reason: "Test", + Message: tt.condMessage, + LastTransitionTime: metav1.Now(), + }} + } + if err := k8sClient.Status().Update(context.Background(), child); err != nil { + t.Fatalf("set reservation status: %v", err) + } + + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme} + allReady, anyFailed, reason, err := controller.checkChildReservationStatus(context.Background(), cr, 1) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if allReady != tt.wantAllReady { + t.Errorf("allReady: want %v, got %v", tt.wantAllReady, allReady) + } + if anyFailed != tt.wantAnyFailed { + t.Errorf("anyFailed: want %v, got %v", tt.wantAnyFailed, anyFailed) + } + if reason != tt.wantReason { + t.Errorf("reason: want %q, got %q", tt.wantReason, reason) + } + }) + } +} + func TestCommittedResourceController_Deletion(t *testing.T) { scheme := newCRTestScheme(t) cr := newTestCommittedResource("test-cr", v1alpha1.CommitmentStatusConfirmed) @@ -422,7 +651,7 @@ func TestCommittedResourceController_Deletion(t *testing.T) { }, } k8sClient := newCRTestClient(scheme, cr, child) - controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: Config{}} + controller := &CommittedResourceController{Client: k8sClient, Scheme: scheme, Conf: CommittedResourceControllerConfig{}} if err := k8sClient.Delete(context.Background(), cr); err != nil { t.Fatalf("delete CR: %v", err) diff --git a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go index 01a0b4199..0090e45f5 100644 --- a/internal/scheduling/reservations/commitments/committed_resource_integration_test.go +++ b/internal/scheduling/reservations/commitments/committed_resource_integration_test.go @@ -24,6 +24,7 @@ import ( hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -55,6 +56,20 @@ func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv { &v1alpha1.Reservation{}, &v1alpha1.Knowledge{}, ). + WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.CommitmentUUID == "" { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). Build() schedulerServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -67,19 +82,19 @@ func newCRIntegrationEnv(t *testing.T) *crIntegrationEnv { crCtrl := &CommittedResourceController{ Client: k8sClient, Scheme: scheme, - Conf: Config{RequeueIntervalRetry: 5 * time.Minute}, + Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 5 * time.Minute}}, } resCtrl := &CommitmentReservationController{ Client: k8sClient, Scheme: scheme, - Conf: Config{ + Conf: ReservationControllerConfig{ SchedulerURL: schedulerServer.URL, - AllocationGracePeriod: 15 * time.Minute, - RequeueIntervalActive: 5 * time.Minute, + AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}, + RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute}, }, } - if err := resCtrl.Init(context.Background(), k8sClient, resCtrl.Conf); err != nil { + if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil { t.Fatalf("resCtrl.Init: %v", err) } @@ -136,196 +151,483 @@ func (e *crIntegrationEnv) getCR(t *testing.T, name string) v1alpha1.CommittedRe return cr } +// reconcileChildReservations runs the reservation controller twice on every child Reservation +// for crName (first reconcile sets TargetHost, second sets Ready=True), then re-reconciles +// the CR so it can observe the placement outcomes. +func (e *crIntegrationEnv) reconcileChildReservations(t *testing.T, crName string) { + t.Helper() + for _, res := range e.listChildReservations(t, crName) { + e.reconcileReservation(t, res.Name) // calls scheduler → sets TargetHost + e.reconcileReservation(t, res.Name) // syncs TargetHost to Status → Ready=True + } + e.reconcileCR(t, crName) +} + // ============================================================================ // Integration tests // ============================================================================ -// TestCRLifecycle_PlannedToConfirmed verifies that transitioning a CR from planned -// to confirmed causes the CR controller to create child Reservation CRDs. -func TestCRLifecycle_PlannedToConfirmed(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() +// TestCRLifecycle covers the multi-step state transitions that require imperative +// mid-test patches and cannot be expressed as a purely declarative table. +func TestCRLifecycle(t *testing.T) { + t.Run("planned→confirmed: child Reservations created and placed", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusPlanned) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } - // Reconcile as planned: finalizer added, no Reservations. - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Fatalf("planned: expected 0 reservations, got %d", len(got)) - } - crState := env.getCR(t, cr.Name) - cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil || cond.Reason != "Planned" { - t.Errorf("planned: expected Reason=Planned, got %v", cond) - } + // Reconcile as planned: finalizer added, no Reservations. + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Fatalf("planned: expected 0 reservations, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Reason != "Planned" { + t.Errorf("planned: expected Reason=Planned, got %v", cond) + } - // Transition to confirmed. - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.State = v1alpha1.CommitmentStatusConfirmed - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch state to confirmed: %v", err) - } + // Transition to confirmed. + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusConfirmed + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to confirmed: %v", err) + } + env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("confirmed: expected 1 reservation, got %d", len(children)) + } + env.reconcileChildReservations(t, cr.Name) - children := env.listChildReservations(t, cr.Name) - if len(children) != 1 { - t.Fatalf("confirmed: expected 1 reservation, got %d", len(children)) - } - crState = env.getCR(t, cr.Name) - if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { - t.Errorf("confirmed: expected Ready=True") - } -} + crState = env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("confirmed: expected Ready=True") + } + }) -// TestCRLifecycle_ConfirmedToExpired verifies that transitioning a CR to expired -// deletes all child Reservation CRDs and marks Ready=False. -func TestCRLifecycle_ConfirmedToExpired(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() + t.Run("confirmed→expired: child Reservations deleted, CR marked inactive", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } - // Bring to confirmed+Ready=True. - env.reconcileCR(t, cr.Name) // adds finalizer - env.reconcileCR(t, cr.Name) // creates Reservations + // Bring to confirmed+Ready=True. + env.reconcileCR(t, cr.Name) // adds finalizer + env.reconcileCR(t, cr.Name) // creates Reservations + env.reconcileChildReservations(t, cr.Name) // places slots → Ready=True - if got := env.listChildReservations(t, cr.Name); len(got) != 1 { - t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got)) - } + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-expire: expected 1 reservation, got %d", len(got)) + } - // Transition to expired. - crState := env.getCR(t, cr.Name) - patch := client.MergeFrom(crState.DeepCopy()) - crState.Spec.State = v1alpha1.CommitmentStatusExpired - if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { - t.Fatalf("patch state to expired: %v", err) - } + // Transition to expired. + crState := env.getCR(t, cr.Name) + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusExpired + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to expired: %v", err) + } + env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("expired: expected 0 reservations, got %d", len(got)) + } + crState = env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Status != metav1.ConditionFalse { + t.Errorf("expired: expected Ready=False, got %v", cond) + } + if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) { + t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason) + } + }) - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("expired: expected 0 reservations, got %d", len(got)) - } - crState = env.getCR(t, cr.Name) - cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) - if cond == nil || cond.Status != metav1.ConditionFalse { - t.Errorf("expired: expected Ready=False, got %v", cond) - } - if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusExpired) { - t.Errorf("expired: expected Reason=%s, got %s", v1alpha1.CommitmentStatusExpired, cond.Reason) - } -} + t.Run("reservation placement: two reconciles set TargetHost then Ready=True", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() -// TestCRLifecycle_ReservationControllerPlacesChild verifies that after the CR controller -// creates a child Reservation, the ReservationController can place it (scheduler call → -// TargetHost set → Ready=True on the Reservation). -func TestCRLifecycle_ReservationControllerPlacesChild(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) - // CR controller creates child Reservation. - env.reconcileCR(t, cr.Name) - env.reconcileCR(t, cr.Name) + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("expected 1 child reservation, got %d", len(children)) + } + child := children[0] - children := env.listChildReservations(t, cr.Name) - if len(children) != 1 { - t.Fatalf("expected 1 child reservation, got %d", len(children)) - } - child := children[0] + // First reconcile: scheduler call → TargetHost written to Spec. + env.reconcileReservation(t, child.Name) + var afterFirst v1alpha1.Reservation + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil { + t.Fatalf("get reservation after first reconcile: %v", err) + } + if afterFirst.Spec.TargetHost == "" { + t.Fatalf("expected TargetHost set after first reservation reconcile") + } - // Reservation controller places it (first reconcile: calls scheduler → sets TargetHost). - env.reconcileReservation(t, child.Name) + // Second reconcile: TargetHost synced to Status, Ready=True. + env.reconcileReservation(t, child.Name) + var afterSecond v1alpha1.Reservation + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil { + t.Fatalf("get reservation after second reconcile: %v", err) + } + if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) { + t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions) + } + if afterSecond.Status.Host != "host-1" { + t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host) + } + }) - var afterFirst v1alpha1.Reservation - if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterFirst); err != nil { - t.Fatalf("get reservation after first reconcile: %v", err) - } - if afterFirst.Spec.TargetHost == "" { - t.Fatalf("expected TargetHost set after first reservation reconcile") - } + t.Run("deletion: finalizer removed, child Reservations cleaned up", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() - // Second reconcile: syncs TargetHost to Status, sets Ready=True. - env.reconcileReservation(t, child.Name) + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } - var afterSecond v1alpha1.Reservation - if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: child.Name}, &afterSecond); err != nil { - t.Fatalf("get reservation after second reconcile: %v", err) - } - if !meta.IsStatusConditionTrue(afterSecond.Status.Conditions, v1alpha1.ReservationConditionReady) { - t.Errorf("expected reservation Ready=True after placement, got %v", afterSecond.Status.Conditions) - } - if afterSecond.Status.Host != "host-1" { - t.Errorf("expected Status.Host=host-1, got %q", afterSecond.Status.Host) - } -} + // Pre-create a child Reservation to verify it gets cleaned up on deletion. + // newTestCommittedResource pre-populates the finalizer, so Delete() immediately sets DeletionTimestamp. + child := &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-cr-0", + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: "test-uuid-1234", + }, + }, + } + if err := env.k8sClient.Create(context.Background(), child); err != nil { + t.Fatalf("create child reservation: %v", err) + } -// TestCRLifecycle_Deletion verifies that deleting a CR cleans up all child Reservations. -func TestCRLifecycle_Deletion(t *testing.T) { - env := newCRIntegrationEnv(t) - defer env.close() + crState := env.getCR(t, cr.Name) + if err := env.k8sClient.Delete(context.Background(), &crState); err != nil { + t.Fatalf("delete CR: %v", err) + } + env.reconcileCR(t, cr.Name) - cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) - if err := env.k8sClient.Create(context.Background(), cr); err != nil { - t.Fatalf("create CR: %v", err) - } + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("post-deletion: expected 0 reservations, got %d", len(got)) + } + var final v1alpha1.CommittedResource + err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final) + if client.IgnoreNotFound(err) != nil { + t.Fatalf("unexpected error after deletion: %v", err) + } + if err == nil { + for _, f := range final.Finalizers { + if f == crFinalizer { + t.Errorf("finalizer not removed after deletion reconcile") + } + } + } + }) + + t.Run("confirmed→superseded: child Reservations deleted, CR marked inactive", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } - // newTestCommittedResource pre-populates the finalizer, so Delete() will set - // DeletionTimestamp without needing a prior reconcile. + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-supersede: expected 1 reservation, got %d", len(got)) + } + + crState := env.getCR(t, cr.Name) + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.State = v1alpha1.CommitmentStatusSuperseded + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch state to superseded: %v", err) + } + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 0 { + t.Errorf("superseded: expected 0 reservations, got %d", len(got)) + } + crState = env.getCR(t, cr.Name) + cond := meta.FindStatusCondition(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil || cond.Status != metav1.ConditionFalse { + t.Errorf("superseded: expected Ready=False, got %v", cond) + } + if cond != nil && cond.Reason != string(v1alpha1.CommitmentStatusSuperseded) { + t.Errorf("superseded: expected Reason=%s, got %s", v1alpha1.CommitmentStatusSuperseded, cond.Reason) + } + }) - // Pre-create a child Reservation to verify it gets cleaned up on deletion. - child := &v1alpha1.Reservation{ - ObjectMeta: metav1.ObjectMeta{ - Name: "my-cr-0", - Labels: map[string]string{ + t.Run("idempotency: extra reconciles after Accepted do not create extra slots", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Fatalf("pre-idempotency check: expected 1 reservation, got %d", len(got)) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Errorf("idempotency: expected 1 reservation after extra reconciles, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("idempotency: expected CR to remain Ready=True after extra reconciles") + } + }) + + t.Run("AllowRejection=false: stays Reserving when scheduler rejects", func(t *testing.T) { + hypervisor := &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: "host-1"}} + env := newIntgEnv(t, []client.Object{newTestFlavorKnowledge(), hypervisor}, intgRejectScheduler) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + // AllowRejection stays false (the default), so placement failure must requeue, not reject. + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + for range 3 { + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: "test-uuid-1234", - }, - }, - } - if err := env.k8sClient.Create(context.Background(), child); err != nil { - t.Fatalf("create child reservation: %v", err) - } + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + } + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + } - // Delete sets DeletionTimestamp (object has finalizer, so it is not removed yet). - crState := env.getCR(t, cr.Name) - if err := env.k8sClient.Delete(context.Background(), &crState); err != nil { - t.Fatalf("delete CR: %v", err) - } + var final v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { + t.Fatalf("get CR: %v", err) + } + cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + t.Fatalf("no Ready condition") + } + if cond.Reason == v1alpha1.CommittedResourceReasonRejected { + t.Errorf("AllowRejection=false: CR must not transition to Rejected, got Reason=%s", cond.Reason) + } + if cond.Reason != v1alpha1.CommittedResourceReasonReserving { + t.Errorf("AllowRejection=false: expected Reason=Reserving, got %s", cond.Reason) + } + }) - env.reconcileCR(t, cr.Name) + t.Run("externally deleted child Reservation is recreated by CR controller", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() - if got := env.listChildReservations(t, cr.Name); len(got) != 0 { - t.Errorf("post-deletion: expected 0 reservations, got %d", len(got)) - } - // Finalizer removed — object either gone or has no finalizer. - var final v1alpha1.CommittedResource - err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &final) - if client.IgnoreNotFound(err) != nil { - t.Fatalf("unexpected error after deletion: %v", err) - } - if err == nil { - for _, f := range final.Finalizers { - if f == crFinalizer { - t.Errorf("finalizer not removed after deletion reconcile") + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + children := env.listChildReservations(t, cr.Name) + if len(children) != 1 { + t.Fatalf("expected 1 child reservation before deletion, got %d", len(children)) + } + + // Simulate out-of-band deletion of the slot. + child := children[0] + if err := env.k8sClient.Delete(context.Background(), &child); err != nil { + t.Fatalf("delete child reservation: %v", err) + } + + // CR controller detects the missing slot and recreates it. + env.reconcileCR(t, cr.Name) + // Place the new slot. + env.reconcileChildReservations(t, cr.Name) + // CR controller observes Ready=True on the recreated slot. + env.reconcileCR(t, cr.Name) + + if got := env.listChildReservations(t, cr.Name); len(got) != 1 { + t.Errorf("expected 1 reservation after recreation, got %d", len(got)) + } + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Errorf("expected CR to be Ready=True after slot recreation") + } + }) + + t.Run("AcceptedAt: set when CR accepted", func(t *testing.T) { + env := newCRIntegrationEnv(t) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + env.reconcileCR(t, cr.Name) + env.reconcileCR(t, cr.Name) + env.reconcileChildReservations(t, cr.Name) + + crState := env.getCR(t, cr.Name) + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Fatalf("expected CR to be Ready=True") + } + if crState.Status.AcceptedAt == nil { + t.Errorf("expected AcceptedAt to be set on acceptance") + } + if crState.Status.AcceptedAmount == nil { + t.Errorf("expected AcceptedAmount to be set on acceptance") + } else if crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Errorf("AcceptedAmount: want 4Gi, got %s", crState.Status.AcceptedAmount.String()) + } + }) + + t.Run("resize failure: rolls back to AcceptedAmount, prior slot preserved", func(t *testing.T) { + // Scheduler: accepts the first placement call (initial 4 GiB slot), rejects all subsequent. + objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} + env := newIntgEnv(t, objects, intgAcceptFirstScheduler(1)) + defer env.close() + + cr := intgCRAllowRejection("my-cr", "uuid-resize-0001", v1alpha1.CommitmentStatusConfirmed) + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + // Phase 1: accept at 4 GiB (1 slot). Uses 1 scheduler call. + intgDriveToTerminal(t, env, []string{cr.Name}) + var crState v1alpha1.CommittedResource + if err := env.k8sClient.Get(context.Background(), types.NamespacedName{Name: cr.Name}, &crState); err != nil { + t.Fatalf("get CR: %v", err) + } + if !meta.IsStatusConditionTrue(crState.Status.Conditions, v1alpha1.CommittedResourceConditionReady) { + t.Fatalf("phase 1: expected CR to be Ready=True after initial placement") + } + if crState.Status.AcceptedAmount == nil || crState.Status.AcceptedAmount.Cmp(resource.MustParse("4Gi")) != 0 { + t.Fatalf("phase 1: AcceptedAmount must be 4Gi, got %v", crState.Status.AcceptedAmount) + } + + // Phase 2: resize to 8 GiB (needs 2 slots). Scheduler has no more accepts. + patch := client.MergeFrom(crState.DeepCopy()) + crState.Spec.Amount = resource.MustParse("8Gi") + if err := env.k8sClient.Patch(context.Background(), &crState, patch); err != nil { + t.Fatalf("patch CR to 8Gi: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + + // CR controller: applyReservationState bumps gen on existing slot, creates 2nd slot. + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + // Reservation controller: existing slot echoes new ParentGeneration (no scheduler call); + // new slot calls scheduler → rejected. + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + } + // CR controller: detects 2nd slot Ready=False → rollbackToAccepted (keeps 1 slot) → Rejected. + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + + // Rollback must preserve 1 slot (matching AcceptedAmount=4Gi), not delete all. + var finalList v1alpha1.ReservationList + if err := env.k8sClient.List(ctx, &finalList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(finalList.Items) != 1 { + t.Errorf("resize rollback: want 1 slot (AcceptedAmount), got %d", len(finalList.Items)) + } + intgAssertCRCondition(t, env.k8sClient, []string{cr.Name}, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + }) + + t.Run("AllowRejection=false: eventually accepted after scheduler starts accepting", func(t *testing.T) { + // Scheduler rejects the first 2 calls (one per reservation controller reconcile pair), + // then accepts all subsequent. AllowRejection=false means the CR controller retries rather + // than rejecting, so the CR must eventually reach Accepted once the scheduler cooperates. + objects := []client.Object{newTestFlavorKnowledge(), intgHypervisor("host-1")} + env := newIntgEnv(t, objects, intgRejectFirstScheduler(2)) + defer env.close() + + cr := newTestCommittedResource("my-cr", v1alpha1.CommitmentStatusConfirmed) + // AllowRejection stays false (default), so placement failure must requeue, not reject. + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR: %v", err) + } + + ctx := context.Background() + crReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: cr.Name}} + for range 3 { + env.crController.Reconcile(ctx, crReq) //nolint:errcheck + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + resReq := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, resReq) //nolint:errcheck + env.resController.Reconcile(ctx, resReq) //nolint:errcheck } + env.crController.Reconcile(ctx, crReq) //nolint:errcheck } - } + + var final v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: cr.Name}, &final); err != nil { + t.Fatalf("get CR: %v", err) + } + cond := meta.FindStatusCondition(final.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + t.Fatalf("no Ready condition after retries") + } + if cond.Reason == v1alpha1.CommittedResourceReasonRejected { + t.Errorf("AllowRejection=false: CR must not be Rejected, got Reason=%s", cond.Reason) + } + if cond.Status != metav1.ConditionTrue || cond.Reason != v1alpha1.CommittedResourceReasonAccepted { + t.Errorf("AllowRejection=false: expected Ready=True/Accepted after retries, got Ready=%s/Reason=%s", cond.Status, cond.Reason) + } + }) } diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 888d37018..fe05fcc20 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -5,103 +5,73 @@ package commitments import ( "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// Config aggregates configuration for all commitments components. +// Each controller and the API have their own sub-struct so that unrelated +// fields are never visible to the wrong component. type Config struct { + ReservationController ReservationControllerConfig `json:"committedResourceReservationController"` + CommittedResourceController CommittedResourceControllerConfig `json:"committedResourceController"` + API APIConfig `json:"committedResourceAPI"` - // RequeueIntervalActive is the interval for requeueing active reservations for periodic verification. - RequeueIntervalActive time.Duration `json:"committedResourceRequeueIntervalActive"` - // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. - RequeueIntervalRetry time.Duration `json:"committedResourceRequeueIntervalRetry"` - // AllocationGracePeriod is the time window after a VM is allocated to a reservation - // during which it's expected to appear on the target host. VMs not confirmed within - // this period are considered stale and removed from the reservation. - AllocationGracePeriod time.Duration `json:"committedResourceAllocationGracePeriod"` - // RequeueIntervalGracePeriod is the interval for requeueing when VMs are in grace period. - // Shorter than RequeueIntervalActive for faster verification of new allocations. - RequeueIntervalGracePeriod time.Duration `json:"committedResourceRequeueIntervalGracePeriod"` - // PipelineDefault is the default pipeline used for scheduling committed resource reservations. - PipelineDefault string `json:"committedResourcePipelineDefault"` - - // SchedulerURL is the endpoint of the nova external scheduler - SchedulerURL string `json:"schedulerURL"` - - // DatasourceName is the name of the Datasource CRD that provides database connection info. - // Used to query VM state for report-usage. If empty, report-usage returns an error. + // DatasourceName is the name of the Datasource CRD that provides database + // connection info. Used to construct the UsageDBClient for report-usage. DatasourceName string `json:"datasourceName,omitempty"` +} - // FlavorGroupPipelines maps flavor group names to pipeline names. - // Example: {"2152": "kvm-hana-bin-packing", "2101": "kvm-general-purpose-load-balancing", "*": "kvm-general-purpose-load-balancing"} - // Used to select different scheduling pipelines based on flavor group characteristics. - FlavorGroupPipelines map[string]string `json:"committedResourceFlavorGroupPipelines,omitempty"` - - // API configuration - - // ChangeAPIWatchReservationsTimeout defines how long to wait for reservations to become ready before timing out and rolling back. - ChangeAPIWatchReservationsTimeout time.Duration `json:"committedResourceChangeAPIWatchReservationsTimeout"` - - // ChangeAPIWatchReservationsPollInterval defines how frequently to poll reservation status during watch. - ChangeAPIWatchReservationsPollInterval time.Duration `json:"committedResourceChangeAPIWatchReservationsPollInterval"` - - // EnableChangeCommitmentsAPI controls whether the change-commitments API endpoint is active. - // When false, the endpoint will return HTTP 503 Service Unavailable. - // The info endpoint remains available for health checks. - EnableChangeCommitmentsAPI bool `json:"committedResourceEnableChangeCommitmentsAPI"` - - // EnableReportUsageAPI controls whether the report-usage API endpoint is active. - // When false, the endpoint will return HTTP 503 Service Unavailable. - // This can be used as an emergency switch if the usage reporting is causing issues. - EnableReportUsageAPI bool `json:"committedResourceEnableReportUsageAPI"` +// ReservationControllerConfig holds tuning knobs for the Reservation CRD controller. +type ReservationControllerConfig struct { + // RequeueIntervalActive is how often to re-verify a healthy Reservation CRD. + RequeueIntervalActive metav1.Duration `json:"requeueIntervalActive"` + // RequeueIntervalRetry is the back-off interval when knowledge is unavailable. + RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"` + // RequeueIntervalGracePeriod is how often to re-check while a VM allocation + // is still within AllocationGracePeriod. Shorter than RequeueIntervalActive. + RequeueIntervalGracePeriod metav1.Duration `json:"requeueIntervalGracePeriod"` + // AllocationGracePeriod is the time window after a VM is allocated to a + // reservation during which it's expected to appear on the target host. + // VMs not confirmed within this period are considered stale and removed. + AllocationGracePeriod metav1.Duration `json:"allocationGracePeriod"` + // SchedulerURL is the endpoint of the nova external scheduler. + SchedulerURL string `json:"schedulerURL"` + // PipelineDefault is the fallback pipeline when no FlavorGroupPipelines entry matches. + PipelineDefault string `json:"pipelineDefault"` + // FlavorGroupPipelines maps flavor group IDs to pipeline names; "*" acts as catch-all. + FlavorGroupPipelines map[string]string `json:"flavorGroupPipelines,omitempty"` +} - // EnableReportCapacityAPI controls whether the report-capacity API endpoint is active. - // When false, the endpoint will return HTTP 503 Service Unavailable. - // This can be used as an emergency switch if the capacity reporting is causing issues. - EnableReportCapacityAPI bool `json:"committedResourceEnableReportCapacityAPI"` +// CommittedResourceControllerConfig holds tuning knobs for the CommittedResource CRD controller. +type CommittedResourceControllerConfig struct { + // RequeueIntervalRetry is the back-off interval when placement is pending or failed. + RequeueIntervalRetry metav1.Duration `json:"requeueIntervalRetry"` } -// ApplyDefaults fills in any unset values with defaults. -func (c *Config) ApplyDefaults() { - defaults := DefaultConfig() - if c.RequeueIntervalActive == 0 { - c.RequeueIntervalActive = defaults.RequeueIntervalActive - } - if c.RequeueIntervalRetry == 0 { - c.RequeueIntervalRetry = defaults.RequeueIntervalRetry - } - if c.RequeueIntervalGracePeriod == 0 { - c.RequeueIntervalGracePeriod = defaults.RequeueIntervalGracePeriod - } - if c.AllocationGracePeriod == 0 { - c.AllocationGracePeriod = defaults.AllocationGracePeriod - } - if c.PipelineDefault == "" { - c.PipelineDefault = defaults.PipelineDefault - } - if c.SchedulerURL == "" { - c.SchedulerURL = defaults.SchedulerURL - } - if c.ChangeAPIWatchReservationsTimeout == 0 { - c.ChangeAPIWatchReservationsTimeout = defaults.ChangeAPIWatchReservationsTimeout - } - if c.ChangeAPIWatchReservationsPollInterval == 0 { - c.ChangeAPIWatchReservationsPollInterval = defaults.ChangeAPIWatchReservationsPollInterval - } - // Note: EnableChangeCommitmentsAPI, EnableReportUsageAPI, EnableReportCapacityAPI - // are booleans where false is a valid value, so we don't apply defaults for them +// APIConfig holds configuration for the LIQUID commitment HTTP endpoints. +type APIConfig struct { + // EnableChangeCommitments controls whether the change-commitments endpoint is active. + // When false the endpoint returns HTTP 503; the info endpoint remains available. + EnableChangeCommitments bool `json:"enableChangeCommitments"` + // EnableReportUsage controls whether the report-usage endpoint is active. + EnableReportUsage bool `json:"enableReportUsage"` + // EnableReportCapacity controls whether the report-capacity endpoint is active. + EnableReportCapacity bool `json:"enableReportCapacity"` + // WatchTimeout is how long the change-commitments handler polls CommittedResource + // CRD conditions before giving up and rolling back. + WatchTimeout metav1.Duration `json:"watchTimeout"` + // WatchPollInterval is how frequently the change-commitments handler polls + // CommittedResource CRD conditions while waiting for the controller outcome. + WatchPollInterval metav1.Duration `json:"watchPollInterval"` } -func DefaultConfig() Config { - return Config{ - RequeueIntervalActive: 5 * time.Minute, - RequeueIntervalRetry: 1 * time.Minute, - RequeueIntervalGracePeriod: 1 * time.Minute, - AllocationGracePeriod: 15 * time.Minute, - PipelineDefault: "kvm-general-purpose-load-balancing", - SchedulerURL: "http://localhost:8080/scheduler/nova/external", - ChangeAPIWatchReservationsTimeout: 10 * time.Second, - ChangeAPIWatchReservationsPollInterval: 500 * time.Millisecond, - EnableChangeCommitmentsAPI: true, - EnableReportUsageAPI: true, - EnableReportCapacityAPI: true, +func DefaultAPIConfig() APIConfig { + return APIConfig{ + EnableChangeCommitments: true, + EnableReportUsage: true, + EnableReportCapacity: true, + WatchTimeout: metav1.Duration{Duration: 10 * time.Second}, + WatchPollInterval: metav1.Duration{Duration: 500 * time.Millisecond}, } } diff --git a/internal/scheduling/reservations/commitments/e2e_checks.go b/internal/scheduling/reservations/commitments/e2e_checks.go index 2292bcaa1..cd4b15d05 100644 --- a/internal/scheduling/reservations/commitments/e2e_checks.go +++ b/internal/scheduling/reservations/commitments/e2e_checks.go @@ -4,13 +4,17 @@ package commitments import ( + "bytes" "context" "encoding/json" "fmt" "io" "log/slog" "net/http" + "strings" + "time" + . "github.com/majewsky/gg/option" liquid "github.com/sapcc/go-api-declarations/liquid" "github.com/sapcc/go-bits/must" ) @@ -19,38 +23,46 @@ const ( // Default URL for the commitments API endpoint. // This should match the service name in the helm chart. defaultCommitmentsAPIURL = "http://cortex-nova-scheduler:8080" + + // defaultE2EProjectUUID is a well-known fake project UUID used when no TestProjectID is configured. + // It is intentionally not a real OpenStack project — commitments created under it self-expire. + defaultE2EProjectUUID = "00000000-0000-0000-0000-000000000e2e" ) // E2EChecksConfig holds the configuration for CR e2e checks. type E2EChecksConfig struct { - // Base URL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL. + // BaseURL for the commitments API. If empty, defaults to defaultCommitmentsAPIURL. BaseURL string `json:"baseURL"` + // RoundTripCheck holds optional overrides for the round-trip check. + // If nil, defaults are used: testProjectID = defaultE2EProjectUUID, az = "". + RoundTripCheck *E2ERoundTripConfig `json:"roundTripCheck,omitempty"` +} + +// E2ERoundTripConfig holds optional overrides for the create→delete round-trip e2e check. +type E2ERoundTripConfig struct { + // AZ is the availability zone to use (e.g. "qa-de-1d"). Defaults to "" if not set. + AZ string `json:"az"` + // TestProjectID is the OpenStack project UUID to create test commitments under. + // Defaults to defaultE2EProjectUUID if not set. + TestProjectID string `json:"testProjectID"` } -// CheckCommitmentsInfoEndpoint sends a GET request to the /commitments/v1/info endpoint -// and verifies that it returns HTTP 200 with a valid ServiceInfo response. +// CheckCommitmentsInfoEndpoint verifies that GET /commitments/v1/info returns 200 with a valid ServiceInfo. func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { - baseURL := config.BaseURL - if baseURL == "" { - baseURL = defaultCommitmentsAPIURL - } + baseURL := e2eBaseURL(config) apiURL := baseURL + "/commitments/v1/info" slog.Info("checking commitments info endpoint", "apiURL", apiURL) httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody)) httpReq.Header.Set("Accept", "application/json") - //nolint:bodyclose // Body is closed in the deferred function below. + //nolint:bodyclose resp := must.Return(http.DefaultClient.Do(httpReq)) defer resp.Body.Close() if resp.StatusCode != http.StatusOK { bodyBytes := must.Return(io.ReadAll(resp.Body)) - slog.Error("commitments info API returned non-200 status code", - "statusCode", resp.StatusCode, - "responseBody", string(bodyBytes), - ) - panic(fmt.Sprintf("commitments info API returned status %d, expected 200", resp.StatusCode)) + panic(fmt.Sprintf("commitments info API returned status %d: %s", resp.StatusCode, bodyBytes)) } var serviceInfo liquid.ServiceInfo @@ -58,20 +70,208 @@ func CheckCommitmentsInfoEndpoint(ctx context.Context, config E2EChecksConfig) { panic(fmt.Sprintf("failed to decode ServiceInfo response: %v", err)) } - // Basic validation of the response if serviceInfo.Version < 0 { slog.Warn("commitments info returned version -1, knowledge may not be ready yet") } - slog.Info("commitments info endpoint check passed", "version", serviceInfo.Version, "resourceCount", len(serviceInfo.Resources), ) } +// CheckCommitmentsRoundTrip iterates all HandlesCommitments resources from /info and for each one: +// 1. Creates a confirmed test commitment (amount=2, expires in 5 minutes) +// 2. If accepted: calls the usage API to verify it returns 200, then deletes the commitment +// 3. If rejected: logs the reason and continues — capacity rejection is not an error +// +// Panics on infrastructure failures (non-200 from the API, deletion failure after acceptance). +func CheckCommitmentsRoundTrip(ctx context.Context, config E2EChecksConfig) { + baseURL := e2eBaseURL(config) + az := liquid.AvailabilityZone("") + projectID := liquid.ProjectUUID(defaultE2EProjectUUID) + if rt := config.RoundTripCheck; rt != nil { + if rt.AZ != "" { + az = liquid.AvailabilityZone(rt.AZ) + } + if rt.TestProjectID != "" { + projectID = liquid.ProjectUUID(rt.TestProjectID) + } + } + + serviceInfo := e2eFetchServiceInfo(ctx, baseURL) + + checked := 0 + for resourceName, resInfo := range serviceInfo.Resources { + if !resInfo.HandlesCommitments { + continue + } + e2eRoundTripResource(ctx, baseURL, serviceInfo.Version, az, projectID, resourceName) + checked++ + } + + if checked == 0 { + slog.Warn("round-trip check: no HandlesCommitments resources found in /info — nothing checked") + } +} + +// e2eRoundTripResource runs the create→usageCheck→delete cycle for one resource. +func e2eRoundTripResource( + ctx context.Context, + baseURL string, + infoVersion int64, + az liquid.AvailabilityZone, + projectID liquid.ProjectUUID, + resourceName liquid.ResourceName, +) { + + testUUID := liquid.CommitmentUUID(fmt.Sprintf("e2e-%d", time.Now().UnixMilli())) + expiresAt := time.Now().Add(5 * time.Minute) + const amount = uint64(2) + + createReq := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + TotalConfirmedAfter: amount, + Commitments: []liquid.Commitment{{ + UUID: testUUID, + Amount: amount, + NewStatus: Some(liquid.CommitmentStatusConfirmed), + ExpiresAt: expiresAt, + }}, + }, + }, + }, + }, + } + + slog.Info("round-trip check: creating test commitment", + "resource", resourceName, "uuid", testUUID, "project", projectID, "az", az) + + rejectionReason := e2eSendChangeCommitments(ctx, baseURL, createReq) + if rejectionReason != "" { + // Only capacity rejections (no hosts available) are expected in production clusters. + // Any other reason (flavor group ineligible, config error, timeout) indicates a + // regression and should surface as a failure. + if !strings.Contains(rejectionReason, "no hosts found") { + panic(fmt.Sprintf("round-trip check: commitment rejected with unexpected reason for resource %s: %s", resourceName, rejectionReason)) + } + slog.Info("round-trip check: commitment rejected — no capacity, continuing", + "resource", resourceName, "reason", rejectionReason) + return + } + slog.Info("round-trip check: commitment accepted", "resource", resourceName, "uuid", testUUID) + + // Register cleanup immediately so it runs even if the usage check panics. + defer func() { + deleteReq := liquid.CommitmentChangeRequest{ + InfoVersion: infoVersion, + AZ: az, + ByProject: map[liquid.ProjectUUID]liquid.ProjectCommitmentChangeset{ + projectID: { + ByResource: map[liquid.ResourceName]liquid.ResourceCommitmentChangeset{ + resourceName: { + TotalConfirmedBefore: amount, + Commitments: []liquid.Commitment{{ + UUID: testUUID, + Amount: amount, + OldStatus: Some(liquid.CommitmentStatusConfirmed), + NewStatus: None[liquid.CommitmentStatus](), + ExpiresAt: expiresAt, + }}, + }, + }, + }, + }, + } + slog.Info("round-trip check: deleting test commitment", "resource", resourceName, "uuid", testUUID) + if reason := e2eSendChangeCommitments(ctx, baseURL, deleteReq); reason != "" { + panic(fmt.Sprintf("round-trip check: delete of test commitment %s was rejected: %s", testUUID, reason)) + } + slog.Info("round-trip check: commitment deleted", "resource", resourceName, "uuid", testUUID) + }() + + // Smoke-check the usage API: verifies the usage calculation pipeline works for this project. + e2eCheckUsageAPI(ctx, baseURL, az, projectID) +} + +// e2eCheckUsageAPI calls POST /commitments/v1/projects/:id/report-usage and verifies 200. +// The usage report for a project with no VMs will show zero usage — we only verify the endpoint works. +func e2eCheckUsageAPI(ctx context.Context, baseURL string, az liquid.AvailabilityZone, projectID liquid.ProjectUUID) { + usageReq := liquid.ServiceUsageRequest{AllAZs: []liquid.AvailabilityZone{az}} + body := must.Return(json.Marshal(usageReq)) + url := fmt.Sprintf("%s/commitments/v1/projects/%s/report-usage", baseURL, projectID) + httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))) + httpReq.Header.Set("Content-Type", "application/json") + + //nolint:bodyclose + resp := must.Return(http.DefaultClient.Do(httpReq)) + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + bodyBytes := must.Return(io.ReadAll(resp.Body)) + panic(fmt.Sprintf("usage API returned %d: %s", resp.StatusCode, bodyBytes)) + } + slog.Info("round-trip check: usage API returned 200", "project", projectID) +} + +// e2eSendChangeCommitments sends a change-commitments request. +// Panics on HTTP non-200 (infrastructure error). +// Returns the rejection reason on 200+rejection (expected for capacity-constrained clusters). +// Returns "" on success. +func e2eSendChangeCommitments(ctx context.Context, baseURL string, req liquid.CommitmentChangeRequest) string { + body := must.Return(json.Marshal(req)) + httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodPost, + baseURL+"/commitments/v1/change-commitments", bytes.NewReader(body))) + httpReq.Header.Set("Content-Type", "application/json") + + //nolint:bodyclose + resp := must.Return(http.DefaultClient.Do(httpReq)) + defer resp.Body.Close() + respBody := must.Return(io.ReadAll(resp.Body)) + + if resp.StatusCode != http.StatusOK { + panic(fmt.Sprintf("change-commitments returned %d: %s", resp.StatusCode, respBody)) + } + var result liquid.CommitmentChangeResponse + if err := json.Unmarshal(respBody, &result); err != nil { + panic(fmt.Sprintf("failed to decode change-commitments response: %v", err)) + } + return result.RejectionReason +} + +// e2eFetchServiceInfo fetches and decodes /info. Panics on failure. +func e2eFetchServiceInfo(ctx context.Context, baseURL string) liquid.ServiceInfo { + httpReq := must.Return(http.NewRequestWithContext(ctx, http.MethodGet, + baseURL+"/commitments/v1/info", http.NoBody)) + httpReq.Header.Set("Accept", "application/json") + //nolint:bodyclose + resp := must.Return(http.DefaultClient.Do(httpReq)) + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + bodyBytes := must.Return(io.ReadAll(resp.Body)) + panic(fmt.Sprintf("info endpoint returned %d: %s", resp.StatusCode, bodyBytes)) + } + var info liquid.ServiceInfo + if err := json.NewDecoder(resp.Body).Decode(&info); err != nil { + panic(fmt.Sprintf("failed to decode ServiceInfo: %v", err)) + } + return info +} + +func e2eBaseURL(config E2EChecksConfig) string { + if config.BaseURL != "" { + return config.BaseURL + } + return defaultCommitmentsAPIURL +} + // RunCommitmentsE2EChecks runs all e2e checks for the commitments API. func RunCommitmentsE2EChecks(ctx context.Context, config E2EChecksConfig) { slog.Info("running commitments e2e checks") CheckCommitmentsInfoEndpoint(ctx, config) + CheckCommitmentsRoundTrip(ctx, config) slog.Info("all commitments e2e checks passed") } diff --git a/internal/scheduling/reservations/commitments/field_index.go b/internal/scheduling/reservations/commitments/field_index.go index 9e3fde378..40760655d 100644 --- a/internal/scheduling/reservations/commitments/field_index.go +++ b/internal/scheduling/reservations/commitments/field_index.go @@ -14,6 +14,7 @@ import ( ) const idxCommittedResourceByUUID = "spec.commitmentUUID" +const idxReservationByCommitmentUUID = "spec.committedResourceReservation.commitmentUUID" // IndexFields registers field indexes required by the CommittedResource controller. func IndexFields(ctx context.Context, mcl *multicluster.Client) error { @@ -38,6 +39,25 @@ func IndexFields(ctx context.Context, mcl *multicluster.Client) error { log.Error(err, "failed to set up index for commitmentUUID") return err } - log.Info("Successfully set up index for commitmentUUID") + if err := mcl.IndexField(ctx, + &v1alpha1.Reservation{}, + &v1alpha1.ReservationList{}, + idxReservationByCommitmentUUID, + func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok { + log.Error(errors.New("unexpected type"), "expected Reservation", "object", obj) + return nil + } + if res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }, + ); err != nil { + log.Error(err, "failed to set up index for reservation commitmentUUID") + return err + } + log.Info("Successfully set up field indexes") return nil } diff --git a/internal/scheduling/reservations/commitments/integration_test.go b/internal/scheduling/reservations/commitments/integration_test.go new file mode 100644 index 000000000..138f3c74c --- /dev/null +++ b/internal/scheduling/reservations/commitments/integration_test.go @@ -0,0 +1,618 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package commitments + +// Table-driven integration tests for the committed-resource lifecycle. +// +// Each test case wires CommittedResourceController and CommitmentReservationController +// against a shared fake k8s client and a mock Nova scheduler, then drives both +// controllers synchronously until every CR reaches a terminal condition. +// +// Terminal conditions (no further reconcile expected without external input): +// - Ready=True / Accepted +// - Ready=False / Rejected +// - Ready=False / Planned (controller waits for StartTime) +// - Ready=False / Expired (controller has cleaned up children) +// - Ready=False / Superseded + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// ============================================================================ +// Test cases +// ============================================================================ + +// CRIntegrationTestCase defines one end-to-end scenario for the committed-resource +// lifecycle spanning both controllers and the mock scheduler. +type CRIntegrationTestCase struct { + Name string + + // Initial cluster state. + Hypervisors []*hv1.Hypervisor + ExistingReservations []*v1alpha1.Reservation // pre-placed slots (for expiry/supersede scenarios) + + // CRs to create and drive to terminal state. + CommittedResources []*v1alpha1.CommittedResource + + // When true the mock scheduler returns an empty hosts list (NoHostsFound). + SchedulerRejects bool + // SchedulerAcceptFirst, when > 0, makes the mock scheduler accept only the first N + // placement calls and reject all subsequent ones. Used to test partial placement + // (e.g. first slot placed, second slot rejected). Takes precedence over SchedulerRejects. + SchedulerAcceptFirst int + + // Expected state after all CRs reach a terminal condition. + ExpectedSlots int // total Reservation CRDs remaining in the store + AcceptedCRs []string // CRs expected Ready=True / Accepted + RejectedCRs []string // CRs expected Ready=False / Rejected + PlannedCRs []string // CRs expected Ready=False / Planned + ExpiredCRs []string // CRs expected Ready=False / Expired + SupersededCRs []string // CRs expected Ready=False / Superseded +} + +func TestCRIntegration(t *testing.T) { + testCases := []CRIntegrationTestCase{ + // ------------------------------------------------------------------ + // Acceptance: slot count from commitment amount + // ------------------------------------------------------------------ + { + Name: "single confirmed CR: one slot placed, CR accepted", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-1", "uuid-intg-0001", v1alpha1.CommitmentStatusConfirmed), + }, + ExpectedSlots: 1, + AcceptedCRs: []string{"cr-1"}, + }, + { + // 8 GiB commitment with the default 4 GiB test flavor → 2 slots + Name: "large CR: commitment amount spans multiple flavors, two slots placed", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCRAmount("cr-large", "uuid-intg-0002", v1alpha1.CommitmentStatusConfirmed, "8Gi"), + }, + ExpectedSlots: 2, + AcceptedCRs: []string{"cr-large"}, + }, + // ------------------------------------------------------------------ + // Pending / guaranteed: same placement path as confirmed + // ------------------------------------------------------------------ + { + Name: "pending CR: slot placed, CR accepted", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-pending", "uuid-intg-0003", v1alpha1.CommitmentStatusPending), + }, + ExpectedSlots: 1, + AcceptedCRs: []string{"cr-pending"}, + }, + { + Name: "guaranteed CR: slot placed, CR accepted", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-guaranteed", "uuid-intg-0004", v1alpha1.CommitmentStatusGuaranteed), + }, + ExpectedSlots: 1, + AcceptedCRs: []string{"cr-guaranteed"}, + }, + // ------------------------------------------------------------------ + // Planned: no slots, condition stays Planned + // ------------------------------------------------------------------ + { + Name: "planned CR: no slots created, condition stays Planned", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-planned", "uuid-intg-0005", v1alpha1.CommitmentStatusPlanned), + }, + ExpectedSlots: 0, + PlannedCRs: []string{"cr-planned"}, + }, + // ------------------------------------------------------------------ + // Rejection paths + // ------------------------------------------------------------------ + { + Name: "scheduler returns no hosts: CR rejected and slots cleaned up", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCRAllowRejection("cr-rej", "uuid-intg-0006", v1alpha1.CommitmentStatusConfirmed), + }, + SchedulerRejects: true, + ExpectedSlots: 0, + RejectedCRs: []string{"cr-rej"}, + }, + { + // Reservation controller detects the empty hosts list before calling the scheduler. + Name: "no hypervisors in cluster: CR rejected with NoHostsAvailable", + Hypervisors: []*hv1.Hypervisor{}, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCRAllowRejection("cr-nohosts", "uuid-intg-0007", v1alpha1.CommitmentStatusConfirmed), + }, + ExpectedSlots: 0, + RejectedCRs: []string{"cr-nohosts"}, + }, + // ------------------------------------------------------------------ + // Multiple independent CRs + // ------------------------------------------------------------------ + { + Name: "two CRs with different UUIDs: each gets its own slot, both accepted", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + intgHypervisor("host-2"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-a", "uuid-intg-0008", v1alpha1.CommitmentStatusConfirmed), + intgCR("cr-b", "uuid-intg-0009", v1alpha1.CommitmentStatusConfirmed), + }, + ExpectedSlots: 2, + AcceptedCRs: []string{"cr-a", "cr-b"}, + }, + { + // One CR in planned state should not block the other from being accepted. + Name: "one planned CR and one confirmed CR: only confirmed CR gets a slot", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-plan", "uuid-intg-0010", v1alpha1.CommitmentStatusPlanned), + intgCR("cr-conf", "uuid-intg-0011", v1alpha1.CommitmentStatusConfirmed), + }, + ExpectedSlots: 1, + PlannedCRs: []string{"cr-plan"}, + AcceptedCRs: []string{"cr-conf"}, + }, + // ------------------------------------------------------------------ + // Inactive states: existing slots must be cleaned up + // ------------------------------------------------------------------ + { + Name: "expired CR with existing slot: slot deleted, CR marked inactive", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + ExistingReservations: []*v1alpha1.Reservation{ + intgExistingReservation("cr-expire-0", "uuid-intg-0012"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-expire", "uuid-intg-0012", v1alpha1.CommitmentStatusExpired), + }, + ExpectedSlots: 0, + ExpiredCRs: []string{"cr-expire"}, + }, + { + Name: "superseded CR with existing slot: slot deleted, CR marked inactive", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + ExistingReservations: []*v1alpha1.Reservation{ + intgExistingReservation("cr-supersede-0", "uuid-intg-0013"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCR("cr-supersede", "uuid-intg-0013", v1alpha1.CommitmentStatusSuperseded), + }, + ExpectedSlots: 0, + SupersededCRs: []string{"cr-supersede"}, + }, + // ------------------------------------------------------------------ + // Spec validation: unknown flavor group + // ------------------------------------------------------------------ + { + // ApplyCommitmentState returns "flavor group not found" which triggers + // rollback+Rejected (AllowRejection=true); no child slots are ever created. + Name: "unknown flavor group: CR rejected, no slots created", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCRUnknownFlavorGroup("cr-unk", "uuid-intg-0014", v1alpha1.CommitmentStatusConfirmed), + }, + ExpectedSlots: 0, + RejectedCRs: []string{"cr-unk"}, + }, + // ------------------------------------------------------------------ + // Partial placement: first slot placed, second slot rejected + // ------------------------------------------------------------------ + { + // 8 GiB CR needs 2 slots. Scheduler accepts the first call (slot 0 placed) + // then rejects the second (slot 1 gets NoHostsFound). With AllowRejection=true + // the CR controller rolls back: deletes both slots and sets Rejected. + Name: "partial placement: first slot placed, second slot rejected, CR rolled back", + Hypervisors: []*hv1.Hypervisor{ + intgHypervisor("host-1"), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + intgCRAmountAllowRejection("cr-partial", "uuid-intg-0015", v1alpha1.CommitmentStatusConfirmed, "8Gi"), + }, + SchedulerAcceptFirst: 1, + ExpectedSlots: 0, + RejectedCRs: []string{"cr-partial"}, + }, + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + runCRIntegrationTestCase(t, tc) + }) + } +} + +// ============================================================================ +// Runner +// ============================================================================ + +func runCRIntegrationTestCase(t *testing.T, tc CRIntegrationTestCase) { + t.Helper() + + schedulerFn := intgAcceptScheduler + switch { + case tc.SchedulerAcceptFirst > 0: + schedulerFn = intgAcceptFirstScheduler(tc.SchedulerAcceptFirst) + case tc.SchedulerRejects: + schedulerFn = intgRejectScheduler + } + + objects := []client.Object{newTestFlavorKnowledge()} + for _, hv := range tc.Hypervisors { + objects = append(objects, hv) + } + for _, res := range tc.ExistingReservations { + objects = append(objects, res) + } + + env := newIntgEnv(t, objects, schedulerFn) + defer env.close() + + crNames := make([]string, len(tc.CommittedResources)) + for i, cr := range tc.CommittedResources { + if err := env.k8sClient.Create(context.Background(), cr); err != nil { + t.Fatalf("create CR %s: %v", cr.Name, err) + } + crNames[i] = cr.Name + } + + intgDriveToTerminal(t, env, crNames) + + // Assert total reservation slot count. + var resList v1alpha1.ReservationList + if err := env.k8sClient.List(context.Background(), &resList, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + t.Fatalf("list reservations: %v", err) + } + if len(resList.Items) != tc.ExpectedSlots { + t.Errorf("reservation slots: want %d, got %d", tc.ExpectedSlots, len(resList.Items)) + } + + // Assert CR conditions. + intgAssertCRCondition(t, env.k8sClient, tc.AcceptedCRs, metav1.ConditionTrue, v1alpha1.CommittedResourceReasonAccepted) + intgAssertCRCondition(t, env.k8sClient, tc.RejectedCRs, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonRejected) + intgAssertCRCondition(t, env.k8sClient, tc.PlannedCRs, metav1.ConditionFalse, v1alpha1.CommittedResourceReasonPlanned) + intgAssertCRCondition(t, env.k8sClient, tc.ExpiredCRs, metav1.ConditionFalse, string(v1alpha1.CommitmentStatusExpired)) + intgAssertCRCondition(t, env.k8sClient, tc.SupersededCRs, metav1.ConditionFalse, string(v1alpha1.CommitmentStatusSuperseded)) +} + +// ============================================================================ +// Integration environment +// ============================================================================ + +type intgEnv struct { + k8sClient client.Client + crController *CommittedResourceController + resController *CommitmentReservationController + schedulerSrv *httptest.Server +} + +func newIntgEnv(t *testing.T, initialObjects []client.Object, schedulerFn http.HandlerFunc) *intgEnv { + t.Helper() + scheme := newCRTestScheme(t) + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initialObjects...). + WithStatusSubresource( + &v1alpha1.CommittedResource{}, + &v1alpha1.Reservation{}, + &v1alpha1.Knowledge{}, + ). + WithIndex(&v1alpha1.Reservation{}, idxReservationByCommitmentUUID, func(obj client.Object) []string { + res, ok := obj.(*v1alpha1.Reservation) + if !ok || res.Spec.CommittedResourceReservation == nil || res.Spec.CommittedResourceReservation.CommitmentUUID == "" { + return nil + } + return []string{res.Spec.CommittedResourceReservation.CommitmentUUID} + }). + WithIndex(&v1alpha1.CommittedResource{}, idxCommittedResourceByUUID, func(obj client.Object) []string { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok || cr.Spec.CommitmentUUID == "" { + return nil + } + return []string{cr.Spec.CommitmentUUID} + }). + Build() + + schedulerSrv := httptest.NewServer(schedulerFn) + + crCtrl := &CommittedResourceController{ + Client: k8sClient, + Scheme: scheme, + Conf: CommittedResourceControllerConfig{RequeueIntervalRetry: metav1.Duration{Duration: 5 * time.Minute}}, + } + resCtrl := &CommitmentReservationController{ + Client: k8sClient, + Scheme: scheme, + Conf: ReservationControllerConfig{ + SchedulerURL: schedulerSrv.URL, + AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}, + RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute}, + }, + } + if err := resCtrl.Init(context.Background(), resCtrl.Conf); err != nil { + t.Fatalf("resCtrl.Init: %v", err) + } + return &intgEnv{k8sClient: k8sClient, crController: crCtrl, resController: resCtrl, schedulerSrv: schedulerSrv} +} + +func (e *intgEnv) close() { e.schedulerSrv.Close() } + +// ============================================================================ +// Reconcile driver +// ============================================================================ + +// intgDriveToTerminal runs reconcile passes until every named CR has a terminal +// condition or the 5 s deadline is reached. +// +// One pass: +// 1. CR controller (adds finalizer / creates Reservation CRDs / handles inactive states) +// 2. Reservation controller ×2 per slot (first call sets TargetHost, second sets Ready=True) +// 3. CR controller again (picks up placement outcomes: Accepted or Rejected) +func intgDriveToTerminal(t *testing.T, env *intgEnv, crNames []string) { + t.Helper() + ctx := context.Background() + deadline := time.Now().Add(5 * time.Second) + + for { + if time.Now().After(deadline) { + for _, name := range crNames { + var cr v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err == nil { + t.Logf("CR %s: conditions=%v", name, cr.Status.Conditions) + } + } + t.Fatal("timed out waiting for CRs to reach terminal state") + } + + allDone := true + for _, name := range crNames { + var cr v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil { + continue // deleted = done + } + if !intgIsTerminalCR(cr) { + allDone = false + } + } + if allDone { + return + } + + // Pass 1: CR controller. + for _, name := range crNames { + var cr v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil { + continue + } + if intgIsTerminalCR(cr) { + continue + } + env.crController.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) //nolint:errcheck + } + + // Pass 2: Reservation controller (two reconciles per slot). + var resList v1alpha1.ReservationList + env.k8sClient.List(ctx, &resList, client.MatchingLabels{ //nolint:errcheck + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }) + for _, res := range resList.Items { + if intgIsTerminalReservation(res) { + continue + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: res.Name}} + env.resController.Reconcile(ctx, req) //nolint:errcheck + env.resController.Reconcile(ctx, req) //nolint:errcheck + } + + // Pass 3: CR controller picks up Reservation outcomes. + for _, name := range crNames { + var cr v1alpha1.CommittedResource + if err := env.k8sClient.Get(ctx, types.NamespacedName{Name: name}, &cr); err != nil { + continue + } + if intgIsTerminalCR(cr) { + continue + } + env.crController.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: name}}) //nolint:errcheck + } + } +} + +func intgIsTerminalCR(cr v1alpha1.CommittedResource) bool { + if !cr.DeletionTimestamp.IsZero() { + return false // needs one more reconcile to remove its finalizer + } + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + return false + } + if cond.Status == metav1.ConditionTrue { + return true + } + return cond.Reason == v1alpha1.CommittedResourceReasonRejected || + cond.Reason == v1alpha1.CommittedResourceReasonPlanned || + cond.Reason == string(v1alpha1.CommitmentStatusExpired) || + cond.Reason == string(v1alpha1.CommitmentStatusSuperseded) +} + +// intgIsTerminalReservation returns true once the Reservation controller has set any +// condition (Ready=True after placement, or Ready=False after rejection). +func intgIsTerminalReservation(res v1alpha1.Reservation) bool { + return meta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) != nil +} + +// ============================================================================ +// Assertion helpers +// ============================================================================ + +func intgAssertCRCondition(t *testing.T, k8sClient client.Client, crNames []string, wantStatus metav1.ConditionStatus, wantReason string) { + t.Helper() + for _, name := range crNames { + var cr v1alpha1.CommittedResource + if err := k8sClient.Get(context.Background(), types.NamespacedName{Name: name}, &cr); err != nil { + t.Errorf("CR %s not found: %v", name, err) + continue + } + cond := meta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + t.Errorf("CR %s: no Ready condition", name) + continue + } + if cond.Status != wantStatus || cond.Reason != wantReason { + t.Errorf("CR %s: want Ready=%s/Reason=%s, got Ready=%s/Reason=%s", name, wantStatus, wantReason, cond.Status, cond.Reason) + } + } +} + +// ============================================================================ +// Scheduler handlers +// ============================================================================ + +func intgAcceptScheduler(w http.ResponseWriter, r *http.Request) { + resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{"host-1"}} + json.NewEncoder(w).Encode(resp) //nolint:errcheck +} + +func intgRejectScheduler(w http.ResponseWriter, r *http.Request) { + resp := &schedulerdelegationapi.ExternalSchedulerResponse{Hosts: []string{}} + json.NewEncoder(w).Encode(resp) //nolint:errcheck +} + +// intgAcceptFirstScheduler returns a handler that accepts the first count placement calls +// and rejects all subsequent ones. Uses an atomic counter so concurrent calls are safe. +func intgAcceptFirstScheduler(count int) http.HandlerFunc { + var calls atomic.Int32 + return func(w http.ResponseWriter, r *http.Request) { + if int(calls.Add(1)) <= count { + intgAcceptScheduler(w, r) + } else { + intgRejectScheduler(w, r) + } + } +} + +// intgRejectFirstScheduler returns a handler that rejects the first count placement calls +// and accepts all subsequent ones. Used to test AllowRejection=false retry-until-success paths. +func intgRejectFirstScheduler(count int) http.HandlerFunc { + var calls atomic.Int32 + return func(w http.ResponseWriter, r *http.Request) { + if int(calls.Add(1)) <= count { + intgRejectScheduler(w, r) + } else { + intgAcceptScheduler(w, r) + } + } +} + +// ============================================================================ +// Test object builders +// ============================================================================ + +// intgHypervisor returns a minimal Hypervisor with the given name. +func intgHypervisor(name string) *hv1.Hypervisor { + return &hv1.Hypervisor{ObjectMeta: metav1.ObjectMeta{Name: name}} +} + +// intgCR returns a CommittedResource with the default 4 GiB amount. +// commitmentUUID must be unique per test case to avoid field-index collisions. +func intgCR(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource { + cr := newTestCommittedResource(name, state) + cr.Spec.CommitmentUUID = commitmentUUID + return cr +} + +// intgCRAmount returns a CommittedResource with a custom amount string (e.g. "8Gi"). +func intgCRAmount(name, commitmentUUID string, state v1alpha1.CommitmentStatus, amount string) *v1alpha1.CommittedResource { + cr := intgCR(name, commitmentUUID, state) + cr.Spec.Amount = resource.MustParse(amount) + return cr +} + +// intgCRAllowRejection returns a CommittedResource with AllowRejection=true so the +// controller rolls back and sets Rejected (rather than retrying indefinitely). +func intgCRAllowRejection(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource { + cr := intgCR(name, commitmentUUID, state) + cr.Spec.AllowRejection = true + return cr +} + +// intgCRAmountAllowRejection returns a CommittedResource with a custom amount and AllowRejection=true. +func intgCRAmountAllowRejection(name, commitmentUUID string, state v1alpha1.CommitmentStatus, amount string) *v1alpha1.CommittedResource { + cr := intgCRAmount(name, commitmentUUID, state, amount) + cr.Spec.AllowRejection = true + return cr +} + +// intgCRUnknownFlavorGroup returns a CommittedResource referencing a flavor group +// that does not exist in the Knowledge CRD, with AllowRejection=true so the +// controller reaches Rejected rather than retrying indefinitely. +func intgCRUnknownFlavorGroup(name, commitmentUUID string, state v1alpha1.CommitmentStatus) *v1alpha1.CommittedResource { + cr := intgCRAllowRejection(name, commitmentUUID, state) + cr.Spec.FlavorGroupName = "nonexistent-group" + return cr +} + +// intgExistingReservation returns a pre-placed Reservation tied to the given commitment UUID, +// used to verify that expiry/supersede paths delete children. +func intgExistingReservation(name, commitmentUUID string) *v1alpha1.Reservation { + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }, + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + CommitmentUUID: commitmentUUID, + }, + }, + } +} diff --git a/internal/scheduling/reservations/commitments/reservation_controller.go b/internal/scheduling/reservations/commitments/reservation_controller.go index 312a65530..96d86aeb2 100644 --- a/internal/scheduling/reservations/commitments/reservation_controller.go +++ b/internal/scheduling/reservations/commitments/reservation_controller.go @@ -37,11 +37,24 @@ type CommitmentReservationController struct { // Kubernetes scheme to use for the reservations. Scheme *runtime.Scheme // Configuration for the controller. - Conf Config + Conf ReservationControllerConfig // SchedulerClient for making scheduler API calls. SchedulerClient *reservations.SchedulerClient } +// echoParentGeneration copies Spec.CommittedResourceReservation.ParentGeneration to +// Status.CommittedResourceReservation.ObservedParentGeneration so the CommittedResource +// controller can confirm this reservation was processed for the current CR generation. +func echoParentGeneration(res *v1alpha1.Reservation) { + if res.Spec.CommittedResourceReservation == nil { + return + } + if res.Status.CommittedResourceReservation == nil { + res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} + } + res.Status.CommittedResourceReservation.ObservedParentGeneration = res.Spec.CommittedResourceReservation.ParentGeneration +} + // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. // Note: This controller only handles commitment reservations, as filtered by the predicate. @@ -76,6 +89,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr Reason: "MissingResourceName", Message: "reservation has no resource name", }) + echoParentGeneration(&res) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion @@ -92,6 +106,19 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr if res.IsReady() { logger.V(1).Info("reservation is active, verifying allocations") + // Sync ObservedParentGeneration if the CR controller bumped ParentGeneration since + // the last time this reservation was processed (e.g. after a spec update). Without + // this patch the CR controller would spin in Reserving forever for already-ready slots. + if res.Spec.CommittedResourceReservation != nil && + (res.Status.CommittedResourceReservation == nil || + res.Status.CommittedResourceReservation.ObservedParentGeneration != res.Spec.CommittedResourceReservation.ParentGeneration) { + old := res.DeepCopy() + echoParentGeneration(&res) + if err := r.Status().Patch(ctx, &res, client.MergeFrom(old)); client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + } + // Verify all allocations in Spec against actual VM state result, err := r.reconcileAllocations(ctx, &res) if err != nil { @@ -102,9 +129,9 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr // Requeue with appropriate interval based on allocation state // Use shorter interval if there are allocations in grace period for faster verification if result.HasAllocationsInGracePeriod { - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalGracePeriod}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalGracePeriod.Duration}, nil } - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive.Duration}, nil } // TODO trigger re-placement of unused reservations over time @@ -126,6 +153,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr Reason: "PreAllocated", Message: "reservation pre-allocated with VM allocations", }) + echoParentGeneration(&res) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion @@ -155,6 +183,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr Reason: "ReservationActive", Message: "reservation is successfully scheduled", }) + echoParentGeneration(&res) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion @@ -189,7 +218,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr logger.Info("flavor knowledge not ready, requeueing", "resourceName", resourceName, "error", err) - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, nil } // Search for the flavor across all flavor groups @@ -224,11 +253,12 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr Reason: "NoHostsAvailable", Message: "no hypervisors available for scheduling", }) + echoParentGeneration(&res) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } - return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry}, nil + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalRetry.Duration}, nil } // Select appropriate pipeline based on flavor group @@ -271,6 +301,7 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr Reason: "NoHostsFound", Message: "no hosts found for reservation", }) + echoParentGeneration(&res) patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, &res, patch); err != nil { // Ignore not-found errors during background deletion @@ -370,7 +401,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte for vmUUID, allocation := range res.Spec.CommittedResourceReservation.Allocations { allocationAge := now.Sub(allocation.CreationTimestamp.Time) - isInGracePeriod := allocationAge < r.Conf.AllocationGracePeriod + isInGracePeriod := allocationAge < r.Conf.AllocationGracePeriod.Duration if isInGracePeriod { // New allocation: VM may not yet appear in the HV CRD (still spawning). @@ -395,7 +426,7 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte "reservation", res.Name, "expectedHost", expectedHost, "allocationAge", allocationAge, - "gracePeriod", r.Conf.AllocationGracePeriod) + "gracePeriod", r.Conf.AllocationGracePeriod.Duration) } } @@ -498,11 +529,9 @@ func (r *CommitmentReservationController) hypervisorToReservations(ctx context.C } // Init initializes the reconciler with required clients and DB connection. -func (r *CommitmentReservationController) Init(ctx context.Context, client client.Client, conf Config) error { - // Initialize scheduler client +func (r *CommitmentReservationController) Init(ctx context.Context, conf ReservationControllerConfig) error { r.SchedulerClient = reservations.NewSchedulerClient(conf.SchedulerURL) logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.SchedulerURL) - return nil } @@ -543,7 +572,7 @@ var commitmentReservationPredicate = predicate.Funcs{ // SetupWithManager sets up the controller with the Manager. func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl *multicluster.Client) error { if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { - if err := r.Init(ctx, mgr.GetClient(), r.Conf); err != nil { + if err := r.Init(ctx, r.Conf); err != nil { return err } return nil @@ -580,7 +609,10 @@ func (r *CommitmentReservationController) SetupWithManager(mgr ctrl.Manager, mcl return bldr.Named("commitment-reservation"). WithOptions(controller.Options{ - // We want to process reservations one at a time to avoid overbooking. + // MaxConcurrentReconciles=1: conservative default. Note that this does NOT prevent + // the cache-staleness race where two back-to-back reconciles both pick the same host + // before the first write is visible to the capacity filter — that requires pessimistic + // blocking at the scheduler level. MaxConcurrentReconciles: 1, }). Complete(r) diff --git a/internal/scheduling/reservations/commitments/reservation_controller_test.go b/internal/scheduling/reservations/commitments/reservation_controller_test.go index 7c0d63ee7..df6316d46 100644 --- a/internal/scheduling/reservations/commitments/reservation_controller_test.go +++ b/internal/scheduling/reservations/commitments/reservation_controller_test.go @@ -80,8 +80,8 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) { reconciler := &CommitmentReservationController{ Client: k8sClient, Scheme: scheme, - Conf: Config{ - RequeueIntervalActive: 5 * time.Minute, + Conf: ReservationControllerConfig{ + RequeueIntervalActive: metav1.Duration{Duration: 5 * time.Minute}, }, } @@ -139,7 +139,7 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) { recentTime := metav1.NewTime(now.Add(-5 * time.Minute)) // 5 minutes ago (within grace period) oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) // 30 minutes ago (past grace period) - config := Config{AllocationGracePeriod: 15 * time.Minute} + config := ReservationControllerConfig{AllocationGracePeriod: metav1.Duration{Duration: 15 * time.Minute}} tests := []struct { name string @@ -474,7 +474,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t })) defer server.Close() - config := Config{ + config := ReservationControllerConfig{ SchedulerURL: server.URL, } @@ -485,7 +485,7 @@ func TestCommitmentReservationController_reconcileInstanceReservation_Success(t } // Initialize the reconciler (this sets up SchedulerClient) - if err := reconciler.Init(context.Background(), k8sClient, config); err != nil { + if err := reconciler.Init(context.Background(), config); err != nil { t.Fatalf("Failed to initialize reconciler: %v", err) } diff --git a/internal/scheduling/reservations/commitments/reservation_manager.go b/internal/scheduling/reservations/commitments/reservation_manager.go index d7a75cc7a..d1fa28fda 100644 --- a/internal/scheduling/reservations/commitments/reservation_manager.go +++ b/internal/scheduling/reservations/commitments/reservation_manager.go @@ -25,6 +25,9 @@ type ApplyResult struct { Deleted int // Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config) Repaired int + // TotalSlots is the total number of reservation slots that should exist after the apply. + // Used by the CR controller to wait for the correct number of children in the cache. + TotalSlots int // TouchedReservations are reservations that were created or updated TouchedReservations []v1alpha1.Reservation // RemovedReservations are reservations that were deleted @@ -92,6 +95,9 @@ func (m *ReservationManager) ApplyCommitmentState( if !exists { return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName) } + if len(flavorGroup.Flavors) == 0 { + return nil, fmt.Errorf("flavor group %s has no flavors", desiredState.FlavorGroupName) + } deltaMemoryBytes := desiredState.TotalMemoryBytes for _, res := range existing { memoryQuantity := res.Spec.Resources[hv1.ResourceMemory] @@ -210,6 +216,7 @@ func (m *ReservationManager) ApplyCommitmentState( "total", len(existing)+result.Created) } + result.TotalSlots = len(existing) + result.Created return result, nil } @@ -225,7 +232,8 @@ func (m *ReservationManager) syncReservationMetadata( if (state.CommitmentUUID != "" && reservation.Spec.CommittedResourceReservation.CommitmentUUID != state.CommitmentUUID) || (state.AvailabilityZone != "" && reservation.Spec.AvailabilityZone != state.AvailabilityZone) || (state.StartTime != nil && (reservation.Spec.StartTime == nil || !reservation.Spec.StartTime.Time.Equal(*state.StartTime))) || - (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) { + (state.EndTime != nil && (reservation.Spec.EndTime == nil || !reservation.Spec.EndTime.Time.Equal(*state.EndTime))) || + (state.ParentGeneration != 0 && reservation.Spec.CommittedResourceReservation.ParentGeneration != state.ParentGeneration) { // Apply patch logger.V(1).Info("syncing reservation metadata", "reservation", reservation.Name, @@ -236,6 +244,9 @@ func (m *ReservationManager) syncReservationMetadata( if state.CommitmentUUID != "" { reservation.Spec.CommittedResourceReservation.CommitmentUUID = state.CommitmentUUID } + if state.ParentGeneration != 0 { + reservation.Spec.CommittedResourceReservation.ParentGeneration = state.ParentGeneration + } if state.AvailabilityZone != "" { reservation.Spec.AvailabilityZone = state.AvailabilityZone @@ -301,13 +312,14 @@ func (m *ReservationManager) newReservation( ), }, CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - ProjectID: state.ProjectID, - CommitmentUUID: state.CommitmentUUID, - DomainID: state.DomainID, - ResourceGroup: state.FlavorGroupName, - ResourceName: flavorInGroup.Name, - Creator: creator, - Allocations: nil, + ProjectID: state.ProjectID, + CommitmentUUID: state.CommitmentUUID, + DomainID: state.DomainID, + ResourceGroup: state.FlavorGroupName, + ResourceName: flavorInGroup.Name, + Creator: creator, + ParentGeneration: state.ParentGeneration, + Allocations: nil, }, } diff --git a/internal/scheduling/reservations/commitments/state.go b/internal/scheduling/reservations/commitments/state.go index 96ede88ac..149cdfc03 100644 --- a/internal/scheduling/reservations/commitments/state.go +++ b/internal/scheduling/reservations/commitments/state.go @@ -97,6 +97,13 @@ type CommitmentState struct { // When set (e.g. "-"), Reservation CRDs are named "". // Used by the CommittedResource controller; leave empty for the legacy syncer path. NamePrefix string + // ParentGeneration is the Generation of the parent CommittedResource CRD. Written into + // Reservation spec so the Reservation controller can echo it back in status, letting + // the CR controller detect when all children have been processed for the current spec. + // Zero for syncer-created reservations (no parent CR). + ParentGeneration int64 + // State is the lifecycle state from Limes (planned/pending/guaranteed/confirmed/superseded/expired). + State v1alpha1.CommitmentStatus } // FromCommitment converts Limes commitment to CommitmentState. @@ -144,6 +151,7 @@ func FromCommitment( AvailabilityZone: commitment.AvailabilityZone, StartTime: startTime, EndTime: endTime, + State: v1alpha1.CommitmentStatus(commitment.Status), }, nil } @@ -151,6 +159,7 @@ func FromCommitment( func FromChangeCommitmentTargetState( commitment liquid.Commitment, projectID string, + domainID string, flavorGroupName string, flavorGroup compute.FlavorGroupFeature, az string, @@ -166,8 +175,8 @@ func FromChangeCommitmentTargetState( var endTime *time.Time switch commitment.NewStatus.UnwrapOr("none") { - // guaranteed and confirmed commitments are honored with start time now - case liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: + // pending, guaranteed, confirmed commitments are honored with Reservation slots. + case liquid.CommitmentStatusPending, liquid.CommitmentStatusGuaranteed, liquid.CommitmentStatusConfirmed: amountMultiple = commitment.Amount // Set start time: use ConfirmBy if available (when the commitment was confirmed), // otherwise use time.Now() for immediate confirmation @@ -187,7 +196,7 @@ func FromChangeCommitmentTargetState( if !commitment.ExpiresAt.IsZero() { endTime = &commitment.ExpiresAt // check expiry time - if commitment.ExpiresAt.Before(time.Now()) || commitment.ExpiresAt.Equal(time.Now()) { + if !commitment.ExpiresAt.After(time.Now()) { // commitment is already expired, ignore capacity amountMultiple = 0 } @@ -203,11 +212,13 @@ func FromChangeCommitmentTargetState( return &CommitmentState{ CommitmentUUID: string(commitment.UUID), ProjectID: projectID, + DomainID: domainID, FlavorGroupName: flavorGroupName, TotalMemoryBytes: totalMemoryBytes, AvailabilityZone: az, StartTime: startTime, EndTime: endTime, + State: v1alpha1.CommitmentStatus(commitment.NewStatus.UnwrapOr("")), }, nil } diff --git a/internal/scheduling/reservations/commitments/syncer.go b/internal/scheduling/reservations/commitments/syncer.go index 60c450b9a..8d3a43adf 100644 --- a/internal/scheduling/reservations/commitments/syncer.go +++ b/internal/scheduling/reservations/commitments/syncer.go @@ -13,7 +13,10 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) var ( @@ -30,21 +33,6 @@ type SyncerConfig struct { SyncInterval time.Duration `json:"committedResourceSyncInterval"` } -func DefaultSyncerConfig() SyncerConfig { - return SyncerConfig{ - SyncInterval: time.Hour, - } -} - -// ApplyDefaults fills in any unset values with defaults. -func (c *SyncerConfig) ApplyDefaults() { - defaults := DefaultSyncerConfig() - if c.SyncInterval == 0 { - c.SyncInterval = defaults.SyncInterval - } - // Note: KeystoneSecretRef and SSOSecretRef are not defaulted as they require explicit configuration -} - type Syncer struct { // Client to fetch commitments from Limes CommitmentsClient @@ -97,11 +85,6 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo skippedUUIDs: make(map[string]bool), } for id, commitment := range commitments { - // Record each commitment seen from Limes - if s.monitor != nil { - s.monitor.RecordCommitmentSeen() - } - if commitment.ServiceType != "compute" { log.Info("skipping non-compute commitment", "id", id, "serviceType", commitment.ServiceType) if s.monitor != nil { @@ -110,12 +93,19 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo continue } - // Only process commitments that are active (confirmed or guaranteed). - // planned/pending are not yet accepted by Cortex; superseded/expired are done. - if commitment.Status != "confirmed" && commitment.Status != "guaranteed" { - log.Info("skipping non-active commitment", "id", id, "status", commitment.Status) - if s.monitor != nil { - s.monitor.RecordCommitmentSkipped(SkipReasonNonActive) + // Validate that the commitment state is a known enum value. + switch v1alpha1.CommitmentStatus(commitment.Status) { + case v1alpha1.CommitmentStatusPlanned, + v1alpha1.CommitmentStatusPending, + v1alpha1.CommitmentStatusGuaranteed, + v1alpha1.CommitmentStatusConfirmed, + v1alpha1.CommitmentStatusSuperseded, + v1alpha1.CommitmentStatusExpired: + // valid, continue processing + default: + log.Info("skipping commitment with unknown status", "id", id, "status", commitment.Status) + if commitment.UUID != "" { + result.skippedUUIDs[commitment.UUID] = true } continue } @@ -194,11 +184,6 @@ func (s *Syncer) getCommitmentStates(ctx context.Context, log logr.Logger, flavo "totalMemoryBytes", state.TotalMemoryBytes) result.states = append(result.states, state) - - // Record successfully processed commitment - if s.monitor != nil { - s.monitor.RecordCommitmentProcessed() - } } return result, nil @@ -215,16 +200,21 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { logger.Info("starting commitment sync") - // Record sync run - if s.monitor != nil { - s.monitor.RecordSyncRun() - } + startTime := time.Now() + defer func() { + if s.monitor != nil { + s.monitor.RecordDuration(time.Since(startTime).Seconds()) + } + }() // Check if flavor group knowledge is ready knowledge := &reservations.FlavorGroupKnowledgeClient{Client: s.Client} knowledgeCRD, err := knowledge.Get(ctx) if err != nil { logger.Error(err, "failed to check flavor group knowledge readiness") + if s.monitor != nil { + s.monitor.RecordError() + } return err } if knowledgeCRD == nil { @@ -236,6 +226,9 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, knowledgeCRD) if err != nil { logger.Error(err, "failed to get flavor groups from knowledge") + if s.monitor != nil { + s.monitor.RecordError() + } return err } @@ -243,42 +236,48 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { commitmentResult, err := s.getCommitmentStates(ctx, logger, flavorGroups) if err != nil { logger.Error(err, "failed to get compute commitments") + if s.monitor != nil { + s.monitor.RecordError() + } return err } - // Create ReservationManager to handle state application - manager := NewReservationManager(s.Client) + if s.monitor != nil { + s.monitor.SetLimesCommitmentsActive(len(commitmentResult.states)) + } - // Apply each commitment state using the manager - var totalCreated, totalDeleted, totalRepaired int + // Upsert CommittedResource CRDs for each commitment + var totalCreated, totalUpdated int for _, state := range commitmentResult.states { - logger.Info("applying commitment state", + logger.Info("upserting committed resource CRD", "commitmentUUID", state.CommitmentUUID, "projectID", state.ProjectID, "flavorGroup", state.FlavorGroupName, - "totalMemoryBytes", state.TotalMemoryBytes) - - applyResult, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, CreatorValue) + "state", state.State) + + var ( + op controllerutil.OperationResult + err error + ) + if isTerminalCommitment(state) { + // Terminal commitments (superseded/expired state, or EndTime in the past): update + // existing CRD so the controller can clean up Reservations, but do not create a + // new one — if no CRD exists locally there are no Reservation slots to clean up. + op, err = s.updateCommittedResourceIfExists(ctx, logger, state) + } else { + op, err = s.upsertCommittedResource(ctx, logger, state) + } if err != nil { - logger.Error(err, "failed to apply commitment state", + logger.Error(err, "failed to upsert committed resource CRD", "commitmentUUID", state.CommitmentUUID) - // Continue with other commitments even if one fails continue } - - totalCreated += applyResult.Created - totalDeleted += applyResult.Deleted - totalRepaired += applyResult.Repaired - } - - // Delete reservations that are no longer in commitments - // Only query committed resource reservations using labels for efficiency - var existingReservations v1alpha1.ReservationList - if err := s.List(ctx, &existingReservations, client.MatchingLabels{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }); err != nil { - logger.Error(err, "failed to list existing committed resource reservations") - return err + switch op { + case controllerutil.OperationResultCreated: + totalCreated++ + case controllerutil.OperationResultUpdated: + totalUpdated++ + } } // Build set of commitment UUIDs we should have (processed + skipped) @@ -286,51 +285,173 @@ func (s *Syncer) SyncReservations(ctx context.Context) error { for _, state := range commitmentResult.states { activeCommitments[state.CommitmentUUID] = true } - // Also include skipped commitments - don't delete their CRDs for uuid := range commitmentResult.skippedUUIDs { activeCommitments[uuid] = true } - // Delete reservations for commitments that no longer exist - for _, existing := range existingReservations.Items { - // Extract commitment UUID from reservation name - commitmentUUID := extractCommitmentUUID(existing.Name) - if commitmentUUID == "" { - logger.Info("skipping reservation with unparseable name", "name", existing.Name) + // Count CommittedResource CRDs present locally but absent from Limes (do not delete — Limes + // responses may be transient and deleting active CRDs would drop Reservation slots). + // Also GC CRDs whose EndTime has passed: the commitment is over, the controller's finalizer + // will clean up child Reservations on deletion. + var existingCRs v1alpha1.CommittedResourceList + if err := s.List(ctx, &existingCRs); err != nil { + logger.Error(err, "failed to list existing committed resource CRDs") + if s.monitor != nil { + s.monitor.RecordError() + } + return err + } + staleCRCount, gcDeleted := 0, 0 + for i := range existingCRs.Items { + cr := &existingCRs.Items[i] + if cr.Spec.SchedulingDomain != v1alpha1.SchedulingDomainNova { continue } + isExpired := cr.Spec.EndTime != nil && !cr.Spec.EndTime.After(time.Now()) + if !activeCommitments[cr.Spec.CommitmentUUID] && !isExpired { + staleCRCount++ + } + if isExpired { + if err := s.Delete(ctx, cr); client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to GC expired committed resource CRD", "name", cr.Name) + return err + } + logger.Info("GC'd expired committed resource CRD", + "name", cr.Name, "endTime", cr.Spec.EndTime) + gcDeleted++ + } + } + // Delete orphaned Reservation CRDs: type=committed-resource but commitment no longer active. + // These are left over from the pre-refactor path where the syncer wrote Reservations directly. + var existingReservations v1alpha1.ReservationList + if err := s.List(ctx, &existingReservations, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + }); err != nil { + logger.Error(err, "failed to list committed resource reservations") + return err + } + var totalReservationDeleted int + for i := range existingReservations.Items { + res := &existingReservations.Items[i] + if res.Spec.CommittedResourceReservation == nil { + logger.Info("skipping reservation without committed resource spec", "name", res.Name) + continue + } + commitmentUUID := res.Spec.CommittedResourceReservation.CommitmentUUID + if commitmentUUID == "" { + logger.Info("skipping reservation with empty commitment UUID", "name", res.Name) + continue + } if !activeCommitments[commitmentUUID] { - // This commitment no longer exists, delete the reservation - if err := s.Delete(ctx, &existing); err != nil { - logger.Error(err, "failed to delete reservation", "name", existing.Name) + if err := s.Delete(ctx, res); client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to delete orphaned reservation", "name", res.Name) return err } - logger.Info("deleted reservation for expired commitment", - "name", existing.Name, - "commitmentUUID", commitmentUUID) - totalDeleted++ + logger.Info("deleted orphaned reservation", "name", res.Name, "commitmentUUID", commitmentUUID) + totalReservationDeleted++ } } - // Record reservation change metrics if s.monitor != nil { + s.monitor.RecordStaleCRs(staleCRCount) if totalCreated > 0 { - s.monitor.RecordReservationsCreated(totalCreated) + s.monitor.RecordCRCreates(totalCreated) } - if totalDeleted > 0 { - s.monitor.RecordReservationsDeleted(totalDeleted) + if totalUpdated > 0 { + s.monitor.RecordCRUpdates(totalUpdated) } - if totalRepaired > 0 { - s.monitor.RecordReservationsRepaired(totalRepaired) + if gcDeleted > 0 { + s.monitor.RecordCRDeletes(gcDeleted) } } - logger.Info("synced reservations", + if staleCRCount > 0 { + logger.Info("WARNING: committed resource CRDs present locally but absent from Limes — review for manual cleanup", + "staleCRs", staleCRCount) + } + + logger.Info("synced committed resource CRDs", "processedCount", len(commitmentResult.states), "skippedCount", len(commitmentResult.skippedUUIDs), "created", totalCreated, - "deleted", totalDeleted, - "repaired", totalRepaired) + "updated", totalUpdated, + "staleCRs", staleCRCount, + "expiredCRsGCd", gcDeleted, + "orphanReservationsDeleted", totalReservationDeleted) return nil } + +func (s *Syncer) applyCommittedResourceSpec(cr *v1alpha1.CommittedResource, state *CommitmentState) { + cr.Spec.CommitmentUUID = state.CommitmentUUID + cr.Spec.SchedulingDomain = v1alpha1.SchedulingDomainNova + cr.Spec.FlavorGroupName = state.FlavorGroupName + cr.Spec.ResourceType = v1alpha1.CommittedResourceTypeMemory + cr.Spec.Amount = *resource.NewQuantity(state.TotalMemoryBytes, resource.BinarySI) + cr.Spec.AvailabilityZone = state.AvailabilityZone + cr.Spec.ProjectID = state.ProjectID + cr.Spec.DomainID = state.DomainID + cr.Spec.State = state.State + cr.Spec.AllowRejection = false + + if state.StartTime != nil { + t := metav1.NewTime(*state.StartTime) + cr.Spec.StartTime = &t + } else { + cr.Spec.StartTime = nil + } + if state.EndTime != nil { + t := metav1.NewTime(*state.EndTime) + cr.Spec.EndTime = &t + } else { + cr.Spec.EndTime = nil + } +} + +func (s *Syncer) upsertCommittedResource(ctx context.Context, logger logr.Logger, state *CommitmentState) (controllerutil.OperationResult, error) { + cr := &v1alpha1.CommittedResource{} + cr.Name = "commitment-" + state.CommitmentUUID + + op, err := controllerutil.CreateOrUpdate(ctx, s.Client, cr, func() error { + s.applyCommittedResourceSpec(cr, state) + return nil + }) + if err != nil { + return op, err + } + logger.V(1).Info("upserted committed resource CRD", "name", cr.Name, "op", op) + return op, nil +} + +// updateCommittedResourceIfExists updates an existing CommittedResource CRD but does not +// create one if it is absent. Used for terminal states (superseded/expired): we want the +// controller to see the state transition and clean up child Reservations, but there is no +// point creating a CRD for a commitment Cortex has never tracked. +func (s *Syncer) updateCommittedResourceIfExists(ctx context.Context, logger logr.Logger, state *CommitmentState) (controllerutil.OperationResult, error) { + cr := &v1alpha1.CommittedResource{} + name := "commitment-" + state.CommitmentUUID + if err := s.Get(ctx, client.ObjectKey{Name: name}, cr); err != nil { + if client.IgnoreNotFound(err) == nil { + logger.V(1).Info("skipping terminal state — CRD does not exist locally", + "commitmentUUID", state.CommitmentUUID, "state", state.State) + return controllerutil.OperationResultNone, nil + } + return controllerutil.OperationResultNone, err + } + s.applyCommittedResourceSpec(cr, state) + if err := s.Update(ctx, cr); err != nil { + return controllerutil.OperationResultNone, err + } + logger.V(1).Info("updated committed resource CRD (terminal state)", "name", name, "state", state.State) + return controllerutil.OperationResultUpdated, nil +} + +// isTerminalCommitment returns true when a commitment should not result in new Reservation +// slots: either its Limes state is already terminal, or its EndTime has passed. +func isTerminalCommitment(state *CommitmentState) bool { + switch state.State { + case v1alpha1.CommitmentStatusSuperseded, v1alpha1.CommitmentStatusExpired: + return true + } + return state.EndTime != nil && !state.EndTime.After(time.Now()) +} diff --git a/internal/scheduling/reservations/commitments/syncer_monitor.go b/internal/scheduling/reservations/commitments/syncer_monitor.go index 853518f81..7e13478bb 100644 --- a/internal/scheduling/reservations/commitments/syncer_monitor.go +++ b/internal/scheduling/reservations/commitments/syncer_monitor.go @@ -14,60 +14,55 @@ const ( SkipReasonInvalidResource = "invalid_resource_name" SkipReasonEmptyUUID = "empty_uuid" SkipReasonNonCompute = "non_compute" - SkipReasonNonActive = "non_active" ) // SyncerMonitor provides metrics for the commitment syncer. type SyncerMonitor struct { - // Sync lifecycle - syncRuns prometheus.Counter - syncErrors prometheus.Counter - - // Commitment processing - commitmentsTotal prometheus.Counter // all commitments seen from Limes - commitmentsProcessed prometheus.Counter // successfully processed - commitmentsSkipped *prometheus.CounterVec // skipped with reason label - - // Reservation changes - reservationsCreated prometheus.Counter - reservationsDeleted prometheus.Counter - reservationsRepaired prometheus.Counter + syncErrors prometheus.Counter + syncDuration prometheus.Histogram + limesCommitmentsActive prometheus.Gauge + staleCRs prometheus.Gauge + commitmentsSkipped *prometheus.CounterVec + crCreates prometheus.Counter + crUpdates prometheus.Counter + crDeletes prometheus.Counter } // NewSyncerMonitor creates a new monitor with Prometheus metrics. func NewSyncerMonitor() *SyncerMonitor { m := &SyncerMonitor{ - syncRuns: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_runs_total", - Help: "Total number of commitment syncer runs", - }), syncErrors: prometheus.NewCounter(prometheus.CounterOpts{ Name: "cortex_committed_resource_syncer_errors_total", - Help: "Total number of commitment syncer errors", + Help: "Total number of commitment syncer runs that failed", + }), + syncDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_syncer_duration_seconds", + Help: "Duration of each commitment syncer run", + Buckets: []float64{0.5, 1, 5, 10, 30, 60, 120}, }), - commitmentsTotal: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_commitments_total", - Help: "Total number of commitments seen from Limes", + limesCommitmentsActive: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_syncer_limes_commitments_active", + Help: "Number of commitments from Limes that passed filtering and should have CR CRDs", }), - commitmentsProcessed: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_commitments_processed_total", - Help: "Total number of commitments successfully processed", + staleCRs: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_syncer_crd_unmatched", + Help: "Number of CommittedResource CRDs present locally but absent from Limes", }), commitmentsSkipped: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "cortex_committed_resource_syncer_commitments_skipped_total", Help: "Total number of commitments skipped during sync", }, []string{"reason"}), - reservationsCreated: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_reservations_created_total", - Help: "Total number of reservations created during sync", + crCreates: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_cr_creates_total", + Help: "Total number of CommittedResource CRDs created by the syncer", }), - reservationsDeleted: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_reservations_deleted_total", - Help: "Total number of reservations deleted during sync", + crUpdates: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_cr_updates_total", + Help: "Total number of CommittedResource CRDs updated by the syncer", }), - reservationsRepaired: prometheus.NewCounter(prometheus.CounterOpts{ - Name: "cortex_committed_resource_syncer_reservations_repaired_total", - Help: "Total number of reservations repaired during sync (wrong metadata)", + crDeletes: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "cortex_committed_resource_syncer_cr_deletes_total", + Help: "Total number of CommittedResource CRDs deleted by the syncer (expired GC)", }), } @@ -78,7 +73,6 @@ func NewSyncerMonitor() *SyncerMonitor { SkipReasonInvalidResource, SkipReasonEmptyUUID, SkipReasonNonCompute, - SkipReasonNonActive, } { m.commitmentsSkipped.WithLabelValues(reason) } @@ -86,66 +80,58 @@ func NewSyncerMonitor() *SyncerMonitor { return m } -// RecordSyncRun records a syncer run. -func (m *SyncerMonitor) RecordSyncRun() { - m.syncRuns.Inc() +func (m *SyncerMonitor) RecordError() { + m.syncErrors.Inc() } -// RecordSyncError records a syncer error. -func (m *SyncerMonitor) RecordSyncError() { - m.syncErrors.Inc() +func (m *SyncerMonitor) RecordDuration(seconds float64) { + m.syncDuration.Observe(seconds) } -// RecordCommitmentSeen records a commitment seen from Limes. -func (m *SyncerMonitor) RecordCommitmentSeen() { - m.commitmentsTotal.Inc() +func (m *SyncerMonitor) SetLimesCommitmentsActive(count int) { + m.limesCommitmentsActive.Set(float64(count)) } -// RecordCommitmentProcessed records a commitment successfully processed. -func (m *SyncerMonitor) RecordCommitmentProcessed() { - m.commitmentsProcessed.Inc() +func (m *SyncerMonitor) RecordStaleCRs(count int) { + m.staleCRs.Set(float64(count)) } -// RecordCommitmentSkipped records a commitment skipped with a reason. func (m *SyncerMonitor) RecordCommitmentSkipped(reason string) { m.commitmentsSkipped.WithLabelValues(reason).Inc() } -// RecordReservationsCreated records reservations created. -func (m *SyncerMonitor) RecordReservationsCreated(count int) { - m.reservationsCreated.Add(float64(count)) +func (m *SyncerMonitor) RecordCRCreates(count int) { + m.crCreates.Add(float64(count)) } -// RecordReservationsDeleted records reservations deleted. -func (m *SyncerMonitor) RecordReservationsDeleted(count int) { - m.reservationsDeleted.Add(float64(count)) +func (m *SyncerMonitor) RecordCRUpdates(count int) { + m.crUpdates.Add(float64(count)) } -// RecordReservationsRepaired records reservations repaired. -func (m *SyncerMonitor) RecordReservationsRepaired(count int) { - m.reservationsRepaired.Add(float64(count)) +func (m *SyncerMonitor) RecordCRDeletes(count int) { + m.crDeletes.Add(float64(count)) } // Describe implements prometheus.Collector. func (m *SyncerMonitor) Describe(ch chan<- *prometheus.Desc) { - m.syncRuns.Describe(ch) m.syncErrors.Describe(ch) - m.commitmentsTotal.Describe(ch) - m.commitmentsProcessed.Describe(ch) + m.syncDuration.Describe(ch) + m.limesCommitmentsActive.Describe(ch) + m.staleCRs.Describe(ch) m.commitmentsSkipped.Describe(ch) - m.reservationsCreated.Describe(ch) - m.reservationsDeleted.Describe(ch) - m.reservationsRepaired.Describe(ch) + m.crCreates.Describe(ch) + m.crUpdates.Describe(ch) + m.crDeletes.Describe(ch) } // Collect implements prometheus.Collector. func (m *SyncerMonitor) Collect(ch chan<- prometheus.Metric) { - m.syncRuns.Collect(ch) m.syncErrors.Collect(ch) - m.commitmentsTotal.Collect(ch) - m.commitmentsProcessed.Collect(ch) + m.syncDuration.Collect(ch) + m.limesCommitmentsActive.Collect(ch) + m.staleCRs.Collect(ch) m.commitmentsSkipped.Collect(ch) - m.reservationsCreated.Collect(ch) - m.reservationsDeleted.Collect(ch) - m.reservationsRepaired.Collect(ch) + m.crCreates.Collect(ch) + m.crUpdates.Collect(ch) + m.crDeletes.Collect(ch) } diff --git a/internal/scheduling/reservations/commitments/syncer_monitor_test.go b/internal/scheduling/reservations/commitments/syncer_monitor_test.go index 853524a70..d973a95e9 100644 --- a/internal/scheduling/reservations/commitments/syncer_monitor_test.go +++ b/internal/scheduling/reservations/commitments/syncer_monitor_test.go @@ -36,14 +36,12 @@ func TestSyncerMonitor_MetricsRegistration(t *testing.T) { name string metricType dto.MetricType }{ - {"cortex_committed_resource_syncer_runs_total", dto.MetricType_COUNTER}, {"cortex_committed_resource_syncer_errors_total", dto.MetricType_COUNTER}, - {"cortex_committed_resource_syncer_commitments_total", dto.MetricType_COUNTER}, - {"cortex_committed_resource_syncer_commitments_processed_total", dto.MetricType_COUNTER}, {"cortex_committed_resource_syncer_commitments_skipped_total", dto.MetricType_COUNTER}, - {"cortex_committed_resource_syncer_reservations_created_total", dto.MetricType_COUNTER}, - {"cortex_committed_resource_syncer_reservations_deleted_total", dto.MetricType_COUNTER}, - {"cortex_committed_resource_syncer_reservations_repaired_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_cr_creates_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_cr_updates_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_cr_deletes_total", dto.MetricType_COUNTER}, + {"cortex_committed_resource_syncer_crd_unmatched", dto.MetricType_GAUGE}, } for _, tc := range cases { @@ -100,7 +98,6 @@ func TestSyncerMonitor_SkipReasonsPreInitialized(t *testing.T) { SkipReasonInvalidResource, SkipReasonEmptyUUID, SkipReasonNonCompute, - SkipReasonNonActive, } { if !presentReasons[reason] { t.Errorf("skip reason %q not pre-initialized in commitments_skipped_total", reason) diff --git a/internal/scheduling/reservations/commitments/syncer_test.go b/internal/scheduling/reservations/commitments/syncer_test.go index e30f286c7..28a464d1e 100644 --- a/internal/scheduling/reservations/commitments/syncer_test.go +++ b/internal/scheduling/reservations/commitments/syncer_test.go @@ -7,14 +7,15 @@ import ( "context" "sort" "testing" + "time" "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) @@ -273,44 +274,31 @@ func TestSyncer_SyncReservations_InstanceCommitments(t *testing.T) { return } - // Verify that reservations were created - var reservations v1alpha1.ReservationList - err = k8sClient.List(context.Background(), &reservations) - if err != nil { - t.Errorf("Failed to list reservations: %v", err) - return + // Verify one CommittedResource CRD was created with the correct spec + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - - // Should have 2 reservations (Amount = 2, each for smallest flavor) - if len(reservations.Items) != 2 { - t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) - return + if len(crList.Items) != 1 { + t.Fatalf("Expected 1 CommittedResource, got %d", len(crList.Items)) } - - // Verify the first reservation - res := reservations.Items[0] - if res.Spec.CommittedResourceReservation == nil { - t.Errorf("Expected CommittedResourceReservation to be set") - return + cr := crList.Items[0] + if cr.Name != "commitment-12345-67890-abcdef" { + t.Errorf("Expected name commitment-12345-67890-abcdef, got %s", cr.Name) } - if res.Spec.CommittedResourceReservation.ProjectID != "test-project-1" { - t.Errorf("Expected project ID test-project-1, got %v", res.Spec.CommittedResourceReservation.ProjectID) + if cr.Spec.ProjectID != "test-project-1" { + t.Errorf("Expected projectID test-project-1, got %s", cr.Spec.ProjectID) } - - if res.Spec.CommittedResourceReservation.ResourceGroup != "test_group_v1" { - t.Errorf("Expected resource group test_group_v1, got %v", res.Spec.CommittedResourceReservation.ResourceGroup) + if cr.Spec.FlavorGroupName != "test_group_v1" { + t.Errorf("Expected flavorGroupName test_group_v1, got %s", cr.Spec.FlavorGroupName) } - - // Check resource values - should be sized for the flavor that fits - // With 2048MB total capacity, we can fit 2x 1024MB flavors - expectedMemory := resource.MustParse("1073741824") // 1024MB in bytes - if !res.Spec.Resources[hv1.ResourceMemory].Equal(expectedMemory) { - t.Errorf("Expected memory %v, got %v", expectedMemory, res.Spec.Resources[hv1.ResourceMemory]) + if cr.Spec.State != v1alpha1.CommitmentStatusConfirmed { + t.Errorf("Expected state confirmed, got %s", cr.Spec.State) } - - expectedVCPUs := resource.MustParse("2") - if !res.Spec.Resources[hv1.ResourceCPU].Equal(expectedVCPUs) { - t.Errorf("Expected vCPUs %v, got %v", expectedVCPUs, res.Spec.Resources[hv1.ResourceCPU]) + // Amount = 2 slots × 1024 MiB = 2 GiB + expectedAmount := resource.NewQuantity(2*1024*1024*1024, resource.BinarySI) + if !cr.Spec.Amount.Equal(*expectedAmount) { + t.Errorf("Expected amount %v, got %v", expectedAmount, cr.Spec.Amount) } } @@ -320,7 +308,6 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { t.Fatalf("Failed to add scheme: %v", err) } - // Create flavor group knowledge CRD flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ "new_group_v1": { LargestFlavorName: "new-flavor", @@ -332,37 +319,26 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { }, }) - // Create an existing reservation with mismatched project/flavor group - // The ReservationManager will delete this and create a new one - existingReservation := &v1alpha1.Reservation{ - ObjectMeta: ctrl.ObjectMeta{ - Name: "commitment-12345-67890-abcdef-0", - Labels: map[string]string{ - v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, - }, - }, - Spec: v1alpha1.ReservationSpec{ - Type: v1alpha1.ReservationTypeCommittedResource, - CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ - CommitmentUUID: "12345-67890-abcdef", - ProjectID: "old-project", - ResourceName: "old-flavor", - ResourceGroup: "old_group", - Creator: CreatorValue, - }, - Resources: map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceMemory: resource.MustParse("512Mi"), - hv1.ResourceCPU: resource.MustParse("1"), - }, + // Pre-existing CommittedResource CRD with stale spec; syncer should update it. + existingCR := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "commitment-12345-67890-abcdef"}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "12345-67890-abcdef", + FlavorGroupName: "old_group", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: *resource.NewQuantity(512*1024*1024, resource.BinarySI), + ProjectID: "old-project", + DomainID: "old-domain", + AvailabilityZone: "az1", + State: v1alpha1.CommitmentStatusConfirmed, }, } k8sClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(existingReservation, flavorGroupsKnowledge). + WithObjects(existingCR, flavorGroupsKnowledge). Build() - // Create mock commitment that will replace the existing reservation mockCommitments := []Commitment{ { ID: 1, @@ -387,57 +363,29 @@ func TestSyncer_SyncReservations_UpdateExisting(t *testing.T) { return result, nil }, listProjectsFunc: func(ctx context.Context) ([]Project, error) { - return []Project{ - {ID: "new-project", DomainID: "new-domain", Name: "New Project"}, - }, nil - }, - listServersFunc: func(ctx context.Context, projects ...Project) (map[string][]Server, error) { - return map[string][]Server{}, nil // No active servers - }, - initFunc: func(ctx context.Context, client client.Client, conf SyncerConfig) error { - // No-op for init - return nil + return []Project{{ID: "new-project", DomainID: "new-domain"}}, nil }, } - syncer := &Syncer{ - CommitmentsClient: mockClient, - Client: k8sClient, - } - - err := syncer.SyncReservations(context.Background()) - if err != nil { - t.Errorf("SyncReservations() error = %v", err) - return - } + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient} - // Verify that reservations were updated (old one deleted, new one created) - // The new reservation will be at index 0 since the old one was deleted first - var reservations v1alpha1.ReservationList - err = k8sClient.List(context.Background(), &reservations) - if err != nil { - t.Errorf("Failed to list reservations: %v", err) - return + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) } - if len(reservations.Items) != 1 { - t.Errorf("Expected 1 reservation, got %d", len(reservations.Items)) - return + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - - newReservation := reservations.Items[0] - - // Verify the new reservation has correct values - if newReservation.Spec.CommittedResourceReservation == nil { - t.Errorf("Expected CommittedResourceReservation to be set") - return + if len(crList.Items) != 1 { + t.Fatalf("Expected 1 CommittedResource, got %d", len(crList.Items)) } - if newReservation.Spec.CommittedResourceReservation.ProjectID != "new-project" { - t.Errorf("Expected project ID new-project, got %v", newReservation.Spec.CommittedResourceReservation.ProjectID) + cr := crList.Items[0] + if cr.Spec.ProjectID != "new-project" { + t.Errorf("Expected projectID new-project, got %s", cr.Spec.ProjectID) } - - if newReservation.Spec.CommittedResourceReservation.ResourceGroup != "new_group_v1" { - t.Errorf("Expected resource group new_group_v1, got %v", newReservation.Spec.CommittedResourceReservation.ResourceGroup) + if cr.Spec.FlavorGroupName != "new_group_v1" { + t.Errorf("Expected flavorGroupName new_group_v1, got %s", cr.Spec.FlavorGroupName) } } @@ -511,19 +459,14 @@ func TestSyncer_SyncReservations_UnitMismatch(t *testing.T) { return } - // Verify that NO reservations were created due to unit mismatch - // The commitment is skipped and Cortex trusts existing CRDs - var reservations v1alpha1.ReservationList - err = k8sClient.List(context.Background(), &reservations) - if err != nil { - t.Errorf("Failed to list reservations: %v", err) - return + // Verify that NO CommittedResource CRDs were created due to unit mismatch. + // The commitment is skipped and Cortex trusts existing CRDs. + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - - // Should have 0 reservations - commitment is skipped due to unit mismatch - // Cortex waits for Limes to update the unit before processing - if len(reservations.Items) != 0 { - t.Errorf("Expected 0 reservations (commitment skipped due to unit mismatch), got %d", len(reservations.Items)) + if len(crList.Items) != 0 { + t.Errorf("Expected 0 CommittedResource CRDs (commitment skipped due to unit mismatch), got %d", len(crList.Items)) } } @@ -595,16 +538,13 @@ func TestSyncer_SyncReservations_UnitMatch(t *testing.T) { return } - // Verify that reservations were created - var reservations v1alpha1.ReservationList - err = k8sClient.List(context.Background(), &reservations) - if err != nil { - t.Errorf("Failed to list reservations: %v", err) - return + // Verify that one CommittedResource CRD was created + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - - if len(reservations.Items) != 2 { - t.Errorf("Expected 2 reservations, got %d", len(reservations.Items)) + if len(crList.Items) != 1 { + t.Errorf("Expected 1 CommittedResource CRD, got %d", len(crList.Items)) } } @@ -680,16 +620,13 @@ func TestSyncer_SyncReservations_EmptyUUID(t *testing.T) { return } - // Verify that no reservations were created due to empty UUID - var reservations v1alpha1.ReservationList - err = k8sClient.List(context.Background(), &reservations) - if err != nil { - t.Errorf("Failed to list reservations: %v", err) - return + // Verify that no CommittedResource CRDs were created due to empty UUID + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - - if len(reservations.Items) != 0 { - t.Errorf("Expected 0 reservations due to empty UUID, got %d", len(reservations.Items)) + if len(crList.Items) != 0 { + t.Errorf("Expected 0 CommittedResource CRDs due to empty UUID, got %d", len(crList.Items)) } } @@ -711,16 +648,16 @@ func TestSyncer_SyncReservations_StatusFilter(t *testing.T) { }) tests := []struct { - name string - status string - expectReservation bool + name string + status string + expectCR bool }{ - {"confirmed is processed", "confirmed", true}, - {"guaranteed is processed", "guaranteed", true}, - {"planned is skipped", "planned", false}, - {"pending is skipped", "pending", false}, - {"superseded is skipped", "superseded", false}, - {"expired is skipped", "expired", false}, + {"confirmed creates CR", "confirmed", true}, + {"guaranteed creates CR", "guaranteed", true}, + {"planned creates CR", "planned", true}, + {"pending creates CR", "pending", true}, + {"superseded does not create CR", "superseded", false}, + {"expired does not create CR", "expired", false}, {"empty status is skipped", "", false}, } @@ -769,17 +706,288 @@ func TestSyncer_SyncReservations_StatusFilter(t *testing.T) { t.Fatalf("SyncReservations() error = %v", err) } - var reservations v1alpha1.ReservationList - if err := k8sClient.List(context.Background(), &reservations); err != nil { - t.Fatalf("Failed to list reservations: %v", err) + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) } - if tc.expectReservation && len(reservations.Items) == 0 { - t.Errorf("status=%q: expected reservation to be created, got none", tc.status) + if tc.expectCR && len(crList.Items) == 0 { + t.Errorf("status=%q: expected CommittedResource CRD to be created, got none", tc.status) } - if !tc.expectReservation && len(reservations.Items) != 0 { - t.Errorf("status=%q: expected no reservation, got %d", tc.status, len(reservations.Items)) + if !tc.expectCR && len(crList.Items) != 0 { + t.Errorf("status=%q: expected no CommittedResource CRD, got %d", tc.status, len(crList.Items)) } }) } } + +func TestSyncer_SyncReservations_StaleCRCount(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": { + LargestFlavorName: "test-flavor", + LargestFlavorVCPUs: 2, + LargestFlavorMemoryMB: 1024, + SmallestFlavorName: "test-flavor", + SmallestFlavorVCPUs: 2, + SmallestFlavorMemoryMB: 1024, + }, + }) + + // Pre-existing CRD whose commitment no longer appears in Limes + staleCR := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "commitment-stale-uuid-1234"}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "stale-uuid-1234", + FlavorGroupName: "test_group_v1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI), + ProjectID: "test-project", + DomainID: "test-domain", + AvailabilityZone: "az1", + State: v1alpha1.CommitmentStatusConfirmed, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(staleCR, flavorGroupsKnowledge). + Build() + + // Limes returns no commitments (stale-uuid-1234 is gone) + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + return map[string]Commitment{}, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{{ID: "test-project", DomainID: "test-domain"}}, nil + }, + } + + monitor := NewSyncerMonitor() + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient, monitor: monitor} + + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) + } + + // Stale CRD must still exist (syncer does not delete) + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list committed resources: %v", err) + } + if len(crList.Items) != 1 { + t.Errorf("Expected stale CRD to be preserved, got %d CRDs", len(crList.Items)) + } + + // Gauge must reflect the stale count + ch := make(chan prometheus.Metric, 10) + monitor.staleCRs.Collect(ch) + close(ch) + m := <-ch + var dto dto.Metric + if err := m.Write(&dto); err != nil { + t.Fatalf("failed to read metric: %v", err) + } + if got := dto.GetGauge().GetValue(); got != 1 { + t.Errorf("Expected staleCRs gauge=1, got %v", got) + } +} + +func TestSyncer_SyncReservations_TerminalState_NoCRDExists(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024, + LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024}, + }) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(flavorGroupsKnowledge).Build() + + for _, status := range []string{"superseded", "expired"} { + t.Run(status, func(t *testing.T) { + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + return map[string]Commitment{ + "term-uuid-1234": { + ID: 1, UUID: "term-uuid-1234", ServiceType: "compute", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", + Amount: 1, Status: status, ProjectID: "p", DomainID: "d", + }, + }, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{{ID: "p", DomainID: "d"}}, nil + }, + } + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient} + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) + } + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list: %v", err) + } + if len(crList.Items) != 0 { + t.Errorf("status=%q: expected no CRD to be created, got %d", status, len(crList.Items)) + } + }) + } +} + +func TestSyncer_SyncReservations_TerminalState_ExistingCRDUpdated(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024, + LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024}, + }) + + existingCR := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "commitment-term-uuid-1234"}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "term-uuid-1234", FlavorGroupName: "test_group_v1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI), + ProjectID: "p", DomainID: "d", AvailabilityZone: "az1", + State: v1alpha1.CommitmentStatusConfirmed, + }, + } + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR, flavorGroupsKnowledge).Build() + + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + return map[string]Commitment{ + "term-uuid-1234": { + ID: 1, UUID: "term-uuid-1234", ServiceType: "compute", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", + Amount: 1, Status: "superseded", ProjectID: "p", DomainID: "d", + }, + }, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{{ID: "p", DomainID: "d"}}, nil + }, + } + + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient} + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) + } + + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list: %v", err) + } + if len(crList.Items) != 1 { + t.Fatalf("Expected CRD to be preserved, got %d", len(crList.Items)) + } + if crList.Items[0].Spec.State != v1alpha1.CommitmentStatusSuperseded { + t.Errorf("Expected state superseded, got %s", crList.Items[0].Spec.State) + } +} + +func TestSyncer_SyncReservations_ExpiredByTime_NoCRDCreated(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024, + LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024}, + }) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(flavorGroupsKnowledge).Build() + + pastTime := uint64(1) // Unix epoch — well in the past + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + return map[string]Commitment{ + "exp-uuid-1234": { + ID: 1, UUID: "exp-uuid-1234", ServiceType: "compute", + ResourceName: "hw_version_test_group_v1_ram", AvailabilityZone: "az1", + Amount: 1, Status: "confirmed", ExpiresAt: pastTime, + ProjectID: "p", DomainID: "d", + }, + }, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{{ID: "p", DomainID: "d"}}, nil + }, + } + + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient} + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) + } + + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list: %v", err) + } + if len(crList.Items) != 0 { + t.Errorf("Expected no CRD created for past-expiry confirmed commitment, got %d", len(crList.Items)) + } +} + +func TestSyncer_SyncReservations_GC_ExpiredEndTime(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + + flavorGroupsKnowledge := createFlavorGroupKnowledge(t, map[string]FlavorGroupData{ + "test_group_v1": {SmallestFlavorName: "f", SmallestFlavorVCPUs: 2, SmallestFlavorMemoryMB: 1024, + LargestFlavorName: "f", LargestFlavorVCPUs: 2, LargestFlavorMemoryMB: 1024}, + }) + + pastTime := metav1.NewTime(time.Now().Add(-time.Hour)) + expiredCR := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: "commitment-gc-uuid-1234"}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: "gc-uuid-1234", FlavorGroupName: "test_group_v1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + Amount: *resource.NewQuantity(1024*1024*1024, resource.BinarySI), + ProjectID: "p", DomainID: "d", AvailabilityZone: "az1", + State: v1alpha1.CommitmentStatusConfirmed, + EndTime: &pastTime, + SchedulingDomain: v1alpha1.SchedulingDomainNova, + }, + } + + k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(expiredCR, flavorGroupsKnowledge).Build() + + // Limes no longer returns this commitment + mockClient := &mockCommitmentsClient{ + listCommitmentsByIDFunc: func(ctx context.Context, projects ...Project) (map[string]Commitment, error) { + return map[string]Commitment{}, nil + }, + listProjectsFunc: func(ctx context.Context) ([]Project, error) { + return []Project{{ID: "p", DomainID: "d"}}, nil + }, + } + + syncer := &Syncer{CommitmentsClient: mockClient, Client: k8sClient} + if err := syncer.SyncReservations(context.Background()); err != nil { + t.Fatalf("SyncReservations() error = %v", err) + } + + var crList v1alpha1.CommittedResourceList + if err := k8sClient.List(context.Background(), &crList); err != nil { + t.Fatalf("Failed to list: %v", err) + } + if len(crList.Items) != 0 { + t.Errorf("Expected expired CRD to be GC'd, got %d CRDs", len(crList.Items)) + } +} diff --git a/tools/visualize-committed-resources/main.go b/tools/visualize-committed-resources/main.go new file mode 100644 index 000000000..afa16a372 --- /dev/null +++ b/tools/visualize-committed-resources/main.go @@ -0,0 +1,596 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +// Tool to visualize CommittedResource CRDs and their child Reservation slots. +// +// Usage: +// +// go run tools/visualize-committed-resources/main.go [flags] +// +// Flags: +// +// --context=ctx Kubernetes context (default: current context) +// --filter-project=id Show only CRs for this project ID (substring match) +// --filter-az=az Show only CRs in this availability zone (substring match) +// --filter-group=name Show only CRs for this flavor group (substring match) +// --filter-state=state Show only CRs in this state (e.g. confirmed, reserving) +// --active Shorthand: show only confirmed/guaranteed CRs +// --views=v1,v2,... Views to show (default: all). Available: summary, commitments, reservations, allocations +// --hide=v1,v2,... Views to hide (applied after --views) +// --watch=interval Refresh interval (e.g. 2s, 5s). Clears screen between refreshes. +package main + +import ( + "context" + "flag" + "fmt" + "os" + "sort" + "strconv" + "strings" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/tools/clientcmd" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(v1alpha1.AddToScheme(scheme)) +} + +// ── ANSI colours ────────────────────────────────────────────────────────────── + +const ( + colReset = "\033[0m" + colBold = "\033[1m" + colGreen = "\033[32m" + colYellow = "\033[33m" + colRed = "\033[31m" + colCyan = "\033[36m" + colGray = "\033[90m" +) + +func green(s string) string { return colGreen + s + colReset } +func yellow(s string) string { return colYellow + s + colReset } +func red(s string) string { return colRed + s + colReset } +func cyan(s string) string { return colCyan + s + colReset } +func gray(s string) string { return colGray + s + colReset } +func bold(s string) string { return colBold + s + colReset } + +// ── Views ───────────────────────────────────────────────────────────────────── + +const ( + viewSummary = "summary" + viewCommitments = "commitments" + viewReservations = "reservations" + viewAllocations = "allocations" +) + +var allViews = []string{viewSummary, viewCommitments, viewReservations, viewAllocations} + +type viewSet map[string]bool + +func parseViews(s string) viewSet { + vs := make(viewSet) + if s == "all" || s == "" { + for _, v := range allViews { + vs[v] = true + } + return vs + } + for _, v := range strings.Split(s, ",") { + vs[strings.TrimSpace(v)] = true + } + return vs +} + +func (vs viewSet) hide(s string) { + if s == "" { + return + } + for _, v := range strings.Split(s, ",") { + delete(vs, strings.TrimSpace(v)) + } +} + +func (vs viewSet) has(v string) bool { return vs[v] } + +// ── k8s client ──────────────────────────────────────────────────────────────── + +func newClient(contextName string) (client.Client, error) { + if contextName == "" { + c, err := config.GetConfig() + if err != nil { + return nil, fmt.Errorf("getting kubeconfig: %w", err) + } + return client.New(c, client.Options{Scheme: scheme}) + } + loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() + kubeConfig := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + loadingRules, + &clientcmd.ConfigOverrides{CurrentContext: contextName}, + ) + c, err := kubeConfig.ClientConfig() + if err != nil { + return nil, fmt.Errorf("getting kubeconfig for context %q: %w", contextName, err) + } + return client.New(c, client.Options{Scheme: scheme}) +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +func printHeader(title string) { + line := strings.Repeat("─", 80) + fmt.Println() + fmt.Println(bold(line)) + fmt.Printf("%s %s\n", bold("▶"), bold(title)) + fmt.Println(bold(line)) +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n-1] + "…" +} + +func age(t *metav1.Time) string { + if t == nil { + return gray("—") + } + d := time.Since(t.Time).Round(time.Second) + switch { + case d < time.Minute: + return fmt.Sprintf("%ds", int(d.Seconds())) + case d < time.Hour: + return fmt.Sprintf("%dm", int(d.Minutes())) + case d < 24*time.Hour: + return fmt.Sprintf("%dh", int(d.Hours())) + default: + return fmt.Sprintf("%dd", int(d.Hours()/24)) + } +} + +func crReadyStatus(cr v1alpha1.CommittedResource) string { + cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + return gray("unknown") + } + switch cond.Reason { + case v1alpha1.CommittedResourceReasonAccepted: + return green("Accepted") + case v1alpha1.CommittedResourceReasonRejected: + return red("Rejected") + case v1alpha1.CommittedResourceReasonReserving: + return yellow("Reserving") + case v1alpha1.CommittedResourceReasonPlanned: + return gray("Planned") + default: + return yellow(cond.Reason) + } +} + +func resReadyStatus(res v1alpha1.Reservation) string { + cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) + if cond == nil { + return gray("pending") + } + if cond.Status == metav1.ConditionTrue { + return green("Ready") + } + return red("NotReady: " + truncate(cond.Message, 40)) +} + +func stateColour(state v1alpha1.CommitmentStatus) string { + switch state { + case v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommitmentStatusGuaranteed: + return green(string(state)) + case v1alpha1.CommitmentStatusPlanned, v1alpha1.CommitmentStatusPending: + return yellow(string(state)) + case v1alpha1.CommitmentStatusExpired, v1alpha1.CommitmentStatusSuperseded: + return gray(string(state)) + default: + return string(state) + } +} + +// ── filters ─────────────────────────────────────────────────────────────────── + +type filters struct { + project string + az string + group string + state string + active bool +} + +func (f filters) match(cr v1alpha1.CommittedResource) bool { + if f.project != "" && !strings.Contains(cr.Spec.ProjectID, f.project) { + return false + } + if f.az != "" && !strings.Contains(cr.Spec.AvailabilityZone, f.az) { + return false + } + if f.group != "" && !strings.Contains(cr.Spec.FlavorGroupName, f.group) { + return false + } + if f.state != "" && !strings.EqualFold(string(cr.Spec.State), f.state) { + return false + } + if f.active { + s := cr.Spec.State + if s != v1alpha1.CommitmentStatusConfirmed && s != v1alpha1.CommitmentStatusGuaranteed { + return false + } + } + return true +} + +// ── views ───────────────────────────────────────────────────────────────────── + +func printSummary(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation) { + printHeader("Summary") + + byState := make(map[v1alpha1.CommitmentStatus]int) + byReady := map[string]int{"Accepted": 0, "Reserving": 0, "Rejected": 0, "Planned": 0, "Unknown": 0} + for _, cr := range crs { + byState[cr.Spec.State]++ + cond := apimeta.FindStatusCondition(cr.Status.Conditions, v1alpha1.CommittedResourceConditionReady) + if cond == nil { + byReady["Unknown"]++ + } else { + byReady[cond.Reason]++ + } + } + + resReady, resNotReady, resPending := 0, 0, 0 + for _, res := range reservations { + cond := apimeta.FindStatusCondition(res.Status.Conditions, v1alpha1.ReservationConditionReady) + switch { + case cond == nil: + resPending++ + case cond.Status == metav1.ConditionTrue: + resReady++ + default: + resNotReady++ + } + } + + fmt.Printf(" CommittedResources : %s\n", bold(fmt.Sprintf("%d total", len(crs)))) + for _, s := range []v1alpha1.CommitmentStatus{ + v1alpha1.CommitmentStatusConfirmed, + v1alpha1.CommitmentStatusGuaranteed, + v1alpha1.CommitmentStatusPending, + v1alpha1.CommitmentStatusPlanned, + v1alpha1.CommitmentStatusExpired, + v1alpha1.CommitmentStatusSuperseded, + } { + if n := byState[s]; n > 0 { + fmt.Printf(" %-14s %d\n", string(s)+":", n) + } + } + fmt.Println() + fmt.Printf(" Ready conditions : %s accepted, %s reserving, %s rejected\n", + green(strconv.Itoa(byReady["Accepted"])), + yellow(strconv.Itoa(byReady["Reserving"])), + red(strconv.Itoa(byReady["Rejected"])), + ) + fmt.Println() + fmt.Printf(" Reservation slots : %s total — %s ready, %s not-ready, %s pending\n", + bold(strconv.Itoa(len(reservations))), + green(strconv.Itoa(resReady)), + red(strconv.Itoa(resNotReady)), + yellow(strconv.Itoa(resPending)), + ) +} + +func printCommitments(crs []v1alpha1.CommittedResource) { + printHeader(fmt.Sprintf("CommittedResources (%d)", len(crs))) + + if len(crs) == 0 { + fmt.Println(gray(" (none)")) + return + } + + for _, cr := range crs { + fmt.Printf("\n %s %s\n", + bold(cyan(cr.Spec.CommitmentUUID)), + crReadyStatus(cr), + ) + fmt.Printf(" project=%-36s group=%-20s az=%s\n", + cr.Spec.ProjectID, cr.Spec.FlavorGroupName, cr.Spec.AvailabilityZone) + fmt.Printf(" state=%-14s amount=%-10s accepted=%s\n", + stateColour(cr.Spec.State), + cr.Spec.Amount.String(), + func() string { + if cr.Status.AcceptedAmount == nil { + return gray("—") + } + return cr.Status.AcceptedAmount.String() + }(), + ) + + if cr.Status.UsedAmount != nil { + fmt.Printf(" used=%-12s\n", cr.Status.UsedAmount.String()) + } + + endStr := gray("no expiry") + if cr.Spec.EndTime != nil { + remaining := time.Until(cr.Spec.EndTime.Time).Round(time.Minute) + if remaining < 0 { + endStr = red(fmt.Sprintf("expired %s ago", age(cr.Spec.EndTime))) + } else { + endStr = fmt.Sprintf("expires in %s (at %s)", remaining, cr.Spec.EndTime.Format(time.RFC3339)) + } + } + fmt.Printf(" age=%-8s %s\n", age(&cr.CreationTimestamp), endStr) + } +} + +func printReservations(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation, showAllocations bool) { + // Index reservations by CommitmentUUID for display under each CR. + byUUID := make(map[string][]v1alpha1.Reservation) + for _, res := range reservations { + if res.Spec.CommittedResourceReservation == nil { + continue + } + uuid := res.Spec.CommittedResourceReservation.CommitmentUUID + byUUID[uuid] = append(byUUID[uuid], res) + } + + printHeader("Reservation Slots") + + if len(reservations) == 0 { + fmt.Println(gray(" (none)")) + return + } + + for _, cr := range crs { + slots := byUUID[cr.Spec.CommitmentUUID] + if len(slots) == 0 { + continue + } + fmt.Printf("\n %s %s %s\n", + bold(cyan(cr.Spec.CommitmentUUID)), + gray(cr.Spec.FlavorGroupName), + gray(fmt.Sprintf("%d slot(s)", len(slots))), + ) + + sort.Slice(slots, func(i, j int) bool { + return slots[i].Name < slots[j].Name + }) + + for _, res := range slots { + targetHost := res.Spec.TargetHost + statusHost := res.Status.Host + var hostStr string + switch { + case statusHost == "": + hostStr = yellow(targetHost) + gray(" (not yet placed)") + case statusHost != targetHost: + hostStr = red(fmt.Sprintf("target=%s status=%s (migrating?)", targetHost, statusHost)) + default: + hostStr = green(targetHost) + } + + genOK := "" + if s := res.Status.CommittedResourceReservation; s != nil { + spec := res.Spec.CommittedResourceReservation + if spec != nil && s.ObservedParentGeneration != spec.ParentGeneration { + genOK = yellow(fmt.Sprintf(" [gen: spec=%d observed=%d]", + spec.ParentGeneration, s.ObservedParentGeneration)) + } + } + + resources := "" + var resourcesSb391 strings.Builder + for rname, qty := range res.Spec.Resources { + fmt.Fprintf(&resourcesSb391, "%s=%s ", rname, qty.String()) + } + resources += resourcesSb391.String() + + fmt.Printf(" %s host=%s %s %s%s\n", + truncate(res.Name, 40), + hostStr, + resReadyStatus(res), + gray(strings.TrimSpace(resources)), + genOK, + ) + + if showAllocations { + specAllocs := 0 + statusAllocs := 0 + if res.Spec.CommittedResourceReservation != nil { + specAllocs = len(res.Spec.CommittedResourceReservation.Allocations) + } + if res.Status.CommittedResourceReservation != nil { + statusAllocs = len(res.Status.CommittedResourceReservation.Allocations) + } + + if specAllocs > 0 || statusAllocs > 0 { + fmt.Printf(" allocations: spec=%d confirmed=%d\n", specAllocs, statusAllocs) + if res.Spec.CommittedResourceReservation != nil { + statusAlloc := map[string]string{} + if res.Status.CommittedResourceReservation != nil { + statusAlloc = res.Status.CommittedResourceReservation.Allocations + } + for vmUUID, alloc := range res.Spec.CommittedResourceReservation.Allocations { + resources := "" + var resourcesSb422 strings.Builder + for rname, qty := range alloc.Resources { + fmt.Fprintf(&resourcesSb422, "%s=%s ", rname, qty.String()) + } + resources += resourcesSb422.String() + confirmedHost, confirmed := statusAlloc[vmUUID] + state := "" + if confirmed { + state = green("confirmed on " + confirmedHost) + } else { + state = yellow(fmt.Sprintf("spec-only (grace since %s)", age(&alloc.CreationTimestamp))) + } + fmt.Printf(" vm=%s %s %s\n", + truncate(vmUUID, 36), + gray(strings.TrimSpace(resources)), + state, + ) + } + } + } + } + } + } +} + +// ── main ────────────────────────────────────────────────────────────────────── + +func main() { + k8sContext := flag.String("context", "", "Kubernetes context (default: current context)") + filterProject := flag.String("filter-project", "", "Show only CRs for this project ID (substring match)") + filterAZ := flag.String("filter-az", "", "Show only CRs in this availability zone (substring match)") + filterGroup := flag.String("filter-group", "", "Show only CRs for this flavor group (substring match)") + filterState := flag.String("filter-state", "", "Show only CRs in this state") + activeOnly := flag.Bool("active", false, "Show only confirmed/guaranteed CRs") + viewsFlag := flag.String("views", "all", "Views: all, summary, commitments, reservations, allocations") + hideFlag := flag.String("hide", "", "Views to hide (applied after --views)") + watchInterval := flag.Duration("watch", 0, "Refresh interval (e.g. 2s, 5s). 0 = run once.") + limitFlag := flag.Int("limit", 200, "Max CRs to fetch (0 = unlimited)") + flag.Parse() + + views := parseViews(*viewsFlag) + views.hide(*hideFlag) + + f := filters{ + project: *filterProject, + az: *filterAZ, + group: *filterGroup, + state: *filterState, + active: *activeOnly, + } + + cl, err := newClient(*k8sContext) + if err != nil { + fmt.Fprintf(os.Stderr, "error: %v\n", err) + os.Exit(1) + } + + ctx := context.Background() + var prevDigest string + first := true + for { + crs, reservations := fetchSnapshot(ctx, cl, f, *limitFlag) + if d := snapshotDigest(crs, reservations); first || d != prevDigest { + if !first { + fmt.Printf("\n%s %s %s\n", + bold("━━━ changed at"), + bold(time.Now().Format(time.RFC3339)), + bold("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"), + ) + } + printSnapshot(crs, reservations, f, views) + prevDigest = d + first = false + } + if *watchInterval == 0 { + break + } + time.Sleep(*watchInterval) + } +} + +// snapshotDigest returns a string that changes whenever any CRD is added, removed, or updated. +func snapshotDigest(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation) string { + var b strings.Builder + for _, cr := range crs { + fmt.Fprintf(&b, "%s:%s ", cr.Name, cr.ResourceVersion) + } + for _, res := range reservations { + fmt.Fprintf(&b, "%s:%s ", res.Name, res.ResourceVersion) + } + return b.String() +} + +func fetchSnapshot(ctx context.Context, cl client.Client, f filters, limit int) ([]v1alpha1.CommittedResource, []v1alpha1.Reservation) { + var listOpts []client.ListOption + if limit > 0 { + listOpts = append(listOpts, client.Limit(int64(limit))) + } + + var crList v1alpha1.CommittedResourceList + if err := cl.List(ctx, &crList, listOpts...); err != nil { + fmt.Fprintf(os.Stderr, "error listing CommittedResources: %v\n", err) + os.Exit(1) + } + + var resList v1alpha1.ReservationList + if err := cl.List(ctx, &resList, append(listOpts, client.MatchingLabels{ + v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource, + })...); err != nil { + fmt.Fprintf(os.Stderr, "error listing Reservations: %v\n", err) + os.Exit(1) + } + + if crList.Continue != "" { + fmt.Fprintf(os.Stderr, yellow("warning: CR list truncated at %d — use --limit=0 or a higher value to see all\n"), limit) + } + if resList.Continue != "" { + fmt.Fprintf(os.Stderr, yellow("warning: Reservation list truncated at %d — use --limit=0 or a higher value to see all\n"), limit) + } + var crs []v1alpha1.CommittedResource + for _, cr := range crList.Items { + if f.match(cr) { + crs = append(crs, cr) + } + } + sort.Slice(crs, func(i, j int) bool { + if crs[i].Spec.FlavorGroupName != crs[j].Spec.FlavorGroupName { + return crs[i].Spec.FlavorGroupName < crs[j].Spec.FlavorGroupName + } + return crs[i].Spec.CommitmentUUID < crs[j].Spec.CommitmentUUID + }) + + matchedUUIDs := make(map[string]bool, len(crs)) + for _, cr := range crs { + matchedUUIDs[cr.Spec.CommitmentUUID] = true + } + var reservations []v1alpha1.Reservation + for _, res := range resList.Items { + if res.Spec.CommittedResourceReservation == nil { + continue + } + if matchedUUIDs[res.Spec.CommittedResourceReservation.CommitmentUUID] { + reservations = append(reservations, res) + } + } + return crs, reservations +} + +func printSnapshot(crs []v1alpha1.CommittedResource, reservations []v1alpha1.Reservation, f filters, views viewSet) { + fmt.Printf("\n%s — %s\n", + bold("visualize-committed-resources"), + gray(time.Now().Format(time.RFC3339)), + ) + if f.project != "" || f.az != "" || f.group != "" || f.state != "" || f.active { + fmt.Printf("%s project=%q az=%q group=%q state=%q active=%v\n", + gray("filters:"), f.project, f.az, f.group, f.state, f.active) + } + + if views.has(viewSummary) { + printSummary(crs, reservations) + } + if views.has(viewCommitments) { + printCommitments(crs) + } + if views.has(viewReservations) { + printReservations(crs, reservations, views.has(viewAllocations)) + } + + fmt.Println() +}