cobaltcore-dev · mblos · May 7, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -106,38 +106,42 @@ type CommittedResourceSpec struct {
 
 // CommittedResourceStatus defines the observed state of CommittedResource.
 type CommittedResourceStatus struct {
-	// AcceptedAmount is the quantity the controller last successfully provisioned as Reservation slots.
-	// Nil if the spec has never been successfully reconciled.
+	// AcceptedSpec is a snapshot of Spec from the last successful reconcile.
+	// Used by rollbackToAccepted to restore the exact previously-accepted placement (AZ, amount,
+	// project, domain, flavor group) even when the current spec has already been mutated to a new value.
 	// +kubebuilder:validation:Optional
-	AcceptedAmount *resource.Quantity `json:"acceptedAmount,omitempty"`
+	AcceptedSpec *CommittedResourceSpec `json:"acceptedSpec,omitempty"`
 
 	// AcceptedAt is when the controller last successfully reconciled the spec into Reservation slots.
 	// +kubebuilder:validation:Optional
 	AcceptedAt *metav1.Time `json:"acceptedAt,omitempty"`
 
-	// LastChanged is when the spec was last written by the syncer.
-	// When AcceptedAt is older than LastChanged, the controller has pending work.
-	// +kubebuilder:validation:Optional
-	LastChanged *metav1.Time `json:"lastChanged,omitempty"`
-
 	// LastReconcileAt is when the controller last ran its reconcile loop for this resource.
 	// +kubebuilder:validation:Optional
 	LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"`
 
-	// AssignedVMs holds the UUIDs of VMs deterministically assigned to this committed resource.
-	// Populated by the usage reconciler; used to compute UsedAmount and drive the quota controller.
+	// AssignedInstances holds the UUIDs of VM instances deterministically assigned to this committed resource.
+	// Populated by the usage reconciler; used to compute UsedResources and drive the quota controller.
 	// +kubebuilder:validation:Optional
-	AssignedVMs []string `json:"assignedVMs,omitempty"`
+	AssignedInstances []string `json:"assignedInstances,omitempty"`
 
-	// UsedAmount is the sum of assigned VM resources expressed in the same units as Spec.Amount.
-	// Populated by the usage reconciler.
+	// UsedResources is the total resource consumption of assigned VM instances, keyed by resource type
+	// (e.g. "memory" in MiB binary SI, "cpu" as core count). Populated by the usage reconciler.
 	// +kubebuilder:validation:Optional
-	UsedAmount *resource.Quantity `json:"usedAmount,omitempty"`
+	UsedResources map[string]resource.Quantity `json:"usedResources,omitempty"`
 
-	// LastUsageReconcileAt is when the usage reconciler last updated AssignedVMs and UsedAmount.
+	// LastUsageReconcileAt is when the usage reconciler last updated AssignedInstances and UsedResources.
 	// +kubebuilder:validation:Optional
 	LastUsageReconcileAt *metav1.Time `json:"lastUsageReconcileAt,omitempty"`
 
+	// UsageObservedGeneration is the CR generation that the usage reconciler last processed.
+	// Follows the Kubernetes observedGeneration pattern: when this differs from
+	// metadata.generation the cooldown is bypassed so spec changes (e.g. shrink) are reflected
+	// immediately rather than waiting for the next cooldown interval.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:validation:Minimum=0
+	UsageObservedGeneration *int64 `json:"usageObservedGeneration,omitempty"`
+
 	// Conditions holds the current status conditions.
 	// +kubebuilder:validation:Optional
 	Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
@@ -163,8 +167,8 @@ const (
 // +kubebuilder:printcolumn:name="ResourceType",type="string",JSONPath=".spec.resourceType"
 // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone"
 // +kubebuilder:printcolumn:name="Amount",type="string",JSONPath=".spec.amount"
-// +kubebuilder:printcolumn:name="AcceptedAmount",type="string",JSONPath=".status.acceptedAmount"
-// +kubebuilder:printcolumn:name="UsedAmount",type="string",JSONPath=".status.usedAmount"
+// +kubebuilder:printcolumn:name="UsedMemory",type="string",JSONPath=".status.usedResources.memory",priority=1
+// +kubebuilder:printcolumn:name="UsedCPU",type="string",JSONPath=".status.usedResources.cpu",priority=1
 // +kubebuilder:printcolumn:name="State",type="string",JSONPath=".spec.state"
 // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
 // +kubebuilder:printcolumn:name="StartTime",type="date",JSONPath=".spec.startTime",priority=1

@@ -52,6 +52,7 @@ const (
 	NovaDatasourceTypeFlavors        NovaDatasourceType = "flavors"
 	NovaDatasourceTypeMigrations     NovaDatasourceType = "migrations"
 	NovaDatasourceTypeAggregates     NovaDatasourceType = "aggregates"
+	NovaDatasourceTypeImages         NovaDatasourceType = "images"
 )
 
 type NovaDatasource struct {

@@ -6,6 +6,7 @@ package main
 import (
 	"context"
 	"crypto/tls"
+	"encoding/json"
 	"flag"
 	"fmt"
 	"log/slog"
@@ -102,7 +103,10 @@ func main() {
 	restConfig := ctrl.GetConfigOrDie()
 
 	// Custom entrypoint for scheduler e2e tests.
-	if len(os.Args) == 2 {
+	// Usage: /main <subcommand> [json-override]
+	// The optional json-override is merged on top of the ConfigMap config, e.g.:
+	//   /main e2e-commitments '{"noCleanup":true,"azs":["qa-de-1a"]}'
+	if len(os.Args) >= 2 {
 		copts := client.Options{Scheme: scheme}
 		client := must.Return(client.New(restConfig, copts))
 		switch os.Args[1] {
@@ -119,7 +123,21 @@ func main() {
 			return
 		case "e2e-commitments":
 			commitmentsChecksConfig := conf.GetConfigOrDie[commitments.E2EChecksConfig]()
-			commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
+			if len(os.Args) >= 3 {
+				if err := json.Unmarshal([]byte(os.Args[2]), &commitmentsChecksConfig); err != nil {
+					slog.Error("invalid json override for e2e-commitments", "err", err)
+					os.Exit(1)
+				}
+			}
+			func() {
+				defer func() {
+					if r := recover(); r != nil {
+						slog.Error("e2e check failed", "reason", r)
+						os.Exit(1)
+					}
+				}()
+				commitments.RunCommitmentsE2EChecks(ctx, commitmentsChecksConfig)
+			}()
 			return
 		}
 	}
@@ -549,14 +567,34 @@ func main() {
 			os.Exit(1)
 		}
 
+		crControllerConf := commitmentsConfig.CommittedResourceController
+		crControllerConf.ApplyDefaults()
 		if err := (&commitments.CommittedResourceController{
 			Client: multiclusterClient,
 			Scheme: mgr.GetScheme(),
-			Conf:   commitmentsConfig.CommittedResourceController,
+			Conf:   crControllerConf,
 		}).SetupWithManager(mgr, multiclusterClient); err != nil {
 			setupLog.Error(err, "unable to create controller", "controller", "CommittedResource")
 			os.Exit(1)
 		}
+
+		usageReconcilerMonitor := commitments.NewUsageReconcilerMonitor()
+		metrics.Registry.MustRegister(&usageReconcilerMonitor)
+		if commitmentsUsageDB == nil {
+			setupLog.Error(nil, "UsageReconciler requires a datasource but commitments.datasourceName is not configured — skipping")
+		} else {
+			usageReconcilerConf := commitmentsConfig.UsageReconciler
+			usageReconcilerConf.ApplyDefaults()
+			if err := (&commitments.UsageReconciler{
+				Client:  multiclusterClient,
+				Conf:    usageReconcilerConf,
+				UsageDB: commitmentsUsageDB,
+				Monitor: usageReconcilerMonitor,
+			}).SetupWithManager(mgr, multiclusterClient); err != nil {
+				setupLog.Error(err, "unable to create controller", "controller", "CommittedResourceUsage")
+				os.Exit(1)
+			}
+		}
 	}
 	if slices.Contains(mainConfig.EnabledControllers, "datasource-controllers") {
 		setupLog.Info("enabling controller", "controller", "datasource-controllers")

@@ -337,6 +337,30 @@ spec:
 ---
 apiVersion: cortex.cloud/v1alpha1
 kind: Datasource
+metadata:
+  name: nova-images
+spec:
+  schedulingDomain: nova
+  databaseSecretRef:
+    name: cortex-nova-postgres
+    namespace: {{ .Release.Namespace }}
+  {{- if .Values.openstack.sso.enabled }}
+  ssoSecretRef:
+    name: cortex-nova-openstack-sso
+    namespace: {{ .Release.Namespace }}
+  {{- end }}
+  type: openstack
+  openstack:
+    syncInterval: 3600s
+    secretRef:
+      name: cortex-nova-openstack-keystone
+      namespace: {{ .Release.Namespace }}
+    type: nova
+    nova:
+      type: images
+---
+apiVersion: cortex.cloud/v1alpha1
+kind: Datasource
 metadata:
   name: limes-project-commitments
 spec:

@@ -167,8 +167,10 @@ cortex-scheduling-controllers:
       # URL of the nova external scheduler API for placement decisions
       schedulerURL: "http://localhost:8080/scheduler/nova/external"
     committedResourceController:
-      # Back-off interval while CommittedResource placement is pending or failed
+      # Back-off interval while CommittedResource placement is pending or failed (base for exponential backoff)
       requeueIntervalRetry: "1m"
+      # Maximum back-off interval cap for the exponential retry delay
+      maxRequeueInterval: "30m"
     committedResourceAPI:
       # Timeout for watching CommittedResource CRDs before rolling back
       watchTimeout: "10s"
@@ -196,6 +198,11 @@ cortex-scheduling-controllers:
             handlesCommitments: false
             hasCapacity: true
             hasQuota: false
+    committedResourceUsageReconciler:
+      # Minimum time between usage reconcile runs for the same CommittedResource.
+      # Also acts as the periodic fallback interval: a successful reconcile schedules
+      # the next run after this duration, so this is also the maximum status staleness.
+      cooldownInterval: "5m"
     # OvercommitMappings is a list of mappings that map hypervisor traits to
     # overcommit ratios. Note that this list is applied in order, so if there
     # are multiple mappings applying to the same hypervisors, the last mapping