From 94ec411e73a69c58a8de4859cd31998a7a9e7980 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 28 Apr 2026 13:22:18 +0200 Subject: [PATCH 01/10] feat: Add quota crd and update quota api --- api/v1alpha1/project_quota_types.go | 122 ++++++ api/v1alpha1/zz_generated.deepcopy.go | 158 ++++++++ .../crds/cortex.cloud_projectquotas.yaml | 212 +++++++++++ .../reservations/commitments/api/handler.go | 3 + .../reservations/commitments/api/info.go | 12 +- .../reservations/commitments/api/info_test.go | 43 ++- .../reservations/commitments/api/quota.go | 156 +++++++- .../commitments/api/quota_monitor.go | 47 +++ .../commitments/api/quota_test.go | 354 ++++++++++++++++++ .../reservations/commitments/config.go | 6 + .../reservations/commitments/usage.go | 17 +- 11 files changed, 1113 insertions(+), 17 deletions(-) create mode 100644 api/v1alpha1/project_quota_types.go create mode 100644 helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml create mode 100644 internal/scheduling/reservations/commitments/api/quota_monitor.go create mode 100644 internal/scheduling/reservations/commitments/api/quota_test.go diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go new file mode 100644 index 000000000..cf61585c6 --- /dev/null +++ b/api/v1alpha1/project_quota_types.go @@ -0,0 +1,122 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ResourceQuota holds the quota for a single resource with per-AZ breakdown. +// Maps to liquid.ResourceQuotaRequest from the LIQUID API. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest +type ResourceQuota struct { + // Quota is the total quota across all AZs (for compatibility). + // Corresponds to liquid.ResourceQuotaRequest.Quota. + // +kubebuilder:validation:Required + Quota int64 `json:"quota"` + + // PerAZ holds the per-availability-zone quota breakdown. + // Key: availability zone name, Value: quota for that AZ. + // Only populated for AZSeparatedTopology resources. + // Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ResourceQuotaUsage holds per-AZ PAYG usage for a single resource. +type ResourceQuotaUsage struct { + // PerAZ holds per-availability-zone PAYG usage values. + // Key: availability zone name, Value: PAYG usage in that AZ. + // +kubebuilder:validation:Optional + PerAZ map[string]int64 `json:"perAZ,omitempty"` +} + +// ProjectQuotaSpec defines the desired state of ProjectQuota. +// Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuotaSpec struct { + // ProjectID of the OpenStack project this quota belongs to. + // Corresponds to the :uuid in the PUT URL path. + // +kubebuilder:validation:Required + ProjectID string `json:"projectID"` + + // ProjectName is the human-readable name of the OpenStack project. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + // +kubebuilder:validation:Optional + ProjectName string `json:"projectName,omitempty"` + + // DomainID of the OpenStack domain this project belongs to. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + // +kubebuilder:validation:Required + DomainID string `json:"domainID"` + + // DomainName is the human-readable name of the OpenStack domain. + // Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + // +kubebuilder:validation:Optional + DomainName string `json:"domainName,omitempty"` + + // Quota maps LIQUID resource names to their per-AZ quota. + // Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + // Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + // +kubebuilder:validation:Optional + Quota map[string]ResourceQuota `json:"quota,omitempty"` +} + +// ProjectQuotaStatus defines the observed state of ProjectQuota. +// Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport +type ProjectQuotaStatus struct { + // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` + + // LastReconcileAt is when the controller last reconciled this project's quota. + // +kubebuilder:validation:Optional + LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"` + + // Conditions holds the current status conditions. + // +kubebuilder:validation:Optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:printcolumn:name="Project",type="string",JSONPath=".spec.projectID" +// +kubebuilder:printcolumn:name="Domain",type="string",JSONPath=".spec.domainID" +// +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" +// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" + +// ProjectQuota is the Schema for the projectquotas API. +// It persists quota values pushed by Limes via the LIQUID quota endpoint +// (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). +// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest +type ProjectQuota struct { + metav1.TypeMeta `json:",inline"` + + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // +required + Spec ProjectQuotaSpec `json:"spec"` + + // +optional + Status ProjectQuotaStatus `json:"status,omitempty,omitzero"` +} + +// +kubebuilder:object:root=true + +// ProjectQuotaList contains a list of ProjectQuota +type ProjectQuotaList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ProjectQuota `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ProjectQuota{}, &ProjectQuotaList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d9daa7aab..1a4bc222a 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1420,6 +1420,120 @@ func (in *PlacementDatasource) DeepCopy() *PlacementDatasource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuota) DeepCopyInto(out *ProjectQuota) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuota. +func (in *ProjectQuota) DeepCopy() *ProjectQuota { + if in == nil { + return nil + } + out := new(ProjectQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuota) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaList) DeepCopyInto(out *ProjectQuotaList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ProjectQuota, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaList. +func (in *ProjectQuotaList) DeepCopy() *ProjectQuotaList { + if in == nil { + return nil + } + out := new(ProjectQuotaList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ProjectQuotaList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaSpec) DeepCopyInto(out *ProjectQuotaSpec) { + *out = *in + if in.Quota != nil { + in, out := &in.Quota, &out.Quota + *out = make(map[string]ResourceQuota, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaSpec. +func (in *ProjectQuotaSpec) DeepCopy() *ProjectQuotaSpec { + if in == nil { + return nil + } + out := new(ProjectQuotaSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { + *out = *in + if in.PaygUsage != nil { + in, out := &in.PaygUsage, &out.PaygUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.LastReconcileAt != nil { + in, out := &in.LastReconcileAt, &out.LastReconcileAt + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProjectQuotaStatus. +func (in *ProjectQuotaStatus) DeepCopy() *ProjectQuotaStatus { + if in == nil { + return nil + } + out := new(ProjectQuotaStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrometheusDatasource) DeepCopyInto(out *PrometheusDatasource) { *out = *in @@ -1570,6 +1684,50 @@ func (in *ReservationStatus) DeepCopy() *ReservationStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuota) DeepCopyInto(out *ResourceQuota) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuota. +func (in *ResourceQuota) DeepCopy() *ResourceQuota { + if in == nil { + return nil + } + out := new(ResourceQuota) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceQuotaUsage) DeepCopyInto(out *ResourceQuotaUsage) { + *out = *in + if in.PerAZ != nil { + in, out := &in.PerAZ, &out.PerAZ + *out = make(map[string]int64, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceQuotaUsage. +func (in *ResourceQuotaUsage) DeepCopy() *ResourceQuotaUsage { + if in == nil { + return nil + } + out := new(ResourceQuotaUsage) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingHistoryEntry) DeepCopyInto(out *SchedulingHistoryEntry) { *out = *in diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml new file mode 100644 index 000000000..07e39aaa0 --- /dev/null +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -0,0 +1,212 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: projectquotas.cortex.cloud +spec: + group: cortex.cloud + names: + kind: ProjectQuota + listKind: ProjectQuotaList + plural: projectquotas + singular: projectquota + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.projectID + name: Project + type: string + - jsonPath: .spec.domainID + name: Domain + type: string + - jsonPath: .status.lastReconcileAt + name: LastReconcile + type: date + - jsonPath: .status.conditions[?(@.type=='Ready')].status + name: Ready + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + ProjectQuota is the Schema for the projectquotas API. + It persists quota values pushed by Limes via the LIQUID quota endpoint + (PUT /v1/projects/:uuid/quota → liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + ProjectQuotaSpec defines the desired state of ProjectQuota. + Populated from PUT /v1/projects/:uuid/quota payloads (liquid.ServiceQuotaRequest). + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + properties: + domainID: + description: |- + DomainID of the OpenStack domain this project belongs to. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.UUID. + type: string + domainName: + description: |- + DomainName is the human-readable name of the OpenStack domain. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Domain.Name. + type: string + projectID: + description: |- + ProjectID of the OpenStack project this quota belongs to. + Corresponds to the :uuid in the PUT URL path. + type: string + projectName: + description: |- + ProjectName is the human-readable name of the OpenStack project. + Extracted from liquid.ServiceQuotaRequest.ProjectMetadata.Name. + type: string + quota: + additionalProperties: + description: |- + ResourceQuota holds the quota for a single resource with per-AZ breakdown. + Maps to liquid.ResourceQuotaRequest from the LIQUID API. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ResourceQuotaRequest + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds the per-availability-zone quota breakdown. + Key: availability zone name, Value: quota for that AZ. + Only populated for AZSeparatedTopology resources. + Corresponds to liquid.ResourceQuotaRequest.PerAZ[az].Quota. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceQuotaRequest + type: object + quota: + description: |- + Quota is the total quota across all AZs (for compatibility). + Corresponds to liquid.ResourceQuotaRequest.Quota. + format: int64 + type: integer + required: + - quota + type: object + description: |- + Quota maps LIQUID resource names to their per-AZ quota. + Key: liquid.ResourceName (e.g. "hw_version_hana_v2_ram") + Mirrors liquid.ServiceQuotaRequest.Resources with AZSeparatedTopology. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#ServiceQuotaRequest + type: object + required: + - domainID + - projectID + type: object + status: + description: |- + ProjectQuotaStatus defines the observed state of ProjectQuota. + Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. + See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport + properties: + conditions: + description: Conditions holds the current status conditions. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastReconcileAt: + description: LastReconcileAt is when the controller last reconciled + this project's quota. + format: date-time + type: string + paygUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + Key: liquid.ResourceName + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/internal/scheduling/reservations/commitments/api/handler.go b/internal/scheduling/reservations/commitments/api/handler.go index f0eb24110..413bad31f 100644 --- a/internal/scheduling/reservations/commitments/api/handler.go +++ b/internal/scheduling/reservations/commitments/api/handler.go @@ -26,6 +26,7 @@ type HTTPAPI struct { usageMonitor ReportUsageAPIMonitor capacityMonitor ReportCapacityAPIMonitor infoMonitor InfoAPIMonitor + quotaMonitor QuotaAPIMonitor // Mutex to serialize change-commitments requests changeMutex sync.Mutex } @@ -44,6 +45,7 @@ func NewAPIWithConfig(k8sClient client.Client, config commitments.Config, usageD usageMonitor: NewReportUsageAPIMonitor(), capacityMonitor: NewReportCapacityAPIMonitor(), infoMonitor: NewInfoAPIMonitor(), + quotaMonitor: NewQuotaAPIMonitor(), } } @@ -52,6 +54,7 @@ func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer, log registry.MustRegister(&api.usageMonitor) registry.MustRegister(&api.capacityMonitor) registry.MustRegister(&api.infoMonitor) + registry.MustRegister(&api.quotaMonitor) mux.HandleFunc("/commitments/v1/change-commitments", api.HandleChangeCommitments) mux.HandleFunc("/commitments/v1/report-capacity", api.HandleReportCapacity) mux.HandleFunc("/commitments/v1/info", api.HandleInfo) diff --git a/internal/scheduling/reservations/commitments/api/info.go b/internal/scheduling/reservations/commitments/api/info.go index 6999b38d6..f16f71301 100644 --- a/internal/scheduling/reservations/commitments/api/info.go +++ b/internal/scheduling/reservations/commitments/api/info.go @@ -151,6 +151,12 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l return liquid.ServiceInfo{}, fmt.Errorf("%w: failed to create unit for flavor group %q: %w", errInternalServiceInfo, groupName, err) } + // Determine topology: AZSeparatedTopology only for groups that accept commitments + // (AZSeparatedTopology means quota is also AZ-aware, required when HasQuota=true) + ramTopology := liquid.AZAwareTopology + if handlesCommitments { + ramTopology = liquid.AZSeparatedTopology + } resources[ramResourceName] = liquid.ResourceInfo{ DisplayName: fmt.Sprintf( "multiples of %d MiB (usable by: %s)", @@ -158,10 +164,10 @@ func (api *HTTPAPI) buildServiceInfo(ctx context.Context, logger logr.Logger) (l flavorListStr, ), Unit: ramUnit, // Non-standard unit: multiples of smallest flavor RAM - Topology: liquid.AZAwareTopology, + Topology: ramTopology, NeedsResourceDemand: false, - HasCapacity: true, // We report capacity via /commitments/v1/report-capacity - HasQuota: false, + HasCapacity: true, // We report capacity via /commitments/v1/report-capacity + HasQuota: handlesCommitments, // true only for groups that accept commitments HandlesCommitments: handlesCommitments, // Only groups with fixed ratio accept commitments Attributes: attrsJSON, } diff --git a/internal/scheduling/reservations/commitments/api/info_test.go b/internal/scheduling/reservations/commitments/api/info_test.go index 48e12fd2c..3ca0bd11c 100644 --- a/internal/scheduling/reservations/commitments/api/info_test.go +++ b/internal/scheduling/reservations/commitments/api/info_test.go @@ -224,7 +224,7 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { t.Fatalf("expected 6 resources (3 per flavor group), got %d", len(serviceInfo.Resources)) } - // Test RAM resource: hw_version_hana_fixed_ram + // Test RAM resource: hw_version_hana_fixed_ram (fixed ratio → commitments + quota) ramResource, ok := serviceInfo.Resources["hw_version_hana_fixed_ram"] if !ok { t.Fatal("expected hw_version_hana_fixed_ram resource to exist") @@ -235,8 +235,14 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if !ramResource.HandlesCommitments { t.Error("hw_version_hana_fixed_ram: expected HandlesCommitments=true (RAM is primary commitment resource)") } + if ramResource.Topology != liquid.AZSeparatedTopology { + t.Errorf("hw_version_hana_fixed_ram: expected Topology=%q, got %q", liquid.AZSeparatedTopology, ramResource.Topology) + } + if !ramResource.HasQuota { + t.Error("hw_version_hana_fixed_ram: expected HasQuota=true (fixed ratio groups accept quotas)") + } - // Test Cores resource: hw_version_hana_fixed_cores + // Test Cores resource: hw_version_hana_fixed_cores (always AZAwareTopology, no quota) coresResource, ok := serviceInfo.Resources["hw_version_hana_fixed_cores"] if !ok { t.Fatal("expected hw_version_hana_fixed_cores resource to exist") @@ -247,8 +253,14 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if coresResource.HandlesCommitments { t.Error("hw_version_hana_fixed_cores: expected HandlesCommitments=false (cores are derived)") } + if coresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, coresResource.Topology) + } + if coresResource.HasQuota { + t.Error("hw_version_hana_fixed_cores: expected HasQuota=false") + } - // Test Instances resource: hw_version_hana_fixed_instances + // Test Instances resource: hw_version_hana_fixed_instances (always AZAwareTopology, no quota) instancesResource, ok := serviceInfo.Resources["hw_version_hana_fixed_instances"] if !ok { t.Fatal("expected hw_version_hana_fixed_instances resource to exist") @@ -259,8 +271,15 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if instancesResource.HandlesCommitments { t.Error("hw_version_hana_fixed_instances: expected HandlesCommitments=false (instances are derived)") } + if instancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_hana_fixed_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, instancesResource.Topology) + } + if instancesResource.HasQuota { + t.Error("hw_version_hana_fixed_instances: expected HasQuota=false") + } // Variable ratio group DOES have resources now, but HandlesCommitments=false for RAM + // Variable ratio → AZAwareTopology, no quota v2RamResource, ok := serviceInfo.Resources["hw_version_v2_variable_ram"] if !ok { t.Fatal("expected hw_version_v2_variable_ram resource to exist (all groups included)") @@ -271,6 +290,12 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2RamResource.HandlesCommitments { t.Error("hw_version_v2_variable_ram: expected HandlesCommitments=false (variable ratio)") } + if v2RamResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_ram: expected Topology=%q, got %q", liquid.AZAwareTopology, v2RamResource.Topology) + } + if v2RamResource.HasQuota { + t.Error("hw_version_v2_variable_ram: expected HasQuota=false (variable ratio)") + } v2CoresResource, ok := serviceInfo.Resources["hw_version_v2_variable_cores"] if !ok { @@ -282,6 +307,12 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2CoresResource.HandlesCommitments { t.Error("hw_version_v2_variable_cores: expected HandlesCommitments=false") } + if v2CoresResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_cores: expected Topology=%q, got %q", liquid.AZAwareTopology, v2CoresResource.Topology) + } + if v2CoresResource.HasQuota { + t.Error("hw_version_v2_variable_cores: expected HasQuota=false") + } v2InstancesResource, ok := serviceInfo.Resources["hw_version_v2_variable_instances"] if !ok { @@ -293,4 +324,10 @@ func TestHandleInfo_HasCapacityEqualsHandlesCommitments(t *testing.T) { if v2InstancesResource.HandlesCommitments { t.Error("hw_version_v2_variable_instances: expected HandlesCommitments=false") } + if v2InstancesResource.Topology != liquid.AZAwareTopology { + t.Errorf("hw_version_v2_variable_instances: expected Topology=%q, got %q", liquid.AZAwareTopology, v2InstancesResource.Topology) + } + if v2InstancesResource.HasQuota { + t.Error("hw_version_v2_variable_instances: expected HasQuota=false") + } } diff --git a/internal/scheduling/reservations/commitments/api/quota.go b/internal/scheduling/reservations/commitments/api/quota.go index c77fdf1a6..37b57d22a 100644 --- a/internal/scheduling/reservations/commitments/api/quota.go +++ b/internal/scheduling/reservations/commitments/api/quota.go @@ -4,19 +4,35 @@ package api import ( + "encoding/json" + "fmt" + "math" "net/http" + "strconv" + "time" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/google/uuid" + "github.com/sapcc/go-api-declarations/liquid" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" ) +// projectQuotaCRDName returns the CRD object name for a given project UUID. +// Convention: "quota-" +func projectQuotaCRDName(projectID string) string { + return "quota-" + projectID +} + // HandleQuota implements PUT /commitments/v1/projects/:project_id/quota from Limes LIQUID API. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // -// This is a no-op endpoint that accepts quota requests but doesn't store them. -// Cortex does not enforce quotas for committed resources - quota enforcement -// happens through commitment validation at change-commitments time. -// The endpoint exists for API compatibility with the LIQUID specification. +// This endpoint receives quota requests from Limes and persists them as ProjectQuota CRDs. +// One CRD per project, named "quota-". func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() + // Extract or generate request ID for tracing requestID := r.Header.Get("X-Request-ID") if requestID == "" { @@ -27,14 +43,138 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { log := apiLog.WithValues("requestID", requestID, "endpoint", "quota") if r.Method != http.MethodPut { - http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + api.quotaError(w, http.StatusMethodNotAllowed, "Method not allowed", startTime) + return + } + + // Check if quota API is enabled + if !api.config.EnableQuotaAPI { + api.quotaError(w, http.StatusServiceUnavailable, "Quota API is disabled", startTime) + return + } + + // Extract project UUID from URL path + projectID, err := extractProjectIDFromPath(r.URL.Path) + if err != nil { + log.Error(err, "failed to extract project ID from path") + api.quotaError(w, http.StatusBadRequest, "Invalid URL path: "+err.Error(), startTime) + return + } + + // Parse request body + var req liquid.ServiceQuotaRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + log.Error(err, "failed to decode quota request body") + api.quotaError(w, http.StatusBadRequest, "Invalid request body: "+err.Error(), startTime) return } - // No-op: Accept the quota request but don't store it - // Cortex handles capacity through commitments, not quotas - log.V(1).Info("received quota request (no-op)", "path", r.URL.Path) + // Extract project/domain metadata if available + var projectName, domainID, domainName string + if meta, ok := req.ProjectMetadata.Unpack(); ok { + // Consistency check: metadata UUID must match URL path UUID + if meta.UUID != "" && meta.UUID != projectID { + log.Info("project UUID mismatch", "urlProjectID", projectID, "metadataUUID", meta.UUID) + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Project UUID mismatch: URL has %q but metadata has %q", projectID, meta.UUID), startTime) + return + } + projectName = meta.Name + domainID = meta.Domain.UUID + domainName = meta.Domain.Name + } + + // Build the spec quota map from the liquid request. + // liquid API uses uint64; our CRD uses int64 (K8s convention). + // Guard against overflow: uint64 values > MaxInt64 would wrap to negative. + specQuota := make(map[string]v1alpha1.ResourceQuota, len(req.Resources)) + for resourceName, resQuota := range req.Resources { + if resQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q exceeds int64 max", resourceName), startTime) + return + } + rq := v1alpha1.ResourceQuota{ + Quota: int64(resQuota.Quota), + } + if len(resQuota.PerAZ) > 0 { + rq.PerAZ = make(map[string]int64, len(resQuota.PerAZ)) + for az, azQuota := range resQuota.PerAZ { + if azQuota.Quota > math.MaxInt64 { + api.quotaError(w, http.StatusBadRequest, fmt.Sprintf("Quota value for resource %q in AZ %q exceeds int64 max", resourceName, az), startTime) + return + } + rq.PerAZ[string(az)] = int64(azQuota.Quota) + } + } + specQuota[string(resourceName)] = rq + } + + // Create or update ProjectQuota CRD + crdName := projectQuotaCRDName(projectID) + ctx := r.Context() + + var existing v1alpha1.ProjectQuota + err = api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) + if err != nil { + if !apierrors.IsNotFound(err) { + // Real error + log.Error(err, "failed to get existing ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check existing quota: %v", err), startTime) + return + } + // Not found — create new + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + }, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + ProjectName: projectName, + DomainID: domainID, + DomainName: domainName, + Quota: specQuota, + }, + } + if err := api.client.Create(ctx, pq); err != nil { + log.Error(err, "failed to create ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create quota: %v", err), startTime) + return + } + log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + } else { + // Update existing + existing.Spec.Quota = specQuota + if projectName != "" { + existing.Spec.ProjectName = projectName + } + if domainID != "" { + existing.Spec.DomainID = domainID + } + if domainName != "" { + existing.Spec.DomainName = domainName + } + if err := api.client.Update(ctx, &existing); err != nil { + log.Error(err, "failed to update ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update quota: %v", err), startTime) + return + } + log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + } // Return 204 No Content as expected by the LIQUID API w.WriteHeader(http.StatusNoContent) + api.recordQuotaMetrics(http.StatusNoContent, startTime) +} + +// quotaError writes an HTTP error response and records metrics. Used for error paths in HandleQuota. +func (api *HTTPAPI) quotaError(w http.ResponseWriter, statusCode int, msg string, startTime time.Time) { + http.Error(w, msg, statusCode) + api.recordQuotaMetrics(statusCode, startTime) +} + +// recordQuotaMetrics records Prometheus metrics for a quota API request. +func (api *HTTPAPI) recordQuotaMetrics(statusCode int, startTime time.Time) { + duration := time.Since(startTime).Seconds() + statusCodeStr := strconv.Itoa(statusCode) + api.quotaMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc() + api.quotaMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration) } diff --git a/internal/scheduling/reservations/commitments/api/quota_monitor.go b/internal/scheduling/reservations/commitments/api/quota_monitor.go new file mode 100644 index 000000000..c06d4b788 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_monitor.go @@ -0,0 +1,47 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import "github.com/prometheus/client_golang/prometheus" + +// QuotaAPIMonitor provides metrics for the CR quota API. +type QuotaAPIMonitor struct { + requestCounter *prometheus.CounterVec + requestDuration *prometheus.HistogramVec +} + +// NewQuotaAPIMonitor creates a new monitor with Prometheus metrics. +// Metrics are pre-initialized with zero values for common HTTP status codes +// to ensure they appear in Prometheus before the first request. +func NewQuotaAPIMonitor() QuotaAPIMonitor { + m := QuotaAPIMonitor{ + requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_committed_resource_quota_api_requests_total", + Help: "Total number of quota API requests by status code.", + }, []string{"status_code"}), + requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "cortex_committed_resource_quota_api_request_duration_seconds", + Help: "Duration of quota API requests in seconds.", + Buckets: prometheus.DefBuckets, + }, []string{"status_code"}), + } + // Pre-initialize common status codes so they appear in Prometheus before the first request + for _, statusCode := range []string{"204", "400", "405", "500"} { + m.requestCounter.WithLabelValues(statusCode) + m.requestDuration.WithLabelValues(statusCode) + } + return m +} + +// Describe implements prometheus.Collector. +func (m *QuotaAPIMonitor) Describe(ch chan<- *prometheus.Desc) { + m.requestCounter.Describe(ch) + m.requestDuration.Describe(ch) +} + +// Collect implements prometheus.Collector. +func (m *QuotaAPIMonitor) Collect(ch chan<- prometheus.Metric) { + m.requestCounter.Collect(ch) + m.requestDuration.Collect(ch) +} diff --git a/internal/scheduling/reservations/commitments/api/quota_test.go b/internal/scheduling/reservations/commitments/api/quota_test.go new file mode 100644 index 000000000..218bc0815 --- /dev/null +++ b/internal/scheduling/reservations/commitments/api/quota_test.go @@ -0,0 +1,354 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package api + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/majewsky/gg/option" + "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// newTestScheme returns a scheme with v1alpha1 types registered. +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + return scheme +} + +// marshalQuotaReq marshals a ServiceQuotaRequest, failing the test on error. +func marshalQuotaReq(t *testing.T, req liquid.ServiceQuotaRequest) []byte { + t.Helper() + body, err := json.Marshal(req) + if err != nil { + t.Fatalf("failed to marshal request: %v", err) + } + return body +} + +func TestHandleQuota_ErrorCases(t *testing.T) { + tests := []struct { + name string + method string + path string + body []byte + metadata *liquid.ProjectMetadata + enableQuota *bool // nil = default (enabled) + expectedStatus int + }{ + { + name: "MethodNotAllowed_GET", + method: http.MethodGet, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "MethodNotAllowed_POST", + method: http.MethodPost, + path: "/commitments/v1/projects/project-abc/quota", + body: nil, + expectedStatus: http.StatusMethodNotAllowed, + }, + { + name: "DisabledAPI", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(`{"resources":{}}`), + enableQuota: boolPtr(false), + expectedStatus: http.StatusServiceUnavailable, + }, + { + name: "InvalidBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte("{invalid"), + expectedStatus: http.StatusBadRequest, + }, + { + name: "EmptyBody", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + body: []byte(""), + expectedStatus: http.StatusBadRequest, + }, + { + name: "UUIDMismatch", + method: http.MethodPut, + path: "/commitments/v1/projects/project-abc/quota", + metadata: &liquid.ProjectMetadata{ + UUID: "different-uuid", + Name: "my-project", + Domain: liquid.DomainMetadata{UUID: "domain-123", Name: "my-domain"}, + }, + expectedStatus: http.StatusBadRequest, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + k8sClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + var httpAPI *HTTPAPI + if tc.enableQuota != nil && !*tc.enableQuota { + config := commitments.DefaultConfig() + config.EnableQuotaAPI = false + httpAPI = NewAPIWithConfig(k8sClient, config, nil) + } else { + httpAPI = NewAPI(k8sClient) + } + + // Build body: use provided bytes or construct from metadata + var bodyReader *bytes.Reader + switch { + case tc.body != nil: + bodyReader = bytes.NewReader(tc.body) + case tc.metadata != nil: + quotaReq := liquid.ServiceQuotaRequest{ + Resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 100}, + }, + } + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + bodyReader = bytes.NewReader(marshalQuotaReq(t, quotaReq)) + default: + bodyReader = bytes.NewReader([]byte{}) + } + + req := httptest.NewRequest(tc.method, tc.path, bodyReader) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != tc.expectedStatus { + t.Errorf("expected status %d, got %d", tc.expectedStatus, resp.StatusCode) + } + }) + } +} + +func TestHandleQuota_CreateAndUpdate(t *testing.T) { + tests := []struct { + name string + // existing is a pre-existing CRD to seed (nil = create, non-nil = update) + existing *v1alpha1.ProjectQuota + projectID string + resources map[liquid.ResourceName]liquid.ResourceQuotaRequest + metadata *liquid.ProjectMetadata + expectQuota map[string]int64 // resource name → expected total quota + expectPerAZ map[string]map[string]int64 // resource name → az → expected quota + expectName string + expectDomain string + expectDomName string + }{ + { + name: "Create_WithPerAZ", + projectID: "project-abc-123", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 100, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 60}, + "az-b": {Quota: 40}, + }, + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 100}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 60, "az-b": 40}, + }, + }, + { + name: "Create_EmptyResources", + projectID: "project-empty", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, + expectQuota: map[string]int64{}, + }, + { + name: "Create_WithMetadata", + projectID: "project-meta-test", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 50}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-meta-test", + Name: "my-project-name", + Domain: liquid.DomainMetadata{ + UUID: "domain-uuid-456", + Name: "my-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 50}, + expectName: "my-project-name", + expectDomain: "domain-uuid-456", + expectDomName: "my-domain-name", + }, + { + name: "Update_QuotaValues", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-xyz", + DomainID: "original-domain", + DomainName: "original-domain-name", + ProjectName: "original-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 50, PerAZ: map[string]int64{"az-a": 50}}, + }, + }, + }, + projectID: "project-xyz", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": { + Quota: 200, + PerAZ: map[liquid.AvailabilityZone]liquid.AZResourceQuotaRequest{ + "az-a": {Quota: 120}, + "az-b": {Quota: 80}, + }, + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 200}, + expectPerAZ: map[string]map[string]int64{ + "hw_version_hana_1_ram": {"az-a": 120, "az-b": 80}, + }, + // Metadata should be preserved when not provided in update + expectDomain: "original-domain", + expectDomName: "original-domain-name", + expectName: "original-project-name", + }, + { + name: "Update_WithNewMetadata", + existing: &v1alpha1.ProjectQuota{ + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: "project-update-meta", + DomainID: "old-domain", + DomainName: "old-domain-name", + ProjectName: "old-project-name", + Quota: map[string]v1alpha1.ResourceQuota{ + "hw_version_hana_1_ram": {Quota: 10}, + }, + }, + }, + projectID: "project-update-meta", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ + "hw_version_hana_1_ram": {Quota: 99}, + }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-update-meta", + Name: "new-project-name", + Domain: liquid.DomainMetadata{ + UUID: "new-domain", + Name: "new-domain-name", + }, + }, + expectQuota: map[string]int64{"hw_version_hana_1_ram": 99}, + expectName: "new-project-name", + expectDomain: "new-domain", + expectDomName: "new-domain-name", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := newTestScheme(t) + builder := fake.NewClientBuilder().WithScheme(scheme) + + if tc.existing != nil { + tc.existing.Name = projectQuotaCRDName(tc.projectID) + builder = builder.WithObjects(tc.existing) + } + k8sClient := builder.Build() + httpAPI := NewAPI(k8sClient) + + quotaReq := liquid.ServiceQuotaRequest{ + Resources: tc.resources, + } + if tc.metadata != nil { + quotaReq.ProjectMetadata = option.Some(*tc.metadata) + } + body := marshalQuotaReq(t, quotaReq) + + path := "/commitments/v1/projects/" + tc.projectID + "/quota" + req := httptest.NewRequest(http.MethodPut, path, bytes.NewReader(body)) + w := httptest.NewRecorder() + + httpAPI.HandleQuota(w, req) + + resp := w.Result() + defer resp.Body.Close() + + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("expected status %d (No Content), got %d", http.StatusNoContent, resp.StatusCode) + } + + // Verify the ProjectQuota CRD + var pq v1alpha1.ProjectQuota + crdName := projectQuotaCRDName(tc.projectID) + if err := k8sClient.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + t.Fatalf("failed to get ProjectQuota CRD %q: %v", crdName, err) + } + + if pq.Spec.ProjectID != tc.projectID { + t.Errorf("expected ProjectID %q, got %q", tc.projectID, pq.Spec.ProjectID) + } + + // Verify quota totals + for resName, expectedTotal := range tc.expectQuota { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec", resName) + continue + } + if actual.Quota != expectedTotal { + t.Errorf("resource %q: expected quota %d, got %d", resName, expectedTotal, actual.Quota) + } + } + + // Verify per-AZ quotas + for resName, azMap := range tc.expectPerAZ { + actual, ok := pq.Spec.Quota[resName] + if !ok { + t.Errorf("expected resource %q in quota spec for per-AZ check", resName) + continue + } + for az, expectedAZ := range azMap { + if actual.PerAZ[az] != expectedAZ { + t.Errorf("resource %q AZ %q: expected %d, got %d", resName, az, expectedAZ, actual.PerAZ[az]) + } + } + } + + // Verify metadata + if tc.expectName != "" && pq.Spec.ProjectName != tc.expectName { + t.Errorf("expected ProjectName %q, got %q", tc.expectName, pq.Spec.ProjectName) + } + if tc.expectDomain != "" && pq.Spec.DomainID != tc.expectDomain { + t.Errorf("expected DomainID %q, got %q", tc.expectDomain, pq.Spec.DomainID) + } + if tc.expectDomName != "" && pq.Spec.DomainName != tc.expectDomName { + t.Errorf("expected DomainName %q, got %q", tc.expectDomName, pq.Spec.DomainName) + } + }) + } +} + +func boolPtr(b bool) *bool { + return &b +} diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 888d37018..36c3ec00b 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -57,6 +57,11 @@ type Config struct { // When false, the endpoint will return HTTP 503 Service Unavailable. // This can be used as an emergency switch if the capacity reporting is causing issues. EnableReportCapacityAPI bool `json:"committedResourceEnableReportCapacityAPI"` + + // EnableQuotaAPI controls whether the quota API endpoint is active. + // When false, the endpoint will return HTTP 503 Service Unavailable. + // This can be used as an emergency switch if quota persistence is causing issues. + EnableQuotaAPI bool `json:"committedResourceEnableQuotaAPI"` } // ApplyDefaults fills in any unset values with defaults. @@ -103,5 +108,6 @@ func DefaultConfig() Config { EnableChangeCommitmentsAPI: true, EnableReportUsageAPI: true, EnableReportCapacityAPI: true, + EnableQuotaAPI: true, } } diff --git a/internal/scheduling/reservations/commitments/usage.go b/internal/scheduling/reservations/commitments/usage.go index d634fc2a0..14dbfa482 100644 --- a/internal/scheduling/reservations/commitments/usage.go +++ b/internal/scheduling/reservations/commitments/usage.go @@ -471,22 +471,33 @@ func (c *UsageCalculator) buildUsageResponse( } // Build ResourceUsageReport for all flavor groups (not just those with fixed ratio) - for flavorGroupName := range flavorGroups { + for flavorGroupName, groupData := range flavorGroups { // All flavor groups are included in usage reporting. // === 1. RAM Resource === ramResourceName := liquid.ResourceName(ResourceNameRAM(flavorGroupName)) ramPerAZ := make(map[liquid.AvailabilityZone]*liquid.AZResourceUsageReport) + // For AZSeparatedTopology resources (fixed-ratio groups), per-AZ Quota must be non-null. + // Use -1 ("infinite quota") as default until actual quota is read from ProjectQuota CRD. + ramHasAZQuota := FlavorGroupAcceptsCommitments(&groupData) for _, az := range allAZs { - ramPerAZ[az] = &liquid.AZResourceUsageReport{ + report := &liquid.AZResourceUsageReport{ Usage: 0, Subresources: []liquid.Subresource{}, } + if ramHasAZQuota { + report.Quota = Some(int64(-1)) // infinite — will be overridden by ProjectQuota CRD + } + ramPerAZ[az] = report } if azData, exists := usageByFlavorGroupAZ[flavorGroupName]; exists { for az, data := range azData { if _, known := ramPerAZ[az]; !known { - ramPerAZ[az] = &liquid.AZResourceUsageReport{} + report := &liquid.AZResourceUsageReport{} + if ramHasAZQuota { + report.Quota = Some(int64(-1)) + } + ramPerAZ[az] = report } ramPerAZ[az].Usage = data.ramUsage ramPerAZ[az].PhysicalUsage = Some(data.ramUsage) // No overcommit for RAM From 661917648516c55af15c95167dd0aa2f4af69f27 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Wed, 29 Apr 2026 16:47:33 +0200 Subject: [PATCH 02/10] WIP: add quota controller --- .claude/settings.local.json | 8 + api/v1alpha1/project_quota_types.go | 7 + api/v1alpha1/zz_generated.deepcopy.go | 7 + .../crds/cortex.cloud_projectquotas.yaml | 20 + internal/scheduling/external/nova.go | 17 + .../reservations/failover/integration_test.go | 15 + .../reservations/failover/vm_source.go | 65 + .../reservations/failover/vm_source_test.go | 4 + .../scheduling/reservations/quota/config.go | 44 + .../scheduling/reservations/quota/context.go | 27 + .../reservations/quota/controller.go | 950 +++++++++++++ .../reservations/quota/controller_test.go | 598 ++++++++ .../reservations/quota/integration_test.go | 1232 +++++++++++++++++ .../scheduling/reservations/quota/metrics.go | 98 ++ 14 files changed, 3092 insertions(+) create mode 100644 .claude/settings.local.json create mode 100644 internal/scheduling/reservations/quota/config.go create mode 100644 internal/scheduling/reservations/quota/context.go create mode 100644 internal/scheduling/reservations/quota/controller.go create mode 100644 internal/scheduling/reservations/quota/controller_test.go create mode 100644 internal/scheduling/reservations/quota/integration_test.go create mode 100644 internal/scheduling/reservations/quota/metrics.go diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000..36798fdd7 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "Read(//root/**)", + "Bash(go doc:*)" + ] + } +} diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go index cf61585c6..715b6e728 100644 --- a/api/v1alpha1/project_quota_types.go +++ b/api/v1alpha1/project_quota_types.go @@ -69,7 +69,14 @@ type ProjectQuotaSpec struct { // Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport type ProjectQuotaStatus struct { + // TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + // Persisted by the quota controller; updated by full reconcile and HV instance diffs. + // Key: liquid.ResourceName + // +kubebuilder:validation:Optional + TotalUsage map[string]ResourceQuotaUsage `json:"totalUsage,omitempty"` + // PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + // Derived as TotalUsage - CRUsage (clamped >= 0). // Key: liquid.ResourceName // +kubebuilder:validation:Optional PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 1a4bc222a..873beb73c 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1504,6 +1504,13 @@ func (in *ProjectQuotaSpec) DeepCopy() *ProjectQuotaSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { *out = *in + if in.TotalUsage != nil { + in, out := &in.TotalUsage, &out.TotalUsage + *out = make(map[string]ResourceQuotaUsage, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } if in.PaygUsage != nil { in, out := &in.PaygUsage, &out.PaygUsage *out = make(map[string]ResourceQuotaUsage, len(*in)) diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml index 07e39aaa0..7bebbdb6c 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -200,6 +200,26 @@ spec: type: object description: |- PaygUsage tracks per-resource per-AZ pay-as-you-go usage. + Derived as TotalUsage - CRUsage (clamped >= 0). + Key: liquid.ResourceName + type: object + totalUsage: + additionalProperties: + description: ResourceQuotaUsage holds per-AZ PAYG usage for a single + resource. + properties: + perAZ: + additionalProperties: + format: int64 + type: integer + description: |- + PerAZ holds per-availability-zone PAYG usage values. + Key: availability zone name, Value: PAYG usage in that AZ. + type: object + type: object + description: |- + TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). + Persisted by the quota controller; updated by full reconcile and HV instance diffs. Key: liquid.ResourceName type: object type: object diff --git a/internal/scheduling/external/nova.go b/internal/scheduling/external/nova.go index b59a37d5b..741c5659c 100644 --- a/internal/scheduling/external/nova.go +++ b/internal/scheduling/external/nova.go @@ -17,6 +17,9 @@ type NovaReaderInterface interface { GetAllFlavors(ctx context.Context) ([]nova.Flavor, error) GetServerByID(ctx context.Context, serverID string) (*nova.Server, error) GetFlavorByName(ctx context.Context, flavorName string) (*nova.Flavor, error) + // GetDeletedServerByID returns a deleted server by its ID from the deleted_servers table. + // Returns nil, nil if the server is not found in the deleted_servers table. + GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) } // NovaReader provides read access to Nova data stored in the database. @@ -107,3 +110,17 @@ func (r *NovaReader) GetFlavorByName(ctx context.Context, flavorName string) (*n } return &flavors[0], nil } + +// GetDeletedServerByID returns a deleted Nova server by its ID from the deleted_servers table. +// Returns nil, nil if the server is not found in the deleted_servers table. +func (r *NovaReader) GetDeletedServerByID(ctx context.Context, serverID string) (*nova.DeletedServer, error) { + var servers []nova.DeletedServer + query := "SELECT * FROM " + nova.DeletedServer{}.TableName() + " WHERE id = $1" + if err := r.Select(ctx, &servers, query, serverID); err != nil { + return nil, fmt.Errorf("failed to query deleted server by ID: %w", err) + } + if len(servers) == 0 { + return nil, nil + } + return &servers[0], nil +} diff --git a/internal/scheduling/reservations/failover/integration_test.go b/internal/scheduling/reservations/failover/integration_test.go index 66d5733bb..df1354be6 100644 --- a/internal/scheduling/reservations/failover/integration_test.go +++ b/internal/scheduling/reservations/failover/integration_test.go @@ -1068,6 +1068,21 @@ func (s *MockVMSource) GetVM(_ context.Context, vmUUID string) (*VM, error) { return nil, nil } +// IsServerActive returns true if the server is found in the mock VMs. +func (s *MockVMSource) IsServerActive(_ context.Context, vmUUID string) (bool, error) { + for i := range s.VMs { + if s.VMs[i].UUID == vmUUID { + return true, nil + } + } + return false, nil +} + +// GetDeletedVMInfo returns nil, nil (no deleted VMs in mock). +func (s *MockVMSource) GetDeletedVMInfo(_ context.Context, _ string) (*DeletedVMInfo, error) { + return nil, nil +} + // newIntegrationTestEnv creates a complete test environment with HTTP server and VMSource. func newIntegrationTestEnv(t *testing.T, vms []VM, hypervisors []*hv1.Hypervisor, reservations []*v1alpha1.Reservation) *IntegrationTestEnv { t.Helper() diff --git a/internal/scheduling/reservations/failover/vm_source.go b/internal/scheduling/reservations/failover/vm_source.go index 4d5c3f210..bcf935798 100644 --- a/internal/scheduling/reservations/failover/vm_source.go +++ b/internal/scheduling/reservations/failover/vm_source.go @@ -26,6 +26,9 @@ type VM struct { // AvailabilityZone is the availability zone where the VM is located. // This is used to ensure failover reservations are created in the same AZ. AvailabilityZone string + // CreatedAt is the ISO 8601 timestamp when the VM was created in Nova. + // Used by the quota controller to distinguish new VMs from migrations. + CreatedAt string // Resources contains the VM's resource allocations (e.g., "memory", "vcpus"). Resources map[string]resource.Quantity // FlavorExtraSpecs contains the flavor's extra specifications (e.g., traits, capabilities). @@ -46,6 +49,22 @@ type VMSource interface { // GetVM returns a specific VM by UUID. // Returns nil, nil if the VM is not found (not an error, just doesn't exist). GetVM(ctx context.Context, vmUUID string) (*VM, error) + // IsServerActive returns true if the server exists in the servers table (still running somewhere). + // Returns false if not found. Used by quota controller to determine if a removed HV instance was deleted vs migrated. + IsServerActive(ctx context.Context, vmUUID string) (bool, error) + // GetDeletedVMInfo returns metadata about a deleted VM (from deleted_servers table), + // including resolved flavor resources. Returns nil, nil if not found. + // Used by quota controller for incremental usage decrements. + GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) +} + +// DeletedVMInfo contains the metadata needed to compute resource decrements for a deleted VM. +type DeletedVMInfo struct { + ProjectID string + AvailabilityZone string + FlavorName string + RAMMiB uint64 + VCPUs uint64 } // DBVMSource implements VMSource by reading directly from the database. @@ -122,6 +141,7 @@ func (s *DBVMSource) ListVMs(ctx context.Context) ([]VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }) @@ -208,6 +228,7 @@ func (s *DBVMSource) GetVM(ctx context.Context, vmUUID string) (*VM, error) { ProjectID: server.TenantID, CurrentHypervisor: server.OSEXTSRVATTRHost, AvailabilityZone: server.OSEXTAvailabilityZone, + CreatedAt: server.Created, Resources: resources, FlavorExtraSpecs: extraSpecs, }, nil @@ -397,6 +418,50 @@ func filterVMsOnKnownHypervisors(vms []VM, hypervisorList *hv1.HypervisorList) [ return result } +// IsServerActive returns true if the server exists in the servers table and is not DELETED. +// VMs in any other status (ACTIVE, SHUTOFF, MIGRATING, ERROR, etc.) still consume resources +// and should NOT be decremented from quota usage. +// Used by the quota controller to distinguish deleted VMs from migrated/existing ones. +func (s *DBVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + server, err := s.NovaReader.GetServerByID(ctx, vmUUID) + if err != nil { + return false, fmt.Errorf("failed to check server existence: %w", err) + } + if server == nil { + return false, nil + } + return server.Status != "DELETED", nil +} + +// GetDeletedVMInfo returns metadata about a deleted VM from the deleted_servers table, +// including resolved flavor resources. Returns nil, nil if the VM is not found in deleted_servers. +func (s *DBVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*DeletedVMInfo, error) { + deletedServer, err := s.NovaReader.GetDeletedServerByID(ctx, vmUUID) + if err != nil { + return nil, fmt.Errorf("failed to get deleted server: %w", err) + } + if deletedServer == nil { + return nil, nil + } + + // Resolve the flavor to get RAM/VCPUs + flavor, err := s.NovaReader.GetFlavorByName(ctx, deletedServer.FlavorName) + if err != nil { + return nil, fmt.Errorf("failed to get flavor for deleted server: %w", err) + } + if flavor == nil { + return nil, fmt.Errorf("flavor %q not found for deleted server %s", deletedServer.FlavorName, vmUUID) + } + + return &DeletedVMInfo{ + ProjectID: deletedServer.TenantID, + AvailabilityZone: deletedServer.OSEXTAvailabilityZone, + FlavorName: deletedServer.FlavorName, + RAMMiB: flavor.RAM, + VCPUs: flavor.VCPUs, + }, nil +} + // warnUnknownVMsOnHypervisors logs a warning for VMs that are on hypervisors but not in the ListVMs (i.e. nova) result. // This can indicate a data sync issue between the hypervisor operator and the VM datasource. func warnUnknownVMsOnHypervisors(hypervisors *hv1.HypervisorList, vms []VM) { diff --git a/internal/scheduling/reservations/failover/vm_source_test.go b/internal/scheduling/reservations/failover/vm_source_test.go index 0b30af0e5..a710c5658 100644 --- a/internal/scheduling/reservations/failover/vm_source_test.go +++ b/internal/scheduling/reservations/failover/vm_source_test.go @@ -399,3 +399,7 @@ func (m *mockNovaReader) GetFlavorByName(ctx context.Context, flavorName string) } return nil, nil } + +func (m *mockNovaReader) GetDeletedServerByID(_ context.Context, _ string) (*nova.DeletedServer, error) { + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/config.go b/internal/scheduling/reservations/quota/config.go new file mode 100644 index 000000000..b7314f595 --- /dev/null +++ b/internal/scheduling/reservations/quota/config.go @@ -0,0 +1,44 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// QuotaControllerConfig defines the configuration for the quota controller. +type QuotaControllerConfig struct { + // FullReconcileInterval is the periodic full reconcile interval. + // Full reconcile re-reads all VMs from Postgres and recomputes all usage. Default: 5m. + FullReconcileInterval metav1.Duration `json:"fullReconcileInterval"` + + // CRStateFilter defines which CommittedResource states to include + // when summing cr_actual_usage. Default: ["confirmed", "guaranteed"] + CRStateFilter []v1alpha1.CommitmentStatus `json:"crStateFilter"` +} + +// ApplyDefaults fills in any unset values with defaults. +func (c *QuotaControllerConfig) ApplyDefaults() { + defaults := DefaultQuotaControllerConfig() + if c.FullReconcileInterval.Duration == 0 { + c.FullReconcileInterval = defaults.FullReconcileInterval + } + if len(c.CRStateFilter) == 0 { + c.CRStateFilter = defaults.CRStateFilter + } +} + +// DefaultQuotaControllerConfig returns a default configuration. +func DefaultQuotaControllerConfig() QuotaControllerConfig { + return QuotaControllerConfig{ + FullReconcileInterval: metav1.Duration{Duration: 5 * time.Minute}, + CRStateFilter: []v1alpha1.CommitmentStatus{ + v1alpha1.CommitmentStatusConfirmed, + v1alpha1.CommitmentStatusGuaranteed, + }, + } +} diff --git a/internal/scheduling/reservations/quota/context.go b/internal/scheduling/reservations/quota/context.go new file mode 100644 index 000000000..8352a1934 --- /dev/null +++ b/internal/scheduling/reservations/quota/context.go @@ -0,0 +1,27 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + "github.com/go-logr/logr" + "github.com/google/uuid" +) + +// WithNewGlobalRequestID creates a new context with a quota-prefixed global request ID. +func WithNewGlobalRequestID(ctx context.Context) context.Context { + return reservations.WithGlobalRequestID(ctx, "quota-"+uuid.New().String()) +} + +// LoggerFromContext returns a logger with greq and req values from the context. +// This creates a child logger with the request tracking values pre-attached, +// so you don't need to repeat them in every log call. +func LoggerFromContext(ctx context.Context) logr.Logger { + return log.WithValues( + "greq", reservations.GlobalRequestIDFromContext(ctx), + "req", reservations.RequestIDFromContext(ctx), + ) +} diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go new file mode 100644 index 000000000..052b2b685 --- /dev/null +++ b/internal/scheduling/reservations/quota/controller.go @@ -0,0 +1,950 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "fmt" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" + commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +var log = ctrl.Log.WithName("quota-controller").WithValues("module", "quota") + +// QuotaController manages quota usage tracking for projects. +// It provides three reconciliation modes: +// - Periodic full reconcile: recomputes all TotalUsage from Postgres +// - Incremental HV diff: delta-updates TotalUsage on HV instance changes +// - PaygUsage-only recompute: triggered by CR or ProjectQuota spec changes +type QuotaController struct { + client.Client + VMSource failover.VMSource + Config QuotaControllerConfig + Metrics *QuotaMetrics +} + +// NewQuotaController creates a new QuotaController. +func NewQuotaController( + c client.Client, + vmSource failover.VMSource, + config QuotaControllerConfig, + metrics *QuotaMetrics, +) *QuotaController { + + return &QuotaController{ + Client: c, + VMSource: vmSource, + Config: config, + Metrics: metrics, + } +} + +// ============================================================================ +// Periodic Full Reconciliation +// ============================================================================ + +// ReconcilePeriodic performs a full reconcile of all project quota usage. +// It reads all VMs from Postgres, computes TotalUsage per project/AZ/resource, +// then derives PaygUsage = TotalUsage - CRUsage for each existing ProjectQuota CRD. +func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { + ctx = WithNewGlobalRequestID(ctx) + startTime := time.Now() + logger := LoggerFromContext(ctx).WithValues("mode", "full-reconcile") + logger.Info("starting full quota reconcile") + + // Fetch flavor groups from Knowledge CRD + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build flavorName → flavorGroup lookup + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Fetch all VMs using VMSource (reads from Postgres via DBVMSource) + vms, err := c.VMSource.ListVMs(ctx) + if err != nil { + logger.Error(err, "failed to list VMs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list VMs: %w", err) + } + + // Compute totalUsage per project/AZ/resource + totalUsageByProject := c.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // List all existing ProjectQuota CRDs + var pqList v1alpha1.ProjectQuotaList + if err := c.List(ctx, &pqList); err != nil { + logger.Error(err, "failed to list ProjectQuota CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list ProjectQuota CRDs: %w", err) + } + + // List all CommittedResource CRDs and pre-group by project ID + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + c.Metrics.RecordReconcileResult(false) + return fmt.Errorf("failed to list CommittedResource CRDs: %w", err) + } + crsByProject := groupCRsByProject(crList.Items) + + // For each ProjectQuota CRD, write TotalUsage + PaygUsage + var updated, skipped int + for i := range pqList.Items { + pq := &pqList.Items[i] + projectID := pq.Spec.ProjectID + + // Get totalUsage for this project (may be empty if project has no VMs) + projectTotalUsage := totalUsageByProject[projectID] + + // Compute CRUsage for this project (using pre-grouped CRs) + crUsage := c.computeCRUsage(crsByProject[projectID]) + + // Derive PaygUsage = TotalUsage - CRUsage (clamp >= 0) + paygUsage := derivePaygUsage(projectTotalUsage, crUsage) + + // Write status with conflict retry + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, projectTotalUsage, paygUsage); err != nil { + logger.Error(err, "failed to update ProjectQuota status", "project", projectID) + skipped++ + continue + } + + // Record metrics + c.recordUsageMetrics(projectID, projectTotalUsage, paygUsage, crUsage) + updated++ + } + + duration := time.Since(startTime) + c.Metrics.RecordReconcileDuration(duration.Seconds()) + c.Metrics.RecordReconcileResult(true) + logger.Info("full quota reconcile completed", + "duration", duration.Round(time.Millisecond), + "totalVMs", len(vms), + "projectQuotas", len(pqList.Items), + "updated", updated, + "skipped", skipped) + + return nil +} + +// ============================================================================ +// Watch-based Reconciliation (PaygUsage-only recompute) +// ============================================================================ + +// Reconcile handles watch-based reconciliation for a single ProjectQuota. +// Triggered by: CR Status.UsedAmount changes or ProjectQuota spec changes. +// It reads the persisted TotalUsage, re-lists CRs, and recomputes PaygUsage. +func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("projectQuota", req.Name, "mode", "payg-recompute") + logger.V(1).Info("reconciling ProjectQuota (PaygUsage recompute)") + + // Fetch the ProjectQuota + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, req.NamespacedName, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + logger.V(1).Info("ProjectQuota not found, likely deleted") + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + projectID := pq.Spec.ProjectID + ctx = reservations.WithRequestID(ctx, projectID) + + // Read persisted TotalUsage (already computed by full reconcile or incremental) + totalUsage := pq.Status.TotalUsage + if totalUsage == nil { + // No TotalUsage yet — full reconcile hasn't run. Skip. + logger.V(1).Info("no TotalUsage persisted yet, skipping PaygUsage recompute") + return ctrl.Result{}, nil + } + + // List CRs for this project (from local cache) + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CommittedResource CRDs") + return ctrl.Result{}, err + } + crsByProject := groupCRsByProject(crList.Items) + + // Compute CRUsage + crUsage := c.computeCRUsage(crsByProject[projectID]) + + // Derive PaygUsage + paygUsage := derivePaygUsage(totalUsage, crUsage) + + // Write updated PaygUsage with conflict retry (keep TotalUsage unchanged) + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage); err != nil { + logger.Error(err, "failed to update ProjectQuota status") + return ctrl.Result{}, err + } + + // Record metrics + c.recordUsageMetrics(projectID, totalUsage, paygUsage, crUsage) + + logger.V(1).Info("PaygUsage recomputed", "project", projectID) + return ctrl.Result{}, nil +} + +// ============================================================================ +// Incremental Update (HV Instance Diff) +// ============================================================================ + +// usageDelta tracks resource deltas for a single project during incremental reconciliation. +type usageDelta struct { + // increments[resourceName][az] = amount to add + increments map[string]map[string]int64 + // decrements[resourceName][az] = amount to subtract + decrements map[string]map[string]int64 +} + +func newUsageDelta() *usageDelta { + return &usageDelta{ + increments: make(map[string]map[string]int64), + decrements: make(map[string]map[string]int64), + } +} + +func (d *usageDelta) addIncrement(resourceName, az string, amount int64) { + if d.increments[resourceName] == nil { + d.increments[resourceName] = make(map[string]int64) + } + d.increments[resourceName][az] += amount +} + +func (d *usageDelta) addDecrement(resourceName, az string, amount int64) { + if d.decrements[resourceName] == nil { + d.decrements[resourceName] = make(map[string]int64) + } + d.decrements[resourceName][az] += amount +} + +// ReconcileHVDiff handles incremental updates when HV instance lists change. +// It diffs old vs new instances to delta-update TotalUsage for affected projects. +// Deltas are batched per project and applied in a single status update per project +// to avoid race conditions from multiple updates. +func (c *QuotaController) ReconcileHVDiff(ctx context.Context, oldHV, newHV *hv1.Hypervisor) error { + ctx = WithNewGlobalRequestID(ctx) + logger := LoggerFromContext(ctx).WithValues("hypervisor", newHV.Name, "mode", "incremental") + + // Diff old vs new instances + oldInstances := make(map[string]bool) + for _, inst := range oldHV.Status.Instances { + if inst.Active { + oldInstances[inst.ID] = true + } + } + newInstances := make(map[string]bool) + for _, inst := range newHV.Status.Instances { + if inst.Active { + newInstances[inst.ID] = true + } + } + + // Find added and removed UUIDs + var added, removed []string + for id := range newInstances { + if !oldInstances[id] { + added = append(added, id) + } + } + for id := range oldInstances { + if !newInstances[id] { + removed = append(removed, id) + } + } + + if len(added) == 0 && len(removed) == 0 { + return nil + } + + logger.V(1).Info("HV instance diff detected", "added", len(added), "removed", len(removed)) + + // Get flavor groups for mapping + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups for incremental update") + return err + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Accumulate deltas per project (batched to avoid per-VM persist race) + projectDeltas := make(map[string]*usageDelta) + + // Process added instances + for _, vmUUID := range added { + c.accumulateAddedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Process removed instances + for _, vmUUID := range removed { + c.accumulateRemovedVM(ctx, vmUUID, flavorToGroup, flavorGroups, projectDeltas) + } + + // Apply batched deltas and recompute PaygUsage for affected projects + var crList v1alpha1.CommittedResourceList + if err := c.List(ctx, &crList); err != nil { + logger.Error(err, "failed to list CRs for PaygUsage recompute") + return err + } + crsByProject := groupCRsByProject(crList.Items) + + for projectID, delta := range projectDeltas { + if err := c.applyDeltaAndUpdateStatus(ctx, projectID, delta, crsByProject[projectID]); err != nil { + logger.Error(err, "failed to apply delta for project", "project", projectID) + // Continue with other projects + } + } + + return nil +} + +// accumulateAddedVM looks up a VM and accumulates its resource contribution as a delta. +// It checks whether the VM is truly new (created after last full reconcile) vs a migration +// (already counted in TotalUsage). Only new VMs get incremented. +func (c *QuotaController) accumulateAddedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + vm, err := c.VMSource.GetVM(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get VM for increment") + return + } + if vm == nil { + return // VM not found in DB, skip + } + + // Check if this VM was already counted in the last full reconcile. + // If the VM was created BEFORE the last full reconcile, it's a migration + // (already in TotalUsage) and we should NOT increment again. + if !c.isVMNewSinceLastReconcile(ctx, vm) { + logger.V(1).Info("VM already counted (created before last reconcile), skipping increment", + "vmCreatedAt", vm.CreatedAt, "project", vm.ProjectID) + return + } + + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + delta := projectDeltas[vm.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[vm.ProjectID] = delta + } + + delta.addIncrement(commitments.ResourceNameRAM(groupName), vm.AvailabilityZone, ramUnits) + delta.addIncrement(commitments.ResourceNameCores(groupName), vm.AvailabilityZone, coresAmount) +} + +// isVMNewSinceLastReconcile checks if a VM was created after the last full reconcile. +// Returns true if the VM is new and should be incrementally added to TotalUsage. +// Returns false if the VM already existed at the last full reconcile (migration, not new). +// +// NOTE: There is a known timing gap -- the postgres servers table is only refreshed every +// N minutes by the datasource poller. A VM that was created shortly BEFORE the last reconcile +// might not have been visible in postgres yet (sync delay), so the full reconcile may have +// missed it. In that case we would also skip the increment here (CreatedAt <= LastReconcileAt) +// and the VM would only be counted on the NEXT full reconcile cycle. This is acceptable for +// now and will be resolved when we move to a CRD-based VM source with real-time events. +func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *failover.VM) bool { + if vm.CreatedAt == "" { + // No creation time available -- be conservative, skip increment. + // The next full reconcile will pick it up. + return false + } + + // Look up the ProjectQuota for this VM's project + crdName := "quota-" + vm.ProjectID + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + // If we can't find the ProjectQuota, skip (full reconcile will handle it) + return false + } + + if pq.Status.LastReconcileAt == nil { + // No full reconcile has run yet -- skip incremental updates + return false + } + + // Parse the VM's creation time and compare with last reconcile + vmCreatedAt, err := time.Parse("2006-01-02T15:04:05Z", vm.CreatedAt) + if err != nil { + // Try alternative format with timezone offset + vmCreatedAt, err = time.Parse(time.RFC3339, vm.CreatedAt) + if err != nil { + // Cannot parse -- be conservative, skip + return false + } + } + + return vmCreatedAt.After(pq.Status.LastReconcileAt.Time) +} + +// accumulateRemovedVM looks up a deleted VM and accumulates its resource contribution as a decrement. +func (c *QuotaController) accumulateRemovedVM( + ctx context.Context, + vmUUID string, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, + projectDeltas map[string]*usageDelta, +) { + + logger := LoggerFromContext(ctx).WithValues("vmUUID", vmUUID) + + // Check if the VM still exists in the servers table (migrated away = still running) + active, err := c.VMSource.IsServerActive(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to check server for decrement") + return + } + if active { + // VM still exists (either ACTIVE on another HV, or in non-ACTIVE state). + // Don't decrement — the full reconcile handles these correctly. + return + } + + // Not found in servers table — check deleted_servers + info, err := c.VMSource.GetDeletedVMInfo(ctx, vmUUID) + if err != nil { + logger.Error(err, "failed to get deleted VM info for decrement") + return + } + if info == nil { + // Not found anywhere — cannot determine what to decrement + logger.V(1).Info("removed VM not found in servers or deleted_servers") + return + } + + groupName, ok := flavorToGroup[info.FlavorName] + if !ok { + return // Flavor not in any group + } + fg, ok := flavorGroups[groupName] + if !ok { + return + } + + // Compute commitment units from the resolved flavor resources + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // MemoryMB is always within int64 range + if unitSizeMiB == 0 { + return + } + + ramUnits := int64(info.RAMMiB) / unitSizeMiB //nolint:gosec // safe + coresAmount := int64(info.VCPUs) //nolint:gosec // safe + + delta := projectDeltas[info.ProjectID] + if delta == nil { + delta = newUsageDelta() + projectDeltas[info.ProjectID] = delta + } + + delta.addDecrement(commitments.ResourceNameRAM(groupName), info.AvailabilityZone, ramUnits) + delta.addDecrement(commitments.ResourceNameCores(groupName), info.AvailabilityZone, coresAmount) +} + +// applyDeltaAndUpdateStatus fetches the ProjectQuota, applies the batched delta to TotalUsage, +// recomputes PaygUsage, and persists with conflict retry. +func (c *QuotaController) applyDeltaAndUpdateStatus( + ctx context.Context, + projectID string, + delta *usageDelta, + projectCRs []v1alpha1.CommittedResource, +) error { + + crdName := "quota-" + projectID + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: crdName}, &pq); err != nil { + if client.IgnoreNotFound(err) == nil { + return nil // PQ deleted, nothing to do + } + return err + } + + if pq.Status.TotalUsage == nil { + pq.Status.TotalUsage = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Apply increments + for resourceName, azAmounts := range delta.increments { + for az, amount := range azAmounts { + incrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Apply decrements + for resourceName, azAmounts := range delta.decrements { + for az, amount := range azAmounts { + decrementUsage(pq.Status.TotalUsage, resourceName, az, amount) + } + } + + // Recompute PaygUsage + crUsage := c.computeCRUsage(projectCRs) + paygUsage := derivePaygUsage(pq.Status.TotalUsage, crUsage) + + pq.Status.PaygUsage = paygUsage + now := metav1.Now() + pq.Status.LastReconcileAt = &now + + if err := c.Status().Update(ctx, &pq); err != nil { + return err + } + + c.recordUsageMetrics(projectID, pq.Status.TotalUsage, paygUsage, crUsage) + return nil + }) +} + +// ============================================================================ +// Manager Setup +// ============================================================================ + +// SetupWithManager sets up the watch-based reconciler for PaygUsage recomputes. +func (c *QuotaController) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-controller"). + // Watch ProjectQuota for spec changes (Limes pushes quota) + For(&v1alpha1.ProjectQuota{}). + // Watch CommittedResource for status changes (UsedAmount updates) + Watches( + &v1alpha1.CommittedResource{}, + handler.EnqueueRequestsFromMapFunc(c.mapCRToProjectQuota), + builder.WithPredicates(crUsedAmountChangePredicate()), + ). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(c) +} + +// SetupHVWatcher sets up a separate controller to watch HV CRD changes +// for incremental TotalUsage updates. +func (c *QuotaController) SetupHVWatcher(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + Named("quota-hv-watcher"). + WatchesRawSource(source.Kind( + mgr.GetCache(), + &hv1.Hypervisor{}, + &hvInstanceDiffHandler{controller: c}, + hvInstanceChangePredicate(), + )). + WithOptions(controller.Options{ + MaxConcurrentReconciles: 1, + }). + Complete(reconcile.Func(func(_ context.Context, _ ctrl.Request) (ctrl.Result, error) { + // The actual work is done in the event handler + return ctrl.Result{}, nil + })) +} + +// Start implements manager.Runnable for the periodic reconciliation loop. +// It does not block manager startup — the first reconcile fires after a short +// initial delay to allow cache sync. +func (c *QuotaController) Start(ctx context.Context) error { + log.Info("starting quota controller (periodic)", + "fullReconcileInterval", c.Config.FullReconcileInterval.Duration, + "crStateFilter", c.Config.CRStateFilter) + + // Use a short initial delay to allow cache sync before first reconcile + initialDelay := 5 * time.Second + timer := time.NewTimer(initialDelay) + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + log.Info("stopping quota controller") + return nil + case <-timer.C: + if err := c.ReconcilePeriodic(ctx); err != nil { + log.Error(err, "periodic full reconcile failed") + } + timer.Reset(c.Config.FullReconcileInterval.Duration) + } + } +} + +// ============================================================================ +// Internal Helpers +// ============================================================================ + +// computeTotalUsage aggregates VM resources by project/AZ/resource. +// +// The RAM calculation converts server RAM into LIQUID commitment units: +// - Each flavor group has a "smallest flavor" defining the unit size (e.g., 32768 MiB) +// - A VM's RAM usage in units = VM_RAM_MiB / unit_size_MiB +// - Example: a 64 GiB VM in a group with 32 GiB smallest flavor = 2 units +// +// This matches the unit system used by LIQUID for commitment tracking. +// The per-AZ breakdown allows Limes to enforce AZ-level quota limits. +func (c *QuotaController) computeTotalUsage( + vms []failover.VM, + flavorToGroup map[string]string, + flavorGroups map[string]compute.FlavorGroupFeature, +) map[string]map[string]v1alpha1.ResourceQuotaUsage { + // result[projectID][resourceName] = ResourceQuotaUsage{PerAZ: {az: amount}} + result := make(map[string]map[string]v1alpha1.ResourceQuotaUsage) + + for _, vm := range vms { + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + continue // Flavor not in any tracked group + } + fg, ok := flavorGroups[groupName] + if !ok { + continue + } + if fg.SmallestFlavor.MemoryMB == 0 { + continue // Invalid group config + } + + ramResourceName := commitments.ResourceNameRAM(groupName) + coresResourceName := commitments.ResourceNameCores(groupName) + + unitSizeMiB := int64(fg.SmallestFlavor.MemoryMB) //nolint:gosec // safe + ramUnits, coresAmount := vmResourceUnits(vm.Resources, unitSizeMiB) + + if _, ok := result[vm.ProjectID]; !ok { + result[vm.ProjectID] = make(map[string]v1alpha1.ResourceQuotaUsage) + } + + // Accumulate RAM usage for this project + AZ + ramUsage := result[vm.ProjectID][ramResourceName] + if ramUsage.PerAZ == nil { + ramUsage.PerAZ = make(map[string]int64) + } + ramUsage.PerAZ[vm.AvailabilityZone] += ramUnits + result[vm.ProjectID][ramResourceName] = ramUsage + + // Accumulate cores usage for this project + AZ + coresUsage := result[vm.ProjectID][coresResourceName] + if coresUsage.PerAZ == nil { + coresUsage.PerAZ = make(map[string]int64) + } + coresUsage.PerAZ[vm.AvailabilityZone] += coresAmount + result[vm.ProjectID][coresResourceName] = coresUsage + } + + return result +} + +// groupCRsByProject groups CommittedResources by project ID for efficient lookup. +func groupCRsByProject(crs []v1alpha1.CommittedResource) map[string][]v1alpha1.CommittedResource { + result := make(map[string][]v1alpha1.CommittedResource) + for i := range crs { + projectID := crs[i].Spec.ProjectID + result[projectID] = append(result[projectID], crs[i]) + } + return result +} + +// computeCRUsage computes the committed resource usage from a pre-filtered slice of CRs for one project. +func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource) map[string]v1alpha1.ResourceQuotaUsage { + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for i := range crs { + cr := &crs[i] + + // Filter: only matching states + if !c.isCRStateIncluded(cr.Spec.State) { + continue + } + + // Get UsedAmount from status + if cr.Status.UsedAmount == nil { + continue + } + usedAmount := cr.Status.UsedAmount.Value() + if usedAmount <= 0 { + continue + } + + // Map ResourceType to resource name + var resourceName string + switch cr.Spec.ResourceType { + case v1alpha1.CommittedResourceTypeMemory: + resourceName = commitments.ResourceNameRAM(cr.Spec.FlavorGroupName) + case v1alpha1.CommittedResourceTypeCores: + resourceName = commitments.ResourceNameCores(cr.Spec.FlavorGroupName) + default: + continue + } + + // Accumulate per AZ + usage := result[resourceName] + if usage.PerAZ == nil { + usage.PerAZ = make(map[string]int64) + } + usage.PerAZ[cr.Spec.AvailabilityZone] += usedAmount + result[resourceName] = usage + } + + return result +} + +// isCRStateIncluded checks if a commitment state is in the configured filter. +func (c *QuotaController) isCRStateIncluded(state v1alpha1.CommitmentStatus) bool { + for _, s := range c.Config.CRStateFilter { + if s == state { + return true + } + } + return false +} + +// derivePaygUsage computes PaygUsage = TotalUsage - CRUsage (clamped >= 0). +func derivePaygUsage( + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) map[string]v1alpha1.ResourceQuotaUsage { + + result := make(map[string]v1alpha1.ResourceQuotaUsage) + + for resourceName, total := range totalUsage { + payg := v1alpha1.ResourceQuotaUsage{ + PerAZ: make(map[string]int64), + } + for az, totalAmount := range total.PerAZ { + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + if azAmount, ok := cr.PerAZ[az]; ok { + crAmount = azAmount + } + } + paygAmount := totalAmount - crAmount + if paygAmount < 0 { + paygAmount = 0 // Clamp >= 0 + } + payg.PerAZ[az] = paygAmount + } + result[resourceName] = payg + } + + return result +} + +// updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt +// with retry-on-conflict to handle concurrent updates. +func (c *QuotaController) updateProjectQuotaStatusWithRetry( + ctx context.Context, + pqName string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, +) error { + + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + // Re-fetch fresh copy on each retry + var pq v1alpha1.ProjectQuota + if err := c.Get(ctx, client.ObjectKey{Name: pqName}, &pq); err != nil { + return err + } + + pq.Status.TotalUsage = totalUsage + pq.Status.PaygUsage = paygUsage + now := metav1.Now() + pq.Status.LastReconcileAt = &now + + return c.Status().Update(ctx, &pq) + }) +} + +// vmResourceUnits computes RAM commitment units and cores from a VM's resources. +// RAM is converted from bytes (resource.Quantity) to MiB, then divided by unitSizeMiB +// (the smallest flavor's memory in MiB for the flavor group) to get commitment units. +func vmResourceUnits(resources map[string]resource.Quantity, unitSizeMiB int64) (ramUnits, cores int64) { + memQty := resources["memory"] + serverRAMMiB := memQty.Value() / (1024 * 1024) // bytes to MiB + ramUnits = serverRAMMiB / unitSizeMiB // commitment units + vcpuQty := resources["vcpus"] + cores = vcpuQty.Value() + return ramUnits, cores +} + +// buildFlavorToGroupMap builds a flavorName → flavorGroupName lookup from flavor groups. +func buildFlavorToGroupMap(flavorGroups map[string]compute.FlavorGroupFeature) map[string]string { + result := make(map[string]string) + for groupName, group := range flavorGroups { + for _, flavor := range group.Flavors { + result[flavor.Name] = groupName + } + } + return result +} + +// incrementUsage increments a usage value in the map. +func incrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + u.PerAZ = make(map[string]int64) + } + u.PerAZ[az] += amount + usage[resourceName] = u +} + +// decrementUsage decrements a usage value in the map (clamp >= 0). +func decrementUsage(usage map[string]v1alpha1.ResourceQuotaUsage, resourceName, az string, amount int64) { + u := usage[resourceName] + if u.PerAZ == nil { + return + } + u.PerAZ[az] -= amount + if u.PerAZ[az] < 0 { + u.PerAZ[az] = 0 + } + usage[resourceName] = u +} + +// recordUsageMetrics emits Prometheus metrics for all resources in a project. +func (c *QuotaController) recordUsageMetrics( + projectID string, + totalUsage map[string]v1alpha1.ResourceQuotaUsage, + paygUsage map[string]v1alpha1.ResourceQuotaUsage, + crUsage map[string]v1alpha1.ResourceQuotaUsage, +) { + + for resourceName, total := range totalUsage { + for az, totalAmount := range total.PerAZ { + paygAmount := int64(0) + if payg, ok := paygUsage[resourceName]; ok { + paygAmount = payg.PerAZ[az] + } + crAmount := int64(0) + if cr, ok := crUsage[resourceName]; ok { + crAmount = cr.PerAZ[az] + } + c.Metrics.RecordUsage(projectID, az, resourceName, totalAmount, paygAmount, crAmount) + } + } +} + +// ============================================================================ +// Predicates & Event Handlers +// ============================================================================ + +// mapCRToProjectQuota maps a CommittedResource change to the affected ProjectQuota reconcile request. +func (c *QuotaController) mapCRToProjectQuota(_ context.Context, obj client.Object) []reconcile.Request { + cr, ok := obj.(*v1alpha1.CommittedResource) + if !ok { + return nil + } + // Map to the ProjectQuota for this project + crdName := "quota-" + cr.Spec.ProjectID + return []reconcile.Request{ + {NamespacedName: client.ObjectKey{Name: crdName}}, + } +} + +// crUsedAmountChangePredicate triggers only when Status.UsedAmount changes on a CommittedResource. +func crUsedAmountChangePredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(_ event.CreateEvent) bool { return false }, + UpdateFunc: func(e event.UpdateEvent) bool { + oldCR, ok1 := e.ObjectOld.(*v1alpha1.CommittedResource) + newCR, ok2 := e.ObjectNew.(*v1alpha1.CommittedResource) + if !ok1 || !ok2 { + return false + } + // Trigger if UsedAmount changed + oldUsed := "" + newUsed := "" + if oldCR.Status.UsedAmount != nil { + oldUsed = oldCR.Status.UsedAmount.String() + } + if newCR.Status.UsedAmount != nil { + newUsed = newCR.Status.UsedAmount.String() + } + return oldUsed != newUsed + }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return false }, + } +} + +// hvInstanceChangePredicate always returns true for updates. +// ReconcileHVDiff performs its own set-diff and exits early if there are no +// actual additions/removals. This ensures instance swaps (same count, different IDs) +// are not missed. +func hvInstanceChangePredicate() predicate.TypedPredicate[*hv1.Hypervisor] { + return predicate.TypedFuncs[*hv1.Hypervisor]{ + CreateFunc: func(_ event.TypedCreateEvent[*hv1.Hypervisor]) bool { return true }, + UpdateFunc: func(_ event.TypedUpdateEvent[*hv1.Hypervisor]) bool { + return true + }, + DeleteFunc: func(_ event.TypedDeleteEvent[*hv1.Hypervisor]) bool { return true }, + GenericFunc: func(_ event.TypedGenericEvent[*hv1.Hypervisor]) bool { return false }, + } +} + +// hvInstanceDiffHandler handles HV instance diff events by calling ReconcileHVDiff. +type hvInstanceDiffHandler struct { + controller *QuotaController +} + +func (h *hvInstanceDiffHandler) Create(_ context.Context, _ event.TypedCreateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On create, no diff needed (full reconcile will catch up) +} + +func (h *hvInstanceDiffHandler) Update(ctx context.Context, e event.TypedUpdateEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + if err := h.controller.ReconcileHVDiff(ctx, e.ObjectOld, e.ObjectNew); err != nil { + log.Error(err, "failed to process HV instance diff", "hypervisor", e.ObjectNew.Name) + } +} + +func (h *hvInstanceDiffHandler) Delete(_ context.Context, _ event.TypedDeleteEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // On delete, full reconcile will correct +} + +func (h *hvInstanceDiffHandler) Generic(_ context.Context, _ event.TypedGenericEvent[*hv1.Hypervisor], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) { + // No-op +} diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go new file mode 100644 index 000000000..4005af326 --- /dev/null +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -0,0 +1,598 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestComputeTotalUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768}, + {Name: "m1.hana_v2.large", MemoryMB: 65536}, + }, + }, + "general": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 4096}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096}, + }, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + vms := []failover.VM{ + { + UUID: "vm-1", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB in bytes + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-2", + FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB in bytes + "vcpus": resource.MustParse("16"), + }, + }, + { + UUID: "vm-3", + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-2", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + { + UUID: "vm-4", + FlavorName: "m1.general.small", + ProjectID: "project-b", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB in bytes + "vcpus": resource.MustParse("2"), + }, + }, + { + UUID: "vm-5", + FlavorName: "unknown-flavor", + ProjectID: "project-c", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + } + + result := ctrl.computeTotalUsage(vms, flavorToGroup, flavorGroups) + + // project-a: hana_v2 in az-1: 32768+65536 = 98304 MiB / 32768 = 3 units RAM, 8+16=24 cores + // project-a: hana_v2 in az-2: 32768 MiB / 32768 = 1 unit RAM, 8 cores + projectA := result["project-a"] + if projectA == nil { + t.Fatal("expected project-a in results") + } + + ramUsage := projectA["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 3 { + t.Errorf("expected project-a az-1 hana_v2_ram = 3, got %d", ramUsage.PerAZ["az-1"]) + } + if ramUsage.PerAZ["az-2"] != 1 { + t.Errorf("expected project-a az-2 hana_v2_ram = 1, got %d", ramUsage.PerAZ["az-2"]) + } + + coresUsage := projectA["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 24 { + t.Errorf("expected project-a az-1 hana_v2_cores = 24, got %d", coresUsage.PerAZ["az-1"]) + } + if coresUsage.PerAZ["az-2"] != 8 { + t.Errorf("expected project-a az-2 hana_v2_cores = 8, got %d", coresUsage.PerAZ["az-2"]) + } + + // project-b: general in az-1: 4096/4096=1 unit RAM, 2 cores + projectB := result["project-b"] + if projectB == nil { + t.Fatal("expected project-b in results") + } + if projectB["hw_version_general_ram"].PerAZ["az-1"] != 1 { + t.Errorf("expected project-b az-1 general_ram = 1, got %d", projectB["hw_version_general_ram"].PerAZ["az-1"]) + } + if projectB["hw_version_general_cores"].PerAZ["az-1"] != 2 { + t.Errorf("expected project-b az-1 general_cores = 2, got %d", projectB["hw_version_general_cores"].PerAZ["az-1"]) + } + + // project-c: unknown flavor → not in results + if _, exists := result["project-c"]; exists { + t.Error("expected project-c to NOT be in results (unknown flavor)") + } +} + +func TestComputeCRUsage(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + usedAmount5 := resource.MustParse("5") + usedAmount3 := resource.MustParse("3") + usedAmount2 := resource.MustParse("2") + + allCRs := []v1alpha1.CommittedResource{ + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount5, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusGuaranteed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount3, + }, + }, + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeCores, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount2, + }, + }, + // Different project — should be excluded by groupCRsByProject + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-b", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-1", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusConfirmed, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount5, + }, + }, + // Pending state — should be excluded by state filter + { + Spec: v1alpha1.CommittedResourceSpec{ + ProjectID: "project-a", + FlavorGroupName: "hana_v2", + AvailabilityZone: "az-2", + ResourceType: v1alpha1.CommittedResourceTypeMemory, + State: v1alpha1.CommitmentStatusPending, + }, + Status: v1alpha1.CommittedResourceStatus{ + UsedAmount: &usedAmount2, + }, + }, + } + + // Pre-group and pass only project-a's CRs + crsByProject := groupCRsByProject(allCRs) + result := ctrl.computeCRUsage(crsByProject["project-a"]) + + // Should include confirmed + guaranteed for project-a only + ramUsage := result["hw_version_hana_v2_ram"] + if ramUsage.PerAZ["az-1"] != 8 { // 5 + 3 + t.Errorf("expected cr ram usage az-1 = 8, got %d", ramUsage.PerAZ["az-1"]) + } + + coresUsage := result["hw_version_hana_v2_cores"] + if coresUsage.PerAZ["az-1"] != 2 { + t.Errorf("expected cr cores usage az-1 = 2, got %d", coresUsage.PerAZ["az-1"]) + } + + // az-2 should NOT be included (pending state) + if ramUsage.PerAZ["az-2"] != 0 { + t.Errorf("expected cr ram usage az-2 = 0 (pending excluded), got %d", ramUsage.PerAZ["az-2"]) + } +} + +func TestDerivePaygUsage(t *testing.T) { + tests := []struct { + name string + totalUsage map[string]v1alpha1.ResourceQuotaUsage + crUsage map[string]v1alpha1.ResourceQuotaUsage + expected map[string]map[string]int64 // resourceName -> az -> amount + }{ + { + name: "basic subtraction", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10, "az-2": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 7, "az-2": 5}, + }, + }, + { + name: "clamp to zero", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 10}}, + }, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 0}, + }, + }, + { + name: "no CR usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + expected: map[string]map[string]int64{ + "hw_version_hana_v2_ram": {"az-1": 5}, + }, + }, + { + name: "empty total usage", + totalUsage: map[string]v1alpha1.ResourceQuotaUsage{}, + crUsage: map[string]v1alpha1.ResourceQuotaUsage{ + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 5}}, + }, + expected: map[string]map[string]int64{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := derivePaygUsage(tt.totalUsage, tt.crUsage) + + for resourceName, expectedAZ := range tt.expected { + resUsage, ok := result[resourceName] + if !ok { + t.Errorf("expected resource %s in result", resourceName) + continue + } + for az, expectedAmount := range expectedAZ { + if resUsage.PerAZ[az] != expectedAmount { + t.Errorf("resource=%s az=%s: expected %d, got %d", + resourceName, az, expectedAmount, resUsage.PerAZ[az]) + } + } + } + + // Check no extra resources in result + for resourceName := range result { + if _, ok := tt.expected[resourceName]; !ok { + t.Errorf("unexpected resource %s in result", resourceName) + } + } + }) + } +} + +func TestBuildFlavorToGroupMap(t *testing.T) { + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small"}, + {Name: "m1.hana_v2.large"}, + }, + }, + "general": { + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small"}, + }, + }, + } + + result := buildFlavorToGroupMap(flavorGroups) + + if result["m1.hana_v2.small"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.small, got %s", result["m1.hana_v2.small"]) + } + if result["m1.hana_v2.large"] != "hana_v2" { + t.Errorf("expected hana_v2 for m1.hana_v2.large, got %s", result["m1.hana_v2.large"]) + } + if result["m1.general.small"] != "general" { + t.Errorf("expected general for m1.general.small, got %s", result["m1.general.small"]) + } + if _, exists := result["unknown"]; exists { + t.Error("expected unknown flavor not to be in map") + } +} + +func TestIncrementDecrementUsage(t *testing.T) { + usage := make(map[string]v1alpha1.ResourceQuotaUsage) + + // Increment from empty + incrementUsage(usage, "res1", "az-1", 5) + if usage["res1"].PerAZ["az-1"] != 5 { + t.Errorf("expected 5 after increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Increment again + incrementUsage(usage, "res1", "az-1", 3) + if usage["res1"].PerAZ["az-1"] != 8 { + t.Errorf("expected 8 after second increment, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement + decrementUsage(usage, "res1", "az-1", 2) + if usage["res1"].PerAZ["az-1"] != 6 { + t.Errorf("expected 6 after decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement below zero → clamp to 0 + decrementUsage(usage, "res1", "az-1", 100) + if usage["res1"].PerAZ["az-1"] != 0 { + t.Errorf("expected 0 after over-decrement, got %d", usage["res1"].PerAZ["az-1"]) + } + + // Decrement non-existent resource (no-op) + decrementUsage(usage, "res2", "az-1", 5) + // Should not panic, and res2 should not exist + if _, exists := usage["res2"]; exists { + if usage["res2"].PerAZ != nil { + t.Error("expected res2 to not have PerAZ after decrement on non-existent") + } + } +} + +func TestIsCRStateIncluded(t *testing.T) { + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusConfirmed) { + t.Error("expected confirmed to be included") + } + if !ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusGuaranteed) { + t.Error("expected guaranteed to be included") + } + if ctrl.isCRStateIncluded(v1alpha1.CommitmentStatusPending) { + t.Error("expected pending to NOT be included") + } +} + +func TestGroupCRsByProject(t *testing.T) { + crs := []v1alpha1.CommittedResource{ + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p2"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p1"}}, + {Spec: v1alpha1.CommittedResourceSpec{ProjectID: "p3"}}, + } + + grouped := groupCRsByProject(crs) + + if len(grouped["p1"]) != 2 { + t.Errorf("expected 2 CRs for p1, got %d", len(grouped["p1"])) + } + if len(grouped["p2"]) != 1 { + t.Errorf("expected 1 CR for p2, got %d", len(grouped["p2"])) + } + if len(grouped["p3"]) != 1 { + t.Errorf("expected 1 CR for p3, got %d", len(grouped["p3"])) + } + if len(grouped["nonexistent"]) != 0 { + t.Error("expected 0 CRs for nonexistent project") + } +} + +func TestUsageDelta(t *testing.T) { + delta := newUsageDelta() + + delta.addIncrement("res1", "az-1", 5) + delta.addIncrement("res1", "az-1", 3) + delta.addIncrement("res1", "az-2", 2) + delta.addDecrement("res1", "az-1", 1) + + if delta.increments["res1"]["az-1"] != 8 { + t.Errorf("expected increment res1/az-1 = 8, got %d", delta.increments["res1"]["az-1"]) + } + if delta.increments["res1"]["az-2"] != 2 { + t.Errorf("expected increment res1/az-2 = 2, got %d", delta.increments["res1"]["az-2"]) + } + if delta.decrements["res1"]["az-1"] != 1 { + t.Errorf("expected decrement res1/az-1 = 1, got %d", delta.decrements["res1"]["az-1"]) + } +} + +func TestReconcile_NilTotalUsage(t *testing.T) { + // When TotalUsage is nil, Reconcile should skip and return no error. + // This validates the early-return branch logic used in Reconcile(). + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + // computeCRUsage on nil slice should return empty map (no panic) + result := ctrl.computeCRUsage(nil) + if len(result) != 0 { + t.Errorf("expected empty result for nil CRs, got %d entries", len(result)) + } + + // derivePaygUsage on nil totalUsage should return empty map + payg := derivePaygUsage(nil, result) + if len(payg) != 0 { + t.Errorf("expected empty payg for nil totalUsage, got %d entries", len(payg)) + } +} + +func TestAccumulateAddedVM_UnknownFlavor(t *testing.T) { + // Verifies that accumulateAddedVM gracefully handles a VM with an unknown flavor + ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + // Use a mock VMSource that returns a VM with unknown flavor + ctrl.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "unknown-flavor", + ProjectID: "project-a", + AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, nil + }, + } + + ctrl.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + // Should not have added any delta (unknown flavor) + if len(projectDeltas) != 0 { + t.Errorf("expected no deltas for unknown flavor, got %d projects", len(projectDeltas)) + } +} + +func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { + // Set up a fake client with a ProjectQuota that has LastReconcileAt in the past. + // The VM's CreatedAt must be AFTER LastReconcileAt for it to be considered new. + lastReconcile := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + vmCreatedAt := "2026-01-02T00:00:00Z" // After lastReconcile + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add scheme: %v", err) + } + + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a"}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: &lastReconcile, + }, + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(pq). + WithStatusSubresource(&v1alpha1.ProjectQuota{}). + Build() + + qc := &QuotaController{ + Client: k8sClient, + Config: DefaultQuotaControllerConfig(), + } + + flavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{MemoryMB: 32768}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 32768}}, + }, + } + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + projectDeltas := make(map[string]*usageDelta) + + qc.VMSource = &mockVMSource{ + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + return &failover.VM{ + UUID: vmUUID, + FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", + AvailabilityZone: "az-1", + CreatedAt: vmCreatedAt, + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, nil + }, + } + + qc.accumulateAddedVM(context.Background(), "vm-1", flavorToGroup, flavorGroups, projectDeltas) + + delta, ok := projectDeltas["project-a"] + if !ok { + t.Fatal("expected delta for project-a") + } + + // 32768 MiB / 32768 = 1 unit RAM + if delta.increments["hw_version_hana_v2_ram"]["az-1"] != 1 { + t.Errorf("expected ram increment = 1, got %d", delta.increments["hw_version_hana_v2_ram"]["az-1"]) + } + if delta.increments["hw_version_hana_v2_cores"]["az-1"] != 8 { + t.Errorf("expected cores increment = 8, got %d", delta.increments["hw_version_hana_v2_cores"]["az-1"]) + } +} + +// mockVMSource is a test helper for VMSource. +type mockVMSource struct { + listVMs func(ctx context.Context) ([]failover.VM, error) + getVM func(ctx context.Context, vmUUID string) (*failover.VM, error) + isServerActive func(ctx context.Context, vmUUID string) (bool, error) + getDeletedVM func(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) +} + +func (m *mockVMSource) ListVMs(ctx context.Context) ([]failover.VM, error) { + if m.listVMs != nil { + return m.listVMs(ctx) + } + return nil, nil +} + +func (m *mockVMSource) GetVM(ctx context.Context, vmUUID string) (*failover.VM, error) { + if m.getVM != nil { + return m.getVM(ctx, vmUUID) + } + return nil, nil +} + +func (m *mockVMSource) ListVMsOnHypervisors(_ context.Context, _ *hv1.HypervisorList, _ bool) ([]failover.VM, error) { + return nil, nil +} + +func (m *mockVMSource) IsServerActive(ctx context.Context, vmUUID string) (bool, error) { + if m.isServerActive != nil { + return m.isServerActive(ctx, vmUUID) + } + return false, nil +} + +func (m *mockVMSource) GetDeletedVMInfo(ctx context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if m.getDeletedVM != nil { + return m.getDeletedVM(ctx, vmUUID) + } + return nil, nil +} diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go new file mode 100644 index 000000000..36977203e --- /dev/null +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -0,0 +1,1232 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "context" + "encoding/json" + "testing" + "time" + + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// ============================================================================ +// Integration Tests +// ============================================================================ + +func TestIntegration(t *testing.T) { + lastReconcileTime := metav1.NewTime(time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)) + + tests := []IntegrationTestCase{ + { + Name: "full reconcile - basic usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // project-a: hana_v2 az-1: (32768+65536)/32768 = 3 RAM units, 8+16=24 cores + // project-a: hana_v2 az-2: 32768/32768 = 1 RAM unit, 8 cores + // project-a: general az-1: 4096/4096 = 1 RAM unit, 2 cores + // project-b: general az-1: 4096/4096 = 1 RAM unit, 2 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // No CRs -> PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile - with CRs reduces PaygUsage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // 2 units of hana_v2 RAM committed in az-1 for project-a + makeCR("cr-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(2)), + // 10 cores committed in az-1 for project-a + makeCR("cr-2", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeCores, v1alpha1.CommitmentStatusConfirmed, int64Ptr(10)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + // PaygUsage = TotalUsage - CRUsage + // hana_v2 RAM: 3-2=1 in az-1, 1-0=1 in az-2 + // hana_v2 Cores: 24-10=14 in az-1, 8-0=8 in az-2 + // general: no CRs so PaygUsage == TotalUsage + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 1, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 14, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - new VM after last reconcile", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile to establish baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a NEW VM (created after last reconcile) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new"), // new instance + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // far future, always AFTER last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-new is created AFTER last reconcile, so it gets incremented + // +1 RAM unit (32768/32768), +8 cores + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental add - migration skipped (VM created before last reconcile)", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds vm-1 (which was created BEFORE last reconcile = migration) + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{}), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-1"), // migrated here, created before reconcile + }), + // Should NOT increment -- vm-1 CreatedAt is 2025-12-01 which is before reconcile time + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - deleted VM decrements usage", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + // vm-del is not in VMs (deleted), but has info in DeletedVMs + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // not active (truly deleted) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile (vm-del not in VMs so not counted) + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff removes vm-del (was on HV before, now gone) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-del"), // was here + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + // vm-del gone + }), + // vm-del: IsServerActive=false, deleted info found + // Decrement: -1 RAM unit, -8 cores in az-1 + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 16, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "incremental remove - migrated VM not decremented", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ActiveVMs: map[string]bool{ + "vm-1": true, // still active (migrated to another HV) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // HV reports vm-1 removed (migrated away) + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + // vm-1 gone from this HV + }), + // vm-1: IsServerActive=true, so NOT decremented + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "CR update triggers PaygUsage recompute", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + makeCR("cr-ram-1", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusConfirmed, int64Ptr(1)), + }, + Actions: []TestAction{ + // Step 1: full reconcile with initial CR (UsedAmount=1) + { + Type: "full_reconcile", + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 2, "az-2": 1}}, // 3-1=2 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: CR UsedAmount increases to 3 -> PaygUsage should drop + { + Type: "cr_update", + CRName: "cr-ram-1", + UsedAmount: 3, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 0, "az-2": 1}}, // 3-3=0 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "unknown flavor VMs are skipped", + FlavorGroups: testFlavorGroups, + VMs: []failover.VM{ + { + UUID: "vm-unknown", FlavorName: "nonexistent-flavor", + ProjectID: "project-x", AvailabilityZone: "az-1", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), + "vcpus": resource.MustParse("2"), + }, + }, + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-x", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // No usage for project-x (unknown flavor skipped) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-x": {}, + }, + }, + }, + }, + { + Name: "multiple full reconciles are idempotent", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Second full reconcile - same result + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "pending CRs are excluded from PaygUsage deduction", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + CommittedResources: []*v1alpha1.CommittedResource{ + // Pending CR should NOT reduce PaygUsage + makeCR("cr-pending", "project-a", "hana_v2", "az-1", + v1alpha1.CommittedResourceTypeMemory, v1alpha1.CommitmentStatusPending, int64Ptr(5)), + }, + Actions: []TestAction{ + { + Type: "full_reconcile", + // PaygUsage == TotalUsage because pending CRs are excluded + ExpectedPaygUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "full reconcile corrects incremental drift", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes correct baseline + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a short-lived "phantom" VM (created after reconcile, + // but deleted before the next full reconcile runs). The incremental path + // bumps TotalUsage by +1 RAM / +8 cores. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-phantom"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", // after last reconcile + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + // TotalUsage now has phantom's contribution (drift) + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // 24+8 drift + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: full reconcile re-scans all VMs. Reset VM list to baseline + // (vm-phantom is gone). This corrects the drift back to the ground truth. + { + Type: "full_reconcile", + OverrideVMs: baseVMsPtr(), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // corrected + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + { + Name: "complex multi-project scenario with adds, removes, and reconcile corrections", + FlavorGroups: testFlavorGroups, + VMs: testVMs, + DeletedVMs: map[string]*failover.DeletedVMInfo{ + "vm-del": { + ProjectID: "project-a", + FlavorName: "m1.hana_v2.small", + AvailabilityZone: "az-1", + RAMMiB: 32768, + VCPUs: 8, + }, + }, + ActiveVMs: map[string]bool{ + "vm-del": false, // truly deleted + "vm-1": true, // still active (for migration scenario) + }, + ProjectQuotas: []*v1alpha1.ProjectQuota{ + makePQ("project-a", nil), + makePQ("project-b", nil), + }, + Actions: []TestAction{ + // Step 1: full reconcile establishes baseline for both projects + // project-a hana_v2: az-1=3 RAM / 24 cores, az-2=1 RAM / 8 cores; general: az-1=1 RAM / 2 cores + // project-b general: az-1=1 RAM / 2 cores + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 2: HV diff adds a genuine new VM to project-a (hana_v2 small, az-1) + // +1 RAM unit, +8 cores + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB = 1 RAM unit + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 3: HV diff adds a phantom VM to project-b (general, az-1) + // This is a short-lived VM that will disappear -- DRIFT for project-b + { + Type: "hv_diff", + OldHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + }), + NewHV: makeHV("hv-2", []hv1.Instance{ + activeInstance("vm-5"), + activeInstance("vm-phantom-b"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-phantom-b", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB = 1 RAM unit + "vcpus": resource.MustParse("2"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 2}}, // 1+1 drift + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 4}}, // 2+2 drift + }, + }, + }, + // Step 4: HV diff removes vm-del from project-a (truly deleted) + // -1 RAM unit, -8 cores in az-1 + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + activeInstance("vm-del"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // 32-8=24 + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 5: full reconcile with OverrideVMs that includes vm-new-a + // (vm-new-a is now "real" and appears in the VM list). + // This reconcile: + // - project-a: FIXES drift -- truth is 4 (vm-new-a in list), delta said 3 + // - project-b: FIXES drift -- truth is 1, delta said 2 (phantom gone) + { + Type: "full_reconcile", + OverrideVMs: &[]failover.VM{ + // testVMs + vm-new-a + testVMs[0], testVMs[1], testVMs[2], testVMs[3], testVMs[4], + { + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + }, + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // corrected up + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, // corrected down + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, // corrected down + }, + }, + }, + // Step 6: another HV diff removes vm-1 from a HV (migration, not deletion). + // vm-1 is still active (ActiveVMs["vm-1"]=true), so NOT decremented. + { + Type: "hv_diff", + OldHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-1"), + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + NewHV: makeHV("hv-1", []hv1.Instance{ + activeInstance("vm-2"), + activeInstance("vm-new-a"), + }), + OverrideVMs: withExtraVMs( + failover.VM{ + UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2099-01-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), + "vcpus": resource.MustParse("8"), + }, + }, + ), + // vm-1 migrated, NOT decremented -- totals unchanged + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + // Step 7: final full reconcile confirms everything matches (no drift). + // This is the "reconcile that matches the deltas" -- nothing to fix. + { + Type: "full_reconcile", + ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ + "project-a": { + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, + "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + "project-b": { + "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, + "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, + }, + }, + }, + }, + }, + } + + _ = lastReconcileTime // referenced by test data (VM CreatedAt values) + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + env := newIntegrationTestEnv(t, tc) + + for i, action := range tc.Actions { + t.Logf(" action %d: %s", i+1, action.Type) + env.executeAction(action) + } + }) + } +} + +// ============================================================================ +// Test Data +// ============================================================================ + +var testFlavorGroups = map[string]compute.FlavorGroupFeature{ + "hana_v2": { + Name: "hana_v2", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.hana_v2.small", MemoryMB: 32768, VCPUs: 8}, + {Name: "m1.hana_v2.large", MemoryMB: 65536, VCPUs: 16}, + }, + }, + "general": { + Name: "general", + SmallestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + LargestFlavor: compute.FlavorInGroup{Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + Flavors: []compute.FlavorInGroup{ + {Name: "m1.general.small", MemoryMB: 4096, VCPUs: 2}, + }, + }, +} + +// Standard VM set for most tests. +// project-a has VMs in BOTH flavor groups (hana_v2 and general). +// project-b has only general VMs. +var testVMs = []failover.VM{ + // vm-1: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-1", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-2: hana_v2, 2 RAM units (65536/32768), 16 cores + { + UUID: "vm-2", FlavorName: "m1.hana_v2.large", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("68719476736"), // 65536 MiB + "vcpus": resource.MustParse("16"), + }, + }, + // vm-3: hana_v2, 1 RAM unit (32768/32768), 8 cores + { + UUID: "vm-3", FlavorName: "m1.hana_v2.small", + ProjectID: "project-a", AvailabilityZone: "az-2", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("34359738368"), // 32768 MiB + "vcpus": resource.MustParse("8"), + }, + }, + // vm-4: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-4", FlavorName: "m1.general.small", + ProjectID: "project-a", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, + // vm-5: general, 1 RAM unit (4096/4096), 2 cores + { + UUID: "vm-5", FlavorName: "m1.general.small", + ProjectID: "project-b", AvailabilityZone: "az-1", + CreatedAt: "2025-12-01T00:00:00Z", + Resources: map[string]resource.Quantity{ + "memory": resource.MustParse("4294967296"), // 4096 MiB + "vcpus": resource.MustParse("2"), + }, + }, +} + +// ============================================================================ +// Integration Test Framework +// ============================================================================ + +// TestAction defines a single step in an integration test scenario. +type TestAction struct { + // Type of action to perform. + // "full_reconcile" - run ReconcilePeriodic + // "hv_diff" - run ReconcileHVDiff with OldHV/NewHV + // "cr_update" - update a CR's UsedAmount, then run Reconcile (watch-triggered) + Type string + + // For hv_diff actions: + OldHV *hv1.Hypervisor + NewHV *hv1.Hypervisor + + // OverrideVMs, when non-nil, replaces the VMSource (ListVMs + GetVM) for + // THIS action and all subsequent actions. Use to simulate VMs appearing or + // disappearing between steps. To "undo" a temporary VM, set OverrideVMs + // again in a later action without that VM. + OverrideVMs *[]failover.VM + + // For cr_update actions: + CRName string + UsedAmount int64 + + // Optional: verify state AFTER this action completes. + // Keys are project IDs. If nil, no verification for this step. + ExpectedTotalUsage map[string]map[string]v1alpha1.ResourceQuotaUsage + ExpectedPaygUsage map[string]map[string]v1alpha1.ResourceQuotaUsage +} + +// IntegrationTestCase defines a complete integration test scenario. +type IntegrationTestCase struct { + Name string + + // Initial state seeded into the fake client and mock VMSource + VMs []failover.VM + DeletedVMs map[string]*failover.DeletedVMInfo // UUID -> deleted VM info + ActiveVMs map[string]bool // UUID -> IsServerActive response + + FlavorGroups map[string]compute.FlavorGroupFeature + ProjectQuotas []*v1alpha1.ProjectQuota + CommittedResources []*v1alpha1.CommittedResource + + // Ordered actions with per-step verification + Actions []TestAction +} + +// integrationTestEnv holds the test environment for a single test case. +type integrationTestEnv struct { + t *testing.T + client client.Client + controller *QuotaController + vmSource *mockVMSource +} + +func newIntegrationTestEnv(t *testing.T, tc IntegrationTestCase) *integrationTestEnv { + t.Helper() + + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add v1alpha1 to scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("failed to add hv1 to scheme: %v", err) + } + + // Build initial objects list + var objects []client.Object + + // Create Knowledge CRD with flavor groups + knowledgeCRD := buildKnowledgeCRD(t, tc.FlavorGroups) + objects = append(objects, knowledgeCRD) + + // Add ProjectQuotas + for _, pq := range tc.ProjectQuotas { + objects = append(objects, pq) + } + + // Add CommittedResources + for _, cr := range tc.CommittedResources { + objects = append(objects, cr) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource( + &v1alpha1.ProjectQuota{}, + &v1alpha1.CommittedResource{}, + &v1alpha1.Knowledge{}, + ). + Build() + + // Build mock VMSource + vmSrc := &mockVMSource{ + listVMs: func(_ context.Context) ([]failover.VM, error) { + return tc.VMs, nil + }, + getVM: func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range tc.VMs { + if tc.VMs[i].UUID == vmUUID { + return &tc.VMs[i], nil + } + } + return nil, nil + }, + isServerActive: func(_ context.Context, vmUUID string) (bool, error) { + if tc.ActiveVMs != nil { + if active, ok := tc.ActiveVMs[vmUUID]; ok { + return active, nil + } + } + return false, nil + }, + getDeletedVM: func(_ context.Context, vmUUID string) (*failover.DeletedVMInfo, error) { + if tc.DeletedVMs != nil { + if info, ok := tc.DeletedVMs[vmUUID]; ok { + return info, nil + } + } + return nil, nil + }, + } + + controller := &QuotaController{ + Client: k8sClient, + VMSource: vmSrc, + Config: DefaultQuotaControllerConfig(), + Metrics: NewQuotaMetrics(nil), // no-op metrics + } + + return &integrationTestEnv{ + t: t, + client: k8sClient, + controller: controller, + vmSource: vmSrc, + } +} + +func (env *integrationTestEnv) verifyTotalUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.TotalUsage == nil { + return // both nil, ok + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.TotalUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected TotalUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: TotalUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + // Check no unexpected resources + for resourceName := range pq.Status.TotalUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected TotalUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) verifyPaygUsage(projectID string, expected map[string]v1alpha1.ResourceQuotaUsage) { + env.t.Helper() + crdName := "quota-" + projectID + var pq v1alpha1.ProjectQuota + if err := env.client.Get(context.Background(), client.ObjectKey{Name: crdName}, &pq); err != nil { + env.t.Fatalf("failed to get ProjectQuota %s: %v", crdName, err) + } + + if expected == nil && pq.Status.PaygUsage == nil { + return + } + + for resourceName, expectedUsage := range expected { + actual, ok := pq.Status.PaygUsage[resourceName] + if !ok { + env.t.Errorf("project %s: expected PaygUsage resource %q not found", projectID, resourceName) + continue + } + for az, expectedAmount := range expectedUsage.PerAZ { + if actual.PerAZ[az] != expectedAmount { + env.t.Errorf("project %s: PaygUsage[%s][%s] = %d, want %d", + projectID, resourceName, az, actual.PerAZ[az], expectedAmount) + } + } + } + + for resourceName := range pq.Status.PaygUsage { + if _, ok := expected[resourceName]; !ok { + env.t.Errorf("project %s: unexpected PaygUsage resource %q", projectID, resourceName) + } + } +} + +func (env *integrationTestEnv) executeAction(action TestAction) { + env.t.Helper() + ctx := context.Background() + + // Apply OverrideVMs if set (persists for all subsequent actions) + if action.OverrideVMs != nil { + vms := *action.OverrideVMs + env.vmSource.listVMs = func(_ context.Context) ([]failover.VM, error) { + return vms, nil + } + env.vmSource.getVM = func(_ context.Context, vmUUID string) (*failover.VM, error) { + for i := range vms { + if vms[i].UUID == vmUUID { + return &vms[i], nil + } + } + return nil, nil + } + } + + switch action.Type { + case "full_reconcile": + if err := env.controller.ReconcilePeriodic(ctx); err != nil { + env.t.Fatalf("ReconcilePeriodic failed: %v", err) + } + + case "hv_diff": + if err := env.controller.ReconcileHVDiff(ctx, action.OldHV, action.NewHV); err != nil { + env.t.Fatalf("ReconcileHVDiff failed: %v", err) + } + + case "cr_update": + // Fetch the CR, update UsedAmount, then call Reconcile + var cr v1alpha1.CommittedResource + if err := env.client.Get(ctx, client.ObjectKey{Name: action.CRName}, &cr); err != nil { + env.t.Fatalf("failed to get CR %s: %v", action.CRName, err) + } + usedQty := resource.NewQuantity(action.UsedAmount, resource.DecimalSI) + cr.Status.UsedAmount = usedQty + if err := env.client.Status().Update(ctx, &cr); err != nil { + env.t.Fatalf("failed to update CR %s status: %v", action.CRName, err) + } + + // Simulate watch trigger: call Reconcile for the affected project + pqName := "quota-" + cr.Spec.ProjectID + _, err := env.controller.Reconcile(ctx, reconcileRequest(pqName)) + if err != nil { + env.t.Fatalf("Reconcile failed after CR update: %v", err) + } + + default: + env.t.Fatalf("unknown action type: %s", action.Type) + } + + // Verify expected state after this action + if action.ExpectedTotalUsage != nil { + for projectID, expected := range action.ExpectedTotalUsage { + env.verifyTotalUsage(projectID, expected) + } + } + if action.ExpectedPaygUsage != nil { + for projectID, expected := range action.ExpectedPaygUsage { + env.verifyPaygUsage(projectID, expected) + } + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +func buildKnowledgeCRD(t *testing.T, flavorGroups map[string]compute.FlavorGroupFeature) *v1alpha1.Knowledge { + t.Helper() + + // Convert map to slice for BoxFeatureList + var features []compute.FlavorGroupFeature + for _, fg := range flavorGroups { + features = append(features, fg) + } + + raw, err := boxFlavorGroupFeatures(features) + if err != nil { + t.Fatalf("failed to box flavor group features: %v", err) + } + + return &v1alpha1.Knowledge{ + ObjectMeta: metav1.ObjectMeta{ + Name: "flavor-groups", + }, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: "nova", + }, + Status: v1alpha1.KnowledgeStatus{ + Raw: raw, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.KnowledgeConditionReady, + Status: metav1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "Ready", + }, + }, + }, + } +} + +func boxFlavorGroupFeatures(features []compute.FlavorGroupFeature) (runtime.RawExtension, error) { + rawSerialized := struct { + Features []compute.FlavorGroupFeature `json:"features"` + }{ + Features: features, + } + data, err := json.Marshal(rawSerialized) + if err != nil { + return runtime.RawExtension{}, err + } + return runtime.RawExtension{Raw: data}, nil +} + +func reconcileRequest(name string) ctrl.Request { + return ctrl.Request{NamespacedName: client.ObjectKey{Name: name}} +} + +func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { + return &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID}, + Spec: v1alpha1.ProjectQuotaSpec{ProjectID: projectID, DomainID: "domain-1"}, + Status: v1alpha1.ProjectQuotaStatus{ + LastReconcileAt: lastReconcileAt, + }, + } +} + +func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { + cr := &v1alpha1.CommittedResource{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.CommittedResourceSpec{ + CommitmentUUID: name + "-uuid", + FlavorGroupName: flavorGroup, + ResourceType: resourceType, + AvailabilityZone: az, + ProjectID: projectID, + DomainID: "domain-1", + Amount: resource.MustParse("10"), + State: state, + }, + } + if usedAmount != nil { + qty := resource.NewQuantity(*usedAmount, resource.DecimalSI) + cr.Status.UsedAmount = qty + } + return cr +} + +func int64Ptr(v int64) *int64 { return &v } + +// withExtraVMs returns a pointer to testVMs + additional VMs. +// Used with OverrideVMs to add VMs to the "world" for an action. +func withExtraVMs(extra ...failover.VM) *[]failover.VM { + vms := append(append([]failover.VM{}, testVMs...), extra...) + return &vms +} + +// baseVMsPtr returns a pointer to a copy of testVMs (resets to baseline). +func baseVMsPtr() *[]failover.VM { + vms := append([]failover.VM{}, testVMs...) + return &vms +} + +func makeHV(name string, instances []hv1.Instance) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Status: hv1.HypervisorStatus{ + Instances: instances, + }, + } +} + +func activeInstance(id string) hv1.Instance { + return hv1.Instance{ID: id, Active: true} +} diff --git a/internal/scheduling/reservations/quota/metrics.go b/internal/scheduling/reservations/quota/metrics.go new file mode 100644 index 000000000..7263ab1fd --- /dev/null +++ b/internal/scheduling/reservations/quota/metrics.go @@ -0,0 +1,98 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package quota + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +// QuotaMetrics holds Prometheus metrics for the quota controller. +type QuotaMetrics struct { + totalUsageGauge *prometheus.GaugeVec + paygUsageGauge *prometheus.GaugeVec + crUsageGauge *prometheus.GaugeVec + reconcileDuration prometheus.Histogram + reconcileResultVec *prometheus.CounterVec +} + +// NewQuotaMetrics creates a new QuotaMetrics instance and registers with the given registerer. +func NewQuotaMetrics(reg prometheus.Registerer) *QuotaMetrics { + m := &QuotaMetrics{ + totalUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_total_usage", + Help: "Total resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + paygUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_payg_usage", + Help: "Pay-as-you-go usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + crUsageGauge: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "cortex_quota_cr_usage", + Help: "Committed resource usage per project/AZ/resource", + }, + []string{"project_id", "availability_zone", "resource"}, + ), + reconcileDuration: prometheus.NewHistogram( + prometheus.HistogramOpts{ + Name: "cortex_quota_reconcile_duration_seconds", + Help: "Duration of quota controller full reconcile", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), + }, + ), + reconcileResultVec: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "cortex_quota_reconcile_total", + Help: "Total number of periodic reconcile attempts by result (success/failure)", + }, + []string{"result"}, + ), + } + + if reg != nil { + reg.MustRegister(m.totalUsageGauge) + reg.MustRegister(m.paygUsageGauge) + reg.MustRegister(m.crUsageGauge) + reg.MustRegister(m.reconcileDuration) + reg.MustRegister(m.reconcileResultVec) + } + + return m +} + +// RecordUsage records usage metrics for a project/AZ/resource. +func (m *QuotaMetrics) RecordUsage(projectID, az, resource string, totalUsage, paygUsage, crUsage int64) { + if m == nil { + return + } + m.totalUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(totalUsage)) + m.paygUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(paygUsage)) + m.crUsageGauge.WithLabelValues(projectID, az, resource).Set(float64(crUsage)) +} + +// RecordReconcileDuration records the duration of a full reconcile. +func (m *QuotaMetrics) RecordReconcileDuration(seconds float64) { + if m == nil { + return + } + m.reconcileDuration.Observe(seconds) +} + +// RecordReconcileResult increments the success or failure counter for periodic reconciles. +func (m *QuotaMetrics) RecordReconcileResult(success bool) { + if m == nil { + return + } + result := "failure" + if success { + result = "success" + } + m.reconcileResultVec.WithLabelValues(result).Inc() +} From 8b1a17f56f4254958ebecd45f7c71dfd8aca3448 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 5 May 2026 07:43:40 +0000 Subject: [PATCH 03/10] test: simplify integration test OverrideVMs, fix lint --- .../reservations/quota/integration_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go index 36977203e..32ffadc7c 100644 --- a/internal/scheduling/reservations/quota/integration_test.go +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -191,7 +191,7 @@ func TestIntegration(t *testing.T) { }, // Step 2: HV diff adds vm-1 (which was created BEFORE last reconcile = migration) { - Type: "hv_diff", + Type: "hv_diff", OldHV: makeHV("hv-2", []hv1.Instance{}), NewHV: makeHV("hv-2", []hv1.Instance{ activeInstance("vm-1"), // migrated here, created before reconcile @@ -498,7 +498,7 @@ func TestIntegration(t *testing.T) { // TotalUsage now has phantom's contribution (drift) ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // 3+1 drift "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // 24+8 drift "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, @@ -512,7 +512,7 @@ func TestIntegration(t *testing.T) { OverrideVMs: baseVMsPtr(), ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // corrected "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // corrected "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, @@ -638,7 +638,7 @@ func TestIntegration(t *testing.T) { activeInstance("vm-2"), activeInstance("vm-new-a"), }), - + OverrideVMs: withExtraVMs( failover.VM{ UUID: "vm-new-a", FlavorName: "m1.hana_v2.small", @@ -652,7 +652,7 @@ func TestIntegration(t *testing.T) { ), ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 3, "az-2": 1}}, // 4-1=3 "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 24, "az-2": 8}}, // 32-8=24 "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, @@ -681,7 +681,7 @@ func TestIntegration(t *testing.T) { }, ExpectedTotalUsage: map[string]map[string]v1alpha1.ResourceQuotaUsage{ "project-a": { - "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up + "hw_version_hana_v2_ram": {PerAZ: map[string]int64{"az-1": 4, "az-2": 1}}, // corrected up "hw_version_hana_v2_cores": {PerAZ: map[string]int64{"az-1": 32, "az-2": 8}}, // corrected up "hw_version_general_ram": {PerAZ: map[string]int64{"az-1": 1}}, "hw_version_general_cores": {PerAZ: map[string]int64{"az-1": 2}}, @@ -1172,7 +1172,7 @@ func reconcileRequest(name string) ctrl.Request { return ctrl.Request{NamespacedName: client.ObjectKey{Name: name}} } -func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { +func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuota { //nolint:unparam return &v1alpha1.ProjectQuota{ ObjectMeta: metav1.ObjectMeta{Name: "quota-" + projectID}, Spec: v1alpha1.ProjectQuotaSpec{ProjectID: projectID, DomainID: "domain-1"}, @@ -1182,7 +1182,7 @@ func makePQ(projectID string, lastReconcileAt *metav1.Time) *v1alpha1.ProjectQuo } } -func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { +func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.CommittedResourceType, state v1alpha1.CommitmentStatus, usedAmount *int64) *v1alpha1.CommittedResource { //nolint:unparam cr := &v1alpha1.CommittedResource{ ObjectMeta: metav1.ObjectMeta{Name: name}, Spec: v1alpha1.CommittedResourceSpec{ From 8045609e18f6bc9980a419a0078fdf5da4dbed8b Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 5 May 2026 07:56:29 +0000 Subject: [PATCH 04/10] quota controller: add LastFullReconcileAt, domainID validation, RetryOnConflict in API, generation predicate - Add LastFullReconcileAt to ProjectQuotaStatus (separate watermark for incremental add detection, not advanced by PaygUsage-only recomputes) - isVMNewSinceLastReconcile now uses LastFullReconcileAt instead of LastReconcileAt to avoid false skips after CR-triggered recomputes - Add domainID validation in HandleQuota (400 if missing) - Wrap HandleQuota create/update in RetryOnConflict to handle concurrent status updates from the quota controller - Add projectQuotaGenerationChangePredicate (GenerationChangedPredicate) on ProjectQuota For() watch to prevent infinite reconcile loops from status-only updates - Add CreatedAt field to VM struct, populated in buildVMsFromHypervisors Ref: BLI #376 --- api/v1alpha1/project_quota_types.go | 8 +- api/v1alpha1/zz_generated.deepcopy.go | 4 + .../crds/cortex.cloud_projectquotas.yaml | 9 ++- .../reservations/commitments/api/quota.go | 81 +++++++++++-------- .../commitments/api/quota_test.go | 26 +++++- .../reservations/failover/vm_source.go | 1 + .../reservations/quota/controller.go | 29 ++++--- .../reservations/quota/controller_test.go | 3 +- 8 files changed, 111 insertions(+), 50 deletions(-) diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go index 715b6e728..113a0aa1f 100644 --- a/api/v1alpha1/project_quota_types.go +++ b/api/v1alpha1/project_quota_types.go @@ -81,10 +81,16 @@ type ProjectQuotaStatus struct { // +kubebuilder:validation:Optional PaygUsage map[string]ResourceQuotaUsage `json:"paygUsage,omitempty"` - // LastReconcileAt is when the controller last reconciled this project's quota. + // LastReconcileAt is when the controller last reconciled this project's quota (any path). // +kubebuilder:validation:Optional LastReconcileAt *metav1.Time `json:"lastReconcileAt,omitempty"` + // LastFullReconcileAt is when the periodic full reconcile last completed for this project. + // Used as the watermark for isVMNewSinceLastReconcile (incremental add detection). + // Only updated by ReconcilePeriodic, NOT by PaygUsage recomputes or incremental deltas. + // +kubebuilder:validation:Optional + LastFullReconcileAt *metav1.Time `json:"lastFullReconcileAt,omitempty"` + // Conditions holds the current status conditions. // +kubebuilder:validation:Optional Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 873beb73c..d72f4d078 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1522,6 +1522,10 @@ func (in *ProjectQuotaStatus) DeepCopyInto(out *ProjectQuotaStatus) { in, out := &in.LastReconcileAt, &out.LastReconcileAt *out = (*in).DeepCopy() } + if in.LastFullReconcileAt != nil { + in, out := &in.LastFullReconcileAt, &out.LastFullReconcileAt + *out = (*in).DeepCopy() + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]v1.Condition, len(*in)) diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml index 7bebbdb6c..c196daf42 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -179,9 +179,16 @@ spec: - type type: object type: array + lastFullReconcileAt: + description: |- + LastFullReconcileAt is when the periodic full reconcile last completed for this project. + Used as the watermark for isVMNewSinceLastReconcile (incremental add detection). + Only updated by ReconcilePeriodic, NOT by PaygUsage recomputes or incremental deltas. + format: date-time + type: string lastReconcileAt: description: LastReconcileAt is when the controller last reconciled - this project's quota. + this project's quota (any path). format: date-time type: string paygUsage: diff --git a/internal/scheduling/reservations/commitments/api/quota.go b/internal/scheduling/reservations/commitments/api/quota.go index 37b57d22a..9c34e879c 100644 --- a/internal/scheduling/reservations/commitments/api/quota.go +++ b/internal/scheduling/reservations/commitments/api/quota.go @@ -16,6 +16,7 @@ import ( "github.com/sapcc/go-api-declarations/liquid" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -83,6 +84,11 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { domainName = meta.Domain.Name } + if domainID == "" { + api.quotaError(w, http.StatusBadRequest, "missing domain UUID in project metadata", startTime) + return + } + // Build the spec quota map from the liquid request. // liquid API uses uint64; our CRD uses int64 (K8s convention). // Guard against overflow: uint64 values > MaxInt64 would wrap to negative. @@ -108,40 +114,43 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { specQuota[string(resourceName)] = rq } - // Create or update ProjectQuota CRD + // Create or update ProjectQuota CRD with retry-on-conflict to handle + // concurrent status updates from the quota controller. crdName := projectQuotaCRDName(projectID) ctx := r.Context() - var existing v1alpha1.ProjectQuota - err = api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) - if err != nil { - if !apierrors.IsNotFound(err) { - // Real error - log.Error(err, "failed to get existing ProjectQuota", "name", crdName) - api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to check existing quota: %v", err), startTime) - return - } - // Not found — create new - pq := &v1alpha1.ProjectQuota{ - ObjectMeta: metav1.ObjectMeta{ - Name: crdName, - }, - Spec: v1alpha1.ProjectQuotaSpec{ - ProjectID: projectID, - ProjectName: projectName, - DomainID: domainID, - DomainName: domainName, - Quota: specQuota, - }, - } - if err := api.client.Create(ctx, pq); err != nil { - log.Error(err, "failed to create ProjectQuota", "name", crdName) - api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to create quota: %v", err), startTime) - return + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + var existing v1alpha1.ProjectQuota + getErr := api.client.Get(ctx, client.ObjectKey{Name: crdName}, &existing) + if getErr != nil { + if !apierrors.IsNotFound(getErr) { + return getErr + } + // Not found -- create new + pq := &v1alpha1.ProjectQuota{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + }, + Spec: v1alpha1.ProjectQuotaSpec{ + ProjectID: projectID, + ProjectName: projectName, + DomainID: domainID, + DomainName: domainName, + Quota: specQuota, + }, + } + if createErr := api.client.Create(ctx, pq); createErr != nil { + // If another request just created it, retry will fetch and update + if apierrors.IsAlreadyExists(createErr) { + return createErr + } + return createErr + } + log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + return nil } - log.V(1).Info("created ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) - } else { - // Update existing + + // Update existing (re-fetched on each retry to get fresh resourceVersion) existing.Spec.Quota = specQuota if projectName != "" { existing.Spec.ProjectName = projectName @@ -152,12 +161,16 @@ func (api *HTTPAPI) HandleQuota(w http.ResponseWriter, r *http.Request) { if domainName != "" { existing.Spec.DomainName = domainName } - if err := api.client.Update(ctx, &existing); err != nil { - log.Error(err, "failed to update ProjectQuota", "name", crdName) - api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to update quota: %v", err), startTime) - return + if updateErr := api.client.Update(ctx, &existing); updateErr != nil { + return updateErr } log.V(1).Info("updated ProjectQuota", "name", crdName, "projectID", projectID, "resources", len(specQuota)) + return nil + }) + if err != nil { + log.Error(err, "failed to create/update ProjectQuota", "name", crdName) + api.quotaError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to persist quota: %v", err), startTime) + return } // Return 204 No Content as expected by the LIQUID API diff --git a/internal/scheduling/reservations/commitments/api/quota_test.go b/internal/scheduling/reservations/commitments/api/quota_test.go index 218bc0815..cf66e39f5 100644 --- a/internal/scheduling/reservations/commitments/api/quota_test.go +++ b/internal/scheduling/reservations/commitments/api/quota_test.go @@ -171,16 +171,26 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { }, }, }, + metadata: &liquid.ProjectMetadata{ + UUID: "project-abc-123", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, expectQuota: map[string]int64{"hw_version_hana_1_ram": 100}, expectPerAZ: map[string]map[string]int64{ "hw_version_hana_1_ram": {"az-a": 60, "az-b": 40}, }, + expectDomain: "domain-1", }, { - name: "Create_EmptyResources", - projectID: "project-empty", - resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, - expectQuota: map[string]int64{}, + name: "Create_EmptyResources", + projectID: "project-empty", + resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{}, + metadata: &liquid.ProjectMetadata{ + UUID: "project-empty", + Domain: liquid.DomainMetadata{UUID: "domain-1"}, + }, + expectQuota: map[string]int64{}, + expectDomain: "domain-1", }, { name: "Create_WithMetadata", @@ -215,6 +225,14 @@ func TestHandleQuota_CreateAndUpdate(t *testing.T) { }, }, projectID: "project-xyz", + metadata: &liquid.ProjectMetadata{ + UUID: "project-xyz", + Name: "original-project-name", + Domain: liquid.DomainMetadata{ + UUID: "original-domain", + Name: "original-domain-name", + }, + }, resources: map[liquid.ResourceName]liquid.ResourceQuotaRequest{ "hw_version_hana_1_ram": { Quota: 200, diff --git a/internal/scheduling/reservations/failover/vm_source.go b/internal/scheduling/reservations/failover/vm_source.go index bcf935798..5a9603b79 100644 --- a/internal/scheduling/reservations/failover/vm_source.go +++ b/internal/scheduling/reservations/failover/vm_source.go @@ -329,6 +329,7 @@ func buildVMsFromHypervisors(hypervisorList *hv1.HypervisorList, postgresVMs []V ProjectID: pgVM.ProjectID, CurrentHypervisor: hv.Name, // Use hypervisor CRD location, not postgres AvailabilityZone: pgVM.AvailabilityZone, + CreatedAt: pgVM.CreatedAt, Resources: pgVM.Resources, FlavorExtraSpecs: pgVM.FlavorExtraSpecs, } diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 052b2b685..53e87c51c 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -127,8 +127,8 @@ func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { // Derive PaygUsage = TotalUsage - CRUsage (clamp >= 0) paygUsage := derivePaygUsage(projectTotalUsage, crUsage) - // Write status with conflict retry - if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, projectTotalUsage, paygUsage); err != nil { + // Write status with conflict retry (full reconcile sets LastFullReconcileAt) + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, projectTotalUsage, paygUsage, true); err != nil { logger.Error(err, "failed to update ProjectQuota status", "project", projectID) skipped++ continue @@ -200,7 +200,7 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl paygUsage := derivePaygUsage(totalUsage, crUsage) // Write updated PaygUsage with conflict retry (keep TotalUsage unchanged) - if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage); err != nil { + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, false); err != nil { logger.Error(err, "failed to update ProjectQuota status") return ctrl.Result{}, err } @@ -408,12 +408,12 @@ func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *fai return false } - if pq.Status.LastReconcileAt == nil { + if pq.Status.LastFullReconcileAt == nil { // No full reconcile has run yet -- skip incremental updates return false } - // Parse the VM's creation time and compare with last reconcile + // Parse the VM's creation time and compare with last FULL reconcile vmCreatedAt, err := time.Parse("2006-01-02T15:04:05Z", vm.CreatedAt) if err != nil { // Try alternative format with timezone offset @@ -424,7 +424,7 @@ func (c *QuotaController) isVMNewSinceLastReconcile(ctx context.Context, vm *fai } } - return vmCreatedAt.After(pq.Status.LastReconcileAt.Time) + return vmCreatedAt.After(pq.Status.LastFullReconcileAt.Time) } // accumulateRemovedVM looks up a deleted VM and accumulates its resource contribution as a decrement. @@ -554,8 +554,9 @@ func (c *QuotaController) applyDeltaAndUpdateStatus( func (c *QuotaController) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). Named("quota-controller"). - // Watch ProjectQuota for spec changes (Limes pushes quota) - For(&v1alpha1.ProjectQuota{}). + // Watch ProjectQuota for spec/generation changes only (ignore status-only updates + // to avoid infinite reconcile loops since Reconcile() updates status). + For(&v1alpha1.ProjectQuota{}, builder.WithPredicates(projectQuotaGenerationChangePredicate())). // Watch CommittedResource for status changes (UsedAmount updates) Watches( &v1alpha1.CommittedResource{}, @@ -776,11 +777,13 @@ func derivePaygUsage( // updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt // with retry-on-conflict to handle concurrent updates. +// If fullReconcile is true, also updates LastFullReconcileAt. func (c *QuotaController) updateProjectQuotaStatusWithRetry( ctx context.Context, pqName string, totalUsage map[string]v1alpha1.ResourceQuotaUsage, paygUsage map[string]v1alpha1.ResourceQuotaUsage, + fullReconcile bool, ) error { return retry.RetryOnConflict(retry.DefaultRetry, func() error { @@ -794,7 +797,9 @@ func (c *QuotaController) updateProjectQuotaStatusWithRetry( pq.Status.PaygUsage = paygUsage now := metav1.Now() pq.Status.LastReconcileAt = &now - + if fullReconcile { + pq.Status.LastFullReconcileAt = &now + } return c.Status().Update(ctx, &pq) }) } @@ -911,6 +916,12 @@ func crUsedAmountChangePredicate() predicate.Predicate { } } +// projectQuotaGenerationChangePredicate triggers only when the ProjectQuota's generation changes +// (i.e., spec was modified). This prevents infinite reconcile loops from status-only updates. +func projectQuotaGenerationChangePredicate() predicate.Predicate { + return predicate.GenerationChangedPredicate{} +} + // hvInstanceChangePredicate always returns true for updates. // ReconcileHVDiff performs its own set-diff and exits early if there are no // actual additions/removals. This ensures instance swaps (same count, different IDs) diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go index 4005af326..68eb8dd33 100644 --- a/internal/scheduling/reservations/quota/controller_test.go +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -501,7 +501,8 @@ func TestAccumulateAddedVM_KnownFlavor(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "quota-project-a"}, Spec: v1alpha1.ProjectQuotaSpec{ProjectID: "project-a"}, Status: v1alpha1.ProjectQuotaStatus{ - LastReconcileAt: &lastReconcile, + LastReconcileAt: &lastReconcile, + LastFullReconcileAt: &lastReconcile, }, } From 53898cfc63e57c6a67b6b6380821e0db48d3f840 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Tue, 5 May 2026 17:22:17 +0200 Subject: [PATCH 05/10] wip local testing --- cmd/manager/main.go | 62 +++++++++++++++++++ helm/bundles/cortex-nova/values.yaml | 6 ++ helm/library/cortex/templates/rbac/role.yaml | 3 + .../reservations/quota/controller.go | 43 +++++++++++-- 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index ba1cd52e8..026801b5b 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -59,6 +59,7 @@ import ( "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" commitmentsapi "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments/api" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover" + "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/quota" "github.com/cobaltcore-dev/cortex/pkg/conf" "github.com/cobaltcore-dev/cortex/pkg/monitoring" "github.com/cobaltcore-dev/cortex/pkg/multicluster" @@ -689,6 +690,67 @@ func main() { "vmSelectionRotationInterval", failoverConfig.VMSelectionRotationInterval) } + if slices.Contains(mainConfig.EnabledControllers, "quota-controller") { + setupLog.Info("enabling controller", "controller", "quota-controller") + quotaConfig := conf.GetConfigOrDie[quota.QuotaControllerConfig]() + quotaConfig.ApplyDefaults() + + // Get datasource name from the failover/commitments config (shared dependency) + datasourceName := conf.GetConfigOrDie[failover.FailoverConfig]().DatasourceName + if datasourceName == "" { + setupLog.Error(nil, "quota-controller requires datasourceName to be configured") + os.Exit(1) + } + + quotaMetrics := quota.NewQuotaMetrics(metrics.Registry) + + // Defer initialization until the manager starts (cache must be ready for postgres reader) + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + // Create PostgresReader from the configured Datasource CRD + postgresReader, err := external.NewPostgresReader(ctx, multiclusterClient, datasourceName) + if err != nil { + setupLog.Error(err, "unable to create postgres reader for quota controller", + "datasourceName", datasourceName) + return err + } + + // Create NovaReader and DBVMSource + novaReader := external.NewNovaReader(postgresReader) + vmSource := failover.NewDBVMSource(novaReader) + + // Create the quota controller + quotaController := quota.NewQuotaController( + multiclusterClient, + vmSource, + quotaConfig, + quotaMetrics, + ) + + // Set up the watch-based reconciler (ProjectQuota spec changes, CR changes) + if err := quotaController.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to set up quota controller") + return err + } + + // Set up the HV watcher for incremental TotalUsage updates + if err := quotaController.SetupHVWatcher(mgr); err != nil { + setupLog.Error(err, "unable to set up quota HV watcher") + return err + } + + setupLog.Info("quota-controller starting", + "fullReconcileInterval", quotaConfig.FullReconcileInterval.Duration, + "crStateFilter", quotaConfig.CRStateFilter) + + // Start the periodic full reconciliation loop + return quotaController.Start(ctx) + })); err != nil { + setupLog.Error(err, "unable to add quota controller to manager") + os.Exit(1) + } + setupLog.Info("quota-controller registered") + } + // +kubebuilder:scaffold:builder if metricsCertWatcher != nil { diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml index 65ad879dd..75a02115c 100644 --- a/helm/bundles/cortex-nova/values.yaml +++ b/helm/bundles/cortex-nova/values.yaml @@ -95,6 +95,8 @@ cortex: &cortex - cortex.cloud/v1alpha1/ReservationList - cortex.cloud/v1alpha1/CommittedResource - cortex.cloud/v1alpha1/CommittedResourceList + - cortex.cloud/v1alpha1/ProjectQuota + - cortex.cloud/v1alpha1/ProjectQuotaList - kvm.cloud.sap/v1/Hypervisor - kvm.cloud.sap/v1/HypervisorList - v1/Secret @@ -133,6 +135,7 @@ cortex-scheduling-controllers: - hypervisor-overcommit-controller - committed-resource-reservations-controller - failover-reservations-controller + - quota-controller enabledTasks: - nova-history-cleanup-task # If true, the external scheduler API will limit the list of hosts in its @@ -170,6 +173,9 @@ cortex-scheduling-controllers: # Whether the report-capacity API endpoint is active # When false, the endpoint returns HTTP 503. committedResourceEnableReportCapacityAPI: true + # Whether the quota API endpoint is active + # When false, the endpoint returns HTTP 503. + committedResourceEnableQuotaAPI: true # OvercommitMappings is a list of mappings that map hypervisor traits to # overcommit ratios. Note that this list is applied in order, so if there # are multiple mappings applying to the same hypervisors, the last mapping diff --git a/helm/library/cortex/templates/rbac/role.yaml b/helm/library/cortex/templates/rbac/role.yaml index ea75c6897..9d4a6903c 100644 --- a/helm/library/cortex/templates/rbac/role.yaml +++ b/helm/library/cortex/templates/rbac/role.yaml @@ -14,6 +14,7 @@ rules: - datasources - reservations - committedresources + - projectquotas - decisions - deschedulings - pipelines @@ -34,6 +35,7 @@ rules: - datasources/finalizers - reservations/finalizers - committedresources/finalizers + - projectquotas/finalizers - decisions/finalizers - deschedulings/finalizers - pipelines/finalizers @@ -48,6 +50,7 @@ rules: - datasources/status - reservations/status - committedresources/status + - projectquotas/status - decisions/status - deschedulings/status - pipelines/status diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 53e87c51c..2a63e8a24 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -180,9 +180,14 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Read persisted TotalUsage (already computed by full reconcile or incremental) totalUsage := pq.Status.TotalUsage if totalUsage == nil { - // No TotalUsage yet — full reconcile hasn't run. Skip. - logger.V(1).Info("no TotalUsage persisted yet, skipping PaygUsage recompute") - return ctrl.Result{}, nil + // No TotalUsage yet — compute it now for this single project (bootstrap case). + logger.Info("no TotalUsage persisted yet, computing for this project") + var err error + totalUsage, err = c.computeTotalUsageForProject(ctx, projectID) + if err != nil { + logger.Error(err, "failed to compute TotalUsage for project") + return ctrl.Result{}, err + } } // List CRs for this project (from local cache) @@ -199,8 +204,9 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Derive PaygUsage paygUsage := derivePaygUsage(totalUsage, crUsage) - // Write updated PaygUsage with conflict retry (keep TotalUsage unchanged) - if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, false); err != nil { + // Write updated status with conflict retry (full=true for bootstrap to set LastFullReconcileAt) + isBootstrap := pq.Status.TotalUsage == nil + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, isBootstrap); err != nil { logger.Error(err, "failed to update ProjectQuota status") return ctrl.Result{}, err } @@ -208,10 +214,35 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Record metrics c.recordUsageMetrics(projectID, totalUsage, paygUsage, crUsage) - logger.V(1).Info("PaygUsage recomputed", "project", projectID) + logger.V(1).Info("PaygUsage recomputed", "project", projectID, "bootstrap", isBootstrap) return ctrl.Result{}, nil } +// computeTotalUsageForProject computes TotalUsage for a single project by reading +// all VMs from Postgres and filtering to the target project. Used as bootstrap when +// a ProjectQuota is first created and has no persisted TotalUsage yet. +func (c *QuotaController) computeTotalUsageForProject(ctx context.Context, projectID string) (map[string]v1alpha1.ResourceQuotaUsage, error) { + // Fetch flavor groups from Knowledge CRD + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + return nil, fmt.Errorf("failed to get flavor groups: %w", err) + } + + // Build flavorName → flavorGroup lookup + flavorToGroup := buildFlavorToGroupMap(flavorGroups) + + // Fetch all VMs and compute usage (only the target project's data will be used) + vms, err := c.VMSource.ListVMs(ctx) + if err != nil { + return nil, fmt.Errorf("failed to list VMs: %w", err) + } + + // Compute totalUsage for all projects and return just this one + totalUsageByProject := c.computeTotalUsage(vms, flavorToGroup, flavorGroups) + return totalUsageByProject[projectID], nil +} + // ============================================================================ // Incremental Update (HV Instance Diff) // ============================================================================ From 9552b02b5989001eacdf4e2d1d6268157e4524c6 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Thu, 7 May 2026 11:54:39 +0200 Subject: [PATCH 06/10] misc from local testing --- api/v1alpha1/project_quota_types.go | 6 +++ .../crds/cortex.cloud_projectquotas.yaml | 7 +++ .../reservations/quota/controller.go | 44 ++++++++++++++----- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/api/v1alpha1/project_quota_types.go b/api/v1alpha1/project_quota_types.go index 113a0aa1f..fecac57cc 100644 --- a/api/v1alpha1/project_quota_types.go +++ b/api/v1alpha1/project_quota_types.go @@ -69,6 +69,12 @@ type ProjectQuotaSpec struct { // Usage values correspond to liquid.AZResourceUsageReport fields reported via /report-usage. // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid#AZResourceUsageReport type ProjectQuotaStatus struct { + // ObservedGeneration is the most recent spec generation that the controller has processed. + // Used to distinguish spec changes (which require TotalUsage recompute) from + // CommittedResource changes (which only need PaygUsage recompute). + // +kubebuilder:validation:Optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + // TotalUsage tracks per-resource per-AZ total resource consumption (all VMs in this project). // Persisted by the quota controller; updated by full reconcile and HV instance diffs. // Key: liquid.ResourceName diff --git a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml index c196daf42..c9183638b 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_projectquotas.yaml @@ -191,6 +191,13 @@ spec: this project's quota (any path). format: date-time type: string + observedGeneration: + description: |- + ObservedGeneration is the most recent spec generation that the controller has processed. + Used to distinguish spec changes (which require TotalUsage recompute) from + CommittedResource changes (which only need PaygUsage recompute). + format: int64 + type: integer paygUsage: additionalProperties: description: ResourceQuotaUsage holds per-AZ PAYG usage for a single diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 2a63e8a24..6deecce71 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -158,11 +158,14 @@ func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { // Reconcile handles watch-based reconciliation for a single ProjectQuota. // Triggered by: CR Status.UsedAmount changes or ProjectQuota spec changes. -// It reads the persisted TotalUsage, re-lists CRs, and recomputes PaygUsage. +// +// Behavior depends on what changed: +// - Spec change (Generation > ObservedGeneration): recomputes TotalUsage from Postgres + PaygUsage +// - CR UsedAmount change (Generation == ObservedGeneration): reads persisted TotalUsage, recomputes PaygUsage only func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { ctx = WithNewGlobalRequestID(ctx) logger := LoggerFromContext(ctx).WithValues("projectQuota", req.Name, "mode", "payg-recompute") - logger.V(1).Info("reconciling ProjectQuota (PaygUsage recompute)") + logger.V(1).Info("reconciling ProjectQuota") // Fetch the ProjectQuota var pq v1alpha1.ProjectQuota @@ -177,17 +180,34 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl projectID := pq.Spec.ProjectID ctx = reservations.WithRequestID(ctx, projectID) - // Read persisted TotalUsage (already computed by full reconcile or incremental) - totalUsage := pq.Status.TotalUsage - if totalUsage == nil { - // No TotalUsage yet — compute it now for this single project (bootstrap case). - logger.Info("no TotalUsage persisted yet, computing for this project") + // Determine if this is a spec change (new CRD or quota update) vs. a CR UsedAmount change + specChanged := pq.Generation > pq.Status.ObservedGeneration + + + var totalUsage map[string]v1alpha1.ResourceQuotaUsage + if specChanged { + // Spec changed (new CRD or quota update) — recompute TotalUsage from Postgres + logger.Info("spec changed, recomputing TotalUsage from Postgres", + "generation", pq.Generation, "observedGeneration", pq.Status.ObservedGeneration) var err error totalUsage, err = c.computeTotalUsageForProject(ctx, projectID) if err != nil { logger.Error(err, "failed to compute TotalUsage for project") return ctrl.Result{}, err } + } else { + // CR UsedAmount changed — read persisted TotalUsage, only recompute PaygUsage + totalUsage = pq.Status.TotalUsage + if totalUsage == nil { + // Safety fallback: TotalUsage should always be set after first spec reconcile + logger.Info("no TotalUsage persisted, computing as fallback") + var err error + totalUsage, err = c.computeTotalUsageForProject(ctx, projectID) + if err != nil { + logger.Error(err, "failed to compute TotalUsage for project") + return ctrl.Result{}, err + } + } } // List CRs for this project (from local cache) @@ -204,9 +224,8 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Derive PaygUsage paygUsage := derivePaygUsage(totalUsage, crUsage) - // Write updated status with conflict retry (full=true for bootstrap to set LastFullReconcileAt) - isBootstrap := pq.Status.TotalUsage == nil - if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, isBootstrap); err != nil { + // Write updated status with conflict retry + if err := c.updateProjectQuotaStatusWithRetry(ctx, pq.Name, totalUsage, paygUsage, specChanged); err != nil { logger.Error(err, "failed to update ProjectQuota status") return ctrl.Result{}, err } @@ -214,7 +233,7 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Record metrics c.recordUsageMetrics(projectID, totalUsage, paygUsage, crUsage) - logger.V(1).Info("PaygUsage recomputed", "project", projectID, "bootstrap", isBootstrap) + logger.V(1).Info("reconcile completed", "project", projectID, "specChanged", specChanged) return ctrl.Result{}, nil } @@ -808,7 +827,7 @@ func derivePaygUsage( // updateProjectQuotaStatusWithRetry writes TotalUsage + PaygUsage + LastReconcileAt // with retry-on-conflict to handle concurrent updates. -// If fullReconcile is true, also updates LastFullReconcileAt. +// If fullReconcile is true, also updates LastFullReconcileAt and ObservedGeneration. func (c *QuotaController) updateProjectQuotaStatusWithRetry( ctx context.Context, pqName string, @@ -826,6 +845,7 @@ func (c *QuotaController) updateProjectQuotaStatusWithRetry( pq.Status.TotalUsage = totalUsage pq.Status.PaygUsage = paygUsage + pq.Status.ObservedGeneration = pq.Generation now := metav1.Now() pq.Status.LastReconcileAt = &now if fullReconcile { From 6ece7da0bf5f4bbda6669469f206848c01e2db28 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Thu, 7 May 2026 14:06:21 +0200 Subject: [PATCH 07/10] . --- cmd/manager/main.go | 4 +++- internal/scheduling/reservations/quota/controller.go | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 026801b5b..0fe26ce9b 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -696,7 +696,9 @@ func main() { quotaConfig.ApplyDefaults() // Get datasource name from the failover/commitments config (shared dependency) - datasourceName := conf.GetConfigOrDie[failover.FailoverConfig]().DatasourceName + failoverCfg := conf.GetConfigOrDie[failover.FailoverConfig]() + failoverCfg.ApplyDefaults() + datasourceName := failoverCfg.DatasourceName if datasourceName == "" { setupLog.Error(nil, "quota-controller requires datasourceName to be configured") os.Exit(1) diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 6deecce71..22403d18f 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -183,7 +183,6 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Determine if this is a spec change (new CRD or quota update) vs. a CR UsedAmount change specChanged := pq.Generation > pq.Status.ObservedGeneration - var totalUsage map[string]v1alpha1.ResourceQuotaUsage if specChanged { // Spec changed (new CRD or quota update) — recompute TotalUsage from Postgres From a24e58823d905e79fc704cda84d22a30f891ee2f Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Thu, 7 May 2026 14:11:18 +0200 Subject: [PATCH 08/10] . --- .claude/settings.local.json | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 36798fdd7..000000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "permissions": { - "allow": [ - "Read(//root/**)", - "Bash(go doc:*)" - ] - } -} From 03c3f58954f6880c323435120ef149bc8e989c86 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Thu, 7 May 2026 12:57:45 +0000 Subject: [PATCH 09/10] fix: migrate quota controller from UsedAmount to UsedResources after merge After merging main, the CommittedResource CRD changed from Status.UsedAmount (*resource.Quantity) to Status.UsedResources (map[string]resource.Quantity). This commit updates: - computeCRUsage: reads UsedResources["memory"] and UsedResources["cpu"] and converts memory bytes to commitment units (multiples of smallest flavor) - applyDeltaAndUpdateStatus: accepts flavorGroups parameter for unit conversion - Reconcile: fetches flavorGroups before computing CRUsage - crUsedAmountChangePredicate: compares UsedResources map instead of UsedAmount - Tests: updated to use UsedResources with proper byte/core values --- .../reservations/quota/controller.go | 73 +++++++++++++------ .../reservations/quota/controller_test.go | 25 ++++--- .../reservations/quota/integration_test.go | 31 ++++++-- 3 files changed, 92 insertions(+), 37 deletions(-) diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 22403d18f..6e2d3630c 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -122,9 +122,9 @@ func (c *QuotaController) ReconcilePeriodic(ctx context.Context) error { projectTotalUsage := totalUsageByProject[projectID] // Compute CRUsage for this project (using pre-grouped CRs) - crUsage := c.computeCRUsage(crsByProject[projectID]) + crUsage := c.computeCRUsage(crsByProject[projectID], flavorGroups) - // Derive PaygUsage = TotalUsage - CRUsage (clamp >= 0) + // Derive PaygUsage paygUsage := derivePaygUsage(projectTotalUsage, crUsage) // Write status with conflict retry (full reconcile sets LastFullReconcileAt) @@ -209,6 +209,14 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl } } + // Fetch flavor groups for CRUsage computation + flavorGroupClient := &reservations.FlavorGroupKnowledgeClient{Client: c.Client} + flavorGroups, err := flavorGroupClient.GetAllFlavorGroups(ctx, nil) + if err != nil { + logger.Error(err, "failed to get flavor groups") + return ctrl.Result{}, err + } + // List CRs for this project (from local cache) var crList v1alpha1.CommittedResourceList if err := c.List(ctx, &crList); err != nil { @@ -218,7 +226,7 @@ func (c *QuotaController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl crsByProject := groupCRsByProject(crList.Items) // Compute CRUsage - crUsage := c.computeCRUsage(crsByProject[projectID]) + crUsage := c.computeCRUsage(crsByProject[projectID], flavorGroups) // Derive PaygUsage paygUsage := derivePaygUsage(totalUsage, crUsage) @@ -366,7 +374,7 @@ func (c *QuotaController) ReconcileHVDiff(ctx context.Context, oldHV, newHV *hv1 crsByProject := groupCRsByProject(crList.Items) for projectID, delta := range projectDeltas { - if err := c.applyDeltaAndUpdateStatus(ctx, projectID, delta, crsByProject[projectID]); err != nil { + if err := c.applyDeltaAndUpdateStatus(ctx, projectID, delta, crsByProject[projectID], flavorGroups); err != nil { logger.Error(err, "failed to apply delta for project", "project", projectID) // Continue with other projects } @@ -546,6 +554,7 @@ func (c *QuotaController) applyDeltaAndUpdateStatus( projectID string, delta *usageDelta, projectCRs []v1alpha1.CommittedResource, + flavorGroups map[string]compute.FlavorGroupFeature, ) error { crdName := "quota-" + projectID @@ -579,7 +588,7 @@ func (c *QuotaController) applyDeltaAndUpdateStatus( } // Recompute PaygUsage - crUsage := c.computeCRUsage(projectCRs) + crUsage := c.computeCRUsage(projectCRs, flavorGroups) paygUsage := derivePaygUsage(pq.Status.TotalUsage, crUsage) pq.Status.PaygUsage = paygUsage @@ -740,7 +749,8 @@ func groupCRsByProject(crs []v1alpha1.CommittedResource) map[string][]v1alpha1.C } // computeCRUsage computes the committed resource usage from a pre-filtered slice of CRs for one project. -func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource) map[string]v1alpha1.ResourceQuotaUsage { +// It reads UsedResources from each CR's status and converts to commitment units (multiples for RAM, raw for cores). +func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavorGroups map[string]compute.FlavorGroupFeature) map[string]v1alpha1.ResourceQuotaUsage { result := make(map[string]v1alpha1.ResourceQuotaUsage) for i := range crs { @@ -751,26 +761,44 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource) map[s continue } - // Get UsedAmount from status - if cr.Status.UsedAmount == nil { - continue - } - usedAmount := cr.Status.UsedAmount.Value() - if usedAmount <= 0 { + // Get used amount from UsedResources map + if len(cr.Status.UsedResources) == 0 { continue } - // Map ResourceType to resource name + // Map ResourceType to resource name and extract used amount var resourceName string + var usedAmount int64 switch cr.Spec.ResourceType { case v1alpha1.CommittedResourceTypeMemory: resourceName = commitments.ResourceNameRAM(cr.Spec.FlavorGroupName) + memQty, ok := cr.Status.UsedResources["memory"] + if !ok { + continue + } + // Convert bytes to commitment units (multiples of smallest flavor) + usedBytes := memQty.Value() + fg, ok := flavorGroups[cr.Spec.FlavorGroupName] + if !ok || fg.SmallestFlavor.MemoryMB == 0 { + continue + } + unitSizeBytes := int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // safe + usedAmount = usedBytes / unitSizeBytes case v1alpha1.CommittedResourceTypeCores: resourceName = commitments.ResourceNameCores(cr.Spec.FlavorGroupName) + cpuQty, ok := cr.Status.UsedResources["cpu"] + if !ok { + continue + } + usedAmount = cpuQty.Value() default: continue } + if usedAmount <= 0 { + continue + } + // Accumulate per AZ usage := result[resourceName] if usage.PerAZ == nil { @@ -940,7 +968,7 @@ func (c *QuotaController) mapCRToProjectQuota(_ context.Context, obj client.Obje } } -// crUsedAmountChangePredicate triggers only when Status.UsedAmount changes on a CommittedResource. +// crUsedResourcesChangePredicate triggers only when Status.UsedResources changes on a CommittedResource. func crUsedAmountChangePredicate() predicate.Predicate { return predicate.Funcs{ CreateFunc: func(_ event.CreateEvent) bool { return false }, @@ -950,16 +978,17 @@ func crUsedAmountChangePredicate() predicate.Predicate { if !ok1 || !ok2 { return false } - // Trigger if UsedAmount changed - oldUsed := "" - newUsed := "" - if oldCR.Status.UsedAmount != nil { - oldUsed = oldCR.Status.UsedAmount.String() + // Trigger if UsedResources changed + if len(oldCR.Status.UsedResources) != len(newCR.Status.UsedResources) { + return true } - if newCR.Status.UsedAmount != nil { - newUsed = newCR.Status.UsedAmount.String() + for key, oldQty := range oldCR.Status.UsedResources { + newQty, ok := newCR.Status.UsedResources[key] + if !ok || oldQty.Cmp(newQty) != 0 { + return true + } } - return oldUsed != newUsed + return false }, DeleteFunc: func(_ event.DeleteEvent) bool { return true }, GenericFunc: func(_ event.GenericEvent) bool { return false }, diff --git a/internal/scheduling/reservations/quota/controller_test.go b/internal/scheduling/reservations/quota/controller_test.go index 68eb8dd33..d503b363f 100644 --- a/internal/scheduling/reservations/quota/controller_test.go +++ b/internal/scheduling/reservations/quota/controller_test.go @@ -137,9 +137,14 @@ func TestComputeTotalUsage(t *testing.T) { func TestComputeCRUsage(t *testing.T) { ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} - usedAmount5 := resource.MustParse("5") - usedAmount3 := resource.MustParse("3") - usedAmount2 := resource.MustParse("2") + // Flavor groups with SmallestFlavor.MemoryMB = 1 for simple unit conversion in tests + // (1 multiple = 1 MiB = 1048576 bytes) + testFlavorGroups := map[string]compute.FlavorGroupFeature{ + "hana_v2": { + SmallestFlavor: compute.FlavorInGroup{Name: "m1.hana_v2.small", MemoryMB: 1}, + Flavors: []compute.FlavorInGroup{{Name: "m1.hana_v2.small", MemoryMB: 1}}, + }, + } allCRs := []v1alpha1.CommittedResource{ { @@ -151,7 +156,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusConfirmed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedAmount: &usedAmount5, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, }, }, { @@ -163,7 +168,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusGuaranteed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedAmount: &usedAmount3, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("3Mi")}, }, }, { @@ -175,7 +180,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusConfirmed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedAmount: &usedAmount2, + UsedResources: map[string]resource.Quantity{"cpu": resource.MustParse("2")}, }, }, // Different project — should be excluded by groupCRsByProject @@ -188,7 +193,7 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusConfirmed, }, Status: v1alpha1.CommittedResourceStatus{ - UsedAmount: &usedAmount5, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("5Mi")}, }, }, // Pending state — should be excluded by state filter @@ -201,14 +206,14 @@ func TestComputeCRUsage(t *testing.T) { State: v1alpha1.CommitmentStatusPending, }, Status: v1alpha1.CommittedResourceStatus{ - UsedAmount: &usedAmount2, + UsedResources: map[string]resource.Quantity{"memory": resource.MustParse("2Mi")}, }, }, } // Pre-group and pass only project-a's CRs crsByProject := groupCRsByProject(allCRs) - result := ctrl.computeCRUsage(crsByProject["project-a"]) + result := ctrl.computeCRUsage(crsByProject["project-a"], testFlavorGroups) // Should include confirmed + guaranteed for project-a only ramUsage := result["hw_version_hana_v2_ram"] @@ -437,7 +442,7 @@ func TestReconcile_NilTotalUsage(t *testing.T) { ctrl := &QuotaController{Config: DefaultQuotaControllerConfig()} // computeCRUsage on nil slice should return empty map (no panic) - result := ctrl.computeCRUsage(nil) + result := ctrl.computeCRUsage(nil, nil) if len(result) != 0 { t.Errorf("expected empty result for nil CRs, got %d entries", len(result)) } diff --git a/internal/scheduling/reservations/quota/integration_test.go b/internal/scheduling/reservations/quota/integration_test.go index 32ffadc7c..740341d9a 100644 --- a/internal/scheduling/reservations/quota/integration_test.go +++ b/internal/scheduling/reservations/quota/integration_test.go @@ -1081,13 +1081,12 @@ func (env *integrationTestEnv) executeAction(action TestAction) { } case "cr_update": - // Fetch the CR, update UsedAmount, then call Reconcile + // Fetch the CR, update UsedResources, then call Reconcile var cr v1alpha1.CommittedResource if err := env.client.Get(ctx, client.ObjectKey{Name: action.CRName}, &cr); err != nil { env.t.Fatalf("failed to get CR %s: %v", action.CRName, err) } - usedQty := resource.NewQuantity(action.UsedAmount, resource.DecimalSI) - cr.Status.UsedAmount = usedQty + cr.Status.UsedResources = usedResourcesFromMultiples(cr.Spec.ResourceType, cr.Spec.FlavorGroupName, action.UsedAmount) if err := env.client.Status().Update(ctx, &cr); err != nil { env.t.Fatalf("failed to update CR %s status: %v", action.CRName, err) } @@ -1197,12 +1196,34 @@ func makeCR(name, projectID, flavorGroup, az string, resourceType v1alpha1.Commi }, } if usedAmount != nil { - qty := resource.NewQuantity(*usedAmount, resource.DecimalSI) - cr.Status.UsedAmount = qty + cr.Status.UsedResources = usedResourcesFromMultiples(resourceType, flavorGroup, *usedAmount) } return cr } +// usedResourcesFromMultiples converts a "multiples" value (the old UsedAmount unit) to UsedResources. +// For memory: multiples * smallestFlavorMB * 1024 * 1024 = bytes. +// For cores: the value is used directly. +func usedResourcesFromMultiples(resourceType v1alpha1.CommittedResourceType, flavorGroup string, multiples int64) map[string]resource.Quantity { + switch resourceType { + case v1alpha1.CommittedResourceTypeMemory: + fg, ok := testFlavorGroups[flavorGroup] + if !ok || fg.SmallestFlavor.MemoryMB == 0 { + return nil + } + bytesVal := multiples * int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // test only + return map[string]resource.Quantity{ + "memory": *resource.NewQuantity(bytesVal, resource.BinarySI), + } + case v1alpha1.CommittedResourceTypeCores: + return map[string]resource.Quantity{ + "cpu": *resource.NewQuantity(multiples, resource.DecimalSI), + } + default: + return nil + } +} + func int64Ptr(v int64) *int64 { return &v } // withExtraVMs returns a pointer to testVMs + additional VMs. From 1a2b1f024064d1df69e3ed70e6f541c13ba58a69 Mon Sep 17 00:00:00 2001 From: Malte Viering Date: Thu, 7 May 2026 13:29:42 +0000 Subject: [PATCH 10/10] fix: use AcceptedSpec in computeCRUsage to avoid mis-bucketing during spec transitions Addresses CodeRabbit review feedback: during spec transitions, reading cr.Spec can bucket usage into the wrong AZ/flavor-group. Now prefers cr.Status.AcceptedSpec (the last successful reconcile snapshot) when it exists, falling back to cr.Spec only when AcceptedSpec is nil. --- .../reservations/quota/controller.go | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/internal/scheduling/reservations/quota/controller.go b/internal/scheduling/reservations/quota/controller.go index 6e2d3630c..f00d040dc 100644 --- a/internal/scheduling/reservations/quota/controller.go +++ b/internal/scheduling/reservations/quota/controller.go @@ -756,8 +756,15 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavo for i := range crs { cr := &crs[i] + // Prefer AcceptedSpec (last successful reconcile snapshot) over Spec + // to avoid mis-bucketing during spec transitions. + spec := &cr.Spec + if cr.Status.AcceptedSpec != nil { + spec = cr.Status.AcceptedSpec + } + // Filter: only matching states - if !c.isCRStateIncluded(cr.Spec.State) { + if !c.isCRStateIncluded(spec.State) { continue } @@ -769,23 +776,23 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavo // Map ResourceType to resource name and extract used amount var resourceName string var usedAmount int64 - switch cr.Spec.ResourceType { + switch spec.ResourceType { case v1alpha1.CommittedResourceTypeMemory: - resourceName = commitments.ResourceNameRAM(cr.Spec.FlavorGroupName) + resourceName = commitments.ResourceNameRAM(spec.FlavorGroupName) memQty, ok := cr.Status.UsedResources["memory"] if !ok { continue } // Convert bytes to commitment units (multiples of smallest flavor) usedBytes := memQty.Value() - fg, ok := flavorGroups[cr.Spec.FlavorGroupName] + fg, ok := flavorGroups[spec.FlavorGroupName] if !ok || fg.SmallestFlavor.MemoryMB == 0 { continue } unitSizeBytes := int64(fg.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec // safe usedAmount = usedBytes / unitSizeBytes case v1alpha1.CommittedResourceTypeCores: - resourceName = commitments.ResourceNameCores(cr.Spec.FlavorGroupName) + resourceName = commitments.ResourceNameCores(spec.FlavorGroupName) cpuQty, ok := cr.Status.UsedResources["cpu"] if !ok { continue @@ -804,7 +811,7 @@ func (c *QuotaController) computeCRUsage(crs []v1alpha1.CommittedResource, flavo if usage.PerAZ == nil { usage.PerAZ = make(map[string]int64) } - usage.PerAZ[cr.Spec.AvailabilityZone] += usedAmount + usage.PerAZ[spec.AvailabilityZone] += usedAmount result[resourceName] = usage }